# Pandas Intro Exercise

In [3]:
# Make Sure Pandas is Installed 

try:
    import pandas as pd
except ImportError:
    import sys
    import subprocess
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'pandas'])
    import pandas as pd

### Note
Titanic Dataset CSV will be used

Refer to [Titanic Dataset](https://www.kaggle.com/datasets/yasserh/titanic-dataset?resource=download) From Kaggle to Download

In [4]:
titanic_df = pd.read_csv(r".\Data\Titanic-Dataset.csv")

In [14]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


### Q1: Create a Pandas Series named survival_series using the Survived column of the DataFrame

In [8]:
# First method
# We Use .copy() method in order to take a deep copy and have no effect on original dataset
survival_series = titanic_df['Survived'].copy()

# Second Method
# Use pd.Series Method
survival_series_v2 = pd.Series(titanic_df['Survived'], name="survival_series")

print("First Method : \n", survival_series)
print("\nSecond Method : \n", survival_series)

First Method : 
 0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

Second Method : 
 0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64


### Q2: Display the last 3 rows of the DataFrame and check the data type of the Age column

In [16]:
# Display Last 3 Rows of The Data Frame
last3 = titanic_df.tail(3)

# Print Rows
# Converted to String for Better Formatting
print("Last 3 Rows : \n", last3.to_string())

# Print Type Of Age Columns
print("\n\nType of Age Column : ", titanic_df['Age'].dtype)



Last 3 Rows : 
      PassengerId  Survived  Pclass                                      Name     Sex   Age  SibSp  Parch      Ticket   Fare Cabin Embarked
888          889         0       3  Johnston, Miss. Catherine Helen "Carrie"  female   NaN      1      2  W./C. 6607  23.45   NaN        S
889          890         1       1                     Behr, Mr. Karl Howell    male  26.0      0      0      111369  30.00  C148        C
890          891         0       3                       Dooley, Mr. Patrick    male  32.0      0      0      370376   7.75   NaN        Q


Type of Age Column :  float64


### Q3 :  Select the Name and Age columns, then filter rows where the passenger’s age is greater than 30 

In [20]:
# Select Name Age Columns
data = titanic_df[['Age', 'Name']]

# Use Boolean Mask Filter
filt = data['Age'] > 30

# Print Date
print(data[filt])

      Age                                               Name
1    38.0  Cumings, Mrs. John Bradley (Florence Briggs Th...
3    35.0       Futrelle, Mrs. Jacques Heath (Lily May Peel)
4    35.0                           Allen, Mr. William Henry
6    54.0                            McCarthy, Mr. Timothy J
11   58.0                           Bonnell, Miss. Elizabeth
..    ...                                                ...
873  47.0                        Vander Cruyssen, Mr. Victor
879  56.0      Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)
881  33.0                                 Markun, Mr. Johann
885  39.0               Rice, Mrs. William (Margaret Norton)
890  32.0                                Dooley, Mr. Patrick

[305 rows x 2 columns]


### Q4: Add a new column FamilySize to the DataFrame, calculated as SibSp + Parch + 1 (including the passenger themselves)

In [22]:
titanic_df['FamilySize'] = titanic_df['SibSp'] + titanic_df['Parch'] + 1

titanic_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,1
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,1
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,4
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,1


### Q5: Fill missing values in the Age column with the mean age then Group the data by Pclass and calculate the average age for each class

In [None]:
# Get Age Mean and Round To 2 Decimal Places
mean = round(titanic_df['Age'].mean(), 2)

# Fill NaN Values - Store Them in A new Column
titanic_df['Age_Filled'] = titanic_df['Age'].fillna(mean)

# Group By Pclass - Use agg to get mean
titanic_df.groupby('Pclass').agg({'Age_Filled': 'mean'})

Unnamed: 0_level_0,Age_Filled
Pclass,Unnamed: 1_level_1
1,37.048241
2,29.867011
3,26.403503


### QBonus: Calculate the survival rate (mean of Survived) for each gender (Sex)

#### Formula For Calculating Rate

For each gender:

$$
\text{Survival Rate}_{\text{gender}} = \frac{\text{Number of Survivors in that gender}}{\text{Total number of passengers in that gender}}
$$

In [None]:
# Fucntion To calculate Rate of Survival
calc_survival_rate =  lambda x: (x.sum() / x.count())

# Use Function in Survived To Calculate For Each Gender (It Calculates For Series and Each Gender will have a series)
titanic_df.groupby('Sex').agg({'Survived' : calc_survival_rate})

Unnamed: 0_level_0,Survived
Sex,Unnamed: 1_level_1
female,0.742038
male,0.188908
