# 1. Data Loading and setup 

In [2]:
import pandas as pd 

In [26]:
df = pd.read_csv('Titanic-Dataset.csv')

In [27]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# 2. Data Cleaning

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [40]:
df.duplicated().sum()

0

In [33]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

There are 177 missing vlaue of age, we will use mean to fill the column 

In [44]:
df['Age'].fillna(df['Age'].mean() , inplace= True)

also, we have 687 missing rows for Cabin column so we will simply drop it

In [51]:
df.drop(columns=['Cabin'],inplace=True)

Alo, we have 2 missing values in Embarked, we will simply replace it with mode

In [58]:
df['Embarked'].fillna(df['Embarked'].mode()[0] , inplace= True)

In [59]:
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

# 3. Data Exploration

**1. Survival Rate**

In [61]:
df['Survived'].mean()*100

38.38383838383838

**2. Class distribution**

In [66]:
df['Pclass'].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

**3. Average age**

In [67]:
df['Age'].mean()

29.69911764705882

**4. Ports**

In [70]:
df['Embarked'].unique()

array(['S', 'C', 'Q'], dtype=object)

# 4. Data Transformation

In [71]:
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['Sex_num'] = df['Sex'].map({'male': 0, 'female': 1})

bins = [0, 18, 60, 100]
labels = ['Child', 'Adult', 'Senior']
df['AgeGroup'] = pd.cut(df['Age'], bins=bins, labels=labels)

New features created: FamilySize, numeric Sex, and AgeGroup.

In [72]:
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,FamilySize,Sex_num,AgeGroup
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,2,0,Adult
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,2,1,Adult
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,1,1,Adult


# 5. Data Selection & Filtering

In [74]:
df[df['Age'] > 50].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,FamilySize,Sex_num,AgeGroup
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,S,1,0,Adult
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,S,1,1,Adult
15,16,1,2,"Hewlett, Mrs. (Mary D Kingcome)",female,55.0,0,0,248706,16.0,S,1,1,Adult
33,34,0,2,"Wheadon, Mr. Edward H",male,66.0,0,0,C.A. 24579,10.5,S,1,0,Senior
54,55,0,1,"Ostby, Mr. Engelhart Cornelius",male,65.0,0,1,113509,61.9792,C,2,0,Senior


In [75]:
df[df['Fare'] > 100].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,FamilySize,Sex_num,AgeGroup
27,28,0,1,"Fortune, Mr. Charles Alexander",male,19.0,3,2,19950,263.0,S,6,0,Adult
31,32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,29.699118,1,0,PC 17569,146.5208,C,2,1,Adult
88,89,1,1,"Fortune, Miss. Mabel Helen",female,23.0,3,2,19950,263.0,S,6,1,Adult
118,119,0,1,"Baxter, Mr. Quigg Edmond",male,24.0,0,1,PC 17558,247.5208,C,2,0,Adult
195,196,1,1,"Lurette, Miss. Elise",female,58.0,0,0,PC 17569,146.5208,C,1,1,Adult


In [76]:
df[(df['Sex'] == 'female') & (df['Pclass'] == 1)]['Name'].head()

1     Cumings, Mrs. John Bradley (Florence Briggs Th...
3          Futrelle, Mrs. Jacques Heath (Lily May Peel)
11                             Bonnell, Miss. Elizabeth
31       Spencer, Mrs. William Augustus (Marie Eugenie)
52             Harper, Mrs. Henry Sleeper (Myna Haxtun)
Name: Name, dtype: object

# 6. Data Aggregation & Grouping

In [79]:
df.groupby('Sex')['Survived'].mean()*100

Sex
female    74.203822
male      18.890815
Name: Survived, dtype: float64

In [83]:
df.groupby('Pclass')['Age'].mean()

Pclass
1    37.048118
2    29.866958
3    26.403259
Name: Age, dtype: float64

In [84]:
df.groupby('Embarked')['Survived'].mean()*100

Embarked
C    55.357143
Q    38.961039
S    33.900929
Name: Survived, dtype: float64

In [85]:
df.groupby('Pclass')['Fare'].mean()

Pclass
1    84.154687
2    20.662183
3    13.675550
Name: Fare, dtype: float64