# Titanic Dataset EDA

## 1. Imports

In [399]:
from ydata_profiling import ProfileReport
import category_encoders as ce
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import pandas as pd
import numpy as np
import sklearn
import tqdm

## 2. DataFrame initialization 

In [400]:
df = pd.read_csv(r'.\data\train.csv')

In [401]:
# Index reset in order for it start from 0 (so subtracting 1 from PassengerId)
df["PassengerId"] = df["PassengerId"] - 1

# Setting PassengerId as index
df = df.set_index("PassengerId")

In [402]:
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## 3. EDA

In [403]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [404]:
for el in df.columns:
    print(el, df[el].nunique())

Survived 2
Pclass 3
Name 891
Sex 2
Age 88
SibSp 7
Parch 7
Ticket 681
Fare 248
Cabin 147
Embarked 3


### 3.1 Data Imputation

#### 3.1.1 Age

In [405]:
# Save original name column to use it later
original_name_column = df["Name"].copy()

In [406]:
# Getting the title from the name
df["Name"] = df["Name"].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
print(df["Name"].value_counts())

Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Mlle              2
Major             2
Col               2
the Countess      1
Capt              1
Ms                1
Sir               1
Lady              1
Mme               1
Don               1
Jonkheer          1
Name: Name, dtype: int64


In [407]:
# Store the title in a list
type_of_passenger = (df["Name"].value_counts()).index.tolist()

In [408]:
# Imputation of the missing values per title
for pass_type in type_of_passenger:
    passenger_category = df['Name'] == pass_type
    passenger_type_df = df[passenger_category]
    where_na = passenger_type_df['Age'].isna()
    median_fill = df.loc[passenger_category, 'Age'].median()
    passenger_type_df.loc[where_na, 'Age'] = median_fill 
    df[passenger_category] = passenger_type_df


In [409]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       891 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [410]:
# Setting back the original name column
df["Name"] = original_name_column
print(df["Name"])

PassengerId
0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object


#### 3.1.2 Cabin

In [411]:
# Despite the fact that the cabin column has a lot of missing values, we can still extract some information from it
# We can see that the first letter of the cabin is the deck where the passenger was located
# We can extract this information and use it as a feature
df["Cabin"] = df["Cabin"].str[0]
print(df["Cabin"].value_counts())

C    59
B    47
D    33
E    32
A    15
F    13
G     4
T     1
Name: Cabin, dtype: int64


In [412]:
# But we still have a lot of missing values
print("\nMissing values in Cabin column: ", df["Cabin"].isna().sum())


Missing values in Cabin column:  687


In [413]:
# We can see that the most of the missing values are from the 3rd class
# We can impute the missing values with the most frequent value of the 3rd class

# Printing how many of the na values of the cabin column are of which class
# print(df[df["Cabin"].isna()]["Pclass"].value_counts())

# We can see that the most of the missing values are from the 3rd class
# We can impute those values with the most frequent values of the 3rd class dividing them across the decks

# For now we will just drop the column
df.drop("Cabin", axis=1, inplace=True)


#### 3.1.3 Embarked

In [414]:
# Seeing the most common values for the Embarked column
print(df["Embarked"].value_counts())
print("-"*80, "\n", "Unknowk embark port:", "\n")
print(df[df["Embarked"].isna()])

S    644
C    168
Q     77
Name: Embarked, dtype: int64
-------------------------------------------------------------------------------- 
 Unknowk embark port: 

             Survived  Pclass                                       Name  \
PassengerId                                                                
61                  1       1                        Icard, Miss. Amelie   
829                 1       1  Stone, Mrs. George Nelson (Martha Evelyn)   

                Sex   Age  SibSp  Parch  Ticket  Fare Embarked  
PassengerId                                                     
61           female  38.0      0      0  113572  80.0      NaN  
829          female  62.0      0      0  113572  80.0      NaN  


In [418]:
# Setting the missing values to the most common value
df["Embarked"] = df["Embarked"].fillna("S")
print(f'Na values: {len(df[df["Embarked"].isna()])}')

Na values: 0


#### 3.1.3 Name

In [420]:
# Since name column is not useful for the model, we can drop it
df.drop("Name", axis=1, inplace=True)

#### 3.1.4 Tickets

In [421]:
# We can see that the ticket column has a lot of unique values, so it is not useful for the model
# We can drop it
df.drop("Ticket", axis=1, inplace=True)

#### 3.1.5 Sex

In [423]:
# Substituting "male" and "female" with "M" and "F" respectively
replace_dictionary = {"male": "M", "female": "F"}
df["Sex"] = df["Sex"].replace(replace_dictionary)

#### 3.1.x Summary

In [425]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 94.9+ KB


In [426]:
df.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,3,M,22.0,1,0,7.25,S
1,1,1,F,38.0,1,0,71.2833,C
2,1,3,F,26.0,0,0,7.925,S
3,1,1,F,35.0,1,0,53.1,S
4,0,3,M,35.0,0,0,8.05,S


## 4. Setting the proper dtypes

In [458]:
# Setting the categorical columns as category type
df["Survived"] = df["Survived"].astype("category")
df["Pclass"] = df["Pclass"].astype("category")
df["Sex"] = df["Sex"].astype("category")
df["Embarked"] = df["Embarked"].astype("category")

# Setting the numerical columns as float or int type
df["Age"] = df["Age"].astype("int8")
df["SibSp"] = df["SibSp"].astype("int8")
df["Parch"] = df["Parch"].astype("int8")
df["Fare"] = df["Fare"].astype("float16")


In [459]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Survived  891 non-null    category
 1   Pclass    891 non-null    category
 2   Sex       891 non-null    category
 3   Age       891 non-null    int8    
 4   SibSp     891 non-null    int8    
 5   Parch     891 non-null    int8    
 6   Fare      891 non-null    float16 
 7   Embarked  891 non-null    category
dtypes: category(4), float16(1), int8(3)
memory usage: 47.6 KB


## 5. Creating Report

In [178]:
profile = ProfileReport(df, title ='Titanic Report')

In [183]:
profile.to_file(output_file='titanic_report.html')

Export report to file: 100%|██████████| 1/1 [00:00<00:00, 50.64it/s]
