# Titanic Dataset EDA

## 1. Imports

In [None]:
from ydata_profiling import ProfileReport
import category_encoders as ce
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import pandas as pd
import numpy as np
import sklearn
import tqdm

## 2. DataFrame initialization 

In [None]:
df = pd.read_csv(r'.\data\train.csv')

In [None]:
# Index reset in order for it start from 0 (so subtracting 1 from PassengerId)
df["PassengerId"] = df["PassengerId"] - 1

# Setting PassengerId as index
df = df.set_index("PassengerId")

In [None]:
df.head()

## 3. EDA

In [None]:
df.info()

In [None]:
for el in df.columns:
    print(el, df[el].nunique())

### 3.1 Data Imputation

#### 3.1.1 Age

In [None]:
# Save original name column to use it later
original_name_column = df["Name"].copy()

In [None]:
# Getting the title from the name
df["Name"] = df["Name"].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
print(df["Name"].value_counts())

In [None]:
# Store the title in a list
type_of_passenger = (df["Name"].value_counts()).index.tolist()

In [None]:
# Imputation of the missing values per title
for pass_type in type_of_passenger:
    passenger_category = df['Name'] == pass_type
    passenger_type_df = df[passenger_category]
    where_na = passenger_type_df['Age'].isna()
    median_fill = df.loc[passenger_category, 'Age'].median()
    passenger_type_df.loc[where_na, 'Age'] = median_fill 
    df[passenger_category] = passenger_type_df


In [None]:
df.info()

In [None]:
# Setting back the original name column
df["Name"] = original_name_column
print(df["Name"])

#### 3.1.2 Cabin

In [None]:
# Despite the fact that the cabin column has a lot of missing values, we can still extract some information from it
# We can see that the first letter of the cabin is the deck where the passenger was located
# We can extract this information and use it as a feature
df["Cabin"] = df["Cabin"].str[0]
print(df["Cabin"].value_counts())

In [None]:
# But we still have a lot of missing values
print("\nMissing values in Cabin column: ", df["Cabin"].isna().sum())

In [None]:
# We can see that the most of the missing values are from the 3rd class
# We can impute the missing values with the most frequent value of the 3rd class

# Printing how many of the na values of the cabin column are of which class
# print(df[df["Cabin"].isna()]["Pclass"].value_counts())

# We can see that the most of the missing values are from the 3rd class
# We can impute those values with the most frequent values of the 3rd class dividing them across the decks

# For now we will just drop the column
df.drop("Cabin", axis=1, inplace=True)


#### 3.1.3 Embarked

In [None]:
# Seeing the most common values for the Embarked column
print(df["Embarked"].value_counts())
print("-"*80, "\n", "Unknowk embark port:", "\n")
print(df[df["Embarked"].isna()])

In [None]:
# Setting the missing values to the most common value
df["Embarked"] = df["Embarked"].fillna("S")
print(f'Na values: {len(df[df["Embarked"].isna()])}')

#### 3.1.3 Name

In [None]:
# Since name column is not useful for the model, we can drop it
df.drop("Name", axis=1, inplace=True)

#### 3.1.4 Tickets

In [None]:
# We can see that the ticket column has a lot of unique values, so it is not useful for the model
# We can drop it
df.drop("Ticket", axis=1, inplace=True)

#### 3.1.5 Sex

In [None]:
# Substituting "male" and "female" with "M" and "F" respectively
replace_dictionary = {"male": "M", "female": "F"}
df["Sex"] = df["Sex"].replace(replace_dictionary)

#### 3.1.x Summary

In [None]:
df.info()

In [None]:
df.head()

## 4. Setting the proper dtypes

In [None]:
# Setting the categorical columns as category type
df["Survived"] = df["Survived"].astype("category")
df["Pclass"] = df["Pclass"].astype("category")
df["Sex"] = df["Sex"].astype("category")
df["Embarked"] = df["Embarked"].astype("category")

# Setting the numerical columns as float or int type
df["Age"] = df["Age"].astype("int8")
df["SibSp"] = df["SibSp"].astype("int8")
df["Parch"] = df["Parch"].astype("int8")
df["Fare"] = df["Fare"].astype("float16")


In [None]:
df.info()

## 5. Creating Report

In [None]:
profile = ProfileReport(df, title ='Titanic Report')

In [None]:
profile.to_file(output_file='titanic_report.html')