# Imoprting Required Libraries

In [193]:
import os
from tqdm import tqdm

import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

import matplotlib.pyplot as plt


# Directions and Oppening

In [31]:
dataset_root_dir = r'E:\University of Kerman\Term 7\Machine Learning\HomeWorks_Repo\ML-2024\Analyzing Titanic Survival Rates\Dataset\titanic'
train_dataset_dir = os.path.join(dataset_root_dir, 'train.csv')
test_dataset_dir = os.path.join(dataset_root_dir, 'test.csv')
target_dataset_dir = os.path.join(dataset_root_dir, 'gender_submission.csv')

In [32]:
train_df = pd.read_csv(train_dataset_dir, index_col='PassengerId')
test_df = pd.read_csv(test_dataset_dir, index_col='PassengerId')
target_df = pd.read_csv(target_dataset_dir, index_col='PassengerId')

# Load and expolore

## Concating the train and test data frame
In this section I will concat train and test data frame because we are not going to train any model and as a result, test set won't come in handy.

There is **no** `Survived` column in test data, so in the outset, we have to inner join test dataset with gender_submission dataset on `PassengerId` column which is a foregin key to test dataset.

In [34]:
target_df.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,1
894,0
895,0
896,1


In [35]:
test_df.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [44]:
test_df = test_df.join(target_df, on='PassengerId')

In [159]:
df = pd.concat([train_df, test_df])

## A View of Data frame

In [160]:
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [161]:
df.tail()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S
1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C
1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S
1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S
1309,0,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C


# Data Cleaning

## Handling null values

### Overview on missing values

I will drop columns `name`, `Cabin` and `Ticket` because they are obviously not related to the survivel of the passenger.

In [162]:
df = df.drop(['Cabin', 'Ticket', 'Name'], axis=1)

In [163]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         263
SibSp         0
Parch         0
Fare          1
Embarked      2
dtype: int64

For handling `Fare` null values, we can use the **mean** of `Age` based on `Pclass` and `Sex`.

In [164]:
grouped_series = df.groupby(['Pclass', 'Sex'])['Fare'].mean()
grouped_series

Pclass  Sex   
1       female    109.412385
        male       69.888385
2       female     23.234827
        male       19.904946
3       female     15.324250
        male       12.415462
Name: Fare, dtype: float64

In [165]:
sex, pclass = df.loc[df[df['Fare'].isnull()].index, ['Sex', 'Pclass']].values[0]
df.loc[df[df['Fare'].isnull()].index, 'Fare'] = np.floor(grouped_series[pclass, sex])

For handling `Embarked` column, we will use the `Pclass` and put the Embarked with the maximum number of entries as the `NaN` values./

In [185]:
df.groupby(['Pclass', 'Embarked'])['SibSp'].count()

Pclass  Embarked
1       C           141
        Q             3
        S           177
2       C            28
        Q             7
        S           242
3       C           101
        Q           113
        S           495
Name: SibSp, dtype: int64

In [186]:
df.loc[df[df['Embarked'].isnull()].index, 'Embarked'] = 'S'

Converting Categorical data to numerical data

In [198]:
df.dtypes

Survived      int64
Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Embarked     object
dtype: object

In [203]:
df['Sex'] = df['Sex'].astype('category')
df['Embarked'] = df['Embarked'].astype('category')

In [210]:
cat_columns = df.select_dtypes(['category']).columns
df[cat_columns] = df[cat_columns].apply(lambda c: c.cat.codes)

We use `IterativeImputer` method for filling age missing values.

In [215]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         263
SibSp         0
Parch         0
Fare          0
Embarked      0
dtype: int64

In [260]:
X = df[['Pclass', 'Sex', 'Fare', 'Age']]

imputer = IterativeImputer()
imp_vals = imputer.fit_transform(X)
df[['Pclass', 'Sex', 'Fare', 'Age']] = imp_vals
df['Age'] = df['Age'].apply(lambda age: np.floor(age) if age > 1 else age)

In [261]:
df.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64