# Titanic data

This dataset comes from [here](https://www.kaggle.com/c/titanic/overview) and will be used for the tutorials.

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## Load the dataset

Make sure to save the dataset in the parent directory or adjust the file path below.

We only read in `titanic_train.csv`, which we'll treat as the entire dataset. The provided test set does not include the ground truth for the outcome, and hence is not useful for us.

In [2]:
df = pd.read_csv('../datasets/titanic/titanic_train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.shape

(891, 12)

In [4]:
sum(df['Survived'])/len(df)

0.3838383838383838

## Prep data

We losely follow [this Github repo](https://github.com/sasuke96/LIME/tree/main) for pre-processing the data.

We will first create a dictionary where we save the names of the target to predict, as well as the column names of numerical and categorial features.

### null values

In [5]:
#checking null values in the column
df.isnull().sum()/df.shape[0]

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.198653
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.771044
Embarked       0.002245
dtype: float64

In [6]:
#replacing Age null value with mean
df['Age']=df['Age'].fillna(df['Age'].mean())

#replacing Embarked with ffill( propagate last valid observation forward to next)
df.Embarked.fillna(method='ffill', inplace=True)

##77% of cabin values are null so removing them, Also dropping Name and passengerID
df.drop(['PassengerId', 'Name', 'Cabin', 'Ticket'], axis=1, inplace=True)

### define feature types

In [7]:
d = {
    'target': 'Survived',
    'numerical':['Age', 'SibSp', 'Parch', 'Fare'],
    'categorical':['Sex', 'Embarked', 'Pclass']
}

In [8]:
##splitting X and Y
X = df.drop(d['target'],axis=1)
y = df[d['target']]

### Encoding categorical variables

In [9]:
from sklearn.preprocessing import LabelEncoder

In [10]:
#finding index of categorical columns
categorical_features_idx = [X.columns.get_loc(col) for col in d['categorical']]
categorical_features_idx

[1, 6, 0]

In [11]:
#encoding categorical variable and creating dictionary
categorical_names = {}
for feature in categorical_features_idx:
    le = LabelEncoder()
    le.fit(X.iloc[:, feature].values)
    X.iloc[:, feature] = le.transform(X.iloc[:, feature])
    categorical_names[feature] = le.classes_

In [12]:
categorical_names

{1: array(['female', 'male'], dtype=object),
 6: array(['C', 'Q', 'S'], dtype=object),
 0: array([1, 2, 3])}

---
## Version 1: numerical & categorical variables, no one-hot encoding

### split into train and test data

In [13]:
# split into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2, stratify=y)

# reset the index
X_train, y_train, X_test, y_test = X_train.reset_index(drop=True), y_train.reset_index(drop=True), X_test.reset_index(drop=True), y_test.reset_index(drop=True)

X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,2,0,29.699118,0,0,7.6292,1
1,2,1,29.699118,0,0,8.05,2
2,2,0,29.699118,0,0,7.75,1
3,0,0,51.0,1,0,77.9583,2
4,2,1,21.0,0,0,7.7333,1


### save .csv files

In [14]:
# X_train.to_csv('../datasets/titanic/titanic_X_train.csv', index=False)
# y_train.to_csv('../datasets/titanic/titanic_y_train.csv', index=False)
# X_test.to_csv('../datasets/titanic/titanic_X_test.csv', index=False)
# y_test.to_csv('../datasets/titanic/titanic_y_test.csv', index=False)

---
## Version 2: numerical & one-hot encoded categorical variables

### One-hot encoding for categorical data 
Then, we will apply one-hot encoding to the categorial variables and split the data into train and test set.

In [15]:
# one-hot encoding
X_ = pd.get_dummies(X, columns=d['categorical'], drop_first=True, dtype=int)
X_

Unnamed: 0,Age,SibSp,Parch,Fare,Sex_1,Embarked_1,Embarked_2,Pclass_1,Pclass_2
0,22.000000,1,0,7.2500,1,0,1,0,1
1,38.000000,1,0,71.2833,0,0,0,0,0
2,26.000000,0,0,7.9250,0,0,1,0,1
3,35.000000,1,0,53.1000,0,0,1,0,0
4,35.000000,0,0,8.0500,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...
886,27.000000,0,0,13.0000,1,0,1,1,0
887,19.000000,0,0,30.0000,0,0,1,0,0
888,29.699118,1,2,23.4500,0,0,1,0,1
889,26.000000,0,0,30.0000,1,0,0,0,0


### split into train and test data

In [16]:
# split into train and test set
X_train, X_test, y_train, y_test = train_test_split(X_, y, random_state=0, test_size=0.2, stratify=y)

# reset the index
X_train, y_train, X_test, y_test = X_train.reset_index(drop=True), y_train.reset_index(drop=True), X_test.reset_index(drop=True), y_test.reset_index(drop=True)

X_train.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Sex_1,Embarked_1,Embarked_2,Pclass_1,Pclass_2
0,29.699118,0,0,7.6292,0,1,0,0,1
1,29.699118,0,0,8.05,1,0,1,0,1
2,29.699118,0,0,7.75,0,1,0,0,1
3,51.0,1,0,77.9583,0,0,1,0,0
4,21.0,0,0,7.7333,1,1,0,0,1


### save .csv files

In [17]:
# X_train.to_csv('../datasets/titanic/encoded_titanic_X_train.csv', index=False)
# X_test.to_csv('../datasets/titanic/encoded_titanic_X_test.csv', index=False)

---
## Version 3: Only binary features

### create binary version of dataset

In [18]:
df_c = pd.DataFrame()
for column in X.columns:
    if column in d['numerical'] and len(X[column].unique()) > 2:
        df_c[column] = pd.cut(X[column], 5, labels=[1, 2, 3, 4, 5])
    else:
        df_c[column] = X[column]

df_binary = pd.get_dummies(df_c, columns=df_c.columns, drop_first=True, dtype=int)

### split into train and test data

In [19]:
# split into train and test set
X_train, X_test, y_train, y_test = train_test_split(df_binary, y, random_state=0, test_size=0.2, stratify=y)

# reset the index
X_train, y_train, X_test, y_test = X_train.reset_index(drop=True), y_train.reset_index(drop=True), X_test.reset_index(drop=True), y_test.reset_index(drop=True)

X_train.head()

Unnamed: 0,Pclass_1,Pclass_2,Sex_1,Age_2,Age_3,Age_4,Age_5,SibSp_2,SibSp_3,SibSp_4,...,Parch_2,Parch_3,Parch_4,Parch_5,Fare_2,Fare_3,Fare_4,Fare_5,Embarked_1,Embarked_2
0,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


### save .csv files

In [20]:
# X_train.to_csv('../datasets/titanic/bin_titanic_X_train.csv', index=False)
# X_test.to_csv('../datasets/titanic/bin_titanic_X_test.csv', index=False)