# Titanic Survivors
####  Predicting how many people survived the titanic catastrophe

In [154]:
import opendatasets as od
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
import os
import plotly.express as px
%matplotlib inline

pd.set_option('display.max_columns', 15)
pd.set_option('display.max_rows', 200)
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

In [13]:
od.download('https://www.kaggle.com/c/titanic', force=True)

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: dikshantkalotra
Your Kaggle Key: ········


100%|██████████████████████████████████████████████████████████████████████████████| 34.1k/34.1k [00:00<00:00, 306kB/s]

Downloading titanic.zip to .\titanic

Extracting archive .\titanic/titanic.zip to .\titanic





In [14]:
os.listdir('titanic')

['gender_submission.csv', 'test.csv', 'train.csv']

In [155]:
train_df = pd.read_csv('titanic/train.csv')
df = train_df.pop('Survived')
train_df['Survived'] = df

In [156]:
train_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,1
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,0
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,0
887,888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,1
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,0
889,890,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,1


## Column Description

|Columns | Definition|	Key|
|---------|-----------|----|
|survival	|Survival	|0 = No, 1 = Yes|
|pclass	| Ticket class	|1 = 1st, 2 = 2nd, 3 = 3rd|
|sex	|Sex	|
|Age	|Age in years|	
|sibsp	|# of siblings / spouses aboard the Titanic|	
|parch	|# of parents / children aboard the Titanic|	
|ticket	|Ticket number|	
|fare	|Passenger fare|	
|cabin	|Cabin number|	
|embarked	|Port of Embarkation	|C = Cherbourg, Q = Queenstown, S = Southampton|

In [157]:
train_df.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Survived
count,891.0,891.0,714.0,891.0,891.0,891.0,891.0
mean,446.0,2.308642,29.699118,0.523008,0.381594,32.204208,0.383838
std,257.353842,0.836071,14.526497,1.102743,0.806057,49.693429,0.486592
min,1.0,1.0,0.42,0.0,0.0,0.0,0.0
25%,223.5,2.0,20.125,0.0,0.0,7.9104,0.0
50%,446.0,3.0,28.0,0.0,0.0,14.4542,0.0
75%,668.5,3.0,38.0,1.0,0.0,31.0,1.0
max,891.0,3.0,80.0,8.0,6.0,512.3292,1.0


In [158]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Name         891 non-null    object 
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    object 
 11  Survived     891 non-null    int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [159]:
train_df.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
Survived         0
dtype: int64

In [160]:
train_df.nunique()

PassengerId    891
Pclass           3
Name           891
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         681
Fare           248
Cabin          147
Embarked         3
Survived         2
dtype: int64

## Data Visualization

In [131]:
px.scatter(train_df,
           x='Ticket',
           y='Fare')

In [122]:
correlated = train_df.corr()
px.imshow(correlated)

In [109]:
fig = px.histogram(train_df, 
                   x='Embarked', 
                   color='Sex', 
                   nbins=47, 
                   marginal='box')
fig.show()

In [118]:
px.scatter(train_df[train_df.Embarked.notnull()],
           x='Age', 
           y='Fare', 
           color='Embarked', 
           marginal_x='box', 
           marginal_y='violin')

## Input and Target columns

In [391]:
input_cols = list(train_df.columns)[1:-1]
target_cols = 'Survived'

In [392]:
inputs_df = train_df[input_cols].copy()
targets = train_df[target_cols]

In [393]:
inputs_df

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...
886,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [394]:
inputs_df = inputs_df.drop(columns=['Name','Ticket', 'Cabin'])

In [395]:
inputs_df.Sex = inputs_df.Sex.map({'male' : 1, 'female' : 0})
inputs_df['Age'] = inputs_df['Age'].groupby([inputs_df['Pclass'], inputs_df['Sex']]).apply(lambda x: x.fillna(x.mean()))

In [396]:
inputs_df = inputs_df.fillna(inputs_df.mode().iloc[0])

In [397]:
inputs_df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.00,1,0,7.2500,S
1,1,0,38.00,1,0,71.2833,C
2,3,0,26.00,0,0,7.9250,S
3,1,0,35.00,1,0,53.1000,S
4,3,1,35.00,0,0,8.0500,S
...,...,...,...,...,...,...,...
886,2,1,27.00,0,0,13.0000,S
887,1,0,19.00,0,0,30.0000,S
888,3,0,21.75,1,2,23.4500,S
889,1,1,26.00,0,0,30.0000,C


In [398]:
inputs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    int64  
 2   Age       891 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Embarked  891 non-null    object 
dtypes: float64(2), int64(4), object(1)
memory usage: 48.9+ KB


In [399]:
numerical_cols = inputs_df.select_dtypes(include=np.number).columns.tolist()
categorical_cols = inputs_df.select_dtypes(['object']).columns.tolist()

In [400]:
print(numerical_cols, categorical_cols)

['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare'] ['Embarked']


## Manipulating the input dataframe

In [216]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

In [253]:
scaler = MinMaxScaler(feature_range=(0,1)).fit(inputs_df[numerical_cols])
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(inputs_df[categorical_cols])

In [254]:
inputs_df[numerical_cols] = scaler.transform(inputs_df[numerical_cols])

In [255]:
encoded_cols = list(encoder.get_feature_names(categorical_cols))
inputs_df[encoded_cols] = encoder.transform(inputs_df[categorical_cols])

In [256]:
inputs_df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Embarked_C,Embarked_Q,Embarked_S
0,1.0,1.0,0.271174,0.125,0.000000,0.014151,S,0.0,0.0,1.0
1,0.0,0.0,0.472229,0.125,0.000000,0.139136,C,1.0,0.0,0.0
2,1.0,0.0,0.321438,0.000,0.000000,0.015469,S,0.0,0.0,1.0
3,0.0,0.0,0.434531,0.125,0.000000,0.103644,S,0.0,0.0,1.0
4,1.0,1.0,0.434531,0.000,0.000000,0.015713,S,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
886,0.5,1.0,0.334004,0.000,0.000000,0.025374,S,0.0,0.0,1.0
887,0.0,0.0,0.233476,0.000,0.000000,0.058556,S,0.0,0.0,1.0
888,1.0,0.0,0.268032,0.125,0.333333,0.045771,S,0.0,0.0,1.0
889,0.0,1.0,0.321438,0.000,0.000000,0.058556,C,1.0,0.0,0.0


In [257]:
train_inputs, train_targets = inputs_df[numerical_cols + encoded_cols], targets

## Training Model

In [297]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, confusion_matrix, accuracy_score

In [370]:
model = RandomForestClassifier(random_state=42, n_jobs=-1, n_estimators=200, max_depth=35, max_leaf_nodes=1000)

In [371]:
model.fit(train_inputs, train_targets)

RandomForestClassifier(max_depth=35, max_leaf_nodes=1000, n_estimators=200,
                       n_jobs=-1, random_state=42)

In [372]:
train_preds = model.predict(train_inputs)

In [373]:
accuracy_score(train_preds, train_targets)

0.9820426487093153

In [374]:
confusion_matrix(list(train_targets), list(train_preds))

array([[544,   5],
       [ 11, 331]], dtype=int64)

## Predicting the outputs

In [458]:
test_df = pd.read_csv('titanic/test.csv')
test_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [459]:
new_inputs_df = test_df.drop(columns=['Name','Cabin','Ticket'])

In [460]:
new_inputs_df.Sex = new_inputs_df.Sex.map({'male' : 1, 'female' : 0})
new_inputs_df['Age'] = new_inputs_df['Age'].groupby([inputs_df['Pclass'], new_inputs_df['Sex']]).apply(lambda x: x.fillna(x.mean()))
new_inputs_df['Fare'] = new_inputs_df['Fare'].groupby([inputs_df['Pclass'], new_inputs_df['Sex']]).apply(lambda x: x.fillna(x.mean()))

In [461]:
new_inputs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Sex          418 non-null    int64  
 3   Age          418 non-null    float64
 4   SibSp        418 non-null    int64  
 5   Parch        418 non-null    int64  
 6   Fare         418 non-null    float64
 7   Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(1)
memory usage: 26.2+ KB


In [462]:
test_inputs = new_inputs_df[numerical_cols + categorical_cols].copy()

In [463]:
print(numerical_cols, categorical_cols)

['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare'] ['Embarked']


In [464]:
new_scaler = MinMaxScaler(feature_range=(0,1)).fit(test_inputs[numerical_cols])
new_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(test_inputs[categorical_cols])

In [465]:
test_inputs[numerical_cols] = new_scaler.transform(test_inputs[numerical_cols])
test_inputs[encoded_cols] = new_encoder.transform(test_inputs[categorical_cols])

In [468]:
test_inputs = test_inputs[numerical_cols + encoded_cols]

In [470]:
test_inputs

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,1.0,1.0,0.452723,0.000,0.000000,0.015282,0.0,1.0,0.0
1,1.0,0.0,0.617566,0.125,0.000000,0.013663,0.0,0.0,1.0
2,0.5,1.0,0.815377,0.000,0.000000,0.018909,0.0,1.0,0.0
3,1.0,1.0,0.353818,0.000,0.000000,0.016908,0.0,0.0,1.0
4,1.0,0.0,0.287881,0.125,0.111111,0.023984,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
413,1.0,1.0,0.436689,0.000,0.000000,0.015713,0.0,0.0,1.0
414,0.0,0.0,0.512066,0.000,0.000000,0.212559,1.0,0.0,0.0
415,1.0,1.0,0.505473,0.000,0.000000,0.014151,0.0,0.0,1.0
416,1.0,1.0,0.436689,0.000,0.000000,0.015713,0.0,0.0,1.0


In [471]:
test_preds = model.predict(test_inputs)

In [474]:
submission_df = pd.DataFrame(test_df['PassengerId'])
submission_df['Survived'] = pd.Series(test_preds)

In [475]:
submission_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [476]:
gender_submission_df = pd.read_csv('titanic/gender_submission.csv')

In [477]:
gender_submission_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [485]:
comparison = pd.DataFrame(np.where(gender_submission_df['Survived']==submission_df['Survived'], True, False))
comparison

Unnamed: 0,0
0,True
1,False
2,True
3,True
4,True
...,...
413,True
414,True
415,True
416,True


In [492]:
comparison.value_counts()
print((333/333+85),'% Correct guesses')

86.0 % Correct guesses


In [493]:
submission_df.to_csv('titanic/my_submission_df.csv', index=None)

In [494]:
os.listdir('titanic')

['gender_submission.csv', 'my_submission_df.csv', 'test.csv', 'train.csv']

# ----------------------------------------#################------------------------------------------