### Data Processing of the Titanic Data-Set

#### Imports

In [1]:
import pandas as pd
import numpy as np

#### Data Import

In [2]:
df = pd.read_csv('TitanicPreprocessed.csv')
df_pruned = df.copy()
df_pruned

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,...,Ticket_STONOQ,Ticket_SWPP,Ticket_WC,Ticket_WEP,Ticket_XXX,FamilySize,Singleton,SmallFamily,LargeFamily,Survived
0,1,22.0,1,0,7.2500,0,0,1,0,0,...,0,0,0,0,0,2,0,1,0,0
1,0,38.0,1,0,71.2833,0,0,0,1,0,...,0,0,0,0,0,2,0,1,0,1
2,0,26.0,0,0,7.9250,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
3,0,35.0,1,0,53.1000,0,0,0,1,0,...,0,0,0,0,1,2,0,1,0,1
4,1,35.0,0,0,8.0500,0,0,1,0,0,...,0,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,1,27.0,0,0,13.0000,0,0,0,0,1,...,0,0,0,0,1,1,0,0,0,0
887,0,19.0,0,0,30.0000,0,1,0,0,0,...,0,0,0,0,1,1,0,0,0,1
888,0,18.0,1,2,23.4500,0,1,0,0,0,...,0,0,1,0,0,4,0,1,0,0
889,1,26.0,0,0,30.0000,0,0,1,0,0,...,0,0,0,0,1,1,0,0,0,1


#### Data Processing

In [3]:
titles = df.columns.str.startswith('Title')
df_pruned = df.loc[:,~titles]
df_pruned = df_pruned.loc[:,df_pruned.corr().loc[:,'Survived'].abs() >= .2].copy()
df_pruned.loc[:,'Pclass_2'] = df['Pclass_2'].to_numpy()
df_pruned

Unnamed: 0,Sex,Fare,Cabin_U,Pclass_1,Pclass_3,SmallFamily,Survived,Pclass_2
0,1,7.2500,1,0,1,1,0,0
1,0,71.2833,0,1,0,1,1,0
2,0,7.9250,1,0,1,0,1,0
3,0,53.1000,0,1,0,1,1,0
4,1,8.0500,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...
886,1,13.0000,1,0,0,0,0,1
887,0,30.0000,0,1,0,0,1,0
888,0,23.4500,1,0,1,1,0,0
889,1,30.0000,0,1,0,0,1,0


In [4]:
df_adjusted = df_pruned.copy()
df_adjusted['Fare'] *= 33
df_adjusted['Fare'] = df_adjusted['Fare'].round(2)
df_adjusted['Age'] = df.loc[:,'Age'].copy()//10
df_adjusted['Fare'] = df.loc[:,'Fare'].copy()//20
df_adjusted.sort_values('Fare')

Unnamed: 0,Sex,Fare,Cabin_U,Pclass_1,Pclass_3,SmallFamily,Survived,Pclass_2,Age
39,0,0.0,1,0,1,1,1,0,1.0
865,0,0.0,1,0,0,0,1,1,4.0
866,0,0.0,1,0,0,1,1,1,2.0
868,1,0.0,1,0,1,0,0,0,2.0
869,1,0.0,1,0,1,1,1,0,0.0
...,...,...,...,...,...,...,...,...,...
341,0,13.0,0,1,0,0,1,0,2.0
438,1,13.0,0,1,0,0,0,0,6.0
737,1,25.0,0,1,0,0,1,0,3.0
258,0,25.0,1,1,0,0,1,0,3.0


In [5]:
df_adjusted['Class'] = np.zeros(df.shape[0], dtype=int)
for i in range(1,4):
    df_adjusted['Pclass_' + str(i)] *= -i + 4
    df_adjusted['Class'] += df_adjusted['Pclass_' + str(i)]
    df_adjusted.drop(['Pclass_' + str(i)], inplace=True, axis=1)

df_adjusted['Cabin'] = np.zeros(df.shape[0], dtype=int)
for i,c in enumerate('ABCDEFGTU'):
    df_adjusted['Cabin'] += df['Cabin_' + str(c)] * i
df_adjusted.drop('Cabin_U', axis=1,inplace=True)
df_adjusted

Unnamed: 0,Sex,Fare,SmallFamily,Survived,Age,Class,Cabin
0,1,0.0,1,0,2.0,1,8
1,0,3.0,1,1,3.0,3,2
2,0,0.0,0,1,2.0,1,8
3,0,2.0,1,1,3.0,3,2
4,1,0.0,0,0,3.0,1,8
...,...,...,...,...,...,...,...
886,1,0.0,0,0,2.0,2,8
887,0,1.0,0,1,1.0,3,1
888,0,1.0,1,0,1.0,1,8
889,1,1.0,0,1,2.0,3,2


In [6]:
tickets = df.columns.str.startswith('Ticket')
df_tickets = df.loc[:,tickets].copy()
df_tickets['NT'] = np.zeros(df.shape[0], dtype=int)
for i,c in enumerate(df.loc[:,tickets].columns):
    df_tickets['NT'] += df[c] * i % 20
df_tickets = df_tickets['NT']

df_adjusted['FamilySize'] = df.loc[:,'FamilySize'].copy()
df_adjusted = pd.concat([df_adjusted,df_tickets], axis=1)
df_adjusted

Unnamed: 0,Sex,Fare,SmallFamily,Survived,Age,Class,Cabin,FamilySize,NT
0,1,0.0,1,0,2.0,1,8,2,2
1,0,3.0,1,1,3.0,3,2,2,14
2,0,0.0,0,1,2.0,1,8,1,11
3,0,2.0,1,1,3.0,3,2,2,16
4,1,0.0,0,0,3.0,1,8,1,16
...,...,...,...,...,...,...,...,...,...
886,1,0.0,0,0,2.0,2,8,1,16
887,0,1.0,0,1,1.0,3,1,1,16
888,0,1.0,1,0,1.0,1,8,4,14
889,1,1.0,0,1,2.0,3,2,1,16


#### Data Export

In [7]:
df_adjusted.sort_index(axis=1).to_csv('titanicfinal.csv', index=False)