In [29]:
import pandas as pd
import random, time
import os
data = pd.read_csv(os.path.join('..','..','data',"titanic.csv"))

# Understanding the data

In [30]:
data.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0.0,3.0,"Boulos, Mr. Hanna",male,,0.0,0.0,2664,7.225,,C
1,1.0,3.0,"de Mulder, Mr. Theodore",male,30.0,0.0,0.0,345774,9.5,,S
2,0.0,2.0,"Banfield, Mr. Frederick James",male,28.0,0.0,0.0,C.A./SOTON 34068,10.5,,S
3,0.0,2.0,"Eitemiller, Mr. George Floyd",male,23.0,0.0,0.0,29751,13.0,,S
4,1.0,1.0,"Longley, Miss. Gretchen Fiske",female,21.0,0.0,0.0,13502,77.9583,D9,S


In [31]:
data.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0
mean,0.381971,2.294882,29.881135,0.498854,0.385027,33.295479
std,0.486055,0.837836,14.4135,1.041658,0.86556,51.758668
min,0.0,1.0,0.1667,0.0,0.0,0.0
25%,0.0,2.0,21.0,0.0,0.0,7.8958
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,39.0,1.0,0.0,31.275
max,1.0,3.0,80.0,8.0,9.0,512.3292


In [32]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  1309 non-null   float64
 1   Pclass    1309 non-null   float64
 2   Name      1309 non-null   object 
 3   Sex       1309 non-null   object 
 4   Age       1046 non-null   float64
 5   SibSp     1309 non-null   float64
 6   Parch     1309 non-null   float64
 7   Ticket    1309 non-null   object 
 8   Fare      1308 non-null   float64
 9   Cabin     295 non-null    object 
 10  Embarked  1307 non-null   object 
dtypes: float64(6), object(5)
memory usage: 112.6+ KB


# Embarked

As we can see there are only 2 missing values in the Embarked column. As a result, the 2 missing Embarked values are filled using the most common value (mode).

In [33]:
data[data['Embarked'].isnull()]

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
101,1.0,1.0,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0.0,0.0,113572,80.0,B28,
1121,1.0,1.0,"Icard, Miss. Amelie",female,38.0,0.0,0.0,113572,80.0,B28,


In [34]:
x = data['Embarked'].mode()[0]
data['Embarked'].fillna(x, inplace=True)

# Age

The following scatter plot shows that higher values of 'SibSp' have a smaller 'Age' range. For instance it is more likely that a person with 4 other siblings on board the Titanic was a child with their family than an adult. As a reuslt, the missing values of age are computed by generating a random number within the range of known ages of the 'SibSp' value for that particlar person. 

In [35]:
p = data.plot(kind = 'scatter', x = 'Age', y = 'SibSp')

In [36]:
data['SibSp'].value_counts()

0.0    891
1.0    319
2.0     42
4.0     22
3.0     20
8.0      9
5.0      6
Name: SibSp, dtype: int64

The maximum and minimum ages for each value of 'SibSp' is found and the missing 'Age' values in the training and testing data are computed.

In [37]:
ageRanges = {0: [100, 0], 1: [100,0], 2: [100,0], 3: [100, 0], 4: [100, 0], 5: [100, 0], 8: [100, 0] }
for x in data.index:
    if data.loc[x, 'Age'] < ageRanges[data.loc[x, "SibSp"]][0]:
        ageRanges[data.loc[x, "SibSp"]][0] = data.loc[x, "Age"]
    elif data.loc[x, 'Age'] > ageRanges[data.loc[x, "SibSp"]][1]:
        ageRanges[data.loc[x, "SibSp"]][1] = data.loc[x, "Age"]

ageRanges[8] = [data['Age'].min(), ageRanges[5][1]] 

for x in data.index:
    if pd.isna(data.loc[x, 'Age']):
        random.seed(time.perf_counter())

        data.loc[x, 'Age'] = random.randint(int(ageRanges[data.loc[x, 'SibSp']][0]), int(ageRanges[data.loc[x, 'SibSp']][1]))

# Cabin

Extracting the first character of the cabin value since this corresponds to the deck where the cabin was located and adding it as a new column in the dataframe.

In [38]:
data['Deck'] = data['Cabin'].apply(lambda d: d[0] if pd.notnull(d) else None)

In [39]:
print(data['Deck'].value_counts())

C    94
B    65
D    46
E    41
A    22
F    21
G     5
T     1
Name: Deck, dtype: int64


Replacing each letter with a corresponding number

In [40]:
data['Deck'] = data['Deck'].replace(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T'], [0, 1, 2, 3, 4, 5, 6, 7])

The correlation between Deck and Pclass is about 0.6. The missing 'Deck' values will be computed similar to how the 'Age' values were worked out. For instance if a person's Pclass is 3, the Deck will be a random number between 4 and 6 as seen in the scatter plot below. For a Pclass value of 1 the random number will be between 0 and 4 because the 7 (which represents the T value before these were changed to numbers) only features once in the dataset and was not included on purpose.

In [41]:
corr=data.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Deck
Survived,1.0,-0.312469,-0.028136,-0.027825,0.08266,0.244265,0.019677
Pclass,-0.312469,1.0,-0.227817,0.060832,0.018322,-0.558629,0.610843
Age,-0.028136,-0.227817,1.0,-0.246342,-0.149874,0.079827,-0.210115
SibSp,-0.027825,0.060832,-0.246342,1.0,0.373587,0.160238,-0.009289
Parch,0.08266,0.018322,-0.149874,0.373587,1.0,0.221539,0.007602
Fare,0.244265,-0.558629,0.079827,0.160238,0.221539,1.0,-0.297525
Deck,0.019677,0.610843,-0.210115,-0.009289,0.007602,-0.297525,1.0


In [42]:
p2 = data.plot(kind = 'scatter', x = 'Deck', y = 'Pclass')

Setting the appropriate deck ranges for each class and generating the missing values randomly.

In [43]:
# The 7 is not included in Pclass 1 because it only featured once in the whole dataset
deckRanges = {1: [0, 4], 2: [3, 5], 3: [4, 6]}

for x in data.index:
    if pd.isna(data.loc[x, 'Deck']):
        random.seed(time.perf_counter())

        data.loc[x, 'Deck'] = random.randint(deckRanges[data.loc[x, 'Pclass']][0], deckRanges[data.loc[x, 'Pclass']][1])

# Name, Ticket and Cabin

The column 'Cabin is no longer needed since the 'Deck' column was computed from it. The columns 'Name' and 'Ticket' are also dropped from the dataframe.

In [44]:
data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

# Fare

In [45]:
data[data['Fare'].isnull()]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck
654,0.0,3.0,male,60.5,0.0,0.0,,S,4.0


There is only one missing 'Fare' value and this is found in the testing data. The median Fare of all the male passengers in Pclass 3 is used to fill the missing value. 

In [46]:
m = data.groupby(['Pclass', 'Sex']).Fare.median()[3][1]
data.Fare.fillna(m, inplace=True)


# Non-Numerical Values

The categorical values 'Pclass' and 'Embarked' are converted into dummy indicator variables. 

In [47]:
data = pd.get_dummies(data,columns=['Pclass', 'Embarked', 'Deck'])

In [48]:
data

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,Pclass_1.0,Pclass_2.0,Pclass_3.0,Embarked_C,Embarked_Q,Embarked_S,Deck_0.0,Deck_1.0,Deck_2.0,Deck_3.0,Deck_4.0,Deck_5.0,Deck_6.0,Deck_7.0
0,0.0,male,12.0,0.0,0.0,7.2250,0,0,1,1,0,0,0,0,0,0,1,0,0,0
1,1.0,male,30.0,0.0,0.0,9.5000,0,0,1,0,0,1,0,0,0,0,1,0,0,0
2,0.0,male,28.0,0.0,0.0,10.5000,0,1,0,0,0,1,0,0,0,0,1,0,0,0
3,0.0,male,23.0,0.0,0.0,13.0000,0,1,0,0,0,1,0,0,0,1,0,0,0,0
4,1.0,female,21.0,0.0,0.0,77.9583,1,0,0,0,0,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1.0,female,40.0,0.0,0.0,153.4625,1,0,0,0,0,1,0,0,1,0,0,0,0,0
1305,0.0,male,71.0,0.0,0.0,7.7250,0,0,1,0,1,0,0,0,0,0,1,0,0,0
1306,1.0,female,24.0,2.0,3.0,18.7500,0,1,0,0,0,1,0,0,0,1,0,0,0,0
1307,1.0,male,23.0,0.0,1.0,63.3583,1,0,0,1,0,0,0,0,0,1,0,0,0,0


Since n-1 columns are needed, 'Pclass_3.0', 'Embarked_S' and 'Deck_7.0' are dropped.

In [49]:
data.drop(['Pclass_3.0','Embarked_S', 'Deck_7.0'], axis=1, inplace=True)

In [50]:
data.head()

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,Pclass_1.0,Pclass_2.0,Embarked_C,Embarked_Q,Deck_0.0,Deck_1.0,Deck_2.0,Deck_3.0,Deck_4.0,Deck_5.0,Deck_6.0
0,0.0,male,12.0,0.0,0.0,7.225,0,0,1,0,0,0,0,0,1,0,0
1,1.0,male,30.0,0.0,0.0,9.5,0,0,0,0,0,0,0,0,1,0,0
2,0.0,male,28.0,0.0,0.0,10.5,0,1,0,0,0,0,0,0,1,0,0
3,0.0,male,23.0,0.0,0.0,13.0,0,1,0,0,0,0,0,1,0,0,0
4,1.0,female,21.0,0.0,0.0,77.9583,1,0,0,0,0,0,0,1,0,0,0


Since there are only 2 values for Categorical column 'Sex', these values are mapped to zero or one. 

In [51]:
sex_values = {"male":0, "female":1}
data['Sex'] = data["Sex"].map(sex_values)

In [52]:
data.head()

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,Pclass_1.0,Pclass_2.0,Embarked_C,Embarked_Q,Deck_0.0,Deck_1.0,Deck_2.0,Deck_3.0,Deck_4.0,Deck_5.0,Deck_6.0
0,0.0,0,12.0,0.0,0.0,7.225,0,0,1,0,0,0,0,0,1,0,0
1,1.0,0,30.0,0.0,0.0,9.5,0,0,0,0,0,0,0,0,1,0,0
2,0.0,0,28.0,0.0,0.0,10.5,0,1,0,0,0,0,0,0,1,0,0
3,0.0,0,23.0,0.0,0.0,13.0,0,1,0,0,0,0,0,1,0,0,0
4,1.0,1,21.0,0.0,0.0,77.9583,1,0,0,0,0,0,0,1,0,0,0


# Saving the dataframes

In [53]:
data.to_csv(os.path.join('..','..',"data",'Variant 3',"titanic_cleaned.csv"),index=False)