In [92]:
import scipy as sc
import pandas as pd
import numpy as np

# Loading the data

In [93]:
dataset = pd.read_csv("Titanic-Dataset.csv")
dataset

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# Fill in the missing values

## Embarked
Due to the fact that most of the values of the embarked column are equal to S it is safe to fill the other 2 missing values with S. Additionally, based on the price we may assume that the passengers are coming from England.

In [94]:
dataset["Embarked"] = dataset["Embarked"].fillna('S')
dataset

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## Age

In [95]:
dataset = dataset.dropna(subset=['Age'])
dataset

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,,Q
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# Cabin

In [96]:
dataset = dataset.drop('Cabin', axis=1)
dataset

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,S
...,...,...,...,...,...,...,...,...,...,...,...
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,Q
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C


# One-Hot encoding

In [97]:
dataset = pd.get_dummies(dataset, columns=['Pclass', 'Embarked', 'Sex'])
dataset

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
0,1,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.2500,False,False,True,False,False,True,False,True
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,True,False,False,True,False,False,True,False
2,3,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.9250,False,False,True,False,False,True,True,False
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1000,True,False,False,False,False,True,True,False
4,5,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.0500,False,False,True,False,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,886,0,"Rice, Mrs. William (Margaret Norton)",39.0,0,5,382652,29.1250,False,False,True,False,True,False,True,False
886,887,0,"Montvila, Rev. Juozas",27.0,0,0,211536,13.0000,False,True,False,False,False,True,False,True
887,888,1,"Graham, Miss. Margaret Edith",19.0,0,0,112053,30.0000,True,False,False,False,False,True,True,False
889,890,1,"Behr, Mr. Karl Howell",26.0,0,0,111369,30.0000,True,False,False,True,False,False,False,True


# Feature engineering

## Family size

In [98]:
dataset["FamilySize"] = dataset.apply(lambda row: row['SibSp'] + row['Parch'] + 1, axis=1)
dataset

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male,FamilySize
0,1,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.2500,False,False,True,False,False,True,False,True,2
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,True,False,False,True,False,False,True,False,2
2,3,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.9250,False,False,True,False,False,True,True,False,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1000,True,False,False,False,False,True,True,False,2
4,5,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.0500,False,False,True,False,False,True,False,True,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,886,0,"Rice, Mrs. William (Margaret Norton)",39.0,0,5,382652,29.1250,False,False,True,False,True,False,True,False,6
886,887,0,"Montvila, Rev. Juozas",27.0,0,0,211536,13.0000,False,True,False,False,False,True,False,True,1
887,888,1,"Graham, Miss. Margaret Edith",19.0,0,0,112053,30.0000,True,False,False,False,False,True,True,False,1
889,890,1,"Behr, Mr. Karl Howell",26.0,0,0,111369,30.0000,True,False,False,True,False,False,False,True,1


### Is single ?

In [99]:
dataset["isSingle"] = dataset.apply(lambda row: row['FamilySize'] == 1, axis=1)
dataset

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male,FamilySize,isSingle
0,1,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.2500,False,False,True,False,False,True,False,True,2,False
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,True,False,False,True,False,False,True,False,2,False
2,3,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.9250,False,False,True,False,False,True,True,False,1,True
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1000,True,False,False,False,False,True,True,False,2,False
4,5,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.0500,False,False,True,False,False,True,False,True,1,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,886,0,"Rice, Mrs. William (Margaret Norton)",39.0,0,5,382652,29.1250,False,False,True,False,True,False,True,False,6,False
886,887,0,"Montvila, Rev. Juozas",27.0,0,0,211536,13.0000,False,True,False,False,False,True,False,True,1,True
887,888,1,"Graham, Miss. Margaret Edith",19.0,0,0,112053,30.0000,True,False,False,False,False,True,True,False,1,True
889,890,1,"Behr, Mr. Karl Howell",26.0,0,0,111369,30.0000,True,False,False,True,False,False,False,True,1,True


## Title extraction

In [100]:
dataset["officialTitle"] = dataset.apply(lambda row: row['Name'].split(', ')[1].split(' ')[0], axis=1)
dataset

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male,FamilySize,isSingle,officialTitle
0,1,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.2500,False,False,True,False,False,True,False,True,2,False,Mr.
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,True,False,False,True,False,False,True,False,2,False,Mrs.
2,3,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.9250,False,False,True,False,False,True,True,False,1,True,Miss.
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1000,True,False,False,False,False,True,True,False,2,False,Mrs.
4,5,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.0500,False,False,True,False,False,True,False,True,1,True,Mr.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,886,0,"Rice, Mrs. William (Margaret Norton)",39.0,0,5,382652,29.1250,False,False,True,False,True,False,True,False,6,False,Mrs.
886,887,0,"Montvila, Rev. Juozas",27.0,0,0,211536,13.0000,False,True,False,False,False,True,False,True,1,True,Rev.
887,888,1,"Graham, Miss. Margaret Edith",19.0,0,0,112053,30.0000,True,False,False,False,False,True,True,False,1,True,Miss.
889,890,1,"Behr, Mr. Karl Howell",26.0,0,0,111369,30.0000,True,False,False,True,False,False,False,True,1,True,Mr.


## Age groups

Considering the ages by the standarts of 1910s
* Child 0-21
* Adult 22-60
* Senior 71+

In [101]:
dataset["ageGroup"] = dataset.apply(lambda row:
    "Child" if row["Age"] < 22 else
    "Adult" if 21 < row["Age"] < 61 else
    "Senior"
, axis=1)

dataset = pd.get_dummies(dataset, columns=['ageGroup'])
dataset

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Pclass_1,Pclass_2,...,Embarked_Q,Embarked_S,Sex_female,Sex_male,FamilySize,isSingle,officialTitle,ageGroup_Adult,ageGroup_Child,ageGroup_Senior
0,1,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.2500,False,False,...,False,True,False,True,2,False,Mr.,True,False,False
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,True,False,...,False,False,True,False,2,False,Mrs.,True,False,False
2,3,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.9250,False,False,...,False,True,True,False,1,True,Miss.,True,False,False
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1000,True,False,...,False,True,True,False,2,False,Mrs.,True,False,False
4,5,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.0500,False,False,...,False,True,False,True,1,True,Mr.,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,886,0,"Rice, Mrs. William (Margaret Norton)",39.0,0,5,382652,29.1250,False,False,...,True,False,True,False,6,False,Mrs.,True,False,False
886,887,0,"Montvila, Rev. Juozas",27.0,0,0,211536,13.0000,False,True,...,False,True,False,True,1,True,Rev.,True,False,False
887,888,1,"Graham, Miss. Margaret Edith",19.0,0,0,112053,30.0000,True,False,...,False,True,True,False,1,True,Miss.,False,True,False
889,890,1,"Behr, Mr. Karl Howell",26.0,0,0,111369,30.0000,True,False,...,False,False,False,True,1,True,Mr.,True,False,False


## Fare bins
* Cheap < 17.5
* Medium < 52.5
* Expensive > 52.5

In [102]:
dataset["fareBin"] = dataset.apply(lambda row:
    "Cheap" if row["Fare"] < 17.5 else
    "Medium" if 21 < row["Fare"] < 52.5 else
    "Expensive"
, axis=1)

dataset = pd.get_dummies(dataset, columns=['fareBin'])
dataset

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Pclass_1,Pclass_2,...,Sex_male,FamilySize,isSingle,officialTitle,ageGroup_Adult,ageGroup_Child,ageGroup_Senior,fareBin_Cheap,fareBin_Expensive,fareBin_Medium
0,1,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.2500,False,False,...,True,2,False,Mr.,True,False,False,True,False,False
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,True,False,...,False,2,False,Mrs.,True,False,False,False,True,False
2,3,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.9250,False,False,...,False,1,True,Miss.,True,False,False,True,False,False
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1000,True,False,...,False,2,False,Mrs.,True,False,False,False,True,False
4,5,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.0500,False,False,...,True,1,True,Mr.,True,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,886,0,"Rice, Mrs. William (Margaret Norton)",39.0,0,5,382652,29.1250,False,False,...,False,6,False,Mrs.,True,False,False,False,False,True
886,887,0,"Montvila, Rev. Juozas",27.0,0,0,211536,13.0000,False,True,...,True,1,True,Rev.,True,False,False,True,False,False
887,888,1,"Graham, Miss. Margaret Edith",19.0,0,0,112053,30.0000,True,False,...,False,1,True,Miss.,False,True,False,False,False,True
889,890,1,"Behr, Mr. Karl Howell",26.0,0,0,111369,30.0000,True,False,...,True,1,True,Mr.,True,False,False,False,False,True
