In [1]:
# import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm

### Initial Data Exploration

In [2]:
# load data set
df = pd.read_csv("titanic/train.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [6]:
df.shape

(891, 12)

In [7]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [8]:
df.drop_duplicates()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


### Preparing Data for Predictions
Since this will be a Two-Class Classification problem on a data set with a small amount of features, I will be using a two-class support vector machine.

#### PassengerId

In [9]:
# The PassengerId column very likely has no bearing on whether or not a specific passenger survived, so it will be removed
df.drop(columns=["PassengerId"], inplace=True)
df.head() 

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


#### Pclass

In [10]:
df["Pclass"].value_counts()

Pclass
3    491
1    216
2    184
Name: count, dtype: int64

In [11]:
# The actual numerical value of this column does not make sense, it can be seen more as a categorical column than it could a numerical column. It will be separated using one-hot encoding after the data is completely cleaned
# df = pd.get_dummies(df, columns=["Pclass"])
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


#### Name

In [12]:
# The only potential valuable information to extract from each passenger's name would be their title (Mr., Mrs., etc.) Let's explore the data to see if it would be feasible to extract this
df["Title"] = df["Name"].map(lambda x: x.split(", ")[1].split(" ")[0])
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr.
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs.
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss.
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs.
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr.


In [13]:
df["Title"].value_counts()

Title
Mr.          517
Miss.        182
Mrs.         125
Master.       40
Dr.            7
Rev.           6
Mlle.          2
Major.         2
Col.           2
the            1
Capt.          1
Ms.            1
Sir.           1
Lady.          1
Mme.           1
Don.           1
Jonkheer.      1
Name: count, dtype: int64

In [14]:
# For ease of use. remove the rows where the the title is not Mr., Miss., Mrs., or Master.
df = df[df["Title"].isin(df["Title"].unique()[:4])]
df["Title"].value_counts()

Title
Mr.        517
Miss.      182
Mrs.       125
Master.     40
Name: count, dtype: int64

In [15]:
# The name column can now be dropped
df.drop(columns=["Name"], inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,0,3,male,22.0,1,0,A/5 21171,7.25,,S,Mr.
1,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs.
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss.
3,1,1,female,35.0,1,0,113803,53.1,C123,S,Mrs.
4,0,3,male,35.0,0,0,373450,8.05,,S,Mr.


#### Sex

In [16]:
df["Sex"].value_counts()

Sex
male      557
female    307
Name: count, dtype: int64

In [17]:
# This column is already binary, I will change it so that it will be 1 if the passenger is Male and 0 if the passenger is female
df["Sex_M"] = df["Sex"].map(lambda x: 1 if x == "male" else 0)
df.drop(columns=["Sex"], inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Sex_M
0,0,3,22.0,1,0,A/5 21171,7.25,,S,Mr.,1
1,1,1,38.0,1,0,PC 17599,71.2833,C85,C,Mrs.,0
2,1,3,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss.,0
3,1,1,35.0,1,0,113803,53.1,C123,S,Mrs.,0
4,0,3,35.0,0,0,373450,8.05,,S,Mr.,1


#### Age

In [18]:
df["Age"].isnull().sum()

176

In [19]:
# There are 85 null values in this row, as we saw earlier. Instead of outright removing these values, we will replace the null values with the average age of people who share the same title as them
df_grouped_titles = df[~df["Age"].isnull()].groupby(["Title"])["Age"].mean()
average_ages = df_grouped_titles.to_dict()
average_ages

{'Master.': 4.574166666666667,
 'Miss.': 21.773972602739725,
 'Mr.': 32.368090452261306,
 'Mrs.': 35.898148148148145}

In [20]:
df = df.set_index("Title").fillna(value=average_ages).reset_index().head()

In [21]:
df["Age"].isnull().sum()

0

In [22]:
df.head()

Unnamed: 0,Title,Survived,Pclass,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_M
0,Mr.,0,3,22.0,1,0,A/5 21171,7.25,,S,1
1,Mrs.,1,1,38.0,1,0,PC 17599,71.2833,C85,C,0
2,Miss.,1,3,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,Mrs.,1,1,35.0,1,0,113803,53.1,C123,S,0
4,Mr.,0,3,35.0,0,0,373450,8.05,,S,1


#### SibSp, Parch, and Fare are fine and will be left alone

#### Ticket Number is arbitrary and will be removed

In [23]:
df.drop(columns=["Ticket"], inplace=True)
df.head()

Unnamed: 0,Title,Survived,Pclass,Age,SibSp,Parch,Fare,Cabin,Embarked,Sex_M
0,Mr.,0,3,22.0,1,0,7.25,,S,1
1,Mrs.,1,1,38.0,1,0,71.2833,C85,C,0
2,Miss.,1,3,26.0,0,0,7.925,,S,0
3,Mrs.,1,1,35.0,1,0,53.1,C123,S,0
4,Mr.,0,3,35.0,0,0,8.05,,S,1


#### Cabin

In [24]:
df["Cabin"].nunique()

2

In [25]:
# all cabin values are null, so the entire column will be removed
df.drop(columns=["Cabin"], inplace=True)
df.head()

Unnamed: 0,Title,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,Sex_M
0,Mr.,0,3,22.0,1,0,7.25,S,1
1,Mrs.,1,1,38.0,1,0,71.2833,C,0
2,Miss.,1,3,26.0,0,0,7.925,S,0
3,Mrs.,1,1,35.0,1,0,53.1,S,0
4,Mr.,0,3,35.0,0,0,8.05,S,1


## The entire dataframe should be ready for our model to be trained once we encode all of our categorical columns

In [26]:
df = pd.get_dummies(df, columns=["Title","Pclass", "Embarked"])
df["Embarked_Q"] = False
df["Pclass_2"] = False
df.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Sex_M,Title_Miss.,Title_Mr.,Title_Mrs.,Pclass_1,Pclass_3,Embarked_C,Embarked_S,Embarked_Q,Pclass_2
0,0,22.0,1,0,7.25,1,False,True,False,False,True,False,True,False,False
1,1,38.0,1,0,71.2833,0,False,False,True,True,False,True,False,False,False
2,1,26.0,0,0,7.925,0,True,False,False,False,True,False,True,False,False
3,1,35.0,1,0,53.1,0,False,False,True,True,False,False,True,False,False
4,0,35.0,0,0,8.05,1,False,True,False,False,True,False,True,False,False


## Creating Model

In [27]:
X = df.drop(columns=["Survived"])
y = df["Survived"]

model = svm.SVC()
model.fit(X, y)

## Loading and Preparing test data

In [28]:
test_df = pd.read_csv("titanic/test.csv")

test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [29]:
test_df["Title"] = test_df["Name"].map(lambda x: x.split(", ")[1].split(" ")[0])
test_df = test_df[test_df["Title"].isin(["Mrs.", "Mr.", "Miss.", "Master."])]

test_df["Sex_M"] = test_df["Sex"].map(lambda x: 1 if x == "male" else 0)
test_df.drop(columns=["Sex"], inplace=True)

df_grouped_titles = test_df[~test_df["Age"].isnull()].groupby(["Title"])["Age"].mean()
average_ages = df_grouped_titles.to_dict()
test_df = test_df.set_index("Title").fillna(value=average_ages).reset_index().head()

test_passengers = test_df["PassengerId"]

test_df.drop(columns=[ "PassengerId", "Name", "Cabin", "Ticket"], inplace=True)
test_df = pd.get_dummies(test_df, columns=["Title", "Pclass", "Embarked"])

# Embarked_C, Pclass_1, and Title_Miss. are all missing
test_df["Embarked_C"] = False
test_df["Pclass_1"] = False
test_df["Title_Miss."] = False

test_df.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Sex_M,Title_Mr.,Title_Mrs.,Pclass_2,Pclass_3,Embarked_Q,Embarked_S,Embarked_C,Pclass_1,Title_Miss.
0,34.5,0,0,7.8292,1,True,False,False,True,True,False,False,False,False
1,47.0,1,0,7.0,0,False,True,False,True,False,True,False,False,False
2,62.0,0,0,9.6875,1,True,False,True,False,True,False,False,False,False
3,27.0,0,0,8.6625,1,True,False,False,True,False,True,False,False,False
4,22.0,1,1,12.2875,0,False,True,False,True,False,True,False,False,False


In [30]:
df_columns = df.columns
test_df_columns = test_df.columns

# Reorder columns
test_df = test_df[X.columns]

print(df_columns, test_df_columns)

Index(['Survived', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_M', 'Title_Miss.',
       'Title_Mr.', 'Title_Mrs.', 'Pclass_1', 'Pclass_3', 'Embarked_C',
       'Embarked_S', 'Embarked_Q', 'Pclass_2'],
      dtype='object') Index(['Age', 'SibSp', 'Parch', 'Fare', 'Sex_M', 'Title_Mr.', 'Title_Mrs.',
       'Pclass_2', 'Pclass_3', 'Embarked_Q', 'Embarked_S', 'Embarked_C',
       'Pclass_1', 'Title_Miss.'],
      dtype='object')


## Using model on test data

In [31]:
predictions = model.predict(test_df)
predictions

array([0, 0, 1, 0, 0], dtype=int64)

In [32]:
submission = pd.DataFrame({
    "PassengerId": test_passengers,
    "Survived": predictions
})

submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,1
3,895,0
4,896,0


In [33]:
submission.to_csv("output/submission.csv")