In [1]:
#Data Analysis
import pandas as pd
import numpy as np
import random as rnd

#Data Visualisation
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#Importing the data using pandas
train_data=pd.read_csv("train.csv")
test_data=pd.read_csv("test.csv")
df=[train_data,test_data]

In [3]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
#Check for null values in the training data
print('Train columns with null values:\n', train_data.isnull().sum())

Train columns with null values:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [5]:
#Check for null values in the test data
print('Test/Validation columns with null values:\n', test_data.isnull().sum())

Test/Validation columns with null values:
 PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [6]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
#Test to see how well each variable correlate with Survival.
#So that we can match this correlation with modelled correction later into the project.
train_data[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


In [8]:
train_data[["Sex", "Survived"]].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Sex,Survived
0,female,0.742038
1,male,0.188908


In [9]:
train_data[["SibSp", "Survived"]].groupby(['SibSp'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,SibSp,Survived
1,1,0.535885
2,2,0.464286
0,0,0.345395
3,3,0.25
4,4,0.166667
5,5,0.0
6,8,0.0


In [10]:
train_data[["Parch", "Survived"]].groupby(['Parch'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Parch,Survived
3,3,0.6
1,1,0.550847
2,2,0.5
0,0,0.343658
5,5,0.2
4,4,0.0
6,6,0.0


In [11]:
#Convert categorical features to numerical values as it is required by most machine learning models.
for dataset in df:
    dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

In [12]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S


In [13]:
#Check for the most frequent Embarked so that we can replace the Null value with it
most_freq=train_data["Embarked"].dropna().mode()[0]

In [14]:
most_freq

'S'

In [15]:
for dataset in df:
    dataset["Embarked"]=dataset["Embarked"].fillna(most_freq)

In [16]:
train_data[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Embarked,Survived
0,C,0.553571
1,Q,0.38961
2,S,0.339009


In [17]:
#Convert categorical features to numerical values as it is required by most machine learning models.
for dataset in df:
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, "Q":2} ).astype(int)

In [18]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,0


In [19]:
#We will drop off Cabin feature as it is highly incomplete and contain many null values
#Ticket feature will also be dropped as it contains high duplicate and does not really contribute to Survival.
#Dropping the features from both train and test dataset to stay consistent.
train_data.drop(['Cabin'], axis = 1,inplace=True)

In [20]:
test_data.drop(['Cabin'], axis = 1,inplace=True)

In [21]:
train_data.drop(['Ticket'], axis = 1,inplace=True)
test_data.drop(['Ticket'], axis = 1,inplace=True)

In [22]:
df=[train_data,test_data]

In [23]:
train_data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,7.25,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,71.2833,1
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,7.925,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,53.1,0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,8.05,0
5,6,0,3,"Moran, Mr. James",0,,0,0,8.4583,2
6,7,0,1,"McCarthy, Mr. Timothy J",0,54.0,0,0,51.8625,0
7,8,0,3,"Palsson, Master. Gosta Leonard",0,2.0,3,1,21.075,0
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",1,27.0,0,2,11.1333,0
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",1,14.0,1,0,30.0708,1


In [24]:
#Extract the passenger Title using regular expression from the name feature
for dataset in df:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

In [25]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,7.25,0,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,71.2833,1,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,7.925,0,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,53.1,0,Mrs
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,8.05,0,Mr


In [26]:
train_data.Title.unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'Countess',
       'Jonkheer'], dtype=object)

In [27]:
pd.crosstab(train_data['Title'], train_data['Sex'])

Sex,0,1
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,1,0
Col,2,0
Countess,0,1
Don,1,0
Dr,6,1
Jonkheer,1,0
Lady,0,1
Major,2,0
Master,40,0
Miss,0,182


In [28]:
for dataset in df:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
    'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

In [29]:
#Test the correlation between Title and Survival
train_data[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

Unnamed: 0,Title,Survived
0,Master,0.575
1,Miss,0.702703
2,Mr,0.156673
3,Mrs,0.793651
4,Rare,0.347826


In [30]:
#Convert categorical features to numerical values.
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
for dataset in df:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,7.25,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,71.2833,1,3
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,7.925,0,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,53.1,0,3
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,8.05,0,1


In [31]:
#Since we now have the title feature, we can now drop name and passenger ID from both the train and test dataset
train_data = train_data.drop(['Name', 'PassengerId'], axis=1)
test_data = test_data.drop(['Name'], axis=1)
df=[train_data,test_data]


In [32]:
train_data.shape, test_data.shape

((891, 9), (418, 9))

In [33]:
train_data.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,22.0,1,0,7.25,0,1
1,1,1,1,38.0,1,0,71.2833,1,3
2,1,3,1,26.0,0,0,7.925,0,2
3,1,1,1,35.0,1,0,53.1,0,3
4,0,3,0,35.0,0,0,8.05,0,1
5,0,3,0,,0,0,8.4583,2,1
6,0,1,0,54.0,0,0,51.8625,0,1
7,0,3,0,2.0,3,1,21.075,0,4
8,1,3,1,27.0,0,2,11.1333,0,3
9,1,2,1,14.0,1,0,30.0708,1,3


In [34]:
test_data.head(10)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,892,3,0,34.5,0,0,7.8292,2,1
1,893,3,1,47.0,1,0,7.0,0,3
2,894,2,0,62.0,0,0,9.6875,2,1
3,895,3,0,27.0,0,0,8.6625,0,1
4,896,3,1,22.0,1,1,12.2875,0,3
5,897,3,0,14.0,0,0,9.225,0,1
6,898,3,1,30.0,0,0,7.6292,2,2
7,899,2,0,26.0,1,1,29.0,0,1
8,900,3,1,18.0,0,0,7.2292,1,3
9,901,3,0,21.0,2,0,24.15,0,1


In [35]:
#Create a new variable FamilySize, adding the siblings,parent and also the passenger itself
for dataset in df:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

In [36]:
train_data[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,FamilySize,Survived
3,4,0.724138
2,3,0.578431
1,2,0.552795
6,7,0.333333
0,1,0.303538
4,5,0.2
5,6,0.136364
7,8,0.0
8,11,0.0


In [37]:
#Create a new variable isSolo, if they are travelling solo 1 and not solo 0
for dataset in df:
    dataset["IsSolo"]=0
    dataset.loc[dataset['FamilySize'] == 1, 'IsSolo'] = 1

In [38]:
train_data[['IsSolo', 'Survived']].groupby(['IsSolo'], as_index=False).mean()

Unnamed: 0,IsSolo,Survived
0,0,0.50565
1,1,0.303538


In [39]:
#We can now drop Parch,SibSp and FamilySize since we have the information capture in IsSolo.
train_data = train_data.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
test_data = test_data.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
df = [train_data, test_data]

In [40]:
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,IsSolo
0,0,3,0,22.0,7.25,0,1,0
1,1,1,1,38.0,71.2833,1,3,0
2,1,3,1,26.0,7.925,0,2,1
3,1,1,1,35.0,53.1,0,3,0
4,0,3,0,35.0,8.05,0,1,1


In [41]:
#Fill the missing Fare value with the median value
test_data['Fare'].fillna(test_data['Fare'].dropna().median(), inplace=True)

In [42]:
#Create a new variable FareBand to split the Fare in 4 different quartiles
train_data['FareBand'] = pd.qcut(train_data['Fare'], 4)

In [43]:
train_data[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().sort_values(by='FareBand', ascending=True)

Unnamed: 0,FareBand,Survived
0,"(-0.001, 7.91]",0.197309
1,"(7.91, 14.454]",0.303571
2,"(14.454, 31.0]",0.454955
3,"(31.0, 512.329]",0.581081


In [44]:
train_data.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,IsSolo,FareBand
0,0,3,0,22.0,7.25,0,1,0,"(-0.001, 7.91]"
1,1,1,1,38.0,71.2833,1,3,0,"(31.0, 512.329]"
2,1,3,1,26.0,7.925,0,2,1,"(7.91, 14.454]"
3,1,1,1,35.0,53.1,0,3,0,"(31.0, 512.329]"
4,0,3,0,35.0,8.05,0,1,1,"(7.91, 14.454]"
5,0,3,0,,8.4583,2,1,1,"(7.91, 14.454]"
6,0,1,0,54.0,51.8625,0,1,1,"(31.0, 512.329]"
7,0,3,0,2.0,21.075,0,4,0,"(14.454, 31.0]"
8,1,3,1,27.0,11.1333,0,3,0,"(7.91, 14.454]"
9,1,2,1,14.0,30.0708,1,3,0,"(14.454, 31.0]"


In [45]:
#Now label the Fare 0-3 depending on the different quartile, 0 quartile = Label 0 and etc.
for dataset in df:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

In [46]:
#We can now drop the FareBand since we already capture the information in the Fare variable
train_data = train_data.drop(['FareBand'], axis=1)
df = [train_data, test_data]
    
train_data.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,IsSolo
0,0,3,0,22.0,0,0,1,0
1,1,1,1,38.0,3,1,3,0
2,1,3,1,26.0,1,0,2,1
3,1,1,1,35.0,3,0,3,0
4,0,3,0,35.0,1,0,1,1
5,0,3,0,,1,2,1,1
6,0,1,0,54.0,3,0,1,1
7,0,3,0,2.0,2,0,4,0
8,1,3,1,27.0,1,0,3,0
9,1,2,1,14.0,2,1,3,0


In [47]:
for dataset in df:
    #Get the median age of each PClass base on the different gender, fill the NA with those values
    guess_df=dataset.groupby(['Sex', 'Pclass']).median()['Age']
    for i in range(0,2):
        for j in range(1,4):
            dataset.loc[(dataset.Age.isnull()) & (dataset.Sex==i) & (dataset.Pclass==j),"Age"]=guess_df[i,j]
    dataset['Age'] = dataset['Age'].astype(int)


In [48]:
train_data.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,IsSolo
0,0,3,0,22,0,0,1,0
1,1,1,1,38,3,1,3,0
2,1,3,1,26,1,0,2,1
3,1,1,1,35,3,0,3,0
4,0,3,0,35,1,0,1,1
5,0,3,0,25,1,2,1,1
6,0,1,0,54,3,0,1,1
7,0,3,0,2,2,0,4,0
8,1,3,1,27,1,0,3,0
9,1,2,1,14,2,1,3,0


In [49]:
test_data.head(10)

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,Title,IsSolo
0,892,3,0,34,0,2,1,1
1,893,3,1,47,0,0,3,0
2,894,2,0,62,1,2,1,1
3,895,3,0,27,1,0,1,1
4,896,3,1,22,1,0,3,0
5,897,3,0,14,1,0,1,1
6,898,3,1,30,0,2,2,1
7,899,2,0,26,2,0,1,0
8,900,3,1,18,0,1,3,1
9,901,3,0,21,2,0,1,0


In [50]:
#Create a new variable AgeBand, to split the age into 5 equal bin
train_data['AgeBand'] = pd.cut(train_data['Age'], 5)
train_data[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)

Unnamed: 0,AgeBand,Survived
0,"(-0.08, 16.0]",0.55
1,"(16.0, 32.0]",0.337374
2,"(32.0, 48.0]",0.412037
3,"(48.0, 64.0]",0.434783
4,"(64.0, 80.0]",0.090909


In [51]:
for dataset in df:    
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age']
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,IsSolo,AgeBand
0,0,3,0,1,0,0,1,0,"(16.0, 32.0]"
1,1,1,1,2,3,1,3,0,"(32.0, 48.0]"
2,1,3,1,1,1,0,2,1,"(16.0, 32.0]"
3,1,1,1,2,3,0,3,0,"(32.0, 48.0]"
4,0,3,0,2,1,0,1,1,"(32.0, 48.0]"


In [52]:
#We can now remove the AgeBand, since the information has been already capture in Age variable
train_data = train_data.drop(['AgeBand'], axis=1)
df = [train_data, test_data]
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,IsSolo
0,0,3,0,1,0,0,1,0
1,1,1,1,2,3,1,3,0
2,1,3,1,1,1,0,2,1
3,1,1,1,2,3,0,3,0
4,0,3,0,2,1,0,1,1


In [53]:
print('Train columns with null values:\n', train_data.isnull().sum())

Train columns with null values:
 Survived    0
Pclass      0
Sex         0
Age         0
Fare        0
Embarked    0
Title       0
IsSolo      0
dtype: int64


In [54]:
print('Test/Validation columns with null values:\n', test_data.isnull().sum())

Test/Validation columns with null values:
 PassengerId    0
Pclass         0
Sex            0
Age            0
Fare           0
Embarked       0
Title          0
IsSolo         0
dtype: int64


In [55]:
"""
We are now ready to train the model and predict the required solution.There are many models out there,
we wil make use of the populars one to model below to predict the solution.
We will also make use of the K Stratified fold to cross validate the data
Logistic Regression
KNN or k-Nearest Neighbors
Support Vector Machines
Naive Bayes classifier
Decision Tree
Random Forrest
"""
# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

In [56]:
X_train=train_data.drop("Survived",axis=1)
Y_train=train_data["Survived"]
X_test  = test_data.drop("PassengerId", axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

((891, 7), (891,), (418, 7))

In [57]:
classifiers = []
classifiers.append(LogisticRegression())
classifiers.append(SVC())
classifiers.append(LinearSVC())
classifiers.append(RandomForestClassifier())
classifiers.append(KNeighborsClassifier())
classifiers.append(GaussianNB())
classifiers.append(DecisionTreeClassifier())

In [58]:
kfold = StratifiedKFold(n_splits=10)

In [59]:
cv_results = []
for classifier in classifiers :
    cv_results.append(cross_val_score(classifier, X_train, y = Y_train, scoring = "accuracy", cv = kfold, n_jobs=4))

In [60]:
cv_means = []
cv_std = []
for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_std.append(cv_result.std())

In [61]:
cv_res = pd.DataFrame({"Algorithm":["LogisticRegression","SVC","LinearSVC","RandomForestClassifier",
                        "KNeighborsClassifier","GaussianNB","DecisionTree"],"Score":cv_means,"Score Errors": cv_std})

In [62]:
sorted_model=cv_res.sort_values(by='Score', ascending=False)
sorted_model

Unnamed: 0,Algorithm,Score,Score Errors
4,KNeighborsClassifier,0.803695,0.046281
6,DecisionTree,0.801423,0.043922
3,RandomForestClassifier,0.800287,0.042785
2,LinearSVC,0.786742,0.025234
0,LogisticRegression,0.784519,0.027816
1,SVC,0.783408,0.03647
5,GaussianNB,0.763246,0.032593


In [63]:
#From the above result we can see that the best classifier will be the KNN classifier
#We make use of the KNN classifier to predict whether the passenger in the test dataset will survive.
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn

84.62

In [64]:
submission_df = pd.DataFrame(columns=['PassengerId', 'Survived'])
submission_df['PassengerId'] = test_data['PassengerId']
submission_df['Survived'] = Y_pred


In [65]:
submission_df.head(10)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,0
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [66]:
submission_df.to_csv('submissions.csv', header=True, index=False)