In [3]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Put this when it's called
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier



from scipy.stats import norm
from scipy import stats
from scipy.stats import norm, skew

In [4]:
# Import data
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')  # Save original data set, just in case.

In [5]:
df_test.shape

(418, 11)

In [6]:
# Overview
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [8]:
# Descriptive statistics
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [9]:
# Analyze missing data
def draw_missing_data_table(df):
    total = df.isnull().sum().sort_values(ascending=False)
    percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    return missing_data

In [10]:
draw_missing_data_table(df)

Unnamed: 0,Total,Percent
Cabin,687,0.771044
Age,177,0.198653
Embarked,2,0.002245
Fare,0,0.0
Ticket,0,0.0
Parch,0,0.0
SibSp,0,0.0
Sex,0,0.0
Name,0,0.0
Pclass,0,0.0


In [11]:
draw_missing_data_table(df_test)

Unnamed: 0,Total,Percent
Cabin,327,0.782297
Age,86,0.205742
Fare,1,0.002392
Embarked,0,0.0
Ticket,0,0.0
Parch,0,0.0
SibSp,0,0.0
Sex,0,0.0
Name,0,0.0
Pclass,0,0.0


In [12]:
# Drop Cabin
df.drop('Cabin', axis=1, inplace=True)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [13]:
df_test.drop('Cabin', axis=1, inplace=True)
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,S


In [14]:
# Fill missing values in Age with a specific value
value = 1000
df['Age'].fillna(1000, inplace=True)
df['Age'].max()

1000.0

In [15]:
value = 1000
df_test['Age'].fillna(1000, inplace=True)
df_test['Age'].max()

1000.0

In [16]:
df_test.fillna(0, inplace=True)

In [17]:
df_test.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [18]:
# Delete observations without Embarked
df.drop(df[pd.isnull(df['Embarked'])].index, inplace=True)  # Get index of points where Embarked is null
df[pd.isnull(df['Embarked'])]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked


In [19]:
df_test.drop(df_test[pd.isnull(df_test['Embarked'])].index, inplace=True)  # Get index of points where Embarked is null
df_test[pd.isnull(df_test['Embarked'])]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked


In [20]:
# Data types
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Embarked        object
dtype: object

In [21]:
df_test.dtypes

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Embarked        object
dtype: object

In [22]:
# Drop PassengerId
#PassengerId = df_test.PassengerId
passenger_id = df['PassengerId'].values
df.drop('PassengerId', axis=1, inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [23]:
passenger_id_test = df_test['PassengerId'].values
df_test.drop('PassengerId', axis=1, inplace=True)
df_test.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q
1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,S
2,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Q
3,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,S
4,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,S


In [24]:
# Define categorical variables
df['Sex'] = pd.Categorical(df['Sex'])
df['Embarked'] = pd.Categorical(df['Embarked'])

In [25]:
df_test['Sex'] = pd.Categorical(df_test['Sex'])
df_test['Embarked'] = pd.Categorical(df_test['Embarked'])

In [26]:
# Create Family feature
df['FamilySize'] = df['SibSp'] + df['Parch']
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,FamilySize
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,1
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,1
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,1
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,0


In [27]:
df_test['FamilySize'] = df_test['SibSp'] + df_test['Parch']
df_test.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,FamilySize
0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q,0
1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,S,1
2,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Q,0
3,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,S,0
4,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,S,2


In [28]:
# Drop SibSp and Parch
df.drop('SibSp',axis=1,inplace=True)
df.drop('Parch',axis=1,inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Ticket,Fare,Embarked,FamilySize
0,0,3,"Braund, Mr. Owen Harris",male,22.0,A/5 21171,7.25,S,1
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,PC 17599,71.2833,C,1
2,1,3,"Heikkinen, Miss. Laina",female,26.0,STON/O2. 3101282,7.925,S,0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,113803,53.1,S,1
4,0,3,"Allen, Mr. William Henry",male,35.0,373450,8.05,S,0


In [29]:
df_test.drop('SibSp',axis=1,inplace=True)
df_test.drop('Parch',axis=1,inplace=True)
df_test.head()

Unnamed: 0,Pclass,Name,Sex,Age,Ticket,Fare,Embarked,FamilySize
0,3,"Kelly, Mr. James",male,34.5,330911,7.8292,Q,0
1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,363272,7.0,S,1
2,2,"Myles, Mr. Thomas Francis",male,62.0,240276,9.6875,Q,0
3,3,"Wirz, Mr. Albert",male,27.0,315154,8.6625,S,0
4,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,3101298,12.2875,S,2


In [30]:
# Drop Name and Ticket
df.drop('Name', axis=1, inplace=True)
df.drop('Ticket', axis=1, inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,FamilySize
0,0,3,male,22.0,7.25,S,1
1,1,1,female,38.0,71.2833,C,1
2,1,3,female,26.0,7.925,S,0
3,1,1,female,35.0,53.1,S,1
4,0,3,male,35.0,8.05,S,0


In [31]:
df_test.drop('Name', axis=1, inplace=True)
df_test.drop('Ticket', axis=1, inplace=True)
df_test.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,FamilySize
0,3,male,34.5,7.8292,Q,0
1,3,female,47.0,7.0,S,1
2,2,male,62.0,9.6875,Q,0
3,3,male,27.0,8.6625,S,0
4,3,female,22.0,12.2875,S,2


In [32]:
# Transform categorical variables into dummy variables
df = pd.get_dummies(df, drop_first=True)  # To avoid dummy trap
df.head()

Unnamed: 0,Survived,Pclass,Age,Fare,FamilySize,Sex_male,Embarked_Q,Embarked_S
0,0,3,22.0,7.25,1,1,0,1
1,1,1,38.0,71.2833,1,0,0,0
2,1,3,26.0,7.925,0,0,0,1
3,1,1,35.0,53.1,1,0,0,1
4,0,3,35.0,8.05,0,1,0,1


In [33]:
df_test = pd.get_dummies(df_test, drop_first=True)  # To avoid dummy trap
df_test.head()

Unnamed: 0,Pclass,Age,Fare,FamilySize,Sex_male,Embarked_Q,Embarked_S
0,3,34.5,7.8292,0,1,1,0
1,3,47.0,7.0,1,0,0,1
2,2,62.0,9.6875,0,1,1,0
3,3,27.0,8.6625,0,1,0,1
4,3,22.0,12.2875,2,0,0,1


In [34]:
df_test.isnull().sum()

Pclass        0
Age           0
Fare          0
FamilySize    0
Sex_male      0
Embarked_Q    0
Embarked_S    0
dtype: int64

In [35]:
# Create data set to train data imputation methods
X = df[df.loc[:, df.columns != 'Survived'].columns]
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=1)

In [36]:
X

Unnamed: 0,Pclass,Age,Fare,FamilySize,Sex_male,Embarked_Q,Embarked_S
0,3,22.0,7.2500,1,1,0,1
1,1,38.0,71.2833,1,0,0,0
2,3,26.0,7.9250,0,0,0,1
3,1,35.0,53.1000,1,0,0,1
4,3,35.0,8.0500,0,1,0,1
...,...,...,...,...,...,...,...
886,2,27.0,13.0000,0,1,0,1
887,1,19.0,30.0000,0,0,0,1
888,3,1000.0,23.4500,3,0,0,1
889,1,26.0,30.0000,0,1,0,0


In [37]:
df_test

Unnamed: 0,Pclass,Age,Fare,FamilySize,Sex_male,Embarked_Q,Embarked_S
0,3,34.5,7.8292,0,1,1,0
1,3,47.0,7.0000,1,0,0,1
2,2,62.0,9.6875,0,1,1,0
3,3,27.0,8.6625,0,1,0,1
4,3,22.0,12.2875,2,0,0,1
...,...,...,...,...,...,...,...
413,3,1000.0,8.0500,0,1,0,1
414,1,39.0,108.9000,0,0,0,0
415,3,38.5,7.2500,0,1,0,1
416,3,1000.0,8.0500,0,1,0,1


In [38]:
# Fit logistic regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [39]:
# Model performance
scores = cross_val_score(logreg, X_train, y_train, cv=10)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))



CV accuracy: 0.786 +/- 0.026




In [40]:
df_test

Unnamed: 0,Pclass,Age,Fare,FamilySize,Sex_male,Embarked_Q,Embarked_S
0,3,34.5,7.8292,0,1,1,0
1,3,47.0,7.0000,1,0,0,1
2,2,62.0,9.6875,0,1,1,0
3,3,27.0,8.6625,0,1,0,1
4,3,22.0,12.2875,2,0,0,1
...,...,...,...,...,...,...,...
413,3,1000.0,8.0500,0,1,0,1
414,1,39.0,108.9000,0,0,0,0
415,3,38.5,7.2500,0,1,0,1
416,3,1000.0,8.0500,0,1,0,1


In [41]:
# Calculate score for testing set
pred_test = logreg.predict(df_test)
#df_test['PassengerId'] = passenger_id_test
#print(rmsle(y_test, pred_test))

In [42]:
df_test

Unnamed: 0,Pclass,Age,Fare,FamilySize,Sex_male,Embarked_Q,Embarked_S
0,3,34.5,7.8292,0,1,1,0
1,3,47.0,7.0000,1,0,0,1
2,2,62.0,9.6875,0,1,1,0
3,3,27.0,8.6625,0,1,0,1
4,3,22.0,12.2875,2,0,0,1
...,...,...,...,...,...,...,...
413,3,1000.0,8.0500,0,1,0,1
414,1,39.0,108.9000,0,0,0,0
415,3,38.5,7.2500,0,1,0,1
416,3,1000.0,8.0500,0,1,0,1


In [43]:
df_test

Unnamed: 0,Pclass,Age,Fare,FamilySize,Sex_male,Embarked_Q,Embarked_S
0,3,34.5,7.8292,0,1,1,0
1,3,47.0,7.0000,1,0,0,1
2,2,62.0,9.6875,0,1,1,0
3,3,27.0,8.6625,0,1,0,1
4,3,22.0,12.2875,2,0,0,1
...,...,...,...,...,...,...,...
413,3,1000.0,8.0500,0,1,0,1
414,1,39.0,108.9000,0,0,0,0
415,3,38.5,7.2500,0,1,0,1
416,3,1000.0,8.0500,0,1,0,1


In [44]:
pred_test = logreg.predict(df_test)
#df_test['PassengerId'] = passenger_id_test

In [45]:
svc = SVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, y_train) * 100, 2)
acc_svc



90.44

In [46]:
pred_test_svc = svc.predict(df_test)
#df_test['PassengerId'] = passenger_id_test

In [47]:
pred_test_svc

array([0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [48]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, y_train) * 100, 2)
acc_knn

84.53

In [49]:
X_train

Unnamed: 0,Pclass,Age,Fare,FamilySize,Sex_male,Embarked_Q,Embarked_S
121,3,1000.0,8.0500,0,1,0,1
687,3,19.0,10.1708,0,1,0,1
790,3,1000.0,7.7500,0,1,1,0
837,3,1000.0,8.0500,0,1,0,1
659,1,58.0,113.2750,2,1,0,0
...,...,...,...,...,...,...,...
716,1,38.0,227.5250,0,0,0,0
768,3,1000.0,24.1500,1,1,1,0
73,3,26.0,14.4542,1,1,0,0
236,2,44.0,26.0000,1,1,0,1


In [50]:
df_test

Unnamed: 0,Pclass,Age,Fare,FamilySize,Sex_male,Embarked_Q,Embarked_S
0,3,34.5,7.8292,0,1,1,0
1,3,47.0,7.0000,1,0,0,1
2,2,62.0,9.6875,0,1,1,0
3,3,27.0,8.6625,0,1,0,1
4,3,22.0,12.2875,2,0,0,1
...,...,...,...,...,...,...,...
413,3,1000.0,8.0500,0,1,0,1
414,1,39.0,108.9000,0,0,0,0
415,3,38.5,7.2500,0,1,0,1
416,3,1000.0,8.0500,0,1,0,1


In [51]:
# Calculate score for testing set
pred_test_knn = knn.predict(df_test)
#df_test['PassengerId'] = passenger_id_test

In [52]:
pred_test_knn

array([0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,

In [53]:
# Generate submission file
submission = pd.DataFrame({ 'PassengerId': passenger_id_test,
                            'Survived': pred_test})
submission.to_csv("submission.csv", index=False)

In [54]:
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [55]:
df_test

Unnamed: 0,Pclass,Age,Fare,FamilySize,Sex_male,Embarked_Q,Embarked_S
0,3,34.5,7.8292,0,1,1,0
1,3,47.0,7.0000,1,0,0,1
2,2,62.0,9.6875,0,1,1,0
3,3,27.0,8.6625,0,1,0,1
4,3,22.0,12.2875,2,0,0,1
...,...,...,...,...,...,...,...
413,3,1000.0,8.0500,0,1,0,1
414,1,39.0,108.9000,0,0,0,0
415,3,38.5,7.2500,0,1,0,1
416,3,1000.0,8.0500,0,1,0,1


In [56]:
submission2 = pd.DataFrame({ 'PassengerId': passenger_id_test,
                            'Survived': pred_test_knn})
submission2.to_csv("submission.csv", index=False)

In [62]:

submission3 = pd.DataFrame({ 'PassengerId': passenger_id_test,
                            'Survived': pred_test_svc})
submission3.to_csv("submission_svc.csv", index=False)

In [60]:
pred_test_svc.shape

(418,)