In [23]:
# split 20% for validation set
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from time import time
from sklearn.model_selection import train_test_split, GridSearchCV

from IPython.core.display import display, HTML 
display(HTML("<style>.container { width:95% !important; }</style>"))  # Use all space available in the browser

pd.options.mode.chained_assignment = None                             # removes unnecessary error

In [24]:
training = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [25]:
''' Remove useless Features: '''

result = training.loc[:,'Survived']                                   # needed later for the training
testIds = test.loc[:,'PassengerId']                                   # needed later for the submission
training = training.drop(['Survived','Ticket','PassengerId','Name','Fare','Age'], axis=1)
test = test.drop(['Ticket','PassengerId','Name','Fare','Age'], axis=1)

In [26]:
''' transform Features into numbers if necessary (training): '''

# Sex:
training.loc[:,'Sex'][np.where(training['Sex'] == 'male')[0]] = 1           # male female into 1:male 0:female
training.loc[:,'Sex'][np.where(training['Sex'] == 'female')[0]] = 0         # male female into 1:male 0:female

# Cabins:
givenCabins = np.where(training['Cabin'] == training['Cabin'])[0]                                    # find indexes where the cabin is given
training.loc[:,'Cabin'][givenCabins] = [cab[:1] for cab in training.loc[:,'Cabin'][givenCabins]]     # The Cabin Number seems Irrelevant to me so I removed it
training.loc[:,'Cabin'][givenCabins] = [ord(cab)-64 for cab in training.loc[:,'Cabin'][givenCabins]] # replaced the deck letter with a Number
training.loc[:,'Cabin'][np.where(training['Cabin'] != training['Cabin'])[0]] = -1                    # replace NaN Values with -1

# Embarked:
givenEmbarked = np.where(training['Embarked'] == training['Embarked'])[0]   # find indexes where Embarked is given
embarkedColumn = training.loc[:,'Embarked'][givenEmbarked]
embarkedColumn = [ord(char)-64 for char in embarkedColumn]
embarkedColumn = [1 if num==3 else num for num in embarkedColumn]           # Embarked letter num 3  -> class 1
embarkedColumn = [2 if num==17 else num for num in embarkedColumn]          # Embarked letter num 17 -> class 2
embarkedColumn = [3 if num==19 else num for num in embarkedColumn]          # Embarked letter num 19 -> class 3
embarkedClass = ["C","Q","S"] # C = Class 1, Q = Class 2, S = Class 3
training.loc[:,'Embarked'][givenEmbarked] = embarkedColumn

In [27]:
''' transform Features into numbers if necessary (test): '''

# Sex:
test.loc[:,'Sex'][np.where(test['Sex'] == 'male')[0]] = 1           # male female into 1:male 0:female
test.loc[:,'Sex'][np.where(test['Sex'] == 'female')[0]] = 0         # male female into 1:male 0:female

# Cabins:
givenCabins = np.where(test['Cabin'] == test['Cabin'])[0]                                    # find indexes where the cabin is given
test.loc[:,'Cabin'][givenCabins] = [cab[:1] for cab in test.loc[:,'Cabin'][givenCabins]]     # The Cabin Number seems Irrelevant to me so I removed it
test.loc[:,'Cabin'][givenCabins] = [ord(cab)-64 for cab in test.loc[:,'Cabin'][givenCabins]] # replaced the deck letter with a Number
test.loc[:,'Cabin'][np.where(test['Cabin'] != test['Cabin'])[0]] = -1                    # replace NaN Values with -1

# Embarked:
givenEmbarked = np.where(test['Embarked'] == test['Embarked'])[0]   # find indexes where Embarked is given
embarkedColumn = test.loc[:,'Embarked'][givenEmbarked]
embarkedColumn = [ord(char)-64 for char in embarkedColumn]
embarkedColumn = [1 if num==3 else num for num in embarkedColumn]           # Embarked letter num 3  -> class 1
embarkedColumn = [2 if num==17 else num for num in embarkedColumn]          # Embarked letter num 17 -> class 2
embarkedColumn = [3 if num==19 else num for num in embarkedColumn]          # Embarked letter num 19 -> class 3
embarkedClass = ["C","Q","S"] # C = Class 1, Q = Class 2, S = Class 3
test.loc[:,'Embarked'][givenEmbarked] = embarkedColumn

In [28]:
print("Lets look at the first 10 data points")
print(training[0:10])

Lets look at the first 10 data points
   Pclass Sex  SibSp  Parch Cabin Embarked
0       3   1      1      0    -1        3
1       1   0      1      0     3        1
2       3   0      0      0    -1        3
3       1   0      1      0     3        3
4       3   1      0      0    -1        3
5       3   1      0      0    -1        2
6       1   1      0      0     5        3
7       3   1      3      1    -1        3
8       3   0      0      2    -1        3
9       2   0      1      0    -1        1


In [29]:
''' Remove rows that still contain NaN: '''

nullRows = np.where(pd.isnull(training).values)[0]
result = result.drop(result.index[nullRows])
nullRowsTest = np.where(pd.isnull(test).values)[0]
testIds = testIds.drop(testIds.index[nullRowsTest])

shapeBefore = training.shape[0]
training = training.dropna(axis=0, how='any') # remove empty rows
test = test.dropna(axis=0, how='any') # remove empty rows

print("Removed", shapeBefore - training.shape[0], "rows from training_clean because at least one feature was None")

Removed 2 rows from training_clean because at least one feature was None


In [32]:

''' Get most meaningful features or combinations of features through PCA: '''
n_components = training.shape[1]
pca = PCA(n_components=n_components, whiten=True, svd_solver="randomized").fit(training)
eigenvalues = pca.explained_variance_

X_train_pca = pca.transform(training)
X_test_pca = pca.transform(test)
print(X_train_pca.shape)

(889, 6)


In [12]:
'''Fitting the classifier to the training set'''

t0 = time()
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
clf = clf.fit(X_train_pca, result)
print("done in %0.3fs" % (time() - t0))

done in 54.405s


In [33]:
'''Predicting alive or dead on the test set'''
y_pred = clf.predict(X_test_pca)
submission_array = np.column_stack((testIds,y_pred))
submission = pd.DataFrame(submission_array, columns=['PassengerId','Survived'])  # 1st row as the column names
print(submission[1:15])

    PassengerId  Survived
1           893         1
2           894         0
3           895         0
4           896         0
5           897         0
6           898         1
7           899         0
8           900         1
9           901         0
10          902         0
11          903         0
12          904         1
13          905         0
14          906         1


In [22]:
submission.to_csv('submission1.csv', index=False) # result: 0.746 !