## Task 7: 
### Learning SVMs on the Titanic dataset. Please report your five-fold cross validation classification accuracies on Titanic training set, with respect to the linear, quadratic, and RBF kernels.

In [1]:
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.svm import SVC

In [3]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [4]:
#Drop columns which may not be useful(Name, Ticket, Cabin)
drop_cols = ['Name', 'Ticket', 'Cabin', 'PassengerId']
train_df = train_df.drop(drop_cols, axis=1)
train_df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


In [5]:
train_df['Embarked'].fillna('S', inplace=True)
train_df['Sex'] = train_df['Sex'].map({'male':0, 'female':1})
train_df['Embarked'] = train_df['Embarked'].map({'C':0, 'Q':1, 'S':2})

In [6]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)

train_df['Age'] = imputer.fit_transform(pd.DataFrame(train_df['Age'])).round(decimals=0)

#Check if any NaN are left
train_df['Age'].isna().any()

False

In [7]:
X = train_df.drop('Survived', axis=1)
y = train_df['Survived']

## Using Linear kernel

In [8]:
linear_svc = SVC(kernel='linear')
linear_svc.fit(X, y)

kf = KFold(n_splits=5, random_state=42, shuffle=True)
lsvc_acc_scores = cross_val_score(linear_svc, X, y, cv=kf, scoring='accuracy')
print(f'Average accuracy: {lsvc_acc_scores.mean()}')

Average accuracy: 0.7878852551628899


## Using Quadratic Kernel 

In [10]:
quad_svc = SVC(kernel='poly')
quad_svc.fit(X, y)

kf = KFold(n_splits=5, random_state=42, shuffle=True)
quad_acc_scores = cross_val_score(quad_svc, X, y, cv=kf, scoring='accuracy')
print(f'Average accuracy: {quad_acc_scores.mean()}')

Average accuracy: 0.6419998744586028


## Using RBF Kernel

In [11]:
rbf_svc = SVC(kernel='rbf')
rbf_svc.fit(X, y)

kf = KFold(n_splits=5, random_state=42, shuffle=True)
rbf_acc_scores = cross_val_score(rbf_svc, X, y, cv=kf, scoring='accuracy')
print(f'Average accuracy: {rbf_acc_scores.mean()}')

Average accuracy: 0.6779109911493315
