In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import neighbors
import math

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
df = df.drop(['Cabin', 'Ticket', 'Name'], axis = 1)

In [5]:
le = preprocessing.LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df['Embarked'] = le.fit_transform(df['Embarked'])

In [6]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.25,2
1,2,1,1,0,38.0,1,0,71.2833,0
2,3,1,3,0,26.0,0,0,7.925,2
3,4,1,1,0,35.0,1,0,53.1,2
4,5,0,3,1,35.0,0,0,8.05,2


# KNN

In [7]:
y = df['Pclass']
X = df.drop(['PassengerId', 'Pclass'], axis = 1)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [9]:
def knn_(k):
    knn = neighbors.KNeighborsClassifier(n_neighbors=k)
    print('Accuracy:', round(knn.fit(X_train, y_train).score(X_test, y_test)*100,2))
    y_pred = knn.predict(X_test)
    print('Confusion matrix:\n',confusion_matrix(y_test, y_pred))

In [10]:
knn_(1)

Accuracy: 89.14
Confusion matrix:
 [[ 62   6   2]
 [  3  37   9]
 [  1   8 139]]


In [11]:
knn_(2)

Accuracy: 86.52
Confusion matrix:
 [[ 67   2   1]
 [  8  37   4]
 [  4  17 127]]


In [12]:
knn_(3)

Accuracy: 86.52
Confusion matrix:
 [[ 63   5   2]
 [  7  28  14]
 [  3   5 140]]


In [13]:
knn_(4)

Accuracy: 82.4
Confusion matrix:
 [[ 61   7   2]
 [  8  30  11]
 [  4  15 129]]


# SVM

In [14]:
from sklearn import svm

In [25]:
df = pd.read_csv('train.csv')

In [30]:
df1 = df.drop(['PassengerId','Cabin', 'Ticket', 'Name'], axis = 1)

In [52]:
le = preprocessing.LabelEncoder()
df1['Sex'] = le.fit_transform(df1['Sex'])
df1['Embarked'] = le.fit_transform(df1['Embarked'])

In [53]:
df1.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


In [43]:
df1.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked'],
      dtype='object')

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [74]:
def my_svm(dv):
    llist = df1.columns.to_list()
    llist.remove(dv)
    print('===========================================================')
    print('Dependent Variable:', dv)
    print('\nIndependent Variables:', *llist)
    y = df1[dv]
    X = df1.drop([dv], axis = 1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
    clf = svm.SVC(gamma = 0.01, C = 100)
    y_pred = clf.fit(X_train, y_train).predict(X_test)
    print(f'\n\nThe accuracy of the model for predicting "{dv}" for unseen data is {round(accuracy_score(y_test, y_pred, normalize=True)*100, 2)}%')
    print('\nConfusion matrix:\n', confusion_matrix(y_test, y_pred))
    print('===========================================================')
    

In [75]:
df2 = df1.drop(['Age', 'Fare'], axis = 1)
for col in df2.columns:
    my_svm(col)

Dependent Variable: Survived

Independent Variables: Pclass Sex Age SibSp Parch Fare Embarked


The accuracy of the model for predicting "Survived" for unseen data is 74.16%

Confusion matrix:
 [[124  33]
 [ 36  74]]
Dependent Variable: Pclass

Independent Variables: Survived Sex Age SibSp Parch Fare Embarked


The accuracy of the model for predicting "Pclass" for unseen data is 90.26%

Confusion matrix:
 [[ 64   4   2]
 [  4  36   9]
 [  1   6 141]]
Dependent Variable: Sex

Independent Variables: Survived Pclass Age SibSp Parch Fare Embarked


The accuracy of the model for predicting "Sex" for unseen data is 72.66%

Confusion matrix:
 [[ 66  32]
 [ 41 128]]
Dependent Variable: SibSp

Independent Variables: Survived Pclass Sex Age Parch Fare Embarked


The accuracy of the model for predicting "SibSp" for unseen data is 74.91%

Confusion matrix:
 [[150  24   7   0   1   0   0]
 [ 21  39   5   0   0   0   0]
 [  4   1   3   0   0   0   0]
 [  0   2   1   3   1   0   0]
 [  0   0   0   0 