# Predicting Titanic Survival using K-Nearest Neighbors

## Importing the libraries

In [17]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [18]:
dataset_train = pd.read_csv('train.csv')
X_train = dataset_train.iloc[:, 1:-1].values
y_train = dataset_train.iloc[:, -1].values

dataset_test = pd.read_csv('test.csv')
X_test = dataset_test.iloc[:, 1:].values

In [19]:
print(X_train)

[[3 'male' 22.0 ... 0 7.25 'S']
 [1 'female' 38.0 ... 0 71.2833 'C']
 [3 'female' 26.0 ... 0 7.925 'S']
 ...
 [3 'female' nan ... 2 23.45 'S']
 [1 'male' 26.0 ... 0 30.0 'C']
 [3 'male' 32.0 ... 0 7.75 'Q']]


In [20]:
print(y_train)

[0 1 1 1 0 0 0 0 1 1 1 1 0 0 0 1 0 1 0 1 0 1 1 1 0 1 0 0 1 0 0 1 1 0 0 0 1
 0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1
 0 0 0 1 1 0 1 1 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0 1 0 0
 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0
 1 1 0 0 1 0 1 1 1 1 0 0 1 0 0 0 0 0 1 0 0 1 1 1 0 1 0 0 0 1 1 0 1 0 1 0 0
 0 1 0 1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 1 1
 0 1 0 0 0 0 0 1 1 1 0 1 1 0 1 1 0 0 0 1 0 0 0 1 0 0 1 0 1 1 1 1 0 0 0 0 0
 0 1 1 1 1 0 1 0 1 1 1 0 1 1 1 0 0 0 1 1 0 1 1 0 0 1 1 0 1 0 1 1 1 1 0 0 0
 1 0 0 1 1 0 1 1 0 0 0 1 1 1 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 1 1 1 1 1
 0 0 0 0 1 1 0 0 0 1 1 0 1 0 0 0 1 0 1 1 1 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1
 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 0 1 1 0 1 1 1 1 0 0 1 0 1 0 0 1 0 0 1 1
 1 1 1 1 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0 0
 0 1 1 0 1 0 0 1 0 0 0 0 

In [21]:
print(X_test)

[[3 'male' 34.5 ... 0 7.8292 'Q']
 [3 'female' 47.0 ... 0 7.0 'S']
 [2 'male' 62.0 ... 0 9.6875 'Q']
 ...
 [3 'male' 38.5 ... 0 7.25 'S']
 [3 'male' nan ... 0 8.05 'S']
 [3 'male' nan ... 1 22.3583 'C']]


# **Taking care of missing data**

In [22]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X_train[:,3:4])
X_train[:, 2:3] = imputer.transform(X_train[:, 2:3])
X_test[:, 2:3] = imputer.transform(X_test[:, 2:3])
X_test[:, 5:6] = imputer.transform(X_test[:, 5:6])

In [23]:
print(X_train)
print(X_test)

[[3 'male' 22.0 ... 0 7.25 'S']
 [1 'female' 38.0 ... 0 71.2833 'C']
 [3 'female' 26.0 ... 0 7.925 'S']
 ...
 [3 'female' 0.5241844769403825 ... 2 23.45 'S']
 [1 'male' 26.0 ... 0 30.0 'C']
 [3 'male' 32.0 ... 0 7.75 'Q']]
[[3 'male' 34.5 ... 0 7.8292 'Q']
 [3 'female' 47.0 ... 0 7.0 'S']
 [2 'male' 62.0 ... 0 9.6875 'Q']
 ...
 [3 'male' 38.5 ... 0 7.25 'S']
 [3 'male' 0.5241844769403825 ... 0 8.05 'S']
 [3 'male' 0.5241844769403825 ... 1 22.3583 'C']]


# **Encoding categorical data**

# Encoding the Independent Variable

In [24]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X_train[:,1] = le.fit_transform(X_train[:,1])
X_test[:,1] = le.fit_transform(X_test[:,1])

In [25]:
print(X_train)
print(X_test)

[[3 1 22.0 ... 0 7.25 'S']
 [1 0 38.0 ... 0 71.2833 'C']
 [3 0 26.0 ... 0 7.925 'S']
 ...
 [3 0 0.5241844769403825 ... 2 23.45 'S']
 [1 1 26.0 ... 0 30.0 'C']
 [3 1 32.0 ... 0 7.75 'Q']]
[[3 1 34.5 ... 0 7.8292 'Q']
 [3 0 47.0 ... 0 7.0 'S']
 [2 1 62.0 ... 0 9.6875 'Q']
 ...
 [3 1 38.5 ... 0 7.25 'S']
 [3 1 0.5241844769403825 ... 0 8.05 'S']
 [3 1 0.5241844769403825 ... 1 22.3583 'C']]


In [26]:
print(X_train[0:5,:])
print(X_test[0:5,:])

[[3 1 22.0 1 0 7.25 'S']
 [1 0 38.0 1 0 71.2833 'C']
 [3 0 26.0 0 0 7.925 'S']
 [1 0 35.0 1 0 53.1 'S']
 [3 1 35.0 0 0 8.05 'S']]
[[3 1 34.5 0 0 7.8292 'Q']
 [3 0 47.0 1 0 7.0 'S']
 [2 1 62.0 0 0 9.6875 'Q']
 [3 1 27.0 0 0 8.6625 'S']
 [3 0 22.0 1 1 12.2875 'S']]


In [27]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0,6])], remainder='passthrough')
X_train = np.array(ct.fit_transform(X_train))
X_test = np.array(ct.fit_transform(X_test))

In [28]:
print(X_train[0:5,:])
print(X_test[0:5,:])

[[0.0 0.0 1.0 0.0 0.0 1.0 1 22.0 1 0 7.25]
 [1.0 0.0 0.0 1.0 0.0 0.0 0 38.0 1 0 71.2833]
 [0.0 0.0 1.0 0.0 0.0 1.0 0 26.0 0 0 7.925]
 [1.0 0.0 0.0 0.0 0.0 1.0 0 35.0 1 0 53.1]
 [0.0 0.0 1.0 0.0 0.0 1.0 1 35.0 0 0 8.05]]
[[0.0 0.0 1.0 0.0 1.0 0.0 1 34.5 0 0 7.8292]
 [0.0 0.0 1.0 0.0 0.0 1.0 0 47.0 1 0 7.0]
 [0.0 1.0 0.0 0.0 1.0 0.0 1 62.0 0 0 9.6875]
 [0.0 0.0 1.0 0.0 0.0 1.0 1 27.0 0 0 8.6625]
 [0.0 0.0 1.0 0.0 0.0 1.0 0 22.0 1 1 12.2875]]


In [29]:
print(X_train)

[[0.0 0.0 1.0 ... 1 0 7.25]
 [1.0 0.0 0.0 ... 1 0 71.2833]
 [0.0 0.0 1.0 ... 0 0 7.925]
 ...
 [0.0 0.0 1.0 ... 1 2 23.45]
 [1.0 0.0 0.0 ... 0 0 30.0]
 [0.0 0.0 1.0 ... 0 0 7.75]]


In [30]:
print(X_test)

[[0.0 0.0 1.0 ... 0 0 7.8292]
 [0.0 0.0 1.0 ... 1 0 7.0]
 [0.0 1.0 0.0 ... 0 0 9.6875]
 ...
 [0.0 0.0 1.0 ... 0 0 7.25]
 [0.0 0.0 1.0 ... 0 0 8.05]
 [0.0 0.0 1.0 ... 1 1 22.3583]]


## Training the K-NN model on the Training set

In [31]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

## Predicting a new result

In [None]:
#print(classifier.predict(sc.transform([[6.9,3.2,5.7,2.3]])))

['Iris-virginica']


## Predicting the Test set results

In [36]:
y_pred = classifier.predict(X_test)
print(y_pred)

[0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 0 1 1 0 0
 0 0 1 0 0 0 0 1 0 0 0 1 1 1 0 1 1 0 0 0 0 0 1 0 1 0 1 1 1 0 0 0 1 0 0 0 0
 1 1 0 0 0 0 1 1 1 0 1 0 1 0 1 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1
 1 1 0 1 0 0 1 1 1 0 0 1 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0
 0 1 1 0 0 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 1
 0 1 0 0 0 1 0 1 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 0 0
 0 0 0 1 0 1 0 0 1 1 0 0 1 0 1 0 0 0 0 0 1 0 0 1 1 0 1 0 1 0 1 0 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 1 0 1 1 0 1 0 0 0 0 0 0 1 1 1 0 0 1 0 0 1 0 0 1 0 0
 0 1 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 1 1 1 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0
 0 1 1 1 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0
 0 1 0 1 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 0 0 1 0 1 0 0 1 1 1 0 0 0 0
 1 1 1 1 1 0 0 1 0 0 1]
