**DATA LOADING AND DATA PREPROCESSING**

In [56]:
import pandas as pd 
import numpy as np
from math import sqrt
from collections import defaultdict

ttn = pd.read_csv("https://github.com/andvise/DataAnalyticsDatasets/blob/16ca8de1233c8643bfe85fcd1cd87c9ff2221312/titanic.csv?raw=True")

In [57]:
###Checking data types
ttn.dtypes

ttn.drop(['PassengerId','Name'], axis=1, inplace=True)


In [58]:
###Checking Missing Values in Variables
ttn.isnull().sum()

###Checking How Many Null Values are Present
ttn['Parents/Children Aboard'].isnull().sum()

###Replacing Missing Values with 0
ttn['Parents/Children Aboard'] = ttn['Parents/Children Aboard'].fillna(0)


**We can see that the Parents/Children Aboard column contains 1 NAN value.**

In [59]:
###Transforming 'Sex' into Numerical
ttn['Sex'] = np.where(ttn['Sex'] == 'female', 0, 1)

In [60]:
###Assigning Predicting Variables to X and Target Variable Y
x = ttn.iloc[:,1:]
y = ttn.iloc[:,:1]


In [62]:
def min_max_scale(column):

    return(column-column.min())/(column.max() - column.min())

#Scaling X and Y values
for xcol in x.columns:
    x[[xcol]] = min_max_scale(x[[xcol]])

for ycol in y.columns:
    y[[ycol]] = min_max_scale(y[[ycol]])

    

In [63]:
###Dividing the data into Training and Testing Set(80% training data and 20% testing data)

X_Train = x.sample(frac=0.8, random_state=1)
X_Test = x.drop(X_Train.index)

Y_Train = y.sample(frac=0.8, random_state=1)
Y_Test = y.drop(Y_Train.index)

In [64]:
#Shape of test and training data
x_trdat_shape = X_Train.shape
x_tedat_shape = X_Test.shape

y_trdat_shape = Y_Train.shape
y_tedat_shape = Y_Test.shape

print("X Train Shape:",x_trdat_shape)
print("Y Train Shape:",y_trdat_shape)
print("X Test Shape:",x_tedat_shape)
print("Y Test Shape:",y_tedat_shape)

X Train Shape: (710, 6)
Y Train Shape: (710, 1)
X Test Shape: (177, 6)
Y Test Shape: (177, 1)


# Part 2 - k-NN implementation


Class to implement KNN

In [84]:
class KNN:

    def __init__(self, k):
        
        self.k = k

    def fit(self, X, y):
        """
        This method fits the training data to the model. We also assume the length of the 
        training data and targets are same, else prediction method will break.
        """
        assert len(X) == len(y)
        self.X = X
        self.y = y
        return self

    def _distance(self, data1, data2):
        """
        Finding the Eucledian distance
        """
        return np.sqrt(sum((data1 - data2)**2))
  
    def _predict_one(self, test):
        """
        Method for fitted model which runs the X Test data comparing
        the Euclidean distances between each point.
        """
        distances = sorted((self._distance(x, test), y) for x, y in zip(self.X, self.y))
        neighbors = distances[:self.k]
        weights_by_class = defaultdict(list)
        for a, b in neighbors:
            weights_by_class[b].append(a)
        return max((sum(val), key) for key, val in weights_by_class.items())[1]

    def predict(self, X):
        """
        Methods for predicting each instance
        """
        return [self._predict_one(x) for x in X]

    def score(self, X, y):
        """
        Method for X Test and y Test, runs the data through the predict method.
        """
        return sum(1 for pred, true in zip(self.predict(X), y) if pred == true) / len(y)

In [85]:
###Method to flatten list of lists to a single list
def flatten(t):
    return [item for sublist in t for item in sublist]

Implementation of KNN with inputs.

Input paramaters 
X as numpy array,
Y as list.



In [86]:
###Instantiate the weighted KNN model with K
knn_test_nbrs = KNN(k=5)

###Fit the model to the training data.
knn_test_nbrs.fit(X_Train.to_numpy(), flatten(Y_Train.values.tolist()))

###Run predictions using the test sample data
prediction = knn_test_nbrs.predict(X_Test.to_numpy())

###Prediction accuracy
knn_pred_acc = knn_test_nbrs.score(X_Test.to_numpy(),flatten(Y_Test.to_numpy()))

data = {'y_Actual':   flatten(Y_Test.values.tolist()),
       'y_Predicted': prediction 
       }
df = pd.DataFrame(data, columns=['y_Actual','y_Predicted'])
confusion_matrix = pd.crosstab(df['y_Actual'], df['y_Predicted'], 
                               rownames=['Actual'], colnames=['Predicted'], 
                               margins = True)

print('The accuracy of the model is :', knn_pred_acc)
print("***************************")
print("'Confusion Matrix'")
print("***************************")
print(confusion_matrix)

The accuracy of the model is : 0.864406779661017
***************************
'Confusion Matrix'
***************************
Predicted  0.0  1.0  All
Actual                  
0.0         96    8  104
1.0         16   57   73
All        112   65  177


# Part 3 - Hyperparameters search 


In [89]:
K=[1, 3, 5, 7, 9, 11]

for i in K :
  ###Instantiate the weighted KNN model with K
  knn_test_nbrs = KNN(k=i)

  ###Fit the model to the training data.
  knn_test_nbrs.fit(X_Train.to_numpy(), flatten(Y_Train.values.tolist()))

  ###Run some predictions using the test sample data.
  prediction = knn_test_nbrs.predict(X_Test.to_numpy())

  ###Prediction accuracy.
  knn_pred_acc = knn_test_nbrs.score(X_Test.to_numpy(),flatten(Y_Test.to_numpy()))

  print("The accuracy of the model is %f for K %d" % (knn_pred_acc, i))



The accuracy of the model is 0.819209 for K 1
The accuracy of the model is 0.830508 for K 3
The accuracy of the model is 0.864407 for K 5
The accuracy of the model is 0.824859 for K 7
The accuracy of the model is 0.790960 for K 9
The accuracy of the model is 0.819209 for K 11


# Part 4 - Weighted k-NN 




Class to implement KNN

In [87]:
class WeightedKNN:

    def __init__(self, k):

        """
        Let k = assumed number of classifications.

        """
        self.k = k

    def fit(self, X, y):

        """
        Method to fit Training data to the model. We also assume that 
        then length of the Training data and targets are the same.
        """
        assert len(X) == len(y)
        self.X = X
        self.y = y
        return self

    def _distance(self, data1, data2):

        """
        Finding the Eucledian distance
        """
        return np.sqrt(sum((data1 - data2)**2))
        

    def _compute_weights(self, distances):

       """
       Computing the weights using inverse distance
       (if distance = 0, assign 1)
       """
       matches = [(1, y) for d, y in distances if d == 0]
       return matches if matches else [(1/pow(d, 2), y) for d, y in distances]
  
    def _predict_weight(self, test):

        distances = sorted((self._distance(x, test), y) for x, y in zip(self.X, self.y))
        weights = self._compute_weights(distances[:self.k])
        weights_by_class = defaultdict(list)
        for a, b in weights:
            weights_by_class[b].append(a)
        return max((sum(val), key) for key, val in weights_by_class.items())[1]

    def predict(self, X):

        return [self._predict_weight(x) for x in X]

    def score(self, X, y):

        """
        Method takes the X Test and y Test, runs the data through the predicted method.
        """
        return sum(1 for pred, true in zip(self.predict(X), y) if pred == true) / len(y)

In [88]:
#Instantiate the weighted KNN model with K
wknn_test_nbrs = WeightedKNN(k=5)

# Fit the model to the training data.
wknn_test_nbrs.fit(X_Train.to_numpy(), flatten(Y_Train.values.tolist()))

# Run some predictions using the test sample data.
prediction = wknn_test_nbrs.predict(X_Test.to_numpy())

# Prediction accuracy.
wknn_pred_acc = wknn_test_nbrs.score(X_Test.to_numpy(),flatten(Y_Test.to_numpy()))

data = {'y_Actual':   flatten(Y_Test.values.tolist()),
       'y_Predicted': prediction 
       }
df = pd.DataFrame(data, columns=['y_Actual','y_Predicted'])
confusion_matrix = pd.crosstab(df['y_Actual'], df['y_Predicted'], 
                               rownames=['Actual'], colnames=['Predicted'], 
                               margins = True)

print('The accuracy of the model is :', wknn_pred_acc)
print("***************************")
print("'Confusion Matrix'")
print("***************************")
print(confusion_matrix)


The accuracy of the model is : 0.8418079096045198
***************************
'Confusion Matrix'
***************************
Predicted  0.0  1.0  All
Actual                  
0.0         92   12  104
1.0         16   57   73
All        108   69  177


In [72]:
K=[1, 3, 5, 7, 9, 11]

for i in K :
  #Instantiate the weighted KNN model with K
  wknn_test_nbrs = WeightedKNN(k=i)

  # Fit the model to the training data.
  wknn_test_nbrs.fit(X_Train.to_numpy(), flatten(Y_Train.values.tolist()))

  # Run some predictions using the test sample data.
  prediction = wknn_test_nbrs.predict(X_Test.to_numpy())

  # Prediction accuracy.
  wknn_pred_acc = wknn_test_nbrs.score(X_Test.to_numpy(),flatten(Y_Test.to_numpy()))

  print("The accuracy of the model is %f for K %d" % (wknn_pred_acc, i))

The accuracy of the model is 0.819209 for K 1
The accuracy of the model is 0.830508 for K 3
The accuracy of the model is 0.841808 for K 5
The accuracy of the model is 0.841808 for K 7
The accuracy of the model is 0.841808 for K 9
The accuracy of the model is 0.847458 for K 11


The Weigted-kNN does not outperform kNN for the given data set. 
It might be more suitable for larger data sets.