## This is the notebook for the code in my artcile: "K Nearest Neighbors by hand: opening the “black box” and understanding the algorithm within"

If you run this notebook top to bottom, you will run a KNNClassifier algorithm with scikit-learn and build one from scratch. If you read the article, you will come to understand how a KNN model classifies new data.

### Import, wrangle and split the data

In [2]:
# Standard imports
import numpy as np
import pandas as pd

# sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
# import the titanic CSV. Run .shape and .head to learn the 
# dataframe's dimensions and to take a peek at the first 5 rows

df = pd.read_csv('titanic.csv')
print(df.shape)
df.head()

(887, 8)


Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


In [5]:
# Check to see if there are any null values

df.isnull().sum()

Survived                   0
Pclass                     0
Name                       0
Sex                        0
Age                        0
Siblings/Spouses Aboard    0
Parents/Children Aboard    0
Fare                       0
dtype: int64

In [6]:
#### THIRD TEXT INSERT

def titanic_wrangle(df):
    df = df.copy()
    
    # Drop Name because it doesn't help predict
    # Drop fare because it would take more preprocessing that 
    drop_columns = ['Name', 'Fare']
    df = df.drop(drop_columns, axis=1)
    
    # Create X and y
    features = df.columns.drop('Survived')
    target = 'Survived'
    
    X = df[features]
    y = df[target]
    
    # One hot encode the sex column
    dummies = pd.get_dummies(X.Sex)
    X = pd.concat([X, dummies], axis='columns').drop('Sex', axis='columns')
    
    return X, y

X, y = titanic_wrangle(df)

In [7]:
# Check the X dataframe to see if the wrangle function worked correctly

X.head()

Unnamed: 0,Pclass,Age,Siblings/Spouses Aboard,Parents/Children Aboard,female,male
0,3,22.0,1,0,0,1
1,1,38.0,1,0,1,0
2,3,26.0,0,0,1,0
3,1,35.0,1,0,1,0
4,3,35.0,0,0,0,1


In [8]:
#### THIRD TEXT INSERT

# Perform the train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [9]:
y_train.value_counts()

0    434
1    275
Name: Survived, dtype: int64

### KNNClassifier in scikit-learn

In [10]:
#### FOURTH TEXT INSERT

# Create KNN Classifier object and determine the number of neighbors (n_neighbors) parameter
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train, y_train)
model.predict(X_test)

array([0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       1, 0])

## Algo from scratch

In [11]:
#### FIFTH TEXT INSERT

# Helper functions 

def _sqrt(x):
    return x**.5 

# I couldn't find a better way to calculate Euclidean distance, 
# so I copied this code from Jason Brownlee at machinelearningmastery.com

def euclideanDistance(row1, row2):
    """ 
    Finds the Euclidean distance between two rows. I.e. the squared sum of the distance 
    between each feature in the two rows.
    """
    # Each time we call the function we are setting the "distance" variable
    # to 0.0
    distance = 0.0

    for i in range(len(row1)-1):
        # Add the distance between each feature in the two rows
        distance += (row1[i] - row2[i])**2

    # Return the square root of the distance between the two rows
    return _sqrt(distance)

In [12]:
#### SIXTH TEXT INSERT

class KNNHomebrew:
    
    def __init__(self, k=3):
        self.k = k # number of neighbors
            
    def model_fit(self, X, y):
        """
        Fits the training data to the model.
        
        KNN simply memorizes the data. So fitting the data is simple creating class
        variables for the X_train and y_train.
        """
        self.X_train = X
        self.y_train = y
    
    def model_predict(self, row):
        """
        This method lets us make a prediction on one new row of data.
        """
        
        # First need to find Euclidean distance between rows
        all_distances = {i: euclideanDistance(row, self.X_train.iloc[i]) for i in range(len(self.X_train))}

        sort_orders = [k for k, v in sorted(all_distances.items(), key=lambda item: item[1])][:self.k]

        # Make the prediciton
        output_values = []
        for k in sort_orders:
            output_values.append(self.y_train.iloc[k])

        prediction = max(set(output_values), key=output_values.count)

        return prediction
    
    def predict_all(self, X_test):
        """
        Using predict_all, we can make predictions on an array of new data.
        """

        self.predictions = []

        for i in range(len(X_test)):
            x = self.model_predict(X_test.iloc[i])
            self.predictions.append(x)

        return np.array(self.predictions)

    def model_accuracy(self, X_test, y_test):
        """ 
        Calculates the accuracy score of the new data.
        """
        return sum(self.predictions == y_test) / len(y_test)

In [13]:
#### SEVENTH TEXT INSERT

# Create KNN Classifier object and determine the number of neighbors (n_neighbors) parameter
knn = KNNHomebrew(3)

# Fit the model to the training data
knn.model_fit(X_train, y_train)

# Make predictions on the test set
knn.predict_all(X_test)

array([0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       1, 0])

### Testing our Model's performance

In [14]:
# Start with the baseline

baseline = 434 / (434 + 275)
print(f'Our baseline score tells us that we can expect passengers to survive {baseline*100:.2f}% of the time')

Our baseline score tells us that we can expect passengers to survive 61.21% of the time


In [15]:
# Accuracy score of the KNNClassifier model in scikit-learn

model.score(X_test, y_test)

0.7584269662921348

In [16]:
# Accuracy score of our homebrew model

knn.model_accuracy(X_test, y_test)

0.7247191011235955