In [4]:
import numpy as np
import pandas as pd

In [6]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = np.array(X)
        self.y_train = np.array(y)

    def predict(self, X):
        pred = []
        X = np.array(X)

        for x_test in X:
            distances = self.compute_distance(self.X_train, x_test)
            index = np.argsort(distances)[:self.k]
            nearestK = self.y_train[index].astype(int)

            majority_vote = np.argmax(np.bincount(nearestK))
            pred.append(majority_vote)
        return pred

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2), axis=1)
        else:
            raise ValueError(f"Distance metric not applicable: {self.distance_metric}")

In [7]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Drop unnecessary columns
    train_data = train_data.drop(['CustomerId', 'Surname'], axis=1)
    test_data = test_data.drop(['CustomerId', 'Surname'], axis=1)

    # Encode categorical variables
    train_data['Geography'] = train_data['Geography'].astype('category').cat.codes
    test_data['Geography'] = test_data['Geography'].astype('category').cat.codes

    train_data['Gender'] = train_data['Gender'].astype('category').cat.codes
    test_data['Gender'] = test_data['Gender'].astype('category').cat.codes

    # Separate features and target
    X = train_data.drop('Exited', axis=1).values
    y = train_data['Exited'].values
    XtestData = test_data.values

    # Normalize features
    average = np.mean(X, axis=0)
    standDev = np.std(X, axis=0)

    X_train = (X - average) / standDev
    X_test = (XtestData - average) / standDev

    return X_train, y, X_test


In [8]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    lenFold = len(X) // n_splits
    calc_acc = []

    for x in range(n_splits):
        initialValue = x * lenFold

        if x != n_splits - 1:
            finalValue = initialValue + lenFold
        else:
            finalValue = len(X)  # Handle the last fold

        indexVal = np.arange(initialValue, finalValue)
        indexTrain = np.concatenate([np.arange(initialValue), np.arange(finalValue, len(X))])

        trainX = X[indexTrain]
        trainY = y[indexTrain]
        xValue = X[indexVal]
        yValue = y[indexVal]

        knn.fit(trainX, trainY)
        predictY = knn.predict(xValue)

        acc = np.mean(predictY == yValue)
        calc_acc.append(acc)

    return np.mean(calc_acc)


In [11]:
# Load and preprocess data
X, y, X_test = preprocess_data('/content/train.csv', '/content/test.csv')

# Create and evaluate model with initial k=5
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
results = cross_validate(X, y, knn)

# Hyperparameter tuning
kHighest = 5
scoreHighest = results  # Initialize with the score of k=5
for x in range(1, 20):
    knn = KNN(k=x, distance_metric='euclidean')
    check = cross_validate(X, y, knn)

    if check > scoreHighest:
        scoreHighest = check
        kHighest = x

# Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=kHighest, distance_metric='euclidean')
knn.fit(X, y)
finalPredictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('/content/test.csv')['id'], 'Exited': finalPredictions}).to_csv('submissions.csv', index=False)
