In [113]:
import numpy as np
import pandas as pd

In [114]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        distances = self.compute_distance(X, self.X_train)
        index = np.argsort(distances)[:, :self.k]
        k_nearest_labels = self.y_train[index]
        predictions = []
        for labels in k_nearest_labels:
            unique, counts = np.unique(labels, return_counts=True)
            predictions.append(unique[np.argmax(counts)])
        
        return np.array(predictions)

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
          return np.sqrt(np.sum((X1[:, np.newaxis, :] - X2[np.newaxis, :, :]) ** 2, axis=2))

In [118]:
def preprocess_data(train_path, test_path):
    # Load the datasets
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    
    train_data = train_data.drop(columns=['id', 'CustomerId', 'Surname'])
    X = train_data.drop('Exited', axis=1)
    y = train_data["Exited"]
    X_test = test_data.drop(columns=['id', 'CustomerId', 'Surname'])

    categorical_cols = ['Geography', 'Gender']
    numerical_cols = ['CreditScore', 'Age', 'Tenure', 'NumOfProducts', 'Balance', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']

    for col in numerical_cols:
        X.fillna({col: X[col].mean()}, inplace=True)
        X_test.fillna({col: X_test[col].mean()}, inplace=True)

    for col in categorical_cols:
        X.fillna({col: X[col].mode()[0]}, inplace=True)
        X_test.fillna({col: X_test[col].mode()[0]}, inplace=True)

        X = pd.get_dummies(X, columns=[col], drop_first=True, dtype=float)
        X_test = pd.get_dummies(X_test, columns=[col], drop_first=True, dtype=float)

    X[numerical_cols] = (X[numerical_cols] - X[numerical_cols].mean()) / X[numerical_cols].std()
    X_test[numerical_cols] = (X_test[numerical_cols] - X_test[numerical_cols].mean()) / X_test[numerical_cols].std()

    X = X.to_numpy()
    y = y.to_numpy()
    X_test = X_test.to_numpy()

    return X, y, X_test

In [119]:
def cross_validate(X, y, knn, n_splits=5):
    index = np.arange(len(X))
    np.random.shuffle(index)

    fold_size = len(X) // n_splits
    folds = [index[i * fold_size:(i + 1) * fold_size] for i in range(n_splits)]

    auc_scores = []

    for i in range(n_splits):
        val_indices = folds[i]
        train_indices = np.concatenate([folds[j] for j in range(n_splits) if j != i])

        X_train, X_val = X[train_indices], X[val_indices]
        y_train, y_val = y[train_indices], y[val_indices]

        knn.fit(X_train, y_train)

        distances = knn.compute_distance(X_val, X_train)

        knn_indices = np.argsort(distances, axis=1)[:, :knn.k]
        min_distances = np.min(distances[np.arange(distances.shape[0])[:, np.newaxis], knn_indices], axis=1)

        scores = 1 / (min_distances + 1e-8)

        def calculate_roc_auc(y_true, y_scores):

            sorted_indices = np.argsort(y_scores)
            y_true_sorted = y_true[sorted_indices]

            tps = np.cumsum(y_true_sorted)
            fps = np.arange(1, len(y_true_sorted) + 1) - tps

            tpr = tps / tps[-1]
            fpr = fps / fps[-1]

            return np.trapz(tpr, fpr)
        
        auc = calculate_roc_auc(y_val, scores)
        auc_scores.append(auc)

    return np.mean(auc_scores)


In [121]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')
# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')
# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

best_k = None
best_score = -1

for k in range(1, 10, 2):
    knn = KNN(k=k, distance_metric='euclidean')
    score = cross_validate(X, y, knn)
    print(f"k={k}, score={score}")
    if score > best_score:
        best_score = score
        best_k = k
print(f"Best k={best_k}")

knn = KNN(k=best_k, distance_metric='euclidean')
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

Cross-validation scores: 0.6773837072912927
k=1, score=0.67813233228453
k=3, score=0.6781402921788838
k=5, score=0.67770046841282
k=7, score=0.6776701507535391
k=9, score=0.6746320165375661
Best k=3
