In [11]:
import numpy as np
import pandas as pd

In [12]:
# Define the KNN class
class KNN:
    def __init__(self, k=5, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = []
        for i, x_test in enumerate(X):
          if self.distance_metric == 'euclidean':
            distances = [np.linalg.norm(x_test - x_train) for x_train in self.X_train]
          elif self.distance_metric == 'manhattan':
            distances = [np.sum(np.abs(x_test - x_train)) for x_train in self.X_train]
          else:
            raise ValueError("Unsupported distance metric")

          # Get the indices of the k nearest neighbors
          sorted_indices = np.argsort(distances)[:self.k]
          nearest_labels = self.y_train[sorted_indices]

          # Determine the most common class among the nearest neighbors
          prediction = np.bincount(nearest_labels.astype(int)).argmax()
          predictions.append(prediction)

          # Print progress every 100 predictions
          if (i + 1) % 1000 == 0:
              print(f"Predicted {i + 1}/{len(X)} samples.")

        return np.array(predictions)



In [13]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    # Separate features and target variable
    X = train_data.drop(['CustomerId', 'Surname', 'Exited'], axis=1)
    y = train_data['Exited']
    X_test = test_data.drop(['CustomerId', 'Surname'], axis=1)

    # Combine training and test data for consistent encoding
    combined_data = pd.concat([X, X_test], axis=0)

    # Encode categorical variables manually
    combined_data['Geography'] = combined_data['Geography'].astype('category').cat.codes
    combined_data['Gender'] = combined_data['Gender'].astype('category').cat.codes

    # Split the encoded data back into training and test sets
    X = combined_data.iloc[:len(X), :].values
    X_test = combined_data.iloc[len(X):, :].values

    # Scale features
    X_scaled = (X - X.mean(axis=0)) / X.std(axis=0)
    X_test_scaled = (X_test - X.mean(axis=0)) / X.std(axis=0)

    return X_scaled, y.values, X_test_scaled

In [19]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    np.random.seed(42)
    indices = np.random.permutation(len(X))
    fold_size = len(X) // n_splits
    auc_scores = []
    print("Starting cross-validation with", n_splits, "splits...")
    for i in range(n_splits):
        print("Training on split", i+1)
        val_indices = indices[i * fold_size:(i + 1) * fold_size]
        train_indices = np.setdiff1d(indices, val_indices)
        X_train, X_val = X[train_indices], X[val_indices]
        y_train, y_val = y[train_indices], y[val_indices]
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_val)
        auc = compute_auc(y_val, y_pred)
        print(f"Split {i + 1}/{n_splits}, AUC: {auc}")
        auc_scores.append(auc)
    print("Cross-validation complete, mean of auc scores:", np.mean(auc_scores))
    return np.mean(auc_scores)

# Define a function to compute AUC manually
def compute_auc(y_true, y_pred):
    sorted_indices = np.argsort(y_pred)
    y_true_sorted = y_true[sorted_indices]
    pos_count = np.sum(y_true_sorted)
    neg_count = len(y_true_sorted) - pos_count
    rank_sum = np.sum(np.where(y_true_sorted == 1)[0] + 1)
    auc = (rank_sum - (pos_count * (pos_count + 1) / 2)) / (pos_count * neg_count)
    return auc

# Define a function to compute accuracy manually
def compute_accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)

In [20]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# Hyperparameter tuning for K and distance metric
best_k = 3
best_distance_metric = 'euclidean'
best_auc = 0
for k in [3, 5, 7, 9, 11]:
    for metric in ['euclidean', 'manhattan']:
        knn = KNN(k=k, distance_metric=metric)
        auc = cross_validate(X, y, knn)
        print(f'K: {k}, Metric: {metric}, AUC: {auc}')
        if auc > best_auc:
            best_k = k
            best_distance_metric = metric
            best_auc = auc

print(f'Best K: {best_k}, Best Metric: {best_distance_metric}, Best AUC: {best_auc}')

# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k, distance_metric=best_distance_metric)
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

Starting cross-validation with 5 splits...
Training on split 1
Predicted 1000/3000 samples.
Predicted 2000/3000 samples.
Predicted 3000/3000 samples.
Split 1/5, AUC: 0.7523167044392649
Training on split 2
Predicted 1000/3000 samples.
Predicted 2000/3000 samples.
Predicted 3000/3000 samples.
Split 2/5, AUC: 0.7452107617240444
Training on split 3
Predicted 1000/3000 samples.
Predicted 2000/3000 samples.
Predicted 3000/3000 samples.
Split 3/5, AUC: 0.7608583510205708
Training on split 4
Predicted 1000/3000 samples.
Predicted 2000/3000 samples.
Predicted 3000/3000 samples.
Split 4/5, AUC: 0.7578765196156501
Training on split 5
Predicted 1000/3000 samples.
Predicted 2000/3000 samples.
Predicted 3000/3000 samples.
Split 5/5, AUC: 0.7663903587841006
Cross-validation complete, mean of auc scores: 0.7565305391167262
Cross-validation scores: 0.7565305391167262
Starting cross-validation with 5 splits...
Training on split 1
Predicted 1000/3000 samples.
Predicted 2000/3000 samples.
Predicted 3000/3