In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = [self._predict_single(x) for x in X]
        return np.array(predictions)

    def _predict_single(self, x):
        distances = [self.compute_distance(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2) ** 2))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2))
        else:
            raise ValueError("Unsupported distance metric")

In [4]:
def preprocess_data(train, test):
    # Dropping irrelevant columns: 'id', 'CustomerId', 'Surname'
    train = train.drop(['id', 'CustomerId', 'Surname'], axis=1)
    test = test.drop(['id', 'CustomerId', 'Surname'], axis=1)
    
    # Handle missing values
    imputer_num = SimpleImputer(strategy='mean')
    imputer_cat = SimpleImputer(strategy='most_frequent')
    
    numerical_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
    categorical_cols = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']
    
    train[numerical_cols] = imputer_num.fit_transform(train[numerical_cols])
    train[categorical_cols] = imputer_cat.fit_transform(train[categorical_cols])
    
    test[numerical_cols] = imputer_num.transform(test[numerical_cols])
    test[categorical_cols] = imputer_cat.transform(test[categorical_cols])
    
    # Encoding categorical variables
    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        train[col] = le.fit_transform(train[col])
        test[col] = le.transform(test[col])
        label_encoders[col] = le
    
    train['Balance_to_Salary'] = train['Balance'] / (train['EstimatedSalary'] + 1)
    test['Balance_to_Salary'] = test['Balance'] / (test['EstimatedSalary'] + 1)
    
    scaler = StandardScaler()
    train[numerical_cols] = scaler.fit_transform(train[numerical_cols])
    test[numerical_cols] = scaler.transform(test[numerical_cols])
    
    X_train = train.drop('Exited', axis=1)
    y_train = train['Exited']
    
    return X_train, y_train, test

In [9]:
def cross_validate_sample(X, y, knn, n_splits=3, sample_size=0.3):
    skf = StratifiedKFold(n_splits=n_splits)
    auc_scores = []

    sample_indices = np.random.choice(len(X), int(sample_size * len(X)), replace=False)
    X_sample = X.iloc[sample_indices]
    y_sample = y.iloc[sample_indices]

    for train_idx, val_idx in skf.split(X_sample, y_sample):
        X_train, X_val = X_sample.iloc[train_idx], X_sample.iloc[val_idx]
        y_train, y_val = y_sample.iloc[train_idx], y_sample.iloc[val_idx]
        
        knn.fit(X_train.values, y_train.values)
        y_val_pred = knn.predict(X_val.values)
        auc = roc_auc_score(y_val, y_val_pred)
        auc_scores.append(auc)

    return np.mean(auc_scores)

In [11]:
X_train, y_train, X_test = preprocess_data(train_data, test_data)

In [13]:
best_k = 3
best_distance = 'euclidean'
best_auc = 0

for k in range(3, 7):
    for metric in ['euclidean', 'manhattan']:
        knn = KNN(k=k, distance_metric=metric)
        auc_score = cross_validate_sample(X_train, y_train, knn, n_splits=3, sample_size=0.3)
        print(f"k: {k}, Metric: {metric}, AUC: {auc_score}")
        if auc_score > best_auc:
            best_k = k
            best_distance = metric
            best_auc = auc_score

print(f"Best k: {best_k}, Best Distance Metric: {best_distance}, Best AUC: {best_auc}")


k: 3, Metric: euclidean, AUC: 0.7603646264836792
k: 3, Metric: manhattan, AUC: 0.7652486377825815
k: 4, Metric: euclidean, AUC: 0.7441660075040418
k: 4, Metric: manhattan, AUC: 0.7667308316820064
k: 5, Metric: euclidean, AUC: 0.7509164771349983
k: 5, Metric: manhattan, AUC: 0.7804733050370333
k: 6, Metric: euclidean, AUC: 0.775610681826167
k: 6, Metric: manhattan, AUC: 0.748368750068425
Best k: 5, Best Distance Metric: manhattan, Best AUC: 0.7804733050370333


In [15]:
knn_final = KNN(k=best_k, distance_metric=best_distance)
knn_final.fit(X_train.values, y_train.values)

test_predictions = knn_final.predict(X_test.values)

In [16]:
# Prepare the submission file
test_data = pd.read_csv('test.csv')
submission = pd.DataFrame({
    'id': test_data['id'],
    'Exited': test_predictions
})

submission.to_csv('submission.csv', index=False)
print("Predictions saved to 'submission.csv'.")

Predictions saved to 'submission.csv'.
