In [46]:
import numpy as np
import pandas as pd

In [47]:
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            return np.linalg.norm(X2 - X1, axis=1)
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X2 - X1), axis=1)
        else:
            raise ValueError("Invalid distance metric")

    def predict(self, X):
        return np.apply_along_axis(self.pred, 1, X.values)

    def pred(self, x):
        distances = self.compute_distance(x, self.X_train.values)
        k_indices = np.argpartition(distances, self.k)[:self.k]
        k_nearest_labels = self.y_train.iloc[k_indices].values
        k_nearest_distances = distances[k_indices]

        # Weighted average with inverse distance
        weights = 1 / (k_nearest_distances + 1e-5)
        weighted_average = np.dot(k_nearest_labels, weights) / np.sum(weights)

        return weighted_average


In [48]:

def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Combining training and test data for consistent preprocessing
    combined_data = pd.concat([train_data, test_data], axis=0, ignore_index=True)

    # One-hot encoding for categorical variables
    combined_data = pd.get_dummies(combined_data, columns=['Geography', 'Gender'], drop_first=True)

    # Select features
    features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary'] + \
               [col for col in combined_data.columns if col.startswith('Geography_') or col.startswith('Gender_')]

    # Normalizing the selected features
    combined_data[features] = (combined_data[features] - combined_data[features].mean()) / combined_data[features].std()

    # Splitting the combined data back into training and test sets
    X_train = combined_data.loc[:len(train_data)-1, features]
    y_train = train_data['Exited']
    X_test = combined_data.loc[len(train_data):, features]

    return X_train, y_train, X_test



In [49]:

def stratified_cross_validate(X, y, knn, n_splits=5):
    np.random.seed(42)
    indices = np.arange(len(X))
    np.random.shuffle(indices)
    
    fold_sizes = np.full(n_splits, len(X) // n_splits)
    fold_sizes[:len(X) % n_splits] += 1
    current = 0
    scores = []
    
    for fold_size in fold_sizes:
        start, stop = current, current + fold_size
        val_indices = indices[start:stop]
        train_indices = np.concatenate([indices[:start], indices[stop:]])
        
        X_train, X_val = X.iloc[train_indices], X.iloc[val_indices]
        y_train, y_val = y.iloc[train_indices], y.iloc[val_indices]
        
        knn.fit(X_train, y_train)
        y_est_prob = knn.predict(X_val)
        score = auc_score(y_val, y_est_prob)
        scores.append(score)
        
        current = stop

    return scores

def auc_score(y_true, y_est_prob):
    positive_indices = np.where(y_true == 1)[0]
    negative_indices = np.where(y_true == 0)[0]

    if len(positive_indices) == 0 or len(negative_indices) == 0:
        return 0.5

    positive_predictions = y_est_prob[positive_indices]
    negative_predictions = y_est_prob[negative_indices]

    correct_order = np.sum(positive_predictions[:, None] > negative_predictions)
    return correct_order / (len(positive_indices) * len(negative_indices))


In [50]:
X_train, y_train, X_test = preprocess_data('train.csv', 'test.csv')

# Create and evaluate the model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = stratified_cross_validate(X_train, y_train, knn)

print("Cross-validation scores:", cv_scores)
print("Mean ROC AUC score:", np.mean(cv_scores))

# Hyperparameter tuning
k_values = [3, 5, 7, 9, 11, 13,15,17]
distance_metrics = ['euclidean', 'manhattan']
best_score = 0
best_params = {}

for k in k_values:
    for metric in distance_metrics:
        knn = KNN(k=k, distance_metric=metric)
        scores = stratified_cross_validate(X_train, y_train, knn)
        mean_score = np.mean(scores)

        if mean_score > best_score:
            best_score = mean_score
            best_params = {'k': k, 'distance_metric': metric}

print("Best parameters:", best_params)
print("Best ROC AUC score:", best_score)

# Train on the full dataset with optimal hyperparameters and make predictions on the test set
best_knn = KNN(**best_params)
best_knn.fit(X_train, y_train)
test_predictions = best_knn.predict(X_test)
rounded_predictions = np.round(test_predictions, 2)

# Save the predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': rounded_predictions.ravel()}).to_csv('submissions.csv', index=False)

Cross-validation scores: [np.float64(0.8032009591733709), np.float64(0.8433986175115208), np.float64(0.8328572068940737), np.float64(0.8091054838880926), np.float64(0.8451011270566188)]
Mean ROC AUC score: 0.8267326789047352
Best parameters: {'k': 17, 'distance_metric': 'manhattan'}
Best ROC AUC score: 0.894169322530578
