In [107]:
import numpy as np
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [108]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # TODO: Implement the fit method
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        # TODO: Implement the predict method
        predictions = []
        for x in X:
            distances = self.compute_distance(x, self.X_train)
            k_nearest_neighbors = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[k_nearest_neighbors]
            k_nearest_distances = distances[k_nearest_neighbors]

            if self.distance_metric == 'euclidean':
                weights = 1 / (k_nearest_distances + 1e-5)  # Add small value to avoid division by zero
            else:
                weights = 1 / (k_nearest_distances + 1e-5)

            # weighted sum of the labels
            weighted_sum = np.sum(k_nearest_labels * weights)
            prediction = 1 if weighted_sum / np.sum(weights) >= 0.35 else 0
            predictions.append(prediction)

        return np.array(predictions)

    def compute_distance(self, X1, X2):
        # TODO: Implement distance computation based on self.distance_metric
        # Hint: Use numpy operations for efficient computation
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X2 - X1) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X2 - X1), axis=1)
        else:
            raise ValueError(f"Unsupported distance metric: {self.distance_metric}")

In [110]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    # TODO: Implement data preprocessing
    # Handle categorical variables, scale features, etc.

    # Dropping unnecessary columns
    train_data = train_data.drop(columns=['id','CustomerId', 'Surname'])
    test_data = test_data.drop(columns=['id', 'CustomerId', 'Surname'])

    # One-hot encode categorical columns
    train_data = pd.get_dummies(train_data, columns=['Geography', 'Gender'])
    test_data = pd.get_dummies(test_data, columns=['Geography', 'Gender'])

    # Ensure test_data has the same columns as train_data
    test_data = test_data.reindex(columns=train_data.columns, fill_value=0)

    # Convert 'HasCrCard', 'IsActiveMember', and boolean columns to integers (0 or 1)
    bool_columns = train_data.select_dtypes(include='bool').columns
    train_data[bool_columns] = train_data[bool_columns].astype(int)
    test_data[bool_columns] = test_data[bool_columns].astype(int)

    # Normalize numerical features using Min-Max scaling
    numerical_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
    train_data = min_max_scale_features(train_data, numerical_features)
    test_data = min_max_scale_features(test_data, numerical_features)

    # Oversampling the minority class (Exited=1)
    class_0 = train_data[train_data['Exited'] == 0]
    class_1 = train_data[train_data['Exited'] == 1]

    # Oversample the minority class by duplicating samples
    class_1_oversampled = class_1.sample(len(class_0), replace=True)
    train_data_balanced = pd.concat([class_0, class_1_oversampled], axis=0)

    X_train = train_data_balanced.drop('Exited', axis=1).values
    y_train = train_data_balanced['Exited'].values

    X_test = test_data.drop(['Exited'], axis=1, errors='ignore')

    return X_train, y_train, X_test.values

In [111]:
def min_max_scale_features(data, numerical_features):
    for feature in numerical_features:
        min_val = data[feature].min()
        max_val = data[feature].max()
        data[feature] = (data[feature] - min_val) / (max_val - min_val)
    return data

In [112]:
def roc_auc_score(y_true, y_pred):
    sorted_indices = np.argsort(y_pred)
    y_true_sorted = y_true[sorted_indices]

    tpr = np.cumsum(y_true_sorted) / np.sum(y_true_sorted)
    fpr = np.cumsum(1 - y_true_sorted) / np.sum(1 - y_true_sorted)

    auc = np.trapz(tpr, fpr)
    return auc

In [113]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    # TODO: Implement cross-validation
    fold_size = len(X) // n_splits
    auc_scores = []

    indices = np.arange(len(X))
    np.random.shuffle(indices)  # Shuffling the dataset

    X_shuffled = X[indices]
    y_shuffled = y[indices]

    for i in range(n_splits):
        X_val = X_shuffled[i * fold_size: (i + 1) * fold_size]
        y_val = y_shuffled[i * fold_size: (i + 1) * fold_size]

        X_train = np.concatenate((X_shuffled[:i * fold_size], X_shuffled[(i + 1) * fold_size:]), axis=0)
        y_train = np.concatenate((y_shuffled[:i * fold_size], y_shuffled[(i + 1) * fold_size:]), axis=0)

        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_val)

        auc = roc_auc_score(y_val, y_pred)
        auc_scores.append(auc)

    return np.mean(auc_scores)

In [114]:
def tune_hyperparameters(X, y, k_values, distance_metrics):
    best_k = None
    best_distance = None
    best_auc = 0

    for k in k_values:
        for distance_metric in distance_metrics:
            knn = KNN(k=k, distance_metric=distance_metric)
            auc = cross_validate(X, y, knn)
            print(f"K={k}, Distance={distance_metric}, AUC={auc}")
            if auc > best_auc:
                best_auc = auc
                best_k = k
                best_distance = distance_metric

    return best_k, best_distance, best_auc

In [115]:
# Load and preprocess data
X, y, X_test = preprocess_data('/content/drive/MyDrive/CS506/Assignments/Assignment_5/train.csv', '/content/drive/MyDrive/CS506/Assignments/Assignment_5/test.csv')



# TODO: hyperparameters tuning
k_values = [1, 2, 3, 4, 5, 6, 8, 10, 15, 20]
distance_metrics = ['euclidean', 'manhattan']
best_k, best_distance, best_auc = tune_hyperparameters(X, y, k_values, distance_metrics)

print(f"Best K: {best_k}, Best Distance Metric: {best_distance}, Best AUC: {best_auc}")

# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k, distance_metric=best_distance)
knn.fit(X, y)
test_predictions = knn.predict(X_test)


# Save test predictions
pd.DataFrame({'id': pd.read_csv('/content/drive/MyDrive/CS506/Assignments/Assignment_5/test.csv')['id'], 'Exited': test_predictions}).to_csv('/content/drive/MyDrive/CS506/Assignments/Assignment_5/submissions.csv', index=False)

K=1, Distance=euclidean, AUC=0.06025235746587119
K=1, Distance=manhattan, AUC=0.0569055032788337
K=2, Distance=euclidean, AUC=0.09127529738373803
K=2, Distance=manhattan, AUC=0.08949333974870267
K=3, Distance=euclidean, AUC=0.08719200686122179
K=3, Distance=manhattan, AUC=0.08756635587063959
K=4, Distance=euclidean, AUC=0.10720561755658856
K=4, Distance=manhattan, AUC=0.10522862002993875
K=5, Distance=euclidean, AUC=0.11726115837075021
K=5, Distance=manhattan, AUC=0.11547062651963422
K=6, Distance=euclidean, AUC=0.11569127628107174
K=6, Distance=manhattan, AUC=0.1152225124077731
K=8, Distance=euclidean, AUC=0.12500896764849023
K=8, Distance=manhattan, AUC=0.12403566453191632
K=10, Distance=euclidean, AUC=0.12840742389200271
K=10, Distance=manhattan, AUC=0.1262500792402404
K=15, Distance=euclidean, AUC=0.12657351101240993
K=15, Distance=manhattan, AUC=0.12625775985948826
K=20, Distance=euclidean, AUC=0.12527434648541141
K=20, Distance=manhattan, AUC=0.12583053168763156
Best K: 10, Best 