In [1]:
import numpy as np
import pandas as pd

In [2]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean', weights='uniform'):
        self.k = k
        self.distance_metric = distance_metric
        self.weights = weights

    def fit(self, X, y):
        # TODO: Implement the fit method
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        """Predict whether each instance in X will churn."""
        predictions = []
        for x in X:
            distances = [self.compute_distance(x, x_train) for x_train in self.X_train]
            k_nearest_neighbors = np.argsort(distances)[:self.k]
            k_nearest_labels = np.array([self.y_train[i] for i in k_nearest_neighbors])
            
            # Voting
            if self.weights == 'uniform':
                prediction = np.mean(k_nearest_labels)  # Average the labels (probability of churn)
            else:
                # Weight by distance (closer points have more influence)
                weights = 1 / np.array(distances)[k_nearest_neighbors]
                weighted_avg = np.sum(weights * k_nearest_labels) / np.sum(weights)
                prediction = weighted_avg
            
            predictions.append(prediction)
        
        return np.array(predictions)

    def compute_distance(self, X1, X2):
        # TODO: Implement distance computation based on self.distance_metric
        # Hint: Use numpy operations for efficient computation
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2) ** 2))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2))

In [3]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    """Load and preprocess the data."""
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Handle missing values, if any
    train_data.fillna(train_data.median(), inplace=True)
    test_data.fillna(test_data.median(), inplace=True)

    # Drop unnecessary columns
    train_data.drop(['CustomerId', 'Surname'], axis=1, inplace=True)
    test_data.drop(['CustomerId', 'Surname'], axis=1, inplace=True)
    
    # Handle categorical variables (Geography, Gender)
    train_data = pd.get_dummies(train_data, columns=['Geography', 'Gender'], drop_first=True)
    test_data = pd.get_dummies(test_data, columns=['Geography', 'Gender'], drop_first=True)

    # Separate features and target
    X_train = train_data.drop('Exited', axis=1).values
    y_train = train_data['Exited'].values
    X_test = test_data.values

    # Standardize features
    X_train = (X_train - np.mean(X_train, axis=0)) / np.std(X_train, axis=0)
    X_test = (X_test - np.mean(X_test, axis=0)) / np.std(X_test, axis=0)

    return X_train, y_train, X_test

In [4]:
def compute_roc_auc(y_true, y_scores):
    """Compute the AUC of ROC curve from scratch."""
    # Sort by the predicted scores
    sorted_indices = np.argsort(-y_scores)
    y_true = y_true[sorted_indices]
    y_scores = y_scores[sorted_indices]
    
    # Number of positives and negatives
    P = np.sum(y_true)  # Total positives
    N = len(y_true) - P  # Total negatives

    # True positive rate (TPR) and False positive rate (FPR)
    TPR = np.zeros(len(y_true) + 1)
    FPR = np.zeros(len(y_true) + 1)
    
    TP = 0  # True positives
    FP = 0  # False positives
    
    for i in range(1, len(y_true) + 1):
        if y_true[i-1] == 1:
            TP += 1
        else:
            FP += 1
        
        TPR[i] = TP / P
        FPR[i] = FP / N
    
    # Compute AUC using the trapezoidal rule
    auc = np.trapz(TPR, FPR)
    return auc

# Define cross-validation function
def cross_validate(X, y, k, distance_metric, weights, n_splits=5):
    fold_size = len(X) // n_splits
    auc_scores = []
    
    for i in range(n_splits):
        X_val = X[i * fold_size:(i + 1) * fold_size]
        y_val = y[i * fold_size:(i + 1) * fold_size]
        
        X_train = np.concatenate([X[:i * fold_size], X[(i + 1) * fold_size:]], axis=0)
        y_train = np.concatenate([y[:i * fold_size], y[(i + 1) * fold_size:]], axis=0)
        
        knn = KNN(k=k, distance_metric=distance_metric, weights=weights)
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_val)
        
        # Compute AUC from scratch
        auc_score = compute_roc_auc(y_val, y_pred)
        auc_scores.append(auc_score)
    
    return np.mean(auc_scores)

In [None]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# # Create and evaluate model
# knn = KNN(k=5, distance_metric='euclidean')

# # Perform cross-validation
# cv_scores = cross_validate(X, y, knn)

# print("Cross-validation scores:", cv_scores)

# TODO: hyperparamters tuning
def hyperparameter_tuning(X, y):
    """Perform grid search for KNN hyperparameters."""
    # Define the hyperparameter grid
    k_values = [3, 5, 7, 9, 11]
    distance_metrics = ['euclidean', 'manhattan']
    weights_options = ['uniform', 'distance']
    
    best_score = 0
    best_params = {'k': None, 'distance_metric': None, 'weights': None}
    
    # Grid search through each combination of k, distance_metric, and weights
    for k in k_values:
        for distance_metric in distance_metrics:
            for weights in weights_options:
                cv_score = cross_validate(X, y, k, distance_metric, weights)
                print(f"CV Score with k={k}, distance_metric={distance_metric}, weights={weights}: {cv_score}")
                
                # Update best params if current score is better
                if cv_score > best_score:
                    best_score = cv_score
                    best_params['k'] = k
                    best_params['distance_metric'] = distance_metric
                    best_params['weights'] = weights
    
    print(f"Best params: {best_params}, Best CV Score: {best_score}")
    return best_params

best_params = hyperparameter_tuning(X, y)

# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = knn = KNN(k=best_params['k'], distance_metric=best_params['distance_metric'])
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)