In [5]:
import numpy as np
import pandas as pd

In [4]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean', p=2):
        self.k = k
        self.distance_metric = distance_metric
        self.p = p  # Parameter for Minkowski distance
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def compute_distance(self, X1, X2):
        # Ensure both X1 and X2 are NumPy arrays and cast to float64
        X1 = np.array(X1, dtype=np.float64)
        X2 = np.array(X2, dtype=np.float64)

        if X2.ndim == 1:
            X2 = X2.reshape(1, -1)

        # Euclidean distance calculation
        distances = np.sqrt(np.sum((X1 - X2) ** 2, axis=1))
        return distances
    
    def predict(self, X):
        # Predict probabilities for each sample in X
        if X.ndim == 1:
            X = X.reshape(1, -1)
        return np.array([self._predict(x) for x in X])

    def _predict(self, x):
        distances = self.compute_distance(self.X_train, x)
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = self.y_train[k_indices]
        k_nearest_distances = distances[k_indices]
        # Weighted voting: weight inversely proportional to distance
        weights = 1 / (k_nearest_distances + 1e-5)
        weighted_sum = np.sum(weights * k_nearest_labels)
        total_weight = np.sum(weights)
        prob = weighted_sum / total_weight
        return prob



In [3]:
# Define data preprocessing function

def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    
    # Drop 'CustomerId' and 'Surname' as they are unlikely to be informative
    train_data.drop(['CustomerId', 'Surname'], axis=1, inplace=True)
    test_data.drop(['CustomerId', 'Surname'], axis=1, inplace=True)
    
    # Handle categorical variables
    categorical_columns = ['Geography', 'Gender']
    for column in categorical_columns:
        # Create dummy variables for categorical features
        dummies = pd.get_dummies(pd.concat([train_data[column], test_data[column]]), prefix=column)
        train_data = pd.concat([train_data, dummies[:len(train_data)].reset_index(drop=True)], axis=1)
        test_data = pd.concat([test_data, dummies[len(train_data):].reset_index(drop=True)], axis=1)
        train_data.drop(column, axis=1, inplace=True)
        test_data.drop(column, axis=1, inplace=True)
    
    # Feature engineering
    # Create age groups
    train_data['AgeGroup'] = pd.cut(train_data['Age'], bins=[0, 30, 40, 50, 100], labels=[0, 1, 2, 3])
    test_data['AgeGroup'] = pd.cut(test_data['Age'], bins=[0, 30, 40, 50, 100], labels=[0, 1, 2, 3])
    
    # Create tenure groups
    train_data['TenureGroup'] = pd.cut(train_data['Tenure'], bins=[-1, 3, 6, 10], labels=[0, 1, 2])
    test_data['TenureGroup'] = pd.cut(test_data['Tenure'], bins=[-1, 3, 6, 10], labels=[0, 1, 2])
    
    # Create balance to salary ratio
    train_data['BalanceSalaryRatio'] = train_data['Balance'] / (train_data['EstimatedSalary'] + 1)
    test_data['BalanceSalaryRatio'] = test_data['Balance'] / (test_data['EstimatedSalary'] + 1)
    
    # Interaction between Age and Balance
    train_data['AgeBalanceInteraction'] = train_data['Age'] * train_data['Balance']
    test_data['AgeBalanceInteraction'] = test_data['Age'] * test_data['Balance']
    
    # Log transformation of Balance and EstimatedSalary
    train_data['Log_Balance'] = np.log1p(train_data['Balance'])
    test_data['Log_Balance'] = np.log1p(test_data['Balance'])
    
    train_data['Log_EstimatedSalary'] = np.log1p(train_data['EstimatedSalary'])
    test_data['Log_EstimatedSalary'] = np.log1p(test_data['EstimatedSalary'])
    

    # Handle missing values and scale numerical features
    numeric_columns = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
                       'EstimatedSalary', 'BalanceSalaryRatio', 'AgeBalanceInteraction',
                       'Log_Balance', 'Log_EstimatedSalary']
    for column in numeric_columns:
        if column == 'Balance':
            median_value = train_data[column].median()
            train_data[column] = train_data[column].fillna(median_value)
            test_data[column] = test_data[column].fillna(median_value)
        else:
            mean_value = train_data[column].mean()
            train_data[column] = train_data[column].fillna(mean_value)
            test_data[column] = test_data[column].fillna(mean_value)
        
        std_value = train_data[column].std()
        train_data[column] = (train_data[column] - mean_value) / std_value
        test_data[column] = (test_data[column] - mean_value) / std_value
    
    # Handle NaN values in categorical columns and convert to int
    for column in ['AgeGroup', 'TenureGroup']:
        mode_value = train_data[column].mode()[0]  # Get the most frequent value
        train_data[column] = train_data[column].fillna(mode_value)
        test_data[column] = test_data[column].fillna(mode_value)
        train_data[column] = train_data[column].astype(int)
        test_data[column] = test_data[column].astype(int)
    
    # Feature selection based on correlation
    corr_matrix = train_data.corr()
    corr_threshold = 0.05  # Adjust the threshold as needed
    top_features = corr_matrix['Exited'][corr_matrix['Exited'].abs() > corr_threshold].index
    top_features = top_features.drop('Exited')
    
    # Prepare final datasets with selected features
    y = train_data['Exited'].values
    X = train_data[top_features].values
    X_test = test_data[top_features].values
    
    return np.array(X), np.array(y), np.array(X_test)

In [2]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    # TODO: Implement cross-validation
    # Compute ROC AUC scores

    fold_size = len(X) // n_splits
    indices = np.arange(len(X))
    np.random.shuffle(indices)
    
    scores = []
    for i in range(n_splits):
        start = i * fold_size
        end = (i + 1) * fold_size if i < n_splits - 1 else len(X)
        test_indices = indices[start:end]
        train_indices = np.concatenate([indices[:start], indices[end:]])
        
        X_train, X_val = X[train_indices], X[test_indices]
        y_train, y_val = y[train_indices], y[test_indices]
        
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_val)
        
        score = roc_auc_score(y_val, y_pred)
        scores.append(score)
    
    return scores
def roc_auc_score(y_true, y_pred):
    positive_indices = np.where(y_true == 1)[0]
    negative_indices = np.where(y_true == 0)[0]
    
    pairs = [(i, j) for i in positive_indices for j in negative_indices]
    
    auc = sum(y_pred[i] > y_pred[j] for i, j in pairs) + 0.5 * sum(y_pred[i] == y_pred[j] for i, j in pairs)
    auc /= len(pairs)
    
    return auc

In [9]:
# Hyperparameter tuning for K value only
X, y, X_test = preprocess_data('train.csv', 'test.csv')

k = 10

# Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=k)
knn.fit(X, y)
test_predictions = knn.predict(X_test)
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)
print(f"Predictions made using k={k} and saved to submissions.csv")



Predictions made using k=10 and saved to submissions.csv
