In [None]:
import numpy as np
import pandas as pd

class CustomKNN:
    def __init__(self, k=3, metric='euclidean', p=2, weight_type='uniform'):
        self.k = k
        self.metric = metric
        self.p = p
        self.weight_type = weight_type

    def fit(self, features, labels):
        self.train_features = np.array(features)
        self.train_labels = np.array(labels)

    def calculate_distances(self, X):
        if self.metric == 'euclidean':
            return np.sqrt(((X[:, np.newaxis] - self.train_features) ** 2).sum(axis=2))
        elif self.metric == 'manhattan':
            return np.abs(X[:, np.newaxis] - self.train_features).sum(axis=2)
        elif self.metric == 'minkowski':
            return np.sum(np.abs(X[:, np.newaxis] - self.train_features) ** self.p, axis=2) ** (1 / self.p)
        else:
            raise ValueError("Distance metric not supported")

    def predict(self, X):
        distances = self.calculate_distances(X)
        nearest_indices = np.argsort(distances, axis=1)[:, :self.k]
        nearest_labels = self.train_labels[nearest_indices]
        
        if self.weight_type == 'uniform':
            return np.mean(nearest_labels, axis=1)
        elif self.weight_type == 'distance':
            nearest_distances = np.take_along_axis(distances, nearest_indices, axis=1)
            weights = 1 / (nearest_distances + 1e-5)
            return np.sum(weights * nearest_labels, axis=1) / np.sum(weights, axis=1)

def prepare_data(train_file, test_file):
    train_df = pd.read_csv(train_file)
    test_df = pd.read_csv(test_file)

    target_train = train_df['Exited']
    features_train = train_df.drop(['id', 'CustomerId', 'Surname', 'Exited'], axis=1)
    features_test = test_df.drop(['id', 'CustomerId', 'Surname'], axis=1)

    # Encoding categorical features
    features_train['Geography'] = pd.factorize(features_train['Geography'])[0]
    features_train['Gender'] = pd.factorize(features_train['Gender'])[0]

    features_test['Geography'] = pd.factorize(features_test['Geography'])[0]
    features_test['Gender'] = pd.factorize(features_test['Gender'])[0]

    # Scaling numeric data
    feature_means = features_train.mean()
    feature_stds = features_train.std()

    features_train = (features_train - feature_means) / feature_stds
    features_test = (features_test - feature_means) / feature_stds

    return features_train.values, target_train.values, features_test.values, test_df['id']

def cross_validation(X, y, knn_model, splits=5):
    n_samples = X.shape[0]
    fold_size = n_samples // splits
    auc_scores = []

    for fold in range(splits):
        val_start = fold * fold_size
        val_end = (fold + 1) * fold_size if fold != splits - 1 else n_samples

        X_train = np.concatenate((X[:val_start], X[val_end:]))
        y_train = np.concatenate((y[:val_start], y[val_end:]))
        X_val = X[val_start:val_end]
        y_val = y[val_start:val_end]

        knn_model.fit(X_train, y_train)
        y_pred = knn_model.predict(X_val)

        auc = np.mean((y_val == 1) * (y_pred >= 0.5) + (y_val == 0) * (y_pred < 0.5))
        auc_scores.append(auc)

    return np.mean(auc_scores), auc_scores

def optimize_hyperparameters(X, y, X_test, test_ids, max_neighbors=20, auc_target=0.9):
    best_k = 1
    best_score = 0
    best_metric = 'euclidean'
    best_weight = 'uniform'
    available_metrics = ['euclidean', 'manhattan']
    weight_options = ['uniform', 'distance']

    for metric in available_metrics:
        for weight_type in weight_options:
            for k in range(1, max_neighbors + 1):
                knn = CustomKNN(k=k, metric=metric, weight_type=weight_type)
                mean_auc, _ = cross_validation(X, y, knn)
                print(f"K={k}, Metric={metric}, Weights={weight_type}, AUC={mean_auc:.4f}")
                
                if mean_auc > best_score:
                    best_score = mean_auc
                    best_k = k
                    best_metric = metric
                    best_weight = weight_type

                if mean_auc >= auc_target:
                    print(f"Target AUC {auc_target} reached. Stopping and saving results.")
                    knn.fit(X, y)
                    test_preds = knn.predict(X_test)
                    pd.DataFrame({'id': test_ids, 'Exited': test_preds}).to_csv(f'results_k{k}_{metric}_{weight_type}.csv', index=False)
                    return best_k, best_metric, best_weight, best_score

    return best_k, best_metric, best_weight, best_score

# Data preparation
X_train, y_train, X_test, test_ids = prepare_data('train.csv', 'test.csv')

# Hyperparameter tuning and output predictions
best_k, best_metric, best_weight, best_auc = optimize_hyperparameters(X_train, y_train, X_test, test_ids)
print(f"Optimal K: {best_k}, Metric: {best_metric}, Weights: {best_weight}, Best AUC: {best_auc}")


K=1, Metric=euclidean, Weights=uniform, AUC=0.8472


In [None]:
def prepare_submission(test_file, predictions, output_file='submission.csv'):
    test_data = pd.read_csv(test_file)
    submission = pd.DataFrame({
        'id': test_data['id'],
        'Exited': predictions
    })
    submission.to_csv(output_file, index=False)
    print(f"Predictions saved to '{output_file}'.")

# Call the function to save the predictions
prepare_submission('submission.csv', test_preds)