In [11]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [12]:
class KNN:
    def __init__(self, k=3):
        self.k = k
        self.X_train = None
        self.y_train = None

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def compute_distance(self, x1, x2):
        return np.sqrt(np.sum((x1 - x2) ** 2))

    def get_neighbors(self, test_instance):
        distances = []
        for i in range(len(self.X_train)):
            dist = self.compute_distance(self.X_train[i], test_instance)
            distances.append((dist, self.y_train[i]))
        distances.sort(key=lambda x: x[0])
        neighbors = distances[:self.k]
        return neighbors

    def predict_proba(self, test_instance):
        neighbors = self.get_neighbors(test_instance)
        output_values = [neighbor[1] for neighbor in neighbors]
        probability = sum(output_values) / self.k
        return probability

    def predict(self, X_test):
        predictions = []
        for instance in tqdm(X_test, desc="Predicting", unit="instance", total=len(X_test)):
            prob = self.predict_proba(instance)
            predictions.append(1 if prob >= 0.5 else 0)
        return predictions

In [13]:
def preprocess_data(train_path, test_path):
    """Preprocess the train and test datasets: handle numerical scaling."""
    # Read the CSV files
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)

    print("Missing values in train data:")
    print(train_df.isnull().sum())
    print("\nMissing values in test data:")
    print(test_df.isnull().sum())

    # Mapping categorical Gender column
    gender_mapping = {'Male': 1, 'Female': 0}
    train_df['Gender'] = train_df['Gender'].map(gender_mapping)
    test_df['Gender'] = test_df['Gender'].map(gender_mapping)

    # One-hot encode Geography
    geography_train = pd.get_dummies(train_df['Geography'], prefix='Geography')
    geography_test = pd.get_dummies(test_df['Geography'], prefix='Geography')

    # Align train and test datasets to ensure they have the same columns
    geography_train, geography_test = geography_train.align(geography_test, join='outer', axis=1, fill_value=0)

    # Drop original Geography columns and concatenate one-hot encoded columns
    train_df = train_df.drop('Geography', axis=1)
    test_df = test_df.drop('Geography', axis=1)

    train_df = pd.concat([train_df, geography_train], axis=1)
    test_df = pd.concat([test_df, geography_test], axis=1)

    # Scale numerical features
    numerical_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']

    means = train_df[numerical_features].mean()
    stds = train_df[numerical_features].std()

    train_df[numerical_features] = (train_df[numerical_features] - means) / stds
    test_df[numerical_features] = (test_df[numerical_features] - means) / stds

    # Preserve the test ids for submission
    test_ids = test_df['id']

    # Drop unnecessary columns
    train_df = train_df.drop(['CustomerId', 'Surname', 'id'], axis=1)
    test_df = test_df.drop(['CustomerId', 'Surname', 'id'], axis=1)

    # Separate features and target variable
    X = train_df.drop('Exited', axis=1).values
    y = train_df['Exited'].values
    X_test = test_df.values

    # Return preprocessed data
    return X, y, X_test, test_ids

In [14]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score ## This library is not used for KNN algorithm

def cross_validate(X, y, knn, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    roc_auc_scores = []

    # tqdm 进度条显示交叉验证的进度
    for train_index, val_index in tqdm(kf.split(X), total=n_splits, desc="Cross-Validation Folds", leave=False):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        knn.fit(X_train, y_train)
        y_val_preds = [knn.predict_proba(instance) for instance in X_val]
        auc = roc_auc_score(y_val, y_val_preds)
        roc_auc_scores.append(auc)

    return np.mean(roc_auc_scores)

In [16]:
# TODO: hyperparamters tuning
def hyperparameter_tuning(X, y, k_values=[6,7,8]):
    """
    Perform hyperparameter tuning to find the best k value for KNN.
    """
    best_k = k_values[0]
    best_score = 0

    for k in k_values:
        knn = KNN(k=k)
        score = cross_validate(X, y, knn)
        print(f"K = {k}, Cross-Validation AUC: {score:.4f}")
        if score > best_score:
            best_score = score
            best_k = k

    return best_k, best_score

# Load and preprocess data
X, y, X_test, test_ids = preprocess_data('train.csv', 'test.csv')

# Hyperparameter tuning to find the best k
best_k, best_score = hyperparameter_tuning(X, y)
print(f"Best k: {best_k}, Best Cross-Validation Score: {best_score}")

# Create and evaluate the model
knn = KNN(k=best_k)
knn.fit(X, y)

# Predict on the test set
test_predictions = knn.predict(X_test)

# Save test predictions to a CSV file in the samplesubmission format
submission = pd.DataFrame({'id': test_ids, 'smoking': test_predictions})
submission.to_csv('submissions.csv', index=False)
print("Predictions saved to 'submissions.csv'")

Missing values in train data:
id                 0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

Missing values in test data:
id                 0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
dtype: int64


                                                                     

K = 6, Cross-Validation AUC: 0.8876


                                                                     

K = 7, Cross-Validation AUC: 0.8931


                                                                     

K = 8, Cross-Validation AUC: 0.8969
Best k: 8, Best Cross-Validation Score: 0.8968906687840905


Predicting: 100%|██████████| 10000/10000 [06:07<00:00, 27.19instance/s]

Predictions saved to 'submissions.csv'



