In [None]:
import numpy as np
import pandas as pd

In [30]:
# 定义 KNN 类
import numpy as np

class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # 存储训练数据
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        # 对每个测试样本进行预测
        y_pred = []
        for x in X:
            # 计算与训练数据的距离
            distances = self.compute_distance(self.X_train, x)
            # 获取距离最近的k个邻居的索引
            k_indices = distances.argsort()[:self.k]
            # 获取k个邻居的标签
            k_nearest_labels = self.y_train[k_indices]
            # 多数投票决定预测标签
            labels, counts = np.unique(k_nearest_labels, return_counts=True)
            majority_label = labels[counts.argmax()]
            y_pred.append(majority_label)
        return np.array(y_pred)

    def predict_proba(self, X):
        # 预测每个测试样本属于每个类别的概率
        proba = []
        for x in X:
            distances = self.compute_distance(self.X_train, x)
            k_indices = distances.argsort()[:self.k]
            k_nearest_labels = self.y_train[k_indices]
            # 计算每个类别的概率
            class_counts = np.bincount(k_nearest_labels, minlength=2)
            class_proba = class_counts / self.k
            proba.append(class_proba)
        return np.array(proba)

    def compute_distance(self, X1, x2):
        # 根据距离度量计算距离
        if self.distance_metric == 'euclidean':
            # 欧氏距离
            distances = np.sqrt(np.sum((X1 - x2) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            # 曼哈顿距离
            distances = np.sum(np.abs(X1 - x2), axis=1)
        else:
            raise ValueError("不支持的距离度量")
        return distances


In [31]:
# 定义数据预处理函数
import pandas as pd
from sklearn.preprocessing import StandardScaler

def preprocess_data(train_path, test_path, sample_submission_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    submission_example = pd.read_csv(sample_submission_path)

    # 使用 sample_submission.csv 中的 'id' 作为测试集的 id
    test_ids = submission_example['id']

    # 合并训练和测试数据以便统一处理，重置索引以防止索引混乱
    data = pd.concat([train_data, test_data], sort=False).reset_index(drop=True)

    # 删除不必要的列
    data.drop(['RowNumber', 'CustomerId', 'Surname', 'id'], axis=1, inplace=True, errors='ignore')

    # 处理分类变量
    categorical_features = ['Geography', 'Gender']
    data = pd.get_dummies(data, columns=categorical_features)

    # 归一化数值特征
    scaler = StandardScaler()
    numeric_features = [col for col in data.columns if col != 'Exited']
    data[numeric_features] = scaler.fit_transform(data[numeric_features])

    # 分割回训练和测试数据
    train_data_processed = data.iloc[:len(train_data)].reset_index(drop=True)
    test_data_processed = data.iloc[len(train_data):].reset_index(drop=True)

    # 提取特征和标签
    X_train = train_data_processed.drop('Exited', axis=1).values
    y_train = train_data_processed['Exited'].values.astype(int)

    X_test = test_data_processed.drop('Exited', axis=1).values

    return X_train, y_train, X_test, test_ids


In [32]:
# 定义交叉验证函数
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

def cross_validate(X, y, knn, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    auc_scores = []

    for train_index, val_index in kf.split(X):
        X_train_fold, X_val_fold = X[train_index], X[val_index]
        y_train_fold, y_val_fold = y[train_index], y[val_index]

        # 训练模型
        knn.fit(X_train_fold, y_train_fold)
        # 预测概率
        y_val_proba = knn.predict_proba(X_val_fold)
        # 获取正类的概率（Exited为1的概率）
        y_scores = y_val_proba[:, 1]
        # 计算ROC AUC分数
        auc = roc_auc_score(y_val_fold, y_scores)
        auc_scores.append(auc)

    return auc_scores


In [33]:
# 加载和预处理数据
import numpy as np

X, y, X_test, test_ids = preprocess_data('train.csv', 'test.csv', 'sample_submission.csv')

# 定义超参数网格
k_values = [3, 5, 7, 9]
distance_metrics = ['euclidean', 'manhattan']

best_auc = 0
best_k = None
best_metric = None

# 超参数调优
for k in k_values:
    for metric in distance_metrics:
        knn = KNN(k=k, distance_metric=metric)
        cv_scores = cross_validate(X, y, knn, n_splits=5)
        mean_auc = np.mean(cv_scores)
        print(f"k={k}, metric={metric}, AUC={mean_auc}")
        if mean_auc > best_auc:
            best_auc = mean_auc
            best_k = k
            best_metric = metric

print(f"最佳超参数：k={best_k}, metric={best_metric}, AUC={best_auc}")

# 使用最佳超参数在整个训练集上训练模型
knn = KNN(k=best_k, distance_metric=best_metric)
knn.fit(X, y)
# 预测测试集的概率
test_proba = knn.predict_proba(X_test)
# 获取正类的概率（Exited为1的概率）
test_predictions = test_proba[:, 1]

# 保存测试集预测结果
submission = pd.DataFrame({'id': test_ids, 'Exited': test_predictions})
submission.to_csv('submissions.csv', index=False)


k=3, metric=euclidean, AUC=0.8460492466705866
k=3, metric=manhattan, AUC=0.8480913593783352
k=5, metric=euclidean, AUC=0.8707493860020747
k=5, metric=manhattan, AUC=0.8733774498677722
k=7, metric=euclidean, AUC=0.8828354664536786
k=7, metric=manhattan, AUC=0.8844340728234339
k=9, metric=euclidean, AUC=0.8888759509883297
k=9, metric=manhattan, AUC=0.8888079130091873
最佳超参数：k=9, metric=euclidean, AUC=0.8888759509883297
