In [96]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from joblib import Parallel, delayed

In [97]:
# 定义 KNN 类
class KNN:
    def __init__(self, k=3, distance_metric='euclidean', weighted=False):
        self.k = k
        self.distance_metric = distance_metric
        self.weighted = weighted

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        # 计算类别权重
        if self.weighted:
            unique, counts = np.unique(y, return_counts=True)
            total = len(y)
            self.class_weights = {u: total / (len(unique) * c) for u, c in zip(unique, counts)}
        else:
            self.class_weights = None

    def compute_distance(self, X):
        if self.distance_metric == 'euclidean':
            # 高效的向量化欧氏距离计算
            X_squared = np.sum(X**2, axis=1).reshape(-1, 1).astype(np.float32)
            train_squared = np.sum(self.X_train**2, axis=1).reshape(1, -1).astype(np.float32)
            cross_term = np.dot(X, self.X_train.T).astype(np.float32)
            distances = np.sqrt(X_squared + train_squared - 2 * cross_term)
        elif self.distance_metric == 'manhattan':
            # 向量化曼哈顿距离计算，分批处理以节省内存
            n_test = X.shape[0]
            n_train = self.X_train.shape[0]
            distances = np.zeros((n_test, n_train), dtype=np.float32)
            batch_size = 500  # 根据内存情况调整
            for i in range(0, n_test, batch_size):
                end = min(i + batch_size, n_test)
                batch = X[i:end, :]  # (batch_size, n_features)
                # 计算绝对差值并求和
                distances[i:end, :] = np.sum(np.abs(batch[:, np.newaxis, :] - self.X_train), axis=2)
        else:
            raise ValueError("Unsupported distance metric")
        return distances


    def predict_proba(self, X):
        distances = self.compute_distance(X)  # 形状 (n_test, n_train)
        neighbor_indices = np.argsort(distances, axis=1)[:, :self.k]  # 形状 (n_test, k)
        neighbor_labels = self.y_train[neighbor_indices]  # 形状 (n_test, k)
        if self.weighted:
            neighbor_distances = np.take_along_axis(distances, neighbor_indices, axis=1)  # 形状 (n_test, k)
            weights = 1 / (neighbor_distances + 1e-5)  # 避免除以零
            class_weights = np.vectorize(self.class_weights.get)(neighbor_labels)  # 形状 (n_test, k)
            weighted_labels = neighbor_labels * class_weights  # 形状 (n_test, k)
            proba = np.sum(weights * weighted_labels, axis=1) / np.sum(weights * class_weights, axis=1)
        else:
            if self.class_weights:
                class_weights = np.vectorize(self.class_weights.get)(neighbor_labels)  # 形状 (n_test, k)
                weighted_labels = neighbor_labels * class_weights
                proba = np.mean(weighted_labels, axis=1)
            else:
                proba = np.mean(neighbor_labels, axis=1)
        return proba

    def predict(self, X, threshold=0.5):
        proba = self.predict_proba(X)
        return (proba >= threshold).astype(int)

# 定义集成 KNN 类
class EnsembleKNN:
    def __init__(self, knn_models):
        self.knn_models = knn_models

    def fit(self, X, y):
        for knn in self.knn_models:
            knn.fit(X, y)

    def predict_proba(self, X):
        proba = np.mean([knn.predict_proba(X) for knn in self.knn_models], axis=0)
        return proba

    def predict(self, X, threshold=0.5):
        proba = self.predict_proba(X)
        return (proba >= threshold).astype(int)

In [98]:
# 定义数据预处理函数
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # 合并训练和测试数据以确保一致的预处理
    combined = pd.concat([train_data.drop('Exited', axis=1), test_data], axis=0).reset_index(drop=True)

    # 删除不必要的列
    combined = combined.drop(['CustomerId', 'Surname', 'id'], axis=1)

    # 特征工程：创建交互特征
    combined['Age_Tenure_Ratio'] = combined['Age'] / (combined['Tenure'] + 1)
    combined['Balance_EstimatedSalary_Ratio'] = combined['Balance'] / (combined['EstimatedSalary'] + 1)
    combined['NumOfProducts_IsActiveMember'] = combined['NumOfProducts'] * combined['IsActiveMember']
    combined['Age_Balance_Ratio'] = combined['Age'] / (combined['Balance'] + 1)
    combined['CreditScore_Age'] = combined['CreditScore'] * combined['Age']
    combined['CreditScore_Balance'] = combined['CreditScore'] * combined['Balance']

    # 转换偏态分布的特征
    for col in ['Balance', 'EstimatedSalary']:
        combined[col] = combined[col].apply(lambda x: np.log1p(x) if x > 0 else 0)

    # 独热编码分类变量
    combined = pd.get_dummies(combined, columns=['Geography', 'Gender'], drop_first=True)

    # 特征缩放使用 RobustScaler 以减少异常值的影响
    scaler = RobustScaler()
    scaled_features = scaler.fit_transform(combined)

    # 分割回训练和测试数据
    X_train = scaled_features[:train_data.shape[0], :]
    y_train = train_data['Exited'].values
    X_test = scaled_features[train_data.shape[0]:, :]

    return X_train, y_train, X_test

In [99]:
# 定义并行化的交叉验证函数
def cross_validate(X, y, knn, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    auc_scores = Parallel(n_jobs=-1)(delayed(evaluate_fold)(X, y, knn, train_idx, val_idx)
                                     for train_idx, val_idx in skf.split(X, y))
    return auc_scores

def evaluate_fold(X, y, knn, train_idx, val_idx):
    X_train_cv, X_val_cv = X[train_idx], X[val_idx]
    y_train_cv, y_val_cv = y[train_idx], y[val_idx]

    knn.fit(X_train_cv, y_train_cv)
    y_pred_proba = knn.predict_proba(X_val_cv)

    auc = roc_auc_score(y_val_cv, y_pred_proba)
    return auc


In [100]:
# 加载和预处理数据
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# 超参数调优
best_k = 9
best_metric = 'manhattan'
best_weighted = True
best_score = 0

# 探索更广泛的 k 值和不同的距离度量
for k in [3, 5, 7, 9, 11, 13, 15]:
    for metric in ['euclidean', 'manhattan']:
        for weighted in [False, True]:
            knn = KNN(k=k, distance_metric=metric, weighted=weighted)
            cv_scores = cross_validate(X, y, knn)
            avg_score = np.mean(cv_scores)
            print(f"k={k}, metric={metric}, weighted={weighted}, AUC={avg_score:.4f}")
            if avg_score > best_score:
                best_score = avg_score
                best_k = k
                best_metric = metric
                best_weighted = weighted

print(f"Best hyperparameters: k={best_k}, metric={best_metric}, weighted={best_weighted}, AUC={best_score:.4f}")

# 使用最佳超参数创建 KNN 模型并训练
best_knn = KNN(k=best_k, distance_metric=best_metric, weighted=best_weighted)
best_knn.fit(X, y)
test_predictions_proba = best_knn.predict_proba(X_test)

# 如果你选择使用集成模型，可以如下操作：
# knn1 = KNN(k=5, distance_metric='manhattan', weighted=True)
# knn2 = KNN(k=7, distance_metric='manhattan', weighted=True)
# knn3 = KNN(k=9, distance_metric='manhattan', weighted=True)
# ensemble_knn = EnsembleKNN([knn1, knn2, knn3])
# ensemble_knn.fit(X, y)
# test_predictions_proba = ensemble_knn.predict_proba(X_test)

# 生成提交文件
submission = pd.read_csv('test.csv')[['id']]
submission['Exited'] = test_predictions_proba
submission.to_csv('submissions.csv', index=False)

k=3, metric=euclidean, weighted=False, AUC=0.8421
k=3, metric=euclidean, weighted=True, AUC=0.8435


MemoryError: Unable to allocate 4.56 GiB for an array with shape (3000, 12000, 17) and data type float64