In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer  # 处理缺失值
import os


# 1. 读取数据
file_path = "/Users/cleazhang/Downloads/bank-additional.csv"

df = pd.read_csv(file_path, sep=',', engine='python')  # 以 `,` 为分隔符
df.columns = df.columns.str.strip()  # 去除列名中的空格

# === 处理数据的完整副本 ===
df_processed = df.copy()

# === 1.删除不相关列 ===
to_delete_features = ['default','pdays']
df_processed.drop(columns=to_delete_features, inplace=True, errors='ignore')  # errors='ignore' 防止列不存在时报错

# === 2. 处理分类变量缺失值 ===
# 众数填充（Mode Impute）适用于分类变量
mode_imputer = SimpleImputer(strategy="most_frequent")
categorical_features = ["job", "marital", "education", "housing", "loan", "contact", "poutcome"]
df_processed[categorical_features] = df_processed[categorical_features].replace("unknown", np.nan)
df_processed[categorical_features] = mode_imputer.fit_transform(df_processed[categorical_features])

# === 3. Label Encoding ===
label_encoding_dict = {
    "education": {'illiterate': 0, 'basic.4y': 4, 'basic.6y': 6, 'basic.9y': 9,
                  'high.school': 11, 'professional.course': 13, 'university.degree': 14},
    "housing": {'no': 0, 'yes': 1},
    "loan": {'no': 0, 'yes': 1},
    "contact": {'telephone': 0, 'cellular': 1},
    "month": {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 'jul': 7, 'aug': 8,
              'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12},
    "day_of_week": {'mon': 1, 'tue': 2, 'wed': 3, 'thu': 4, 'fri': 5},
    "poutcome": {'nonexistent': 0, 'failure': 1, 'success': 2},
    "y": {'no':0,'yes':1}
}

for col, mapping in label_encoding_dict.items():
    if col in df_processed.columns:
        df_processed[col] = df_processed[col].map(mapping)

# === 4. One-Hot Encoding ===
one_hot_features = ["job", "marital"]
df_processed = pd.get_dummies(df_processed, columns=one_hot_features, dtype=int)

# === 5. 处理数值特征 ===
numeric_features = ["age", "campaign", "previous", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"]

# 缺失值填充（均值填充）适用于数值变量
mean_imputer = SimpleImputer(strategy="mean")
df_processed[numeric_features] = mean_imputer.fit_transform(df_processed[numeric_features])

# 标准化 (Standardization)
scaler_standard = StandardScaler()
df_processed[numeric_features] = scaler_standard.fit_transform(df_processed[numeric_features])

# # 归一化 (Normalization)
# scaler_minmax = MinMaxScaler()
# df_processed[numeric_features] = scaler_minmax.fit_transform(df_processed[numeric_features])

# === 6. K-Fold交叉验证 ===
kf = KFold(n_splits=10, shuffle=True, random_state=42)
save_dir = "/Users/cleazhang/Downloads/bank-additional-dataset"
os.makedirs(save_dir, exist_ok=True)

# 进行 KFold 划分，并保存 train 和 test
for fold_idx, (train_idx, test_idx) in enumerate(kf.split(df_processed)):
    train_data = df_processed.iloc[train_idx]
    test_data = df_processed.iloc[test_idx]

    # === 分别保存 train 和 test 数据 ===
    train_path = os.path.join(save_dir, f"kfold_fold_{fold_idx+1}_train.csv")
    test_path = os.path.join(save_dir, f"kfold_fold_{fold_idx+1}_test.csv")

    train_data.to_csv(train_path, index=False)  # 保存为 CSV
    test_data.to_csv(test_path, index=False)
# === 打印处理后的数据 ===
#print(df_processed.isna().sum().sum())  # 计算整个 DataFrame 里的 NaN 总数
#nan_columns = df_processed.columns[df_processed.isna().sum() > 0]  # 找出含 NaN 的列
# print("Columns with NaN values:\n", df_processed[nan_columns].isna().sum())

# print("NaN values:\n", df_processed.isna().sum())  # 检查 NaN
# print("Infinite values:\n", (df_processed == np.inf).sum() + (df_processed == -np.inf).sum())  # 检查无穷大

print("\n最终处理后的数据：")
print(df_processed.head())



最终处理后的数据：
        age  education  housing  loan  contact  month  day_of_week  campaign  \
0 -0.980752          9        1     0        1      5            5 -0.209228   
1 -0.107991         11        0     0        0      5            5  0.569634   
2 -1.465619         11        1     0        0      6            3 -0.598660   
3 -0.204965          9        1     0        0      6            5  0.180203   
4  0.667795         14        1     0        1     11            1 -0.598660   

   previous  poutcome  ...  job_management  job_retired  job_self-employed  \
0 -0.351356         0  ...               0            0                  0   
1 -0.351356         0  ...               0            0                  0   
2 -0.351356         0  ...               0            0                  0   
3 -0.351356         0  ...               0            0                  0   
4 -0.351356         0  ...               0            0                  0   

   job_services  job_student  job_techn