In [1]:
import time
import psutil
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
import joblib
import os


In [7]:
file_path = r"C:\Users\张凤智\Downloads\higgs\HIGGS.csv.gz"

col_names = ["label"] + [f"feature_{i}" for i in range(1, 29)]
dtypes = {f"feature_{i}": "float32" for i in range(1, 29)}
dtypes["label"] = "int8"

seed = 999
chunksize = 500_000
chunk_limit = 3
dataset_name = "HIGGS"

save_dir = os.path.join(r"D:\DSS5104\final\randomforest", dataset_name)
os.makedirs(save_dir, exist_ok=True)

In [4]:
def split_features(X):
    cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
    num_cols = X.select_dtypes(include=["number"]).columns.tolist()
    return num_cols, cat_cols

def build_preprocessor(num_cols, cat_cols):
    numeric_transformer = StandardScaler()
    categorical_transformer = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
    preprocessor = ColumnTransformer([
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ])
    return preprocessor

In [5]:
models = []
scores = []
feature_importances = []

In [17]:
reader = pd.read_csv(file_path, compression="gzip", header=None,
                     names=col_names, chunksize=chunksize, dtype=dtypes)

for i, chunk in enumerate(reader):
    if i >= chunk_limit:
        break

    print(f"\n🚀 正在处理第 {i+1} 个数据块...")

    X_chunk = chunk.drop(columns=["label"])
    y_chunk = chunk["label"].astype(int)

    num_cols, cat_cols = split_features(X_chunk)
    preprocessor = build_preprocessor(num_cols, cat_cols)

    X_train, X_test, y_train, y_test = train_test_split(
        X_chunk, y_chunk, test_size=0.2, stratify=y_chunk, random_state=seed
    )

    model = RandomForestClassifier(
        n_estimators=100,
        max_depth=None,
        random_state=seed,
        n_jobs=-1
    )

    pipeline = Pipeline([
        ("pre", preprocessor),
        ("rf", model),
    ])

    # ⏱️ 监控资源
    mem_before = psutil.virtual_memory().used / 1024**3  # GB
    cpu_before = psutil.cpu_percent(interval=2)
    start_time = time.time()

    pipeline.fit(X_train, y_train)

    train_time = time.time() - start_time
    mem_after = psutil.virtual_memory().used / 1024**3
    cpu_after = psutil.cpu_percent(interval=2)

    y_pred = pipeline.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")

    print(f"✅ Chunk {i+1} 准确率: {acc:.4f}, F1分数: {f1:.4f}, 用时: {train_time:.2f}s")
    print(f"💻 内存: {mem_before:.2f} GB → {mem_after:.2f} GB")
    print(f"🧠 CPU: {cpu_before:.1f}% → {cpu_after:.1f}%")
    
    model_path = os.path.join(save_dir, f"{dataset_name}_rf_model_chunk_{i+1}.pkl")
    
    joblib.dump(pipeline, model_path)
    print(f"📦 模型已保存到: {model_path}")

    # 保存结果
    models.append(pipeline)
    scores.append({
        "Chunk": i+1,
        "Accuracy": acc,
        "F1_score": f1,
        "Train Time (s)": train_time,
        "Memory Before (GB)": mem_before,
        "Memory After (GB)": mem_after,
        "CPU Before (%)": cpu_before,
        "CPU After (%)": cpu_after,
        "CPU Usage (%)": cpu_after - cpu_before
    })

    # 特征重要性
    rf_model = pipeline.named_steps["rf"]
    importances = rf_model.feature_importances_
    feature_names = num_cols.copy()
    top_idx = np.argsort(importances)[::-1][:10]
    top_feats = np.array(feature_names)[top_idx]
    top_imps = importances[top_idx]

    importance_df = pd.DataFrame({
        "Feature": top_feats,
        "Importance": top_imps,
        "Chunk": i+1
    })
    feature_importances.append(importance_df)

    # 画图并保存为 PNG
    plt.figure(figsize=(12, 5))
    plt.title(f"Top 10 Feature Importances for Chunk {i+1}")
    plt.bar(range(len(top_imps)), top_imps)
    plt.xticks(range(len(top_feats)), top_feats, rotation=45)
    plt.ylabel("Importance")
    plt.tight_layout()

    plot_path = os.path.join(save_dir, f"{dataset_name}_feature_importance_chunk_{i+1}.png")
    plt.savefig(plot_path)
    plt.close()
    print(f"📊 特征重要性图已保存到: {plot_path}")


🚀 正在处理第 1 个数据块...
✅ Chunk 1 准确率: 0.7303, F1分数: 0.7294, 用时: 141.10s
💻 内存: 6.24 GB → 6.59 GB
🧠 CPU: 16.7% → 17.4%
📦 模型已保存到: D:\DSS5104\final\randomforest\HIGGS\HIGGS_rf_model_chunk_1.pkl
📊 特征重要性图已保存到: D:\DSS5104\final\randomforest\HIGGS\HIGGS_feature_importance_chunk_1.png

🚀 正在处理第 2 个数据块...
✅ Chunk 2 准确率: 0.7307, F1分数: 0.7296, 用时: 141.62s
💻 内存: 6.47 GB → 6.60 GB
🧠 CPU: 25.1% → 41.6%
📦 模型已保存到: D:\DSS5104\final\randomforest\HIGGS\HIGGS_rf_model_chunk_2.pkl
📊 特征重要性图已保存到: D:\DSS5104\final\randomforest\HIGGS\HIGGS_feature_importance_chunk_2.png

🚀 正在处理第 3 个数据块...
✅ Chunk 3 准确率: 0.7283, F1分数: 0.7272, 用时: 135.22s
💻 内存: 6.05 GB → 6.50 GB
🧠 CPU: 15.8% → 17.2%
📦 模型已保存到: D:\DSS5104\final\randomforest\HIGGS\HIGGS_rf_model_chunk_3.pkl
📊 特征重要性图已保存到: D:\DSS5104\final\randomforest\HIGGS\HIGGS_feature_importance_chunk_3.png


In [18]:
results_df = pd.DataFrame(scores)
importance_all = pd.concat(feature_importances, ignore_index=True)

# 保存为 CSV
results_csv_path = os.path.join(save_dir, f"{dataset_name}_chunk_results.csv")
importance_csv_path = os.path.join(save_dir, f"{dataset_name}_feature_importances_all.csv")
results_df.to_csv(results_csv_path, index=False)
importance_all.to_csv(importance_csv_path, index=False)

# 保存为 Excel
excel_path = os.path.join(save_dir, f"{dataset_name}_summary.xlsx")
with pd.ExcelWriter(excel_path, engine="xlsxwriter") as writer:
    results_df.to_excel(writer, sheet_name="Chunk Results", index=False)
    importance_all.to_excel(writer, sheet_name="Feature Importances", index=False)

print(f"\n📄 所有结果已保存到：\n- {results_csv_path}\n- {importance_csv_path}\n- {excel_path}")


📄 所有结果已保存到：
- D:\DSS5104\final\randomforest\HIGGS\HIGGS_chunk_results.csv
- D:\DSS5104\final\randomforest\HIGGS\HIGGS_feature_importances_all.csv
- D:\DSS5104\final\randomforest\HIGGS\HIGGS_summary.xlsx
