In [1]:
import pandas as pd
import numpy as np
import time
import psutil
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
# 设置数据路径
file_path = r"C:\Users\张凤智\Downloads\higgs\HIGGS.csv.gz"

def evaluate_model(X, y):
        
    # 训练集 / 测试集划分
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=999)

    # 开始时间记录
    start_time = time.time()
    process = psutil.Process()

    # 模型初始化和训练
    model = RandomForestClassifier(
        n_estimators=100,   # 树的数量
        n_jobs=-1,          # 并行加速
        random_state=999
    )
    
    model.fit(X_train, y_train)

    # 结束时间记录
    end_time = time.time()
    elapsed_time = end_time - start_time
    cpu_percent = process.cpu_percent(interval=0.1)
    memory_used = process.memory_info().rss / (1024 ** 2)  # 转换为 MB

    # 模型预测
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    # 评估指标
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)

    print(f"✅ Accuracy: {acc:.4f}")
    print(f"✅ F1-score: {f1:.4f}")
    print(f"✅ AUC: {auc:.4f}")
    print(f"⏱️ Time: {elapsed_time:.2f} seconds")
    print(f"🧠 CPU Usage: {cpu_percent}%")
    print(f"🧠 Memory Used: {memory_used:.2f} MB")


In [None]:
# 加载数据（仅前 1,000,000 行可提高速度，也可以加载全部）
print("📥 正在加载数据...")
df = pd.read_csv(file_path, compression='gzip', header=None)

# 第一列是标签，后续是特征
y = df.iloc[:, 0]
X = df.iloc[:, 1:]


evaluate_model(X, y)

📥 正在加载数据...
