In [1]:
import pandas as pd
import numpy as np
import time
import psutil
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from xgboost import XGBClassifier

import os

In [2]:

file_path = r"C:\Users\张凤智\Downloads\higgs\HIGGS.csv.gz"

save_path = r"D:\DSS5104\XGboost\xgboost_result\scale\HIGGIS\HIGGIS.csv"

results = []

def evaluate_model(X, y, sample_fraction=1.0):
    print(f"\nSample scale: {sample_fraction}")

    if sample_fraction < 1.0:

        X_sample, _, y_sample, _ = train_test_split(X, y, train_size=sample_fraction, stratify=y, random_state=999)
    else:
        X_sample = X
        y_sample = y
        
    X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=999)

    #record start time and CPU usage
    start_time = time.time()
    process = psutil.Process()
    process.cpu_percent(interval=None)

    model = XGBClassifier(
        use_label_encoder=False,
        eval_metric='logloss',
        verbosity=0,
        n_jobs=-1
    )
    model.fit(X_train, y_train)

    end_time = time.time()
    elapsed_time = end_time - start_time
    cpu_percent = process.cpu_percent(interval=None)
    memory_used = process.memory_info().rss / (1024 ** 2)  

    # model prediction
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    # evaluate model performance
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)

    print(f"Accuracy: {acc:.4f}")
    print(f"F1-score: {f1:.4f}")
    print(f"AUC: {auc:.4f}")
    print(f"Time: {elapsed_time:.2f} seconds")
    print(f"CPU Usage: {cpu_percent}%")
    print(f"Memory Used: {memory_used:.2f} MB")
    
    results.append({
        'Sample Fraction': sample_fraction,
        'Accuracy': acc,
        'F1-score': f1,
        'AUC': auc,
        'time': elapsed_time,
        'cpu_occupied(%)': cpu_percent,
        'Memory_Used (MB)': memory_used
    })    


In [3]:

df = pd.read_csv(file_path, compression='gzip', header=None)

y = df.iloc[:, 0]
X = df.iloc[:, 1:]

for frac in [0.1, 0.5, 1.0]:
    evaluate_model(X, y, sample_fraction=frac)
    
os.makedirs(os.path.dirname(save_path), exist_ok=True)
    
results_df = pd.DataFrame(results)
results_df.to_csv(save_path, index=False)
print(f"\nSaved all: {save_path}")


Sample scale: 0.1
Accuracy: 0.7408
F1-score: 0.7572
AUC: 0.8222
Time: 11.51 seconds
CPU Usage: 553.4%
Memory Used: 2300.98 MB

Sample scale: 0.5
Accuracy: 0.7408
F1-score: 0.7577
AUC: 0.8226
Time: 55.10 seconds
CPU Usage: 545.9%
Memory Used: 2701.88 MB

Sample scale: 1.0
Accuracy: 0.7411
F1-score: 0.7583
AUC: 0.8228
Time: 133.57 seconds
CPU Usage: 459.6%
Memory Used: 1966.99 MB

Saved all: D:\DSS5104\XGboost\xgboost_result\scale\HIGGIS\HIGGIS.csv
