In [1]:
import time
import psutil
import threading

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score,
    mean_squared_error, mean_absolute_error, r2_score
)
from xgboost import XGBClassifier, XGBRegressor
from dataloader import load_data
import warnings
warnings.simplefilter("ignore")

Data directory: d:\DSS5104\data


In [None]:
def preprocess_tabular_data(X_df, y, cat_cols, cont_cols, is_train=False, scaler=None, encoder=None):
    X = X_df.copy()

    if cat_cols:
        X[cat_cols] = X[cat_cols].astype(str)

        if is_train:
            encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
            X[cat_cols] = encoder.fit_transform(X[cat_cols])
        else:
            X[cat_cols] = encoder.transform(X[cat_cols])
    else:
        encoder = None

    if cont_cols:
        X[cont_cols] = X[cont_cols].astype("float32")
        if is_train:
            scaler = StandardScaler()
            X[cont_cols] = scaler.fit_transform(X[cont_cols])
        else:
            X[cont_cols] = scaler.transform(X[cont_cols])
    else:
        scaler = None

    return X, np.array(y), scaler, encoder


def prepare_data_np(dataset_name: str):
    dataset_name = dataset_name.lower()

    if dataset_name.startswith("adult"):
        '''
        you can select a seed:777, 888, 999, 
        and change the seed 'build_model(task_type: str, seed = 999)' at the same time
        '''
        data_train, data_test = load_data("adult", seed = 777)
        X_train, y_train = data_train.drop(columns=['income']), (data_train['income'] == '>50K').astype(int)
        X_val, y_val = data_test.drop(columns=['income']), (data_test['income'] == '>50K').astype(int)
        task_type = "classification"

    elif dataset_name.startswith("california"):
        X_train, X_val, y_train, y_val = load_data("california", seed = 777)
        task_type = "regression"

    elif dataset_name.startswith("higgs"):
        X_train, X_val, y_train, y_val = load_data("higgs")
        y_train, y_val = (y_train == 1).astype(int), (y_val == 1).astype(int)
        task_type = "classification"

    elif dataset_name.startswith("churn"):
        X_train, X_val, y_train, y_val = load_data("churn")
        y_train, y_val = (y_train == 'Yes').astype(int), (y_val == 'Yes').astype(int)
        task_type = "classification"

    elif dataset_name.startswith("creditcard"):
        X_train, X_val, y_train, y_val = load_data("credit")
        y_train, y_val = (y_train == 1).astype(int), (y_val == 1).astype(int)
        task_type = "classification"

    elif dataset_name.startswith("poker"):
        X_train, X_val, y_train, y_val = load_data("poker")
        task_type = "classification"

    elif dataset_name.startswith("bank"):
        X_train, X_val, y_train, y_val = load_data("bank")
        y_train, y_val = (y_train == 'yes').astype(int), (y_val == 'yes').astype(int)
        task_type = "classification"

    elif dataset_name.startswith("wine"):
        X_train, X_val, y_train, y_val = load_data("wine")
        task_type = "regression"

    elif dataset_name.startswith("covtype"):
        X_train, X_val, y_train, y_val = load_data("covtype")
        task_type = "classification"

    else:
        raise ValueError(f"Unsupported dataset: {dataset_name}")

    cat_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
    cont_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()

    X_train_processed, y_train_array, scaler, encoder = preprocess_tabular_data(
        X_train, y_train, cat_cols, cont_cols, is_train=True)

    X_val_processed, y_val_array, _, _ = preprocess_tabular_data(
        X_val, y_val, cat_cols, cont_cols, is_train=False, scaler=scaler, encoder=encoder)

    return (X_train_processed, y_train_array), (X_val_processed, y_val_array), task_type


def build_model(task_type: str, seed=999):
    if task_type == "classification":
        model = XGBClassifier(
            n_jobs=-1,
            random_state=seed,
            sub_sample=0.8,
            use_label_encoder=False,
            eval_metric='logloss'
        )
    elif task_type == "regression":
        model = XGBRegressor(
            n_jobs=-1,
            random_state=seed
        )
    else:
        raise ValueError(f"Unsupported task type: {task_type}")
    
    return model


def evaluate_model(model, X_test, y_test, task_type, train_time, avg_cpu_usage):
    y_test_pred = model.predict(X_test)

    results = {
        "train_time_seconds": round(train_time, 4),
        "cpu_usage": round(avg_cpu_usage, 4)
    }

    print("\nEvaluation Results:")

    if task_type == "classification":
        acc = accuracy_score(y_test, y_test_pred)
        f1 = f1_score(y_test, y_test_pred, average='macro')
        results.update({
            "test_accuracy": round(acc, 4),
            "test_f1": round(f1, 4),
        })
        
        print(f"Accuracy: {acc:.4f}")
        print(f"F1 Score: {f1:.4f}")

        try:
            y_proba = model.predict_proba(X_test)
            auc = roc_auc_score(y_test, y_proba[:, 1])
            results["test_auc"] = round(auc, 4)
            print(f"AUC: {auc:.4f}")
        except Exception as e:
            results["test_auc"] = np.nan
            print("AUC can't be calculated:", e)

    else:  # regression
        rmse = mean_squared_error(y_test, y_test_pred) ** 0.5
        mae = mean_absolute_error(y_test, y_test_pred)
        r2 = r2_score(y_test, y_test_pred)
        results.update({
            "test_rmse": round(rmse, 4),
            "test_mae": round(mae, 4),
            "test_r2": round(r2, 4),
        })
        
        print(f"RMSE: {rmse:.4f}")
        print(f"MAE: {mae:.4f}")
        print(f"R²: {r2:.4f}")

    return results


def monitor_cpu(interval, usage_list, stop_flag):
    psutil.cpu_percent(interval=None) 
    while not stop_flag.is_set():
        usage = psutil.cpu_percent(interval=interval) 
        usage_list.append(usage)

def run_xgboost_pipeline(dataset_name):
    (X_train, y_train), (X_val, y_val), task_type = prepare_data_np(dataset_name)
    model = build_model(task_type)

    cpu_usages = []
    stop_flag = threading.Event()
    monitor_thread = threading.Thread(target=monitor_cpu, args=(0.1, cpu_usages, stop_flag))

    start_time = time.time()
    monitor_thread.start()
    time.sleep(0.1) 

    model.fit(X_train, y_train)

    stop_flag.set()
    monitor_thread.join()
    train_time = time.time() - start_time

    avg_cpu_usage = sum(cpu_usages) / len(cpu_usages) if cpu_usages else 0

    results = evaluate_model(model, X_val, y_val, task_type, train_time, avg_cpu_usage)
    return model, results

In [None]:
dataset_name = "adult"  
_, metrics = run_xgboost_pipeline(dataset_name)
print("\nFinal Metrics Summary:")
for k, v in metrics.items():
    print(f"{k}: {v}")

Seeding: 999
binary classification
(30162, 15) (30162,)
(15060, 15) (15060,)

Evaluation Results:
Accuracy: 0.8655
F1 Score: 0.8099
AUC: 0.9244

Final Metrics Summary:
train_time_seconds: 0.3035
cpu_usage: 67.8333
test_accuracy: 0.8655
test_f1: 0.8099
test_auc: 0.9244


In [None]:
dataset_name = "adult"  
_, metrics = run_xgboost_pipeline(dataset_name)
print("\nFinal Metrics Summary:")
for k, v in metrics.items():
    print(f"{k}: {v}")

Seeding: 888
binary classification
(30162, 15) (30162,)
(15060, 15) (15060,)

Evaluation Results:
Accuracy: 0.8655
F1 Score: 0.8099
AUC: 0.9244

Final Metrics Summary:
train_time_seconds: 0.3051
cpu_usage: 67.9
test_accuracy: 0.8655
test_f1: 0.8099
test_auc: 0.9244


In [None]:
dataset_name = "adult"  
_, metrics = run_xgboost_pipeline(dataset_name)
print("\nFinal Metrics Summary:")
for k, v in metrics.items():
    print(f"{k}: {v}")

Seeding: 777
binary classification
(30162, 15) (30162,)
(15060, 15) (15060,)

Evaluation Results:
Accuracy: 0.8655
F1 Score: 0.8099
AUC: 0.9244

Final Metrics Summary:
train_time_seconds: 0.3166
cpu_usage: 73.1333
test_accuracy: 0.8655
test_f1: 0.8099
test_auc: 0.9244


In [None]:
dataset_name = "bank+marketing"  
_, metrics = run_xgboost_pipeline(dataset_name)
print("\nFinal Metrics Summary:")
for k, v in metrics.items():
    print(f"{k}: {v}")

Seeding: 999
binary classification
(32950, 20) (32950,)
(8238, 20) (8238,)

Evaluation Results:
Accuracy: 0.9176
F1 Score: 0.7732
AUC: 0.9509

Final Metrics Summary:
train_time_seconds: 0.3037
cpu_usage: 68.0667
test_accuracy: 0.9176
test_f1: 0.7732
test_auc: 0.9509


In [None]:
dataset_name = "covtype"  
_, metrics = run_xgboost_pipeline(dataset_name)
print("\nFinal Metrics Summary:")
for k, v in metrics.items():
    print(f"{k}: {v}")

Seeding: 999
multi-class classification
(464809, 12) (464809,)
(116203, 12) (116203,)

Evaluation Results:
Accuracy: 0.8746
F1 Score: 0.8635
AUC can't be calculated: multi_class must be in ('ovo', 'ovr')

Final Metrics Summary:
train_time_seconds: 23.398
cpu_usage: 99.5549
test_accuracy: 0.8746
test_f1: 0.8635
test_auc: nan


In [None]:
#frac = 0.1--change the dataloader.py to load the data with frac = 0.1
dataset_name = "covtype"  
_, metrics = run_xgboost_pipeline(dataset_name)
print("\nFinal Metrics Summary:")
for k, v in metrics.items():
    print(f"{k}: {v}")

Seeding: 999
multi-class classification
(46480, 12) (46480,)
(11621, 12) (11621,)

Evaluation Results:
Accuracy: 0.8485
F1 Score: 0.7983
AUC can't be calculated: multi_class must be in ('ovo', 'ovr')

Final Metrics Summary:
train_time_seconds: 1.7298
cpu_usage: 91.9588
test_accuracy: 0.8485
test_f1: 0.7983
test_auc: nan


In [None]:
#frac = 0.5
dataset_name = "covtype"  
_, metrics = run_xgboost_pipeline(dataset_name)
print("\nFinal Metrics Summary:")
for k, v in metrics.items():
    print(f"{k}: {v}")

Seeding: 999
multi-class classification
(232404, 12) (232404,)
(58102, 12) (58102,)

Evaluation Results:
Accuracy: 0.8722
F1 Score: 0.8516
AUC can't be calculated: multi_class must be in ('ovo', 'ovr')

Final Metrics Summary:
train_time_seconds: 10.2125
cpu_usage: 98.9303
test_accuracy: 0.8722
test_f1: 0.8516
test_auc: nan


In [None]:
dataset_name = "poker"  
_, metrics = run_xgboost_pipeline(dataset_name)
print("\nFinal Metrics Summary:")
for k, v in metrics.items():
    print(f"{k}: {v}")

Seeding: 999
multi-class classification
(25010, 10) (25010,)
(1000000, 10) (1000000,)

Evaluation Results:
Accuracy: 0.7401
F1 Score: 0.1856
AUC can't be calculated: multi_class must be in ('ovo', 'ovr')

Final Metrics Summary:
train_time_seconds: 1.5242
cpu_usage: 93.4533
test_accuracy: 0.7401
test_f1: 0.1856
test_auc: nan


In [None]:
dataset_name = "wine"  
_, metrics = run_xgboost_pipeline(dataset_name)
print("\nFinal Metrics Summary:")
for k, v in metrics.items():
    print(f"{k}: {v}")

Seeding: 999
multi-class classification
(2558, 11) (2558,)
(640, 11) (640,)

Evaluation Results:
RMSE: 0.2111
MAE: 0.1055
R²: 0.9315

Final Metrics Summary:
train_time_seconds: 0.3482
cpu_usage: 61.4333
test_rmse: 0.2111
test_mae: 0.1055
test_r2: 0.9315


In [None]:
dataset_name = "california"  
_, metrics = run_xgboost_pipeline(dataset_name)
print("\nFinal Metrics Summary:")
for k, v in metrics.items():
    print(f"{k}: {v}")

Seeding: 999
regression
(16512, 8) (16512,)
(4128, 8) (4128,)

Evaluation Results:
RMSE: 0.4737
MAE: 0.3121
R²: 0.8394

Final Metrics Summary:
train_time_seconds: 0.4422
cpu_usage: 84.2
test_rmse: 0.4737
test_mae: 0.3121
test_r2: 0.8394


In [None]:
dataset_name = "california"  
_, metrics = run_xgboost_pipeline(dataset_name)
print("\nFinal Metrics Summary:")
for k, v in metrics.items():
    print(f"{k}: {v}")

Seeding: 888
regression
(16512, 8) (16512,)
(4128, 8) (4128,)

Evaluation Results:
RMSE: 0.4747
MAE: 0.3087
R²: 0.8271

Final Metrics Summary:
train_time_seconds: 0.4079
cpu_usage: 74.575
test_rmse: 0.4747
test_mae: 0.3087
test_r2: 0.8271


In [None]:
dataset_name = "california"  
_, metrics = run_xgboost_pipeline(dataset_name)
print("\nFinal Metrics Summary:")
for k, v in metrics.items():
    print(f"{k}: {v}")

Seeding: 777
regression
(16512, 8) (16512,)
(4128, 8) (4128,)

Evaluation Results:
RMSE: 0.4698
MAE: 0.3061
R²: 0.8288

Final Metrics Summary:
train_time_seconds: 0.3044
cpu_usage: 86.3667
test_rmse: 0.4698
test_mae: 0.3061
test_r2: 0.8288


In [None]:
dataset_name = "creditcard"  
_, metrics = run_xgboost_pipeline(dataset_name)
print("\nFinal Metrics Summary:")
for k, v in metrics.items():
    print(f"{k}: {v}")

Seeding: 999
binary classification
(227845, 30) (227845,)
(56962, 30) (56962,)

Evaluation Results:
Accuracy: 0.9992
F1 Score: 0.8789
AUC: 0.8773

Final Metrics Summary:
train_time_seconds: 2.0961
cpu_usage: 93.61
test_accuracy: 0.9992
test_f1: 0.8789
test_auc: 0.8773


In [None]:
dataset_name = "churn"  
_, metrics = run_xgboost_pipeline(dataset_name)
print("\nFinal Metrics Summary:")
for k, v in metrics.items():
    print(f"{k}: {v}")

Seeding: 999
binary classification
(5625, 19) (5625,)
(1407, 19) (1407,)

Evaluation Results:
Accuracy: 0.7825
F1 Score: 0.7062
AUC: 0.8184

Final Metrics Summary:
train_time_seconds: 0.3046
cpu_usage: 75.6
test_accuracy: 0.7825
test_f1: 0.7062
test_auc: 0.8184
