In [49]:
import time
import psutil
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score, classification_report,
    mean_squared_error, mean_absolute_error, r2_score
)

from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier, XGBRegressor
import joblib
from ucimlrepo import fetch_ucirepo


In [2]:
'''

we can choose different dataset

if using California housing dataset, please choose load_california_housing()

if using uci dataset, please choose load_data()

seed

seed we choose 777, 888, 999

'''

'\n\nwe can choose different dataset\n\nif using California housing dataset, please choose load_california_housing()\n\nif using uci dataset, please choose load_data()\n\nseed\n\nseed we choose 777, 888, 999\n\n'

In [43]:

'''
[Covertype](https://archive.ics.uci.edu/dataset/31/covertype)

[Adult](https://archive.ics.uci.edu/dataset/2/adult)

[Bank Marketing](https://archive.ics.uci.edu/dataset/222/bank+marketing)

[Poker Hand](https://archive.ics.uci.edu/dataset/158/poker+hand)

[HIGGS](https://archive.ics.uci.edu/dataset/280/higgs)

[Wine Quality](https://archive.ics.uci.edu/dataset/186/wine+quality)
'''

seed = 999
dataset_name = "Wine Quality" 
dataset_id = 186
task_type = "regression"  # or "regression"
scale = 0.5

save_dir = os.path.join(r"D:\DSS5104\XGboost\xgboost_result\999", dataset_name)
#save_dir = os.path.join(r"D:\DSS5104\XGboost\xgboost_result\scale", f"{dataset_name}_scale{scale}")
os.makedirs(save_dir, exist_ok=True)


In [75]:
from sklearn.datasets import fetch_california_housing
seed = 777

def load_california_housing():
    data = fetch_california_housing()
    X = data.data 
    y = data.target 
    
    feature_names = data.feature_names
    X = pd.DataFrame(X, columns=feature_names)

    return X, y
dataset_name = "California Housing"  
task_type = "regression"

save_dir = os.path.join(r"D:\DSS5104\XGboost\xgboost_result\777", dataset_name)
os.makedirs(save_dir, exist_ok=True)


In [51]:
def load_data():
    dataset = fetch_ucirepo(id=dataset_id)
    X, y = dataset.data.features, dataset.data.targets.squeeze()

    y = y.astype(str).str.strip().str.replace('.', '', regex=False)

    if (X == "?").any().any():
        X = X.replace("?", np.nan)

        original_shape = X.shape[0]
        df = pd.concat([X, y], axis=1)
        df = df.dropna()
        removed = original_shape - df.shape[0]
        print(f"Missing value detected. Remove {removed} records")


        X = df.drop(columns=[y.name])
        y = df[y.name]

    return X, y


In [5]:
#sample 0.1, 0.5
def load_data():
    dataset = fetch_ucirepo(id=dataset_id)
    X, y = dataset.data.features, dataset.data.targets.squeeze()
    
    data = pd.concat([X, y], axis=1)
    data = data.sample(frac=scale, random_state=seed)  
    
    X = data.drop(columns=[y.name])
    y = data[y.name]
    
    y = y.astype(str).str.strip().str.replace('.', '', regex=False)
    if (X == "?").any().any():
        X = X.replace("?", np.nan)
        df = pd.concat([X, y], axis=1).dropna()
        X, y = df.drop(columns=[y.name]), df[y.name]
    
    return X, y

In [76]:
def split_features(X):
    cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
    num_cols = X.select_dtypes(include=["float64", "int64"]).columns.tolist()
    return num_cols, cat_cols

def build_preprocessor(num_cols, cat_cols):
    numeric_transformer = StandardScaler()
    categorical_transformer = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    return ColumnTransformer([
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ])


In [77]:
X, y = load_california_housing()
#X, y = load_data()
num_cols, cat_cols = split_features(X)
preprocessor = build_preprocessor(num_cols, cat_cols)

if task_type == "classification":

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2,
        stratify=y if task_type == "classification" else None,
        random_state=seed
    )

else:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2,
        random_state=seed
    )


print(f"dataset: train={len(X_train)}, test={len(X_test)}")

dataset: train=16512, test=4128


In [78]:
model = XGBClassifier(n_jobs=-1, random_state=seed, use_label_encoder=False, eval_metric='logloss') \
    if task_type == "classification" \
    else XGBRegressor(n_jobs=-1, random_state=seed)

pipeline = Pipeline([
    ("pre", preprocessor),
    ("xgb", model)
])


In [79]:
start_time = time.time()

process = psutil.Process()
process.cpu_percent(interval=None)

cpu_before = psutil.cpu_percent(interval=None)
    
if task_type == "classification":
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)
    y_test_encoded = label_encoder.transform(y_test)
else:
    y_train_encoded = y_train
    y_test_encoded = y_test
    
pipeline.fit(X_train, y_train_encoded)

train_time = time.time() - start_time
cpu_after = psutil.cpu_percent(interval=None)
cpu_occupied = cpu_after - cpu_before
memory_used = process.memory_info().rss / (1024 ** 2)

print(f"time: {train_time:.2f}s, CPU: {cpu_before}% → {cpu_after}%, cpu occupied: {cpu_occupied}%, memory used: {memory_used:.2f}MB")

time: 0.27s, CPU: 17.4% → 96.5%, cpu occupied: 79.1%, memory used: 111.76MB


In [82]:
# evaluation
y_test_pred = pipeline.predict(X_test)

results = {
    "train_time_seconds": train_time,
    "cpu_before_percent": cpu_before,
    "cpu_after_percent": cpu_after
}

if task_type == "classification":
    y_test_encoded = label_encoder.transform(y_test)

    test_acc = accuracy_score(y_test_encoded, y_test_pred)
    test_f1 = f1_score(y_test_encoded, y_test_pred, average="macro")

    results.update({
        "test_accuracy": test_acc,
        "test_f1": test_f1,
    })

    print(f"\nevaluation:")
    print(f"Accuracy: {test_acc:.4f}")
    print(f"F1 Score: {test_f1:.4f}")

    try:
        y_test_proba = pipeline.predict_proba(X_test)
        test_auc = roc_auc_score(y_test_encoded, y_test_proba[:, 1])
        results["test_auc"] = test_auc
        print(f"AUC: {test_auc:.4f}")
    except Exception as e:
        test_auc = np.nan
        print("AUC can't be caculated:", e)

else:
    test_rmse = mean_squared_error(y_test, y_test_pred) ** 0.5
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    results.update({
        "test_rmse": test_rmse,
        "test_mae": test_mae,
        "test_r2": test_r2,
    })

    print(f"\nevaluation:")
    print(f"RMSE: {test_rmse:.4f}")
    print(f"MAE: {test_mae:.4f}")
    print(f"R²: {test_r2:.4f}")



evaluation:
RMSE: 0.4698
MAE: 0.3061
R²: 0.8288


In [81]:
if task_type == "classification":
    ordered_results = {
        "Accuracy": test_acc,
        "F1_score": test_f1,
        "AUC": test_auc,
        "time": train_time,
        "cpu_before(%)": cpu_before,
        "cpu_after(%)": cpu_after,
        "cpu_occupied(%)": cpu_after - cpu_before,
        "memory_used(MB)": memory_used         
    }
else:
    ordered_results = {
        "RMSE": test_rmse,
        "MAE": test_mae,
        "R2": test_r2,
        "time": train_time,
        "cpu_before(%)": cpu_before,
        "cpu_after(%)": cpu_after,
        "cpu_occupied(%)": cpu_after - cpu_before,
        "memory_used(MB)": memory_used
    }

result_df = pd.DataFrame([ordered_results])
result_path = os.path.join(save_dir, f"{dataset_name}_xgboost_results.csv")
result_df.to_csv(result_path, index=False)

print("All saved in", save_dir)


All saved in D:\DSS5104\XGboost\xgboost_result\777\California Housing
