In [2]:
import time
import psutil
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score,
    mean_squared_error, mean_absolute_error, r2_score,
    classification_report
)

import joblib
import os

from ucimlrepo import fetch_ucirepo


In [2]:
'''

we can choose different dataset

if using California housing dataset, please choose load_california_housing()

if using uci dataset, please choose load_data()

seed

seed we choose 777, 888, 999

'''

'\n\nwe can choose different dataset\n\nif using California housing dataset, please choose load_california_housing()\n\nif using uci dataset, please choose load_data()\n\nseed\n\nseed we choose 777, 888, 999\n\n'

In [2]:

# california_housing dataset

from sklearn.datasets import fetch_california_housing
seed = 999

def load_california_housing():
    data = fetch_california_housing()
    X = data.data 
    y = data.target 
    
    feature_names = data.feature_names
    X = pd.DataFrame(X, columns=feature_names)

    return X, y
dataset_name = "California Housing"  
task_type = "regression"

save_dir = os.path.join(r"D:\DSS5104\RandomForest\randomforest_result\999", dataset_name)
os.makedirs(save_dir, exist_ok=True)



In [11]:
#load uci dataset
'''
[Covertype](https://archive.ics.uci.edu/dataset/31/covertype)

[Adult](https://archive.ics.uci.edu/dataset/2/adult)

[Bank Marketing](https://archive.ics.uci.edu/dataset/222/bank+marketing)

[Poker Hand](https://archive.ics.uci.edu/dataset/158/poker+hand)

[Wine Quality](https://archive.ics.uci.edu/dataset/186/wine+quality)
'''

seed = 999
np.random.seed(seed)

dataset_name = "Poker Hand"
dataset_id = 158
task_type = "classification" # "classification" or "regression"

save_dir = os.path.join(r"D:\DSS5104\final\randomforest_result\999", dataset_name)
os.makedirs(save_dir, exist_ok=True)




In [4]:
def load_data():
    dataset = fetch_ucirepo(id=dataset_id)
    X = dataset.data.features
    y = dataset.data.targets.squeeze()

    if (X == "?").any().any():
        X = X.replace("?", np.nan)
        
        original_shape = X.shape[0]
        df = pd.concat([X, y], axis=1)
        df = df.dropna()
        removed = original_shape - df.shape[0]
        print(f"Missing value detected. Remove {removed} records")

        X = df.drop(columns=[y.name])
        y = df[y.name]

    return X, y

In [5]:

def split_features(X):
    cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
    num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
    return num_cols, cat_cols

def build_preprocessor(num_cols, cat_cols):
    numeric_transformer = StandardScaler()
    categorical_transformer = OrdinalEncoder(
        handle_unknown="use_encoded_value",
        unknown_value=-1
    )

    preprocessor = ColumnTransformer([
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ])
    return preprocessor

In [12]:
X, y = load_data() 
#X, y = load_california_housing() #load_data()

num_cols, cat_cols = split_features(X)
preprocessor = build_preprocessor(num_cols, cat_cols)


if task_type == "classification":

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2,
        stratify=y if task_type == "classification" else None,
        random_state=seed
    )

else:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2,
        random_state=seed
    )


print(f"dataset: train={len(X_train)}, test={len(X_test)}")


dataset: train=820008, test=205002


In [13]:

if task_type == "classification":
    model = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=seed, n_jobs=-1)
else:
    model = RandomForestRegressor(n_estimators=100, max_depth=None, random_state=seed, n_jobs=-1)

pipeline = Pipeline([
    ("pre", preprocessor),
    ("rf", model),
])

In [14]:
start_time = time.time()

process = psutil.Process()
process.cpu_percent(interval=None)

cpu_before = psutil.cpu_percent(interval=None)

pipeline.fit(X_train, y_train)

train_time = time.time() - start_time
cpu_after = psutil.cpu_percent(interval=None)
cpu_occupied = cpu_after - cpu_before
memory_used = process.memory_info().rss / (1024 ** 2)

print(f"time: {train_time:.2f}s, CPU: {cpu_before}% → {cpu_after}%, cpu occupied: {cpu_occupied}%, memory used: {memory_used:.2f}MB")

time: 147.68s, CPU: 27.7% → 98.9%, cpu occupied: 71.2%, memory used: 3204.38MB


In [15]:
y_test_pred = pipeline.predict(X_test)

if task_type == "classification":
    
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred, average="macro")  
    print("evaluation:")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print(f"Test F1 Score: {test_f1:.4f}")
    
    try:
        y_test_proba = pipeline.predict_proba(X_test)
        if len(np.unique(y_test)) == 2:
            test_auc = roc_auc_score(y_test, y_test_proba[:, 1])
        else:
            test_auc = roc_auc_score(y_test, y_test_proba, multi_class="ovr", average="macro")
        print(f"Test AUC: {test_auc:.4f}")
    except Exception as e:
        print("AUC can't caculate: ", e)
    
    print("\nClassification Report:\n", classification_report(y_test, y_test_pred))

else:
    
    test_mse = mean_squared_error(y_test, y_test_pred)
    test_rmse = test_mse ** 0.5
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    print("evaluation:")
    print(f"Test RMSE: {test_rmse:.4f}")
    print(f"Test MAE: {test_mae:.4f}")
    print(f"Test R²: {test_r2:.4f}")

evaluation:
Test Accuracy: 0.7521
Test F1 Score: 0.2208
Test AUC: 0.8735


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.91      0.84    102740
           1       0.72      0.69      0.71     86619
           2       0.59      0.03      0.05      9766
           3       0.82      0.09      0.15      4327
           4       0.45      0.01      0.01       796
           5       0.99      0.28      0.43       410
           6       0.25      0.01      0.01       292
           7       0.00      0.00      0.00        47
           8       0.00      0.00      0.00         3
           9       0.00      0.00      0.00         2

    accuracy                           0.75    205002
   macro avg       0.46      0.20      0.22    205002
weighted avg       0.74      0.75      0.72    205002



In [16]:
train_metrics = {
    "train_time_seconds": train_time,
    "cpu_before_percent": cpu_before,
    "cpu_after_percent": cpu_after
}

if task_type == "classification":
    eval_results = {
        "Accuracy": test_accuracy,
        "F1_score": test_f1,
        "AUC": test_auc,
        "time": train_time,
        "cpu_before(%)": cpu_before,
        "cpu_after(%)": cpu_after,
        "cpu_occupied(%)": cpu_after - cpu_before,
        "memory_used(MB)": memory_used
    }
    
elif task_type == "regression":
    eval_results = {
        "RMSE": test_rmse,
        "MAE": test_mae,
        "R2": test_r2,
        "time": train_time,
        "cpu_before(%)": cpu_before,
        "cpu_after(%)": cpu_after,
        "cpu_occupied(%)": cpu_after - cpu_before,
        "memory_used(MB)": memory_used
    }

eval_df = pd.DataFrame([eval_results])



csv_filename = os.path.join(save_dir, f"{dataset_name}_model_results.csv")
eval_df.to_csv(csv_filename, index=False)

print(f"saved {csv_filename}")

saved D:\DSS5104\final\randomforest_result\999\Poker Hand\Poker Hand_model_results.csv
