# 03 â€” Train Models on Balanced Dataset

Uses `../data/poker_balanced_10k.csv` generated by Notebook 01.

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score

df = pd.read_csv('../data/poker_balanced_10k.csv')
X = df[['S1','R1','S2','R2','S3','R3','S4','R4','S5','R5']]; y = df['ORD']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp)
scaler = StandardScaler(); X_train_s = scaler.fit_transform(X_train); X_val_s = scaler.transform(X_val); X_test_s = scaler.transform(X_test)
models = [
    ("KNN (k=1)", KNeighborsClassifier(n_neighbors=1)),
    ("Random Forest (n=300)", RandomForestClassifier(n_estimators=300, random_state=42)),
    ("Gradient Boosting", GradientBoostingClassifier(random_state=42)),
    ("XGBoost", XGBClassifier(tree_method='hist', eval_metric='mlogloss', random_state=42)),
]
results = []
stored_preds = {}
for name, clf in models:
    clf.fit(X_train_s, y_train)
    y_pred = clf.predict(X_test_s)
    acc = accuracy_score(y_test, y_pred)
    macro = f1_score(y_test, y_pred, average='macro')
    weighted = f1_score(y_test, y_pred, average='weighted')
    results.append((name, acc, macro, weighted))
    stored_preds[name] = y_pred
    print(f"\n=== {name} ===\nAccuracy: {acc:.4f}\nMacro F1: {macro:.4f}\nWeighted F1: {weighted:.4f}\n")
pd.DataFrame(results, columns=['Model','Accuracy','Macro F1','Weighted F1'])



=== KNN (k=1) ===
Accuracy: 0.6467
Macro F1: 0.6361
Weighted F1: 0.6361


=== Random Forest (n=300) ===
Accuracy: 0.8460
Macro F1: 0.8399
Weighted F1: 0.8399


=== Gradient Boosting ===
Accuracy: 0.8540
Macro F1: 0.8473
Weighted F1: 0.8473


=== XGBoost ===
Accuracy: 0.9400
Macro F1: 0.9384
Weighted F1: 0.9384



Unnamed: 0,Model,Accuracy,Macro F1,Weighted F1
0,KNN (k=1),0.646667,0.636084,0.636084
1,Random Forest (n=300),0.846,0.839927,0.839927
2,Gradient Boosting,0.854,0.84728,0.84728
3,XGBoost,0.94,0.938414,0.938414


In [2]:
import xgboost as xgb
print(xgb.__version__)

2.1.4
