In [14]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
data_train = pd.read_csv('./data/salary.train.processed.csv').set_index('id')
data_test = pd.read_csv('./data/salary.test.processed.csv').set_index('id')
# กำหนด target
target = 'label'

# แยกข้อมูลฝึกและทดสอบ
X_train = data_train.drop(columns=[target])
y_train = data_train[target]
X_test = data_test.drop(columns=[target])
y_test = data_test[target]

In [15]:
pip install optuna

Note: you may need to restart the kernel to use updated packages.


In [16]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [17]:
import optuna
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

# --- Step 1: Define the Advanced Objective Function ---
def objective_with_es(trial):
    """
    Objective function with manual Cross-Validation and Early Stopping.
    """
    
    # 1. Define the search space for the parameters
    param = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'random_state': 42,
        'use_label_encoder': False,
        'n_estimators': 1000, 
        
        # --- THIS IS THE FIX ---
        'early_stopping_rounds': 50, # MOVED HERE
        
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0)
    }
    
    # 2. Set up 3-Fold Cross-Validation
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = []

    # 3. Manually run the CV loop
    for train_idx, val_idx in skf.split(X_train, y_train):
        X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        model = XGBClassifier(**param)
        
        # --- THIS IS THE FIX ---
        # We pass eval_set, but 'early_stopping_rounds' is already in the model
        model.fit(
            X_train_fold, 
            y_train_fold,
            eval_set=[(X_val_fold, y_val_fold)],
            verbose=False
        )
        
        preds = model.predict(X_val_fold)
        f1 = f1_score(y_val_fold, preds, average='binary')
        scores.append(f1)
    
    return np.mean(scores)

# --- Step 2: Create and Run the Study ---
print("Starting Optuna study with Early Stopping... (This will take several minutes)")
study = optuna.create_study(direction='maximize')
study.optimize(objective_with_es, n_trials=50) # 50 trials
print("Study complete!")

# --- Step 3: Get Best Params ---
print("\nBest trial:")
print(f"  Value (Mean F1): {study.best_value:.4f}")
print("  Best Params: ")
# We need to remove the key we manually added
best_params_from_study = {k: v for k, v in study.best_params.items() if k != 'early_stopping_rounds'}
print(best_params_from_study)


# --- Step 4: Train the FINAL Model (Also Fixed) ---
X_train_final, X_val_final, y_train_final, y_val_final = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

# Create the final model with the fix
final_xgb = XGBClassifier(
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss',
    n_estimators=1000,
    early_stopping_rounds=50, # --- FIX APPLIED HERE ---
    **best_params_from_study  # Use best params from Optuna
)

# Train it with the fix
final_xgb.fit(
    X_train_final, 
    y_train_final,
    eval_set=[(X_val_final, y_val_final)],
    verbose=False
)

# --- Step 5: Evaluate the Optuna-Tuned Model on the TEST set ---
y_pred_optuna_es = final_xgb.predict(X_test)

print("\nOptuna-Tuned XGBoost (w/ Early Stopping) Classification Report:")
print(classification_report(y_test, y_pred_optuna_es))

[I 2025-10-21 19:35:57,573] A new study created in memory with name: no-name-d2ce3609-1334-41cd-86c1-292aa5a0137d


Starting Optuna study with Early Stopping... (This will take several minutes)


[I 2025-10-21 19:36:01,144] Trial 0 finished with value: 0.7923960733249218 and parameters: {'learning_rate': 0.03467852056798425, 'max_depth': 6, 'min_child_weight': 5, 'subsample': 0.6647539487143682, 'colsample_bytree': 0.8001392158689938}. Best is trial 0 with value: 0.7923960733249218.
[I 2025-10-21 19:36:05,429] Trial 1 finished with value: 0.7915544890599041 and parameters: {'learning_rate': 0.024233155106460475, 'max_depth': 9, 'min_child_weight': 9, 'subsample': 0.7830017567512916, 'colsample_bytree': 0.9442662901949559}. Best is trial 0 with value: 0.7923960733249218.
[I 2025-10-21 19:36:06,951] Trial 2 finished with value: 0.7907780103367749 and parameters: {'learning_rate': 0.09131587491543129, 'max_depth': 10, 'min_child_weight': 7, 'subsample': 0.848748604574615, 'colsample_bytree': 0.9213469913676485}. Best is trial 0 with value: 0.7923960733249218.
[I 2025-10-21 19:36:12,529] Trial 3 finished with value: 0.7918808345066847 and parameters: {'learning_rate': 0.01305878946

Study complete!

Best trial:
  Value (Mean F1): 0.7931
  Best Params: 
{'learning_rate': 0.016936740838045274, 'max_depth': 6, 'min_child_weight': 4, 'subsample': 0.6728417531696815, 'colsample_bytree': 0.8734308880973841}

Optuna-Tuned XGBoost (w/ Early Stopping) Classification Report:
              precision    recall  f1-score   support

         0.0       0.86      0.84      0.85      2416
         1.0       0.79      0.81      0.80      1764

    accuracy                           0.83      4180
   macro avg       0.83      0.83      0.83      4180
weighted avg       0.83      0.83      0.83      4180



In [18]:
best_params = {
    'learning_rate': 0.016936740838045274, 
    'max_depth': 6, 
    'min_child_weight':  4, 
    'subsample': 0.6728417531696815, 
    'colsample_bytree': 0.8734308880973841
}

### Class Weight

In [19]:
import pandas
import xgboost as xgb
import sklearn.metrics
from sklearn.metrics import classification_report

print("--- 1. Testing XGBoost with Class Weight ---")

# --- Load Data ---
try:
    data_train_full = pandas.read_csv('./data/salary.train.processed.csv').set_index('id')
    data_test_xgb = pandas.read_csv('./data/salary.test.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ไม่พบไฟล์ salary.train.processed.csv กรุณาตรวจสอบ path")
    # exit() 

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']
X_test = data_test_xgb.drop(['label'], axis='columns')
y_test = data_test_xgb['label']

# --- Calculate Weight ---
scale_pos_weight = len(y_full[y_full == 0]) / len(y_full[y_full == 1])
print(f"Using scale_pos_weight: {scale_pos_weight:.4f}")

# --- Define Parameters ---
best_xgb_params = best_params

# --- Create and Train Model ---
xgb_model = xgb.XGBClassifier(
    **best_xgb_params,
    scale_pos_weight=scale_pos_weight, # 👈 Add weight
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(X_full, y_full) # Train on original data
print("Model training complete.")

# --- Evaluate ---
y_pred = xgb_model.predict(X_test)
report = classification_report(y_test, y_pred, digits=6, output_dict=True)
df_report = pandas.DataFrame(report).transpose()

print("\nXGBoost (Tuned + Class Weight) Report:")
print(df_report)

--- 1. Testing XGBoost with Class Weight ---
Using scale_pos_weight: 1.3882
Model training complete.

XGBoost (Tuned + Class Weight) Report:
              precision    recall  f1-score      support
0.0            0.886023  0.781871  0.830695  2416.000000
1.0            0.742676  0.862245  0.798006  1764.000000
accuracy       0.815789  0.815789  0.815789     0.815789
macro avg      0.814349  0.822058  0.814351  4180.000000
weighted avg   0.825529  0.815789  0.816900  4180.000000


### SMOTE

In [20]:
import pandas
import xgboost as xgb
import sklearn.metrics
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

print("\n--- 2. Testing XGBoost with SMOTE ---")

# --- Load Data ---
try:
    data_train_full = pandas.read_csv('./data/salary.train.processed.csv').set_index('id')
    data_test_xgb = pandas.read_csv('./data/salary.test.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ไม่พบไฟล์ salary.train.processed.csv กรุณาตรวจสอบ path")
    # exit() 

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']
X_test = data_test_xgb.drop(['label'], axis='columns')
y_test = data_test_xgb['label']

# --- Apply SMOTE ---
print("Applying SMOTE...")
smote = SMOTE(random_state=42, n_jobs=-1)
X_resampled, y_resampled = smote.fit_resample(X_full, y_full)
print(f"New resampled label distribution:\n{y_resampled.value_counts()}")

# --- Define Parameters ---
best_xgb_params = best_params

# --- Create and Train Model ---
xgb_model = xgb.XGBClassifier(
    **best_xgb_params,
    # ⚠️ NO 'scale_pos_weight'
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(X_resampled, y_resampled) # Train on SMOTEd data
print("Model training complete.")

# --- Evaluate ---
y_pred = xgb_model.predict(X_test)
report = classification_report(y_test, y_pred, digits=6, output_dict=True)
df_report = pandas.DataFrame(report).transpose()

print("\nXGBoost (Tuned + SMOTE) Report:")
print(df_report)


--- 2. Testing XGBoost with SMOTE ---
Applying SMOTE...
New resampled label distribution:
label
1.0    9719
0.0    9719
Name: count, dtype: int64
Model training complete.

XGBoost (Tuned + SMOTE) Report:
              precision    recall  f1-score      support
0.0            0.884255  0.781043  0.829451  2416.000000
1.0            0.741447  0.859977  0.796325  1764.000000
accuracy       0.814354  0.814354  0.814354     0.814354
macro avg      0.812851  0.820510  0.812888  4180.000000
weighted avg   0.823988  0.814354  0.815471  4180.000000


### SMOTETomek

In [21]:
import pandas
import xgboost as xgb
import sklearn.metrics
from sklearn.metrics import classification_report
from imblearn.combine import SMOTETomek

print("\n--- 3. Testing XGBoost with SMOTETomek ---")

# --- Load Data ---
try:
    data_train_full = pandas.read_csv('./data/salary.train.processed.csv').set_index('id')
    data_test_xgb = pandas.read_csv('./data/salary.test.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ไม่พบไฟล์ salary.train.processed.csv กรุณาตรวจสอบ path")
    # exit() 

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']
X_test = data_test_xgb.drop(['label'], axis='columns')
y_test = data_test_xgb['label']

# --- Apply SMOTETomek ---
print("Applying SMOTETomek...")
smt = SMOTETomek(random_state=42, n_jobs=-1)
X_resampled, y_resampled = smt.fit_resample(X_full, y_full)
print(f"New resampled label distribution:\n{y_resampled.value_counts()}")

# --- Define Parameters ---
best_xgb_params = best_params

# --- Create and Train Model ---
xgb_model = xgb.XGBClassifier(
    **best_xgb_params,
    # ⚠️ NO 'scale_pos_weight'
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(X_resampled, y_resampled) # Train on SMOTETomek data
print("Model training complete.")

# --- Evaluate ---
y_pred = xgb_model.predict(X_test)
report = classification_report(y_test, y_pred, digits=6, output_dict=True)
df_report = pandas.DataFrame(report).transpose()

print("\nXGBoost (Tuned + SMOTETomek) Report:")
print(df_report)


--- 3. Testing XGBoost with SMOTETomek ---
Applying SMOTETomek...
New resampled label distribution:
label
1.0    8914
0.0    8914
Name: count, dtype: int64
Model training complete.

XGBoost (Tuned + SMOTETomek) Report:
              precision    recall  f1-score      support
0.0            0.883721  0.786424  0.832238  2416.000000
1.0            0.745813  0.858277  0.798102  1764.000000
accuracy       0.816746  0.816746  0.816746     0.816746
macro avg      0.814767  0.822350  0.815170  4180.000000
weighted avg   0.825522  0.816746  0.817833  4180.000000


### ADASYN

In [22]:
import pandas
import xgboost as xgb
import sklearn.metrics
from sklearn.metrics import classification_report
from imblearn.over_sampling import ADASYN

print("\n--- 4. Testing XGBoost with ADASYN ---")

# --- Load Data ---
try:
    data_train_full = pandas.read_csv('./data/salary.train.processed.csv').set_index('id')
    data_test_xgb = pandas.read_csv('./data/salary.test.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ไม่พบไฟล์ salary.train.processed.csv กรุณาตรวจสอบ path")
    # exit() 

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']
X_test = data_test_xgb.drop(['label'], axis='columns')
y_test = data_test_xgb['label']

# --- Apply ADASYN ---
print("Applying ADASYN...")
ada = ADASYN(random_state=42, n_jobs=-1)
X_resampled, y_resampled = ada.fit_resample(X_full, y_full)
print(f"New resampled label distribution:\n{y_resampled.value_counts()}")

# --- Define Parameters ---
best_xgb_params = best_params

# --- Create and Train Model ---
xgb_model = xgb.XGBClassifier(
    **best_xgb_params,
    # ⚠️ NO 'scale_pos_weight'
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(X_resampled, y_resampled) # Train on ADASYN data
print("Model training complete.")

# --- Evaluate ---
y_pred = xgb_model.predict(X_test)
report = classification_report(y_test, y_pred, digits=6, output_dict=True)
df_report = pandas.DataFrame(report).transpose()

print("\nXGBoost (Tuned + ADASYN) Report:")
print(df_report)


--- 4. Testing XGBoost with ADASYN ---
Applying ADASYN...
New resampled label distribution:
label
1.0    9726
0.0    9719
Name: count, dtype: int64
Model training complete.

XGBoost (Tuned + ADASYN) Report:
              precision    recall  f1-score      support
0.0            0.891398  0.767798  0.824994  2416.000000
1.0            0.732730  0.871882  0.796272  1764.000000
accuracy       0.811722  0.811722  0.811722     0.811722
macro avg      0.812064  0.819840  0.810633  4180.000000
weighted avg   0.824439  0.811722  0.812873  4180.000000
