In [3]:
pip install --upgrade xgboost

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
import warnings

warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

print("--- 🚀 Starting Optuna Hyperparameter Search for XGBoost (Safe Mode) ---")


# --- 1. โหลดข้อมูล ---
try:
    train_df = pd.read_csv("../data/salary.train.processed.csv")
    test_df = pd.read_csv("../data/salary.test.processed.csv")
    print("Files loaded successfully.")
except FileNotFoundError:
    print("Error: Files not found.")
    exit()

# --- 2. สร้างชุดข้อมูล Train/Test (ฉบับเต็ม) ---
columns_to_drop = ['id', 'social-security-number', 'house-number', 'fnlwgt']
target_column = 'label'

train_cols_to_drop = [col for col in columns_to_drop if col in train_df.columns]
y_train_full = train_df[target_column]
X_train_full = train_df.drop(columns=train_cols_to_drop + [target_column])

test_cols_to_drop = [col for col in columns_to_drop if col in test_df.columns]
y_test = test_df[target_column]
X_test = test_df.drop(columns=test_cols_to_drop + [target_column])

print("Data splits created.")

# --- 3. (ข้าม) ไม่จำเป็นต้อง Scale ข้อมูลสำหรับ XGBoost ---

# --- 4. 🛡️ สร้างชุดข้อมูล Validation (สำหรับ Optuna) ---
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, 
    y_train_full, 
    test_size=0.25,
    random_state=42,
    stratify=y_train_full
)

print(f"Training data (for Optuna): {X_train.shape}")
print(f"Validation data (for Optuna): {X_val.shape}")

# --- 5. คำนวณค่าถ่วงน้ำหนัก ---
label_counts = y_train_full.value_counts()
scale_pos_weight = label_counts[0] / label_counts[1]
print(f"Calculated scale_pos_weight for XGB: {scale_pos_weight:.4f}")


# --- 6. 🧠 สร้างฟังก์ชัน Objective (ปรับสำหรับ XGB) ---

def objective(trial):
    # 6a. 🎛️ สุ่มพารามิเตอร์สำหรับ XGBClassifier
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True), 
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True), # L1
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True), # L2
        
        'scale_pos_weight': scale_pos_weight,
        'random_state': 42,
        'n_jobs': -1
    }

    # 6b. สร้างและเทรนโมเดล
    # -----------------------------------------------
    # ❗️❗️ (แก้ไข) ลบ eval_set และ early_stopping_rounds ออก
    model = xgb.XGBClassifier(**params, use_label_encoder=False)
    
    # เทรนโมเดล (แบบไม่มี early stopping)
    model.fit(X_train, y_train)
    # -----------------------------------------------

    # 6c. ทำนายความน่าจะเป็น (บน X_val)
    y_probs_val = model.predict_proba(X_val)[:, 1]
    
    # 6d. 🎯 ค้นหา Threshold ที่ดีที่สุด
    thresholds = np.linspace(0.3, 0.7, 50)
    f1_scores = [f1_score(y_val, (y_probs_val >= t).astype(int)) for t in thresholds]
    
    # 6e. คืนค่า F1 ที่ดีที่สุดที่หาได้
    return np.max(f1_scores)

# --- 7. 🏃‍♂️ เริ่มการค้นหา ---
print("\nStarting Optuna study... (This may take longer without early stopping)")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50) # 50 ครั้ง

print("Optuna search finished.")


# --- 8. 🏆 สรุป "สูตร" ที่ดีที่สุด (XGB Champion Recipe) ---
print("\n" + "="*50)
print("--- 🏆 XGB Champion Model Recipe ---")

# 8a. ดึงค่าพารามิเตอร์ที่ดีที่สุด
best_xgb_params = study.best_params
print(f"Best Hyperparameters: \n{best_xgb_params}")

# 8b. สร้างโมเดลที่ดีที่สุด และเทรนด้วย "ข้อมูลทั้งหมด"
print("\nTraining final champion model on ALL training data...")

final_model_params = {
    **best_xgb_params,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'scale_pos_weight': scale_pos_weight,
    'random_state': 42,
    'n_jobs': -1,
    'use_label_encoder': False
}
final_model_xgb = xgb.XGBClassifier(**final_model_params)

# เทรนโมเดลสุดท้าย
final_model_xgb.fit(X_train_full, y_train_full)

# 8c. 🎯 ค้นหา Threshold ที่ดีที่สุดอีกครั้ง (ครั้งนี้บน Test Set)
print("Finding optimal threshold on TEST data...")
y_probs_test = final_model_xgb.predict_proba(X_test)[:, 1]

thresholds = np.linspace(0.3, 0.7, 100)
best_f1_test = 0
best_threshold_test = 0.5

for t in thresholds:
    f1 = f1_score(y_test, (y_probs_test >= t).astype(int))
    if f1 > best_f1_test:
        best_f1_test = f1
        best_threshold_test = t
        
print(f"Best Threshold found (on Test): {best_threshold_test:.4f}")

# --- 9. 📊 รายงานผลลัพธ์สุดท้าย ---
print("\n" + "="*50)
print("--- 📊 Final XGB Champion Model Results (on Test Set) ---")

y_final_pred = (y_probs_test >= best_threshold_test).astype(int)
final_f1 = f1_score(y_test, y_final_pred)
final_recall = recall_score(y_test, y_final_pred)
final_precision = precision_score(y_test, y_final_pred)

print(f"   Best F1 Score:   {final_f1:.4f}")
print(f"   Recall:          {final_recall:.4f}")
print(f"   Precision:       {final_precision:.4f}")
print("="*50)

# --- 10. พิมพ์ classification report ฉบับเต็ม ---
print("\n--- Full Classification Report (at Optimal Threshold) ---")
print(classification_report(y_test, y_final_pred, digits=4))
# -------------------------------------------------

print("\n--- Script Finished ---")

--- 🚀 Starting Optuna Hyperparameter Search for XGBoost (Safe Mode) ---
Files loaded successfully.
Data splits created.
Training data (for Optuna): (12540, 53)
Validation data (for Optuna): (4180, 53)
Calculated scale_pos_weight for XGB: 1.3882

Starting Optuna study... (This may take longer without early stopping)
Optuna search finished.

--- 🏆 XGB Champion Model Recipe ---
Best Hyperparameters: 
{'n_estimators': 274, 'learning_rate': 0.016953254781019632, 'max_depth': 6, 'subsample': 0.8067001361889019, 'colsample_bytree': 0.6594637598107654, 'gamma': 0.02086187116416753, 'reg_alpha': 9.988116054641043, 'reg_lambda': 0.0021197017225680994}

Training final champion model on ALL training data...
Finding optimal threshold on TEST data...
Best Threshold found (on Test): 0.4374

--- 📊 Final XGB Champion Model Results (on Test Set) ---
   Best F1 Score:   0.8044
   Recall:          0.8917
   Precision:       0.7327

--- Full Classification Report (at Optimal Threshold) ---
              pr