In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
data_train = pd.read_csv('./data/salary.train.processed.csv').set_index('id')
data_test = pd.read_csv('./data/salary.test.processed.csv').set_index('id')
# กำหนด target
target = 'label'

# แยกข้อมูลฝึกและทดสอบ
X_train = data_train.drop(columns=[target])
y_train = data_train[target]
X_test = data_test.drop(columns=[target])
y_test = data_test[target]

In [2]:
pip install optuna

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

# --- Load Data ---
try:
    train_df = pandas.read_csv('./data/salary.train.processed.csv', index_col='id')
    test_df = pandas.read_csv('./data/salary.test.processed.csv', index_col='id')
except FileNotFoundError:
    print("Error: Could not find the processed CSV files.")
    raise

X_train = train_df.drop('label', axis=1)
y_train = train_df['label']
X_test = test_df.drop('label', axis=1)
y_test = test_df['label']
# --- End of Data Loading ---

# --- Train-Validation Split ---
X_train_final, X_val_final, y_train_final, y_val_final = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

# --- Define XGBoost Model with Default Parameters ---
xgb_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42,
    n_estimators=1000,
    early_stopping_rounds=50,
    learning_rate=0.1,  # ค่า default
    max_depth=6,        # ค่า default
    min_child_weight=1, # ค่า default
    subsample=1.0,      # ค่า default
    colsample_bytree=1.0  # ค่า default
)

# --- Train the Model with Early Stopping ---
xgb_model.fit(
    X_train_final,
    y_train_final,
    eval_set=[(X_val_final, y_val_final)],
    verbose=False
)

# --- Evaluate on Test Set ---
y_pred_xgb = xgb_model.predict(X_test)

print("\nXGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb, digits=6))


XGBoost Classification Report:
              precision    recall  f1-score   support

         0.0   0.858180  0.831540  0.844650      2416
         1.0   0.778684  0.811791  0.794893      1764

    accuracy                       0.823206      4180
   macro avg   0.818432  0.821666  0.819772      4180
weighted avg   0.824632  0.823206  0.823652      4180



In [5]:
import pandas
import optuna
import numpy as np
import sklearn.metrics
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

# --- Load Data (Assuming X_train, y_train, X_test, y_test are loaded) ---
# (You must load your data here first)
try:
    train_df = pandas.read_csv('./data/salary.train.processed.csv', index_col='id')
    test_df = pandas.read_csv('./data/salary.test.processed.csv', index_col='id')
except FileNotFoundError:
    print("Error: Could not find the processed CSV files.")
    raise

X_train = train_df.drop('label', axis=1)
y_train = train_df['label']
X_test = test_df.drop('label', axis=1)
y_test = test_df['label']
# --- End of Data Loading ---


# --- Step 1: Define the Advanced Objective Function ---
def objective_with_es(trial):
    """
    Objective function with manual Cross-Validation and Early Stopping.
    """
    
    # 1. Define the search space for the parameters
    param = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'random_state': 42,
        'use_label_encoder': False,
        'n_estimators': 1000, 
        
        # --- THIS IS THE FIX ---
        'early_stopping_rounds': 50, # MOVED HERE
        
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0)
    }
    
    # 2. Set up 3-Fold Cross-Validation
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = []

    # 3. Manually run the CV loop
    for train_idx, val_idx in skf.split(X_train, y_train):
        X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        model = XGBClassifier(**param)
        
        # --- THIS IS THE FIX ---
        # We pass eval_set, but 'early_stopping_rounds' is already in the model
        model.fit(
            X_train_fold, 
            y_train_fold,
            eval_set=[(X_val_fold, y_val_fold)],
            verbose=False
        )
        
        preds = model.predict(X_val_fold)
        f1 = sklearn.metrics.f1_score(y_val_fold, preds, average='binary')
        scores.append(f1)
    
    return np.mean(scores)

# --- Step 2: Create and Run the Study ---
print("Starting Optuna study with Early Stopping... (This will take several minutes)")
study = optuna.create_study(direction='maximize')
study.optimize(objective_with_es, n_trials=50) # 50 trials
print("Study complete!")

# --- Step 3: Get Best Params ---
print("\nBest trial:")
print(f"  Value (Mean F1): {study.best_value:.4f}")
print("  Best Params: ")
# We need to remove the key we manually added
best_params = {k: v for k, v in study.best_params.items() if k != 'early_stopping_rounds'} # 👈 *** Renamed variable ***
print(best_params) # 👈 *** Print new variable ***
print(f"\nSuccessfully stored parameters in 'best_params' variable.")


# --- Step 4: Train the FINAL Model (Also Fixed) ---
X_train_final, X_val_final, y_train_final, y_val_final = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

# Create the final model with the fix
final_xgb = XGBClassifier(
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss',
    n_estimators=1000,
    early_stopping_rounds=50, # --- FIX APPLIED HERE ---
    **best_params  # 👈 *** Use new variable ***
)

# Train it with the fix
final_xgb.fit(
    X_train_final, 
    y_train_final,
    eval_set=[(X_val_final, y_val_final)],
    verbose=False
)

# --- Step 5: Evaluate the Optuna-Tuned Model on the TEST set ---
y_pred_optuna_es = final_xgb.predict(X_test)

print("\nOptuna-Tuned XGBoost (w/ Early Stopping) Classification Report:")
print(classification_report(y_test, y_pred_optuna_es, digits=6))

[I 2025-10-22 06:44:57,907] A new study created in memory with name: no-name-da3ba00a-7ad3-4ea6-95ef-cf5c92bf4062


Starting Optuna study with Early Stopping... (This will take several minutes)


[I 2025-10-22 06:45:00,437] Trial 0 finished with value: 0.7899229277888272 and parameters: {'learning_rate': 0.03747553201879854, 'max_depth': 10, 'min_child_weight': 6, 'subsample': 0.9945720759070314, 'colsample_bytree': 0.8281161569147458}. Best is trial 0 with value: 0.7899229277888272.
[I 2025-10-22 06:45:01,775] Trial 1 finished with value: 0.7909559391006252 and parameters: {'learning_rate': 0.13816489836748594, 'max_depth': 5, 'min_child_weight': 4, 'subsample': 0.9706146674831886, 'colsample_bytree': 0.9633703784494705}. Best is trial 1 with value: 0.7909559391006252.
[I 2025-10-22 06:45:07,234] Trial 2 finished with value: 0.7885567827087069 and parameters: {'learning_rate': 0.013558447735419735, 'max_depth': 10, 'min_child_weight': 9, 'subsample': 0.9512517865035529, 'colsample_bytree': 0.6557998803383025}. Best is trial 1 with value: 0.7909559391006252.
[I 2025-10-22 06:45:08,822] Trial 3 finished with value: 0.7866435362231391 and parameters: {'learning_rate': 0.055987727

Study complete!

Best trial:
  Value (Mean F1): 0.7923
  Best Params: 
{'learning_rate': 0.16863298232095916, 'max_depth': 5, 'min_child_weight': 7, 'subsample': 0.8743916337587808, 'colsample_bytree': 0.9728472649390595}

Successfully stored parameters in 'best_params' variable.

Optuna-Tuned XGBoost (w/ Early Stopping) Classification Report:
              precision    recall  f1-score   support

         0.0   0.857263  0.842715  0.849927      2416
         1.0   0.789474  0.807823  0.798543      1764

    accuracy                       0.827990      4180
   macro avg   0.823368  0.825269  0.824235      4180
weighted avg   0.828655  0.827990  0.828242      4180



In [6]:
print(best_params)

{'learning_rate': 0.16863298232095916, 'max_depth': 5, 'min_child_weight': 7, 'subsample': 0.8743916337587808, 'colsample_bytree': 0.9728472649390595}


best_params = {
    'learning_rate': 0.016936740838045274, 
    'max_depth': 6, 
    'min_child_weight':  4, 
    'subsample': 0.6728417531696815, 
    'colsample_bytree': 0.8734308880973841
}

### Class Weight

In [7]:
import pandas
import xgboost as xgb
import sklearn.metrics
from sklearn.metrics import classification_report

print("--- 1. Testing XGBoost with Class Weight ---")

# --- Load Data ---
try:
    data_train_full = pandas.read_csv('./data/salary.train.processed.csv').set_index('id')
    data_test_xgb = pandas.read_csv('./data/salary.test.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ไม่พบไฟล์ salary.train.processed.csv กรุณาตรวจสอบ path")
    # exit() 

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']
X_test = data_test_xgb.drop(['label'], axis='columns')
y_test = data_test_xgb['label']

# --- Calculate Weight ---
scale_pos_weight = len(y_full[y_full == 0]) / len(y_full[y_full == 1])
print(f"Using scale_pos_weight: {scale_pos_weight:.4f}")

# --- Define Parameters ---
best_xgb_params = best_params

# --- Create and Train Model ---
xgb_model = xgb.XGBClassifier(
    **best_xgb_params,
    scale_pos_weight=scale_pos_weight, # 👈 Add weight
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(X_full, y_full) # Train on original data
print("Model training complete.")

# --- Evaluate ---
y_pred = xgb_model.predict(X_test)
report = classification_report(y_test, y_pred, digits=6, output_dict=True)
df_report = pandas.DataFrame(report).transpose()

print("\nXGBoost (Tuned + Class Weight) Report:")
print(df_report)

--- 1. Testing XGBoost with Class Weight ---
Using scale_pos_weight: 1.3882
Model training complete.

XGBoost (Tuned + Class Weight) Report:
              precision    recall  f1-score      support
0.0            0.879413  0.793874  0.834457  2416.000000
1.0            0.750875  0.850907  0.797768  1764.000000
accuracy       0.817943  0.817943  0.817943     0.817943
macro avg      0.815144  0.822391  0.816112  4180.000000
weighted avg   0.825169  0.817943  0.818974  4180.000000


### SMOTE

In [8]:
import pandas
import xgboost as xgb
import sklearn.metrics
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

print("\n--- 2. Testing XGBoost with SMOTE ---")

# --- Load Data ---
try:
    data_train_full = pandas.read_csv('./data/salary.train.processed.csv').set_index('id')
    data_test_xgb = pandas.read_csv('./data/salary.test.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ไม่พบไฟล์ salary.train.processed.csv กรุณาตรวจสอบ path")
    # exit() 

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']
X_test = data_test_xgb.drop(['label'], axis='columns')
y_test = data_test_xgb['label']

# --- Apply SMOTE ---
print("Applying SMOTE...")
smote = SMOTE(random_state=42, n_jobs=-1)
X_resampled, y_resampled = smote.fit_resample(X_full, y_full)
print(f"New resampled label distribution:\n{y_resampled.value_counts()}")

# --- Define Parameters ---
best_xgb_params = best_params

# --- Create and Train Model ---
xgb_model = xgb.XGBClassifier(
    **best_xgb_params,
    # ⚠️ NO 'scale_pos_weight'
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(X_resampled, y_resampled) # Train on SMOTEd data
print("Model training complete.")

# --- Evaluate ---
y_pred = xgb_model.predict(X_test)
report = classification_report(y_test, y_pred, digits=6, output_dict=True)
df_report = pandas.DataFrame(report).transpose()

print("\nXGBoost (Tuned + SMOTE) Report:")
print(df_report)


--- 2. Testing XGBoost with SMOTE ---
Applying SMOTE...
New resampled label distribution:
label
1.0    9719
0.0    9719
Name: count, dtype: int64


  File "c:\Users\natth\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\natth\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\natth\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\natth\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


Model training complete.

XGBoost (Tuned + SMOTE) Report:
              precision    recall  f1-score      support
0.0            0.870313  0.816639  0.842622  2416.000000
1.0            0.768427  0.833333  0.799565  1764.000000
accuracy       0.823684  0.823684  0.823684     0.823684
macro avg      0.819370  0.824986  0.821094  4180.000000
weighted avg   0.827316  0.823684  0.824452  4180.000000


### SMOTETomek

In [9]:
import pandas
import xgboost as xgb
import sklearn.metrics
from sklearn.metrics import classification_report
from imblearn.combine import SMOTETomek

print("\n--- 3. Testing XGBoost with SMOTETomek ---")

# --- Load Data ---
try:
    data_train_full = pandas.read_csv('./data/salary.train.processed.csv').set_index('id')
    data_test_xgb = pandas.read_csv('./data/salary.test.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ไม่พบไฟล์ salary.train.processed.csv กรุณาตรวจสอบ path")
    # exit() 

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']
X_test = data_test_xgb.drop(['label'], axis='columns')
y_test = data_test_xgb['label']

# --- Apply SMOTETomek ---
print("Applying SMOTETomek...")
smt = SMOTETomek(random_state=42, n_jobs=-1)
X_resampled, y_resampled = smt.fit_resample(X_full, y_full)
print(f"New resampled label distribution:\n{y_resampled.value_counts()}")

# --- Define Parameters ---
best_xgb_params = best_params

# --- Create and Train Model ---
xgb_model = xgb.XGBClassifier(
    **best_xgb_params,
    # ⚠️ NO 'scale_pos_weight'
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(X_resampled, y_resampled) # Train on SMOTETomek data
print("Model training complete.")

# --- Evaluate ---
y_pred = xgb_model.predict(X_test)
report = classification_report(y_test, y_pred, digits=6, output_dict=True)
df_report = pandas.DataFrame(report).transpose()

print("\nXGBoost (Tuned + SMOTETomek) Report:")
print(df_report)


--- 3. Testing XGBoost with SMOTETomek ---
Applying SMOTETomek...
New resampled label distribution:
label
1.0    9220
0.0    9220
Name: count, dtype: int64
Model training complete.

XGBoost (Tuned + SMOTETomek) Report:
              precision    recall  f1-score      support
0.0            0.869565  0.819536  0.843810  2416.000000
1.0            0.770888  0.831633  0.800109  1764.000000
accuracy       0.824641  0.824641  0.824641     0.824641
macro avg      0.820227  0.825585  0.821960  4180.000000
weighted avg   0.827923  0.824641  0.825368  4180.000000


### ADASYN

In [11]:
import pandas
import xgboost as xgb
import sklearn.metrics
from sklearn.metrics import classification_report
from imblearn.over_sampling import ADASYN

print("\n--- 4. Testing XGBoost with ADASYN ---")

# --- Load Data ---
try:
    data_train_full = pandas.read_csv('./data/salary.train.processed.csv').set_index('id')
    data_test_xgb = pandas.read_csv('./data/salary.test.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ไม่พบไฟล์ salary.train.processed.csv กรุณาตรวจสอบ path")
    # exit() 

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']
X_test = data_test_xgb.drop(['label'], axis='columns')
y_test = data_test_xgb['label']

# --- Apply ADASYN ---
print("Applying ADASYN...")
ada = ADASYN(random_state=42, n_jobs=-1)
X_resampled, y_resampled = ada.fit_resample(X_full, y_full)
print(f"New resampled label distribution:\n{y_resampled.value_counts()}")

# --- Define Parameters ---
best_xgb_params = best_params

# --- Create and Train Model ---
xgb_model = xgb.XGBClassifier(
    **best_xgb_params,
    # ⚠️ NO 'scale_pos_weight'
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(X_resampled, y_resampled) # Train on ADASYN data
print("Model training complete.")

# --- Evaluate ---
y_pred = xgb_model.predict(X_test)
report = classification_report(y_test, y_pred, digits=6, output_dict=True)
df_report = pandas.DataFrame(report).transpose()

print("\nXGBoost (Tuned + ADASYN) Report:")
print(df_report)


--- 4. Testing XGBoost with ADASYN ---
Applying ADASYN...
New resampled label distribution:
label
0.0    9719
1.0    9698
Name: count, dtype: int64
Model training complete.

XGBoost (Tuned + ADASYN) Report:
              precision    recall  f1-score      support
0.0            0.877708  0.805050  0.839810  2416.000000
1.0            0.760183  0.846372  0.800966  1764.000000
accuracy       0.822488  0.822488  0.822488     0.822488
macro avg      0.818945  0.825711  0.820388  4180.000000
weighted avg   0.828111  0.822488  0.823417  4180.000000
