In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
data_train = pd.read_csv('./data/salary.train.processed.csv').set_index('id')
data_test = pd.read_csv('./data/salary.test.processed.csv').set_index('id')
# กำหนด target
target = 'label'

# แยกข้อมูลฝึกและทดสอบ
X_train = data_train.drop(columns=[target])
y_train = data_train[target]
X_test = data_test.drop(columns=[target])
y_test = data_test[target]

In [2]:
pip install optuna

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install lightgbm


Note: you may need to restart the kernel to use updated packages.


In [4]:
import optuna
import numpy as np
import lightgbm as lgbm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

# --- Step 1: Define the Objective Function ---
def objective_lgbm(trial):
    """
    Objective function with manual Cross-Validation and Early Stopping.
    """
    
    # 1. Define the search space for LightGBM
    param = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'random_state': 42,
        'n_estimators': 1000,
        'verbose': -1, # Suppress LightGBM's own logging
        
        # These are the key parameters we'll tune
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 10, 50), # Key LGBM param
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 30),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0)
    }
    
    # 2. Set up 3-Fold Cross-Validation
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = []

    # 3. Manually run the CV loop
    for train_idx, val_idx in skf.split(X_train, y_train):
        X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        model = lgbm.LGBMClassifier(**param)
        
        # Train with early stopping
        model.fit(
            X_train_fold, 
            y_train_fold,
            eval_set=[(X_val_fold, y_val_fold)],
            eval_metric='logloss',
            # Use the 'callbacks' argument for early stopping
            callbacks=[lgbm.early_stopping(50, verbose=False)]
        )
        
        preds = model.predict(X_val_fold)
        f1 = f1_score(y_val_fold, preds, average='binary')
        scores.append(f1)
    
    return np.mean(scores)

# --- Step 2: Create and Run the Study ---
print("Starting Optuna study for LightGBM... (This should be fast!)")
study_lgbm = optuna.create_study(direction='maximize')
study_lgbm.optimize(objective_lgbm, n_trials=50) # 50 trials
print("Study complete!")

# --- Step 3: Get Best Params ---
print("\nBest trial:")
print(f"  Value (Mean F1): {study_lgbm.best_value:.4f}")
print("  Best Params: ")
print(study_lgbm.best_params)


# --- Step 4: Train the FINAL Model ---
# 1. Create a new train/validation split
X_train_final, X_val_final, y_train_final, y_val_final = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

# 2. Create the final model
final_lgbm = lgbm.LGBMClassifier(
    random_state=42,
    n_estimators=1000,
    **study_lgbm.best_params # Use best params from Optuna
)

# 3. Train it with early stopping
final_lgbm.fit(
    X_train_final, 
    y_train_final,
    eval_set=[(X_val_final, y_val_final)],
    eval_metric='logloss',
    callbacks=[lgbm.early_stopping(50, verbose=False)]
)

# --- Step 5: Evaluate the Optuna-Tuned Model on the TEST set ---
y_pred_optuna_lgbm = final_lgbm.predict(X_test)

print("\nOptuna-Tuned LightGBM Classification Report:")
print(classification_report(y_test, y_pred_optuna_lgbm))

[I 2025-10-21 19:23:41,377] A new study created in memory with name: no-name-39577086-16a2-4ab3-88e1-5ba6949a13dd


Starting Optuna study for LightGBM... (This should be fast!)


  File "c:\Users\natth\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\natth\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\natth\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\natth\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[I 2025-10-21 19:23:44,555] Trial 0 finished with value: 0.7911721121847751 and parameters: {'learning_rate': 0.011720722404824632, 'num_leaves': 17, 'max_depth': 5, 'min_child_samples': 20, 'subsample': 0.6562270463965464, 'colsample_bytree': 0.8444479593206774}. Best is trial 0 with value: 0.791172112

Study complete!

Best trial:
  Value (Mean F1): 0.7934
  Best Params: 
{'learning_rate': 0.0161945307562969, 'num_leaves': 36, 'max_depth': 6, 'min_child_samples': 22, 'subsample': 0.9537604399255435, 'colsample_bytree': 0.7231530404355765}

Optuna-Tuned LightGBM Classification Report:
              precision    recall  f1-score   support

         0.0       0.86      0.84      0.85      2416
         1.0       0.79      0.81      0.80      1764

    accuracy                           0.83      4180
   macro avg       0.82      0.83      0.82      4180
weighted avg       0.83      0.83      0.83      4180



In [5]:
best_params = {
    'learning_rate':  0.0161945307562969, 
    'num_leaves': 36, 
    'max_depth': 6, 
    'min_child_samples': 22, 
    'subsample': 0.9537604399255435, 
    'colsample_bytree': 0.7231530404355765
}

### ClassWeight

In [6]:
import pandas
import lightgbm as lgb
import sklearn.metrics
from sklearn.metrics import classification_report

# --- 1. Load Data (to calculate the weight) ---
try:
    data_train_full = pandas.read_csv('./data/salary.train.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ไม่พบไฟล์ salary.train.processed.csv กรุณาตรวจสอบ path")
    # exit() 

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']

# --- 2. Calculate scale_pos_weight ---
# This is the same logic as your XGBoost code
scale_pos_weight = len(y_full[y_full == 0]) / len(y_full[y_full == 1])
print(f"Calculated scale_pos_weight: {scale_pos_weight:.4f}")

# --- 3. Define Your Best Parameters ---
# These are the params you provided in your prompt
best_lgbm_params = {
    'learning_rate': 0.11760370695477697, 
    'num_leaves': 28, 
    'max_depth': 7, 
    'min_child_samples': 19, 
    'subsample': 0.6811934251139399, 
    'colsample_bytree': 0.7733362388832487
}

# --- 4. Create and Train the Model ---
print("\nTraining LightGBM model with class weighting...")

lgbm_model_final = lgb.LGBMClassifier(
    **best_lgbm_params,       # Apply all your tuned parameters
    scale_pos_weight=scale_pos_weight, # 👈 Here is the class weight
    random_state=42,
    n_jobs=-1
)

# Train the model on the full training dataset
lgbm_model_final.fit(X_full, y_full)

print("Model training complete.")

# --- 5. Evaluate on Test Data (Recommended) ---
print("\nEvaluating model on test data...")

data_test_lgbm = pandas.read_csv('./data/salary.test.processed.csv').set_index('id')
data_test_lgbm['prediction'] = lgbm_model_final.predict(data_test_lgbm.drop(['label'], axis='columns'))

# Print the report
report_scores_lgbm = sklearn.metrics.classification_report(
    y_true=data_test_lgbm['label'],
    y_pred=data_test_lgbm['prediction'],
    digits=6,
    output_dict=True
)
df_score_lgbm = pandas.DataFrame(report_scores_lgbm).transpose()

print("\nLightGBM (Tuned + Weighted) Report:")
print(df_score_lgbm)

Calculated scale_pos_weight: 1.3882

Training LightGBM model with class weighting...
Model training complete.

Evaluating model on test data...

LightGBM (Tuned + Weighted) Report:
              precision    recall  f1-score      support
0.0            0.885727  0.798841  0.840044  2416.000000
1.0            0.757121  0.858844  0.804781  1764.000000
accuracy       0.824163  0.824163  0.824163     0.824163
macro avg      0.821424  0.828842  0.822412  4180.000000
weighted avg   0.831454  0.824163  0.825162  4180.000000


### SMOTE

In [7]:
import pandas
import lightgbm as lgb
import sklearn.metrics
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE  # 👈 1. Import SMOTE

# --- 1. Load Data (Needed for training) ---
try:
    data_train_full = pandas.read_csv('./data/salary.train.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ไม่พบไฟล์ salary.train.processed.csv กรุณาตรวจสอบ path")
    # exit() 

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']

print(f"Original training data shape: {X_full.shape}")
print(f"Original label distribution:\n{y_full.value_counts()}")

# --- 2. Define Your Best Parameters ---
# These are the params you provided
best_lgbm_params = best_params

# --- 3. Apply SMOTE to the Training Data ---
print("\nApplying SMOTE to the training data...")
smote = SMOTE(random_state=42, n_jobs=-1)
X_resampled, y_resampled = smote.fit_resample(X_full, y_full)

print(f"New resampled training data shape: {X_resampled.shape}")
print(f"New resampled label distribution:\n{y_resampled.value_counts()}")

# --- 4. Create and Train the Final Model (with NO class_weight) ---
print("\nTraining final LGBM model on SMOTEd data...")

lgbm_model_final = lgb.LGBMClassifier(
    **best_lgbm_params,
    # ⚠️ NO 'scale_pos_weight' or 'class_weight' here
    random_state=42,
    n_jobs=-1
)

# 5. Train the model on the NEW resampled data
lgbm_model_final.fit(X_resampled, y_resampled)
print("Model training complete.")

# --- 6. Evaluate on ORIGINAL Test Data ---
print("\nEvaluating model on *original* test data...")

data_test_lgbm = pandas.read_csv('./data/salary.test.processed.csv').set_index('id')

# IMPORTANT: Do NOT apply SMOTE to the test data.
X_test = data_test_lgbm.drop(['label'], axis='columns')
y_test = data_test_lgbm['label']

data_test_lgbm['prediction'] = lgbm_model_final.predict(X_test)

# Print the report
report_scores_lgbm = sklearn.metrics.classification_report(
    y_true=y_test,
    y_pred=data_test_lgbm['prediction'],
    digits=6,
    output_dict=True
)
df_score_lgbm = pandas.DataFrame(report_scores_lgbm).transpose()

print("\nLightGBM (Tuned + SMOTE) Report:")
print(df_score_lgbm)

Original training data shape: (16720, 89)
Original label distribution:
label
0.0    9719
1.0    7001
Name: count, dtype: int64

Applying SMOTE to the training data...
New resampled training data shape: (19438, 89)
New resampled label distribution:
label
1.0    9719
0.0    9719
Name: count, dtype: int64

Training final LGBM model on SMOTEd data...
Model training complete.

Evaluating model on *original* test data...

LightGBM (Tuned + SMOTE) Report:
              precision    recall  f1-score      support
0.0            0.877354  0.790563  0.831700  2416.000000
1.0            0.747379  0.848639  0.794797  1764.000000
accuracy       0.815072  0.815072  0.815072     0.815072
macro avg      0.812367  0.819601  0.813249  4180.000000
weighted avg   0.822503  0.815072  0.816127  4180.000000


### SMOTETomek

In [8]:
import pandas
import lightgbm as lgb
import sklearn.metrics
from sklearn.metrics import classification_report
from imblearn.combine import SMOTETomek

print("\n--- 3. Testing LightGBM with SMOTETomek ---")

# --- Load Data ---
try:
    data_train_full = pandas.read_csv('./data/salary.train.processed.csv').set_index('id')
    data_test_lgbm = pandas.read_csv('./data/salary.test.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ไม่พบไฟล์ salary.train.processed.csv กรุณาตรวจสอบ path")
    # exit() 

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']
X_test = data_test_lgbm.drop(['label'], axis='columns')
y_test = data_test_lgbm['label']

# --- Apply SMOTETomek ---
print("Applying SMOTETomek...")
smt = SMOTETomek(random_state=42, n_jobs=-1)
X_resampled, y_resampled = smt.fit_resample(X_full, y_full)
print(f"New resampled label distribution:\n{y_resampled.value_counts()}")

# --- Define Parameters ---
best_lgbm_params = best_params

# --- Create and Train Model ---
lgbm_model = lgb.LGBMClassifier(
    **best_lgbm_params,
    # ⚠️ NO 'scale_pos_weight' or 'class_weight'
    random_state=42,
    n_jobs=-1
)

lgbm_model.fit(X_resampled, y_resampled) # Train on SMOTETomek data
print("Model training complete.")

# --- Evaluate ---
y_pred = lgbm_model.predict(X_test)
report = classification_report(y_test, y_pred, digits=6, output_dict=True)
df_report = pandas.DataFrame(report).transpose()

print("\nLightGBM (Tuned + SMOTETomek) Report:")
print(df_report)


--- 3. Testing LightGBM with SMOTETomek ---
Applying SMOTETomek...
New resampled label distribution:
label
1.0    8914
0.0    8914
Name: count, dtype: int64
Model training complete.

LightGBM (Tuned + SMOTETomek) Report:
              precision    recall  f1-score      support
0.0            0.875572  0.792219  0.831812  2416.000000
1.0            0.748245  0.845805  0.794039  1764.000000
accuracy       0.814833  0.814833  0.814833     0.814833
macro avg      0.811908  0.819012  0.812926  4180.000000
weighted avg   0.821839  0.814833  0.815872  4180.000000


### ADASYN

In [9]:
import pandas
import lightgbm as lgb
import sklearn.metrics
from sklearn.metrics import classification_report
from imblearn.over_sampling import ADASYN

print("\n--- 4. Testing LightGBM with ADASYN ---")

# --- Load Data ---
try:
    data_train_full = pandas.read_csv('./data/salary.train.processed.csv').set_index('id')
    data_test_lgbm = pandas.read_csv('./data/salary.test.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ไม่พบไฟล์ salary.train.processed.csv กรุณาตรวจสอบ path")
    # exit() 

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']
X_test = data_test_lgbm.drop(['label'], axis='columns')
y_test = data_test_lgbm['label']

# --- Apply ADASYN ---
print("Applying ADASYN...")
ada = ADASYN(random_state=42, n_jobs=-1)
X_resampled, y_resampled = ada.fit_resample(X_full, y_full)
print(f"New resampled label distribution:\n{y_resampled.value_counts()}")

# --- Define Parameters ---
best_lgbm_params = best_params

# --- Create and Train Model ---
lgbm_model = lgb.LGBMClassifier(
    **best_lgbm_params,
    # ⚠️ NO 'scale_pos_weight' or 'class_weight'
    random_state=42,
    n_jobs=-1
)

lgbm_model.fit(X_resampled, y_resampled) # Train on ADASYN data
print("Model training complete.")

# --- Evaluate ---
y_pred = lgbm_model.predict(X_test)
report = classification_report(y_test, y_pred, digits=6, output_dict=True)
df_report = pandas.DataFrame(report).transpose()

print("\nLightGBM (Tuned + ADASYN) Report:")
print(df_report)


--- 4. Testing LightGBM with ADASYN ---
Applying ADASYN...
New resampled label distribution:
label
1.0    9726
0.0    9719
Name: count, dtype: int64
Model training complete.

LightGBM (Tuned + ADASYN) Report:
              precision    recall  f1-score      support
0.0            0.893762  0.759106  0.820949  2416.000000
1.0            0.726504  0.876417  0.794450  1764.000000
accuracy       0.808612  0.808612  0.808612     0.808612
macro avg      0.810133  0.817762  0.807700  4180.000000
weighted avg   0.823178  0.808612  0.809766  4180.000000
