In [16]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
data_train = pd.read_csv('./data/salary.train.processed.csv').set_index('id')
data_test = pd.read_csv('./data/salary.test.processed.csv').set_index('id')
# กำหนด target
target = 'label'

# แยกข้อมูลฝึกและทดสอบ
X_train = data_train.drop(columns=[target])
y_train = data_train[target]
X_test = data_test.drop(columns=[target])
y_test = data_test[target]

In [17]:
pip install optuna

Note: you may need to restart the kernel to use updated packages.


In [18]:
pip install lightgbm


Note: you may need to restart the kernel to use updated packages.


In [19]:
import pandas
import lightgbm as lgbm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

# --- Load Data ---
try:
    train_df = pandas.read_csv('./data/salary.train.processed.csv', index_col='id')
    test_df = pandas.read_csv('./data/salary.test.processed.csv', index_col='id')
except FileNotFoundError:
    print("Error: Could not find the processed CSV files.")
    raise

X_train = train_df.drop('label', axis=1)
y_train = train_df['label']
X_test = test_df.drop('label', axis=1)
y_test = test_df['label']
# --- End of Data Loading ---

# --- Train-Validation Split ---
X_train_final, X_val_final, y_train_final, y_val_final = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

# --- Define LightGBM Model with Default Parameters ---
lgbm_model = lgbm.LGBMClassifier(
    objective='binary',
    metric='binary_logloss',
    random_state=42,
    n_estimators=1000,
    learning_rate=0.1,  # ค่า default
    num_leaves=31,      # ค่า default
    max_depth=-1,       # ค่า default (ไม่จำกัดความลึก)
    min_child_samples=20,  # ค่า default
    subsample=1.0,      # ค่า default
    colsample_bytree=1.0,  # ค่า default
    verbose=-1
)

# --- Train the Model with Early Stopping ---
lgbm_model.fit(
    X_train_final,
    y_train_final,
    eval_set=[(X_val_final, y_val_final)],
    eval_metric='logloss',
    callbacks=[lgbm.early_stopping(50, verbose=False)]
)

# --- Evaluate on Test Set ---
y_pred_lgbm = lgbm_model.predict(X_test)

print("\nLightGBM Classification Report:")
print(classification_report(y_test, y_pred_lgbm, digits=6))


LightGBM Classification Report:
              precision    recall  f1-score   support

         0.0   0.858539  0.836507  0.847379      2416
         1.0   0.783680  0.811224  0.797214      1764

    accuracy                       0.825837      4180
   macro avg   0.821109  0.823866  0.822297      4180
weighted avg   0.826948  0.825837  0.826209      4180



In [20]:
import pandas
import optuna
import numpy as np
import lightgbm as lgbm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split
import warnings
import sklearn

# Suppress warnings
warnings.filterwarnings('ignore')

# --- Load Data ---
try:
    train_df = pandas.read_csv('./data/salary.train.processed.csv', index_col='id')
    test_df = pandas.read_csv('./data/salary.test.processed.csv', index_col='id')
except FileNotFoundError:
    print("Error: Could not find the processed CSV files.")
    raise

X_train = train_df.drop('label', axis=1)
y_train = train_df['label']
X_test = test_df.drop('label', axis=1)
y_test = test_df['label']
# --- End of Data Loading ---

# --- Step 1: Define the Objective Function ---
def objective_lgbm(trial):
    """
    Objective function with manual Cross-Validation and Early Stopping.
    """
    # 1. Define the search space for LightGBM
    param = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'random_state': 42,
        'n_estimators': 1000,
        'verbose': -1,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 10, 50),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 30),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0)
    }
    
    # 2. Set up 3-Fold Cross-Validation
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = []

    # 3. Manually run the CV loop
    for train_idx, val_idx in skf.split(X_train, y_train):
        X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        model = lgbm.LGBMClassifier(**param)
        
        # Train with early stopping
        model.fit(
            X_train_fold, 
            y_train_fold,
            eval_set=[(X_val_fold, y_val_fold)],
            eval_metric='logloss',
            callbacks=[lgbm.early_stopping(50, verbose=False)]
        )
        
        preds = model.predict(X_val_fold)
        f1 = sklearn.metrics.f1_score(y_val_fold, preds, average='binary')
        scores.append(f1)
    
    return np.mean(scores)

# --- Step 2: Create and Run the Study ---
study_name = "lgbm_salary_tuning"  # ตั้งชื่อโปรเจกต์ (แต่ไม่บันทึกลง storage)

print(f"Starting Optuna study: '{study_name}'")
print(f"Running 200 new trials...")

# สร้าง Study ใหม่โดยไม่ใช้ storage
study_lgbm = optuna.create_study(
    study_name=study_name,
    direction='maximize'
)

# รัน optimization
study_lgbm.optimize(objective_lgbm, n_trials=100)
print("Study complete!")
print(f"Total number of trials in study: {len(study_lgbm.trials)}")

# --- Step 3: Get Best Params ---
print("\nBest trial:")
print(f"  Value (Mean F1): {study_lgbm.best_value:.4f}")
print("  Best Params: ")
print(study_lgbm.best_params)

# เก็บพารามิเตอร์ที่ดีที่สุด
best_params = study_lgbm.best_params
print(f"\nSuccessfully stored parameters in 'best_params' variable.")

# --- Step 4: Train the FINAL Model ---
X_train_final, X_val_final, y_train_final, y_val_final = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

final_lgbm = lgbm.LGBMClassifier(
    random_state=42,
    n_estimators=1000,
    **best_params
)

final_lgbm.fit(
    X_train_final, 
    y_train_final,
    eval_set=[(X_val_final, y_val_final)],
    eval_metric='logloss',
    callbacks=[lgbm.early_stopping(50, verbose=False)]
)

# --- Step 5: Evaluate the Optuna-Tuned Model on the TEST set ---
y_pred_optuna_lgbm = final_lgbm.predict(X_test)

print("\nOptuna-Tuned LightGBM Classification Report:")
print(classification_report(y_test, y_pred_optuna_lgbm, digits=6))

[I 2025-10-22 06:40:26,329] A new study created in memory with name: lgbm_salary_tuning


Starting Optuna study: 'lgbm_salary_tuning'
Running 200 new trials...


[I 2025-10-22 06:40:26,848] Trial 0 finished with value: 0.7905935909105978 and parameters: {'learning_rate': 0.13087099233680682, 'num_leaves': 24, 'max_depth': 6, 'min_child_samples': 9, 'subsample': 0.9340840184473755, 'colsample_bytree': 0.8102533462424166}. Best is trial 0 with value: 0.7905935909105978.
[I 2025-10-22 06:40:29,930] Trial 1 finished with value: 0.7875580630145745 and parameters: {'learning_rate': 0.016043146631172546, 'num_leaves': 44, 'max_depth': 3, 'min_child_samples': 16, 'subsample': 0.819539102780078, 'colsample_bytree': 0.6909344043303839}. Best is trial 0 with value: 0.7905935909105978.
[I 2025-10-22 06:40:34,878] Trial 2 finished with value: 0.791800765921803 and parameters: {'learning_rate': 0.010205717272192534, 'num_leaves': 20, 'max_depth': 6, 'min_child_samples': 11, 'subsample': 0.7938575279368162, 'colsample_bytree': 0.8247383883688079}. Best is trial 2 with value: 0.791800765921803.
[I 2025-10-22 06:40:37,743] Trial 3 finished with value: 0.7904615

Study complete!
Total number of trials in study: 100

Best trial:
  Value (Mean F1): 0.7931
  Best Params: 
{'learning_rate': 0.06883376056802966, 'num_leaves': 46, 'max_depth': 6, 'min_child_samples': 14, 'subsample': 0.9091917489759059, 'colsample_bytree': 0.6281804943827396}

Successfully stored parameters in 'best_params' variable.

Optuna-Tuned LightGBM Classification Report:
              precision    recall  f1-score   support

         0.0   0.858294  0.837334  0.847685      2416
         1.0   0.784421  0.810658  0.797324      1764

    accuracy                       0.826077      4180
   macro avg   0.821358  0.823996  0.822504      4180
weighted avg   0.827119  0.826077  0.826432      4180



In [21]:
print(best_params)

{'learning_rate': 0.06883376056802966, 'num_leaves': 46, 'max_depth': 6, 'min_child_samples': 14, 'subsample': 0.9091917489759059, 'colsample_bytree': 0.6281804943827396}


### ClassWeight

In [22]:
import pandas
import lightgbm as lgb
import sklearn.metrics
from sklearn.metrics import classification_report

# --- 1. Load Data (to calculate the weight) ---
try:
    data_train_full = pandas.read_csv('./data/salary.train.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ไม่พบไฟล์ salary.train.processed.csv กรุณาตรวจสอบ path")
    # exit() 

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']

# --- 2. Calculate scale_pos_weight ---
# This is the same logic as your XGBoost code
scale_pos_weight = len(y_full[y_full == 0]) / len(y_full[y_full == 1])
print(f"Calculated scale_pos_weight: {scale_pos_weight:.4f}")

# --- 3. Define Your Best Parameters ---
# These are the params you provided in your prompt
best_lgbm_params = best_params

# --- 4. Create and Train the Model ---
print("\nTraining LightGBM model with class weighting...")

lgbm_model_final = lgb.LGBMClassifier(
    **best_lgbm_params,       # Apply all your tuned parameters
    scale_pos_weight=scale_pos_weight, # 👈 Here is the class weight
    random_state=42,
    n_jobs=-1
)

# Train the model on the full training dataset
lgbm_model_final.fit(X_full, y_full)

print("Model training complete.")

# --- 5. Evaluate on Test Data (Recommended) ---
print("\nEvaluating model on test data...")

data_test_lgbm = pandas.read_csv('./data/salary.test.processed.csv').set_index('id')
data_test_lgbm['prediction'] = lgbm_model_final.predict(data_test_lgbm.drop(['label'], axis='columns'))

# Print the report
report_scores_lgbm = sklearn.metrics.classification_report(
    y_true=data_test_lgbm['label'],
    y_pred=data_test_lgbm['prediction'],
    digits=6,
    output_dict=True
)
df_score_lgbm = pandas.DataFrame(report_scores_lgbm).transpose()

print("\nLightGBM (Tuned + Weighted) Report:")
print(df_score_lgbm)

Calculated scale_pos_weight: 1.3882

Training LightGBM model with class weighting...
Model training complete.

Evaluating model on test data...

LightGBM (Tuned + Weighted) Report:
              precision    recall  f1-score      support
0.0            0.887134  0.790563  0.836069  2416.000000
1.0            0.750370  0.862245  0.802427  1764.000000
accuracy       0.820813  0.820813  0.820813     0.820813
macro avg      0.818752  0.826404  0.819248  4180.000000
weighted avg   0.829418  0.820813  0.821872  4180.000000


### SMOTE

In [23]:
import json
import joblib
import pandas
import lightgbm as lgb
import sklearn.metrics
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE  # 👈 1. Import SMOTE

# --- 1. Load Data (Needed for training) ---
try:
    data_train_full = pandas.read_csv('./data/salary.train.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ไม่พบไฟล์ salary.train.processed.csv กรุณาตรวจสอบ path")
    # exit() 

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']

print(f"Original training data shape: {X_full.shape}")
print(f"Original label distribution:\n{y_full.value_counts()}")

# --- 2. Define Your Best Parameters ---
# These are the params you provided
best_lgbm_params = best_params

# --- 3. Apply SMOTE to the Training Data ---
print("\nApplying SMOTE to the training data...")
smote = SMOTE(random_state=42, n_jobs=-1)
X_resampled, y_resampled = smote.fit_resample(X_full, y_full)

print(f"New resampled training data shape: {X_resampled.shape}")
print(f"New resampled label distribution:\n{y_resampled.value_counts()}")

# --- 4. Create and Train the Final Model (with NO class_weight) ---
print("\nTraining final LGBM model on SMOTEd data...")

lgbm_model_final = lgb.LGBMClassifier(
    **best_lgbm_params,
    # ⚠️ NO 'scale_pos_weight' or 'class_weight' here
    random_state=42,
    n_jobs=-1
)

# 5. Train the model on the NEW resampled data
lgbm_model_final.fit(X_resampled, y_resampled)
print("Model training complete.")

# --- 6. Evaluate on ORIGINAL Test Data ---
print("\nEvaluating model on *original* test data...")

data_test_lgbm = pandas.read_csv('./data/salary.test.processed.csv').set_index('id')

# IMPORTANT: Do NOT apply SMOTE to the test data.
X_test = data_test_lgbm.drop(['label'], axis='columns')
y_test = data_test_lgbm['label']

data_test_lgbm['prediction'] = lgbm_model_final.predict(X_test)

# Print the report
report_scores_lgbm = sklearn.metrics.classification_report(
    y_true=y_test,
    y_pred=data_test_lgbm['prediction'],
    digits=6,
    output_dict=True
)
df_score_lgbm = pandas.DataFrame(report_scores_lgbm).transpose()

print("\nLightGBM (Tuned + SMOTE) Report:")
print(df_score_lgbm)
joblib.dump(lgbm_model_final, './model/lgbm/lgbm_model_final_smote.pkl')
with open('./model/lgbm/lgbm_config.json','w')as f:
    json.dump(
        obj=lgbm_model_final.get_params(),
        fp=f,
        indent = 4
    )

Original training data shape: (16720, 56)
Original label distribution:
label
0.0    9719
1.0    7001
Name: count, dtype: int64

Applying SMOTE to the training data...
New resampled training data shape: (19438, 56)
New resampled label distribution:
label
1.0    9719
0.0    9719
Name: count, dtype: int64

Training final LGBM model on SMOTEd data...
Model training complete.

Evaluating model on *original* test data...

LightGBM (Tuned + SMOTE) Report:
              precision    recall  f1-score      support
0.0            0.876792  0.810017  0.842083  2416.000000
1.0            0.764374  0.844104  0.802263  1764.000000
accuracy       0.824402  0.824402  0.824402     0.824402
macro avg      0.820583  0.827060  0.822173  4180.000000
weighted avg   0.829350  0.824402  0.825278  4180.000000


### SMOTETomek

In [24]:
import pandas
import lightgbm as lgb
import sklearn.metrics
from sklearn.metrics import classification_report
from imblearn.combine import SMOTETomek

print("\n--- 3. Testing LightGBM with SMOTETomek ---")

# --- Load Data ---
try:
    data_train_full = pandas.read_csv('./data/salary.train.processed.csv').set_index('id')
    data_test_lgbm = pandas.read_csv('./data/salary.test.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ไม่พบไฟล์ salary.train.processed.csv กรุณาตรวจสอบ path")
    # exit() 

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']
X_test = data_test_lgbm.drop(['label'], axis='columns')
y_test = data_test_lgbm['label']

# --- Apply SMOTETomek ---
print("Applying SMOTETomek...")
smt = SMOTETomek(random_state=42, n_jobs=-1)
X_resampled, y_resampled = smt.fit_resample(X_full, y_full)
print(f"New resampled label distribution:\n{y_resampled.value_counts()}")

# --- Define Parameters ---
best_lgbm_params = best_params

# --- Create and Train Model ---
lgbm_model = lgb.LGBMClassifier(
    **best_lgbm_params,
    # ⚠️ NO 'scale_pos_weight' or 'class_weight'
    random_state=42,
    n_jobs=-1
)

lgbm_model.fit(X_resampled, y_resampled) # Train on SMOTETomek data
print("Model training complete.")

# --- Evaluate ---
y_pred = lgbm_model.predict(X_test)
report = classification_report(y_test, y_pred, digits=6, output_dict=True)
df_report = pandas.DataFrame(report).transpose()

print("\nLightGBM (Tuned + SMOTETomek) Report:")
print(df_report)


--- 3. Testing LightGBM with SMOTETomek ---
Applying SMOTETomek...
New resampled label distribution:
label
1.0    9220
0.0    9220
Name: count, dtype: int64
Model training complete.

LightGBM (Tuned + SMOTETomek) Report:
              precision    recall  f1-score      support
0.0            0.876118  0.810844  0.842218  2416.000000
1.0            0.764918  0.842971  0.802050  1764.000000
accuracy       0.824402  0.824402  0.824402     0.824402
macro avg      0.820518  0.826907  0.822134  4180.000000
weighted avg   0.829190  0.824402  0.825267  4180.000000


### ADASYN

In [25]:
import pandas
import lightgbm as lgb
import sklearn.metrics
from sklearn.metrics import classification_report
from imblearn.over_sampling import ADASYN

print("\n--- 4. Testing LightGBM with ADASYN ---")

# --- Load Data ---
try:
    data_train_full = pandas.read_csv('./data/salary.train.processed.csv').set_index('id')
    data_test_lgbm = pandas.read_csv('./data/salary.test.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ไม่พบไฟล์ salary.train.processed.csv กรุณาตรวจสอบ path")
    # exit() 

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']
X_test = data_test_lgbm.drop(['label'], axis='columns')
y_test = data_test_lgbm['label']

# --- Apply ADASYN ---
print("Applying ADASYN...")
ada = ADASYN(random_state=42, n_jobs=-1)
X_resampled, y_resampled = ada.fit_resample(X_full, y_full)
print(f"New resampled label distribution:\n{y_resampled.value_counts()}")

# --- Define Parameters ---
best_lgbm_params = best_params

# --- Create and Train Model ---
lgbm_model = lgb.LGBMClassifier(
    **best_lgbm_params,
    # ⚠️ NO 'scale_pos_weight' or 'class_weight'
    random_state=42,
    n_jobs=-1
)

lgbm_model.fit(X_resampled, y_resampled) # Train on ADASYN data
print("Model training complete.")

# --- Evaluate ---
y_pred = lgbm_model.predict(X_test)
report = classification_report(y_test, y_pred, digits=6, output_dict=True)
df_report = pandas.DataFrame(report).transpose()

print("\nLightGBM (Tuned + ADASYN) Report:")
print(df_report)


--- 4. Testing LightGBM with ADASYN ---
Applying ADASYN...
New resampled label distribution:
label
0.0    9719
1.0    9698
Name: count, dtype: int64
Model training complete.

LightGBM (Tuned + ADASYN) Report:
              precision    recall  f1-score      support
0.0            0.886070  0.795116  0.838133  2416.000000
1.0            0.753976  0.859977  0.803496  1764.000000
accuracy       0.822488  0.822488  0.822488     0.822488
macro avg      0.820023  0.827547  0.820814  4180.000000
weighted avg   0.830325  0.822488  0.823516  4180.000000
