In [1]:
import pandas
import optuna
import sklearn.metrics
import lightgbm as lgbm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import warnings
from sklearn.model_selection import cross_val_score, train_test_split
import numpy as np
import joblib
import json

In [2]:
pip install optuna




In [3]:
pip install lightgbm


Note: you may need to restart the kernel to use updated packages.


### Find-tune using Optuna

- Load data

In [4]:
train_df = pandas.read_csv('../data/salary.train.processed.csv').set_index('id')
test_df = pandas.read_csv('../data/salary.test.processed.csv').set_index('id')

- seperate the data


In [5]:
X_train = train_df.drop(['label'], axis='columns')
y_train = train_df['label']

In [6]:
train_df['label'].value_counts()

label
0.0    9719
1.0    7001
Name: count, dtype: int64

### Setting optuna
Given the imbalanced class distribution, we will utilize `class_weight` to mitigate model bias. 

- Base Model Configuration
```
random_state=42,
n_estimators=1000,
```
- Hyper parameters
```
'objective': 'binary',
'metric': 'binary_logloss',
'random_state': 42,
'n_estimators': 1000,
'verbose': -1,
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
'num_leaves': trial.suggest_int('num_leaves', 10, 50),
'max_depth': trial.suggest_int('max_depth', 3, 10),
'min_child_samples': trial.suggest_int('min_child_samples', 5, 30),
'subsample': trial.suggest_float('subsample', 0.6, 1.0),
'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0)
```
We'll use 3-fold cross-Validation and return the averege F1-Weight Score

In [7]:
warnings.filterwarnings("ignore", category=UserWarning)
def objective_lgbm(trial):
    """
    Objective function with manual Cross-Validation and Early Stopping.
    """
    # 1. Define the search space for LightGBM
    param = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'random_state': 42,
        'n_estimators': 1000,
        'verbose': -1,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 10, 50),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 30),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0)
    }
    
    # 2. Set up 3-Fold Cross-Validation
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = []

    # 3. Manually run the CV loop
    for train_idx, val_idx in skf.split(X_train, y_train):
        X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        model = lgbm.LGBMClassifier(**param)
        
        # Train with early stopping
        model.fit(
            X_train_fold, 
            y_train_fold,
            eval_set=[(X_val_fold, y_val_fold)],
            eval_metric='logloss',
            callbacks=[lgbm.early_stopping(50, verbose=False)]
        )
        
        preds = model.predict(X_val_fold)
        f1 = sklearn.metrics.f1_score(y_val_fold, preds, average='binary')
        scores.append(f1)
    
    return np.mean(scores)

# --- Step 2: Create and Run the Study ---
study_name = "lgbm_salary_tuning"  # ตั้งชื่อโปรเจกต์ (แต่ไม่บันทึกลง storage)

print(f"Starting Optuna study: '{study_name}'")
print(f"Running 200 new trials...")

# สร้าง Study ใหม่โดยไม่ใช้ storage
study_lgbm = optuna.create_study(
    study_name=study_name,
    direction='maximize'
)

# รัน optimization
study_lgbm.optimize(objective_lgbm, n_trials=200)
print("Study complete!")
print(f"Total number of trials in study: {len(study_lgbm.trials)}")

# --- Step 3: Get Best Params ---
print("\nBest trial:")
print(f"  Value (Mean F1): {study_lgbm.best_value:.4f}")
print("  Best Params: ")
print(study_lgbm.best_params)
best_params = study_lgbm.best_params

[I 2025-11-06 16:20:16,117] A new study created in memory with name: lgbm_salary_tuning


Starting Optuna study: 'lgbm_salary_tuning'
Running 200 new trials...


  File "c:\Users\natth\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\natth\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\natth\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\natth\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[I 2025-11-06 16:20:16,417] Trial 0 finished with value: 0.7903771516829877 and parameters: {'learning_rate': 0.1989276291038046, 'num_leaves': 25, 'max_depth': 7, 'min_child_samples': 6, 'subsample': 0.690317106089212, 'colsample_bytree': 0.8882484555504684}. Best is trial 0 with value: 0.7903771516829

Study complete!
Total number of trials in study: 200

Best trial:
  Value (Mean F1): 0.7944
  Best Params: 
{'learning_rate': 0.014528796765418418, 'num_leaves': 48, 'max_depth': 7, 'min_child_samples': 18, 'subsample': 0.9881082263770181, 'colsample_bytree': 0.7270211902942715}


In [8]:
best_parameter = study_lgbm.best_params
best_parameter

{'learning_rate': 0.014528796765418418,
 'num_leaves': 48,
 'max_depth': 7,
 'min_child_samples': 18,
 'subsample': 0.9881082263770181,
 'colsample_bytree': 0.7270211902942715}

### Best parameter

```
'learning_rate': 0.014528796765418418,
'num_leaves': 48,
'max_depth': 7,
'min_child_samples': 18,
'subsample': 0.9881082263770181,
'colsample_bytree': 0.7270211902942715
```

---

### Train simple model with best parameter

In [9]:
X_train_final, X_val_final, y_train_final, y_val_final = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

In [10]:
base_lgbm = lgbm.LGBMClassifier(
    random_state=42,
    n_estimators=1000,
    **study_lgbm.best_params
)

In [11]:
base_lgbm.fit(
    X_train_final, 
    y_train_final,
    eval_set=[(X_val_final, y_val_final)],
    eval_metric='logloss',
    callbacks=[lgbm.early_stopping(50, verbose=False)]
)

### Evaluate 

In [12]:
# X_test = test_df.drop('label', axis=1)
# y_test = test_df['label']

In [13]:
data_test_lgbm = test_df
data_test_lgbm['prediction'] = base_lgbm.predict(data_test_lgbm.drop(['label'],axis='columns'))


In [14]:
data_test_lgbm.head(10)

Unnamed: 0_level_0,age-group,education-num,native-country,capitalgain,capitalloss,hoursperweek,occupation_prof-specialty,occupation_tech-support,occupation_exec-managerial,occupation_machine-op-inspct,...,relationship_unmarried,race_white,race_amer-indian-eskimo,race_black,race_asian-pac-islander,race_other,sex_male,sex_female,label,prediction
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12849,1.0,13.0,5,2.0,0.0,2.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1460,0.0,1.0,3,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
13594,2.0,9.0,5,0.0,0.0,2.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
14400,3.0,9.0,5,0.0,0.0,2.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
14333,2.0,9.0,5,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
10595,2.0,13.0,5,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9962,3.0,13.0,5,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4001,4.0,9.0,5,0.0,0.0,3.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
7002,3.0,5.0,5,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
23986,2.0,15.0,5,0.0,0.0,2.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0


In [15]:
sklearn.metrics.confusion_matrix(
    y_true=data_test_lgbm['label'],
    y_pred=data_test_lgbm['prediction'],
)

array([[2012,  404],
       [ 332, 1432]], dtype=int64)

In [16]:
report_scores_Baselgbm = sklearn.metrics.classification_report(
    y_true=data_test_lgbm['label'],
    y_pred=data_test_lgbm['prediction'],
    digits=6,
    output_dict=True
)
df_score_rf = pandas.DataFrame(report_scores_Baselgbm).transpose()
df_score_rf

Unnamed: 0,precision,recall,f1-score,support
0.0,0.858362,0.832781,0.845378,2416.0
1.0,0.779956,0.811791,0.795556,1764.0
accuracy,0.823923,0.823923,0.823923,0.823923
macro avg,0.819159,0.822286,0.820467,4180.0
weighted avg,0.825274,0.823923,0.824353,4180.0


In [17]:
data_test_lgbm.to_csv('./results/predictions.rf.classWeight-feture-selection.csv')
joblib.dump(
    value = base_lgbm,
    filename = './model/lgbm&FindTune.joblib'
)
with open('./results/scores.lgbm&FindTune.json','w')as f:
    json.dump(
        obj=report_scores_Baselgbm,
        fp=f,
        indent = 4
    )

---