In [2]:
import pandas
import optuna
import sklearn.metrics
import lightgbm as lgbm
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
import warnings
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
from sklearn.model_selection import cross_val_score, train_test_split
import numpy as np
import joblib
import json

In [3]:
pip install optuna

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install lightgbm


Note: you may need to restart the kernel to use updated packages.


### Find-tune using Optuna

- Load data

In [5]:
train_df = pandas.read_csv('../data/salary.train.processed.csv').set_index('id')
test_df = pandas.read_csv('../data/salary.test.processed.csv').set_index('id')

- seperate the data


In [6]:
X_train = train_df.drop(['label'], axis='columns')
y_train = train_df['label']

# Test data
X_test = test_df.drop(['label'], axis='columns')
y_test = test_df['label']

In [7]:
train_df['label'].value_counts()

label
0.0    9719
1.0    7001
Name: count, dtype: int64

### Setting optuna
Given the imbalanced class distribution, we will utilize `class_weight` to mitigate model bias. 

- Base Model Configuration
```
random_state=42,
n_estimators=1000,
```
- Hyper parameters
```
'objective': 'binary',
'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3, log=True),
'num_leaves': trial.suggest_int('num_leaves', 20, 100),
'max_depth': trial.suggest_int('max_depth', 5, 30),
'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
'subsample': trial.suggest_float('subsample', 0.5, 1.0),
'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 3.0),
'random_state': 42,
'n_jobs': -1
```
We'll use 10-fold cross-Validation and return the averege F1-Weight Score

- Split CV & Thresholding

In [8]:
X_train_cv, X_val_thresh, y_train_cv, y_val_thresh = train_test_split(
    X_train, y_train, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_train
)
print(f"Optuna CV set: {X_train_cv.shape[0]} samples")
print(f"Threshold-finding set: {X_val_thresh.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

Optuna CV set: 13376 samples
Threshold-finding set: 3344 samples
Test set: 4180 samples


- Calculate scale_pos_weight

In [9]:
label_counts =y_train.value_counts()
scale_pos_weight = label_counts[0] / label_counts[1]
print(f"Calculated scale_pos_weight: {scale_pos_weight:.4f}")

Calculated scale_pos_weight: 1.3882


- Optuna

In [10]:
def objective(trial):
    params = {
        'objective': 'binary',
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 60),        # LOWER: fixes overfitting
        'max_depth': trial.suggest_int('max_depth', 3, 12),           # LOWER: fixes -inf warnings
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-3, 10, log=True),  # NEW: more reg
        'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 0.5),                # NEW: prevents bad splits
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 3.0),
        'random_state': 42,
        'n_jobs': -1,
        'verbose': -1  # NEW: quieter logs
    }
    max_n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    cv_folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    f1_scores = []
    for fold, (train_idx, val_idx) in enumerate(cv_folds.split(X_train_cv, y_train_cv)):
        X_train_fold, X_val_fold = X_train_cv.iloc[train_idx], X_train_cv.iloc[val_idx]
        y_train_fold, y_val_fold = y_train_cv.iloc[train_idx], y_train_cv.iloc[val_idx]
        model = lgbm.LGBMClassifier(**params, n_estimators=max_n_estimators)
        model.fit(
            X_train_fold, y_train_fold,
            eval_set=[(X_val_fold, y_val_fold)],
            eval_metric='auc',
            callbacks=[lgbm.early_stopping(20, verbose=False)]
        )
        y_val_probs = model.predict_proba(X_val_fold)[:, 1]
        # FIXED: Fast + optimal threshold via PR curve (no more slow loops!)
        from sklearn.metrics import precision_recall_curve
        prec, rec, thr = precision_recall_curve(y_val_fold, y_val_probs)
        f1s = 2 * prec * rec / (prec + rec + 1e-8)
        best_idx = np.argmax(f1s)
        best_f1 = f1s[best_idx]
        best_thresh = thr[best_idx] if best_idx < len(thr) else 0.5
        f1_scores.append(best_f1)
        trial.set_user_attr(f'fold_{fold}_threshold', best_thresh)
    mean_f1 = np.mean(f1_scores)
    trial.set_user_attr('best_n_estimators', model.best_iteration_)
    return mean_f1

In [11]:
print("\n--- Starting Optuna optimization (this may take time)... ---")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print("Optimization finished.")
print(f"Best trial (Mean CV F1): {study.best_value:.4f}")

[I 2025-11-08 14:33:52,902] A new study created in memory with name: no-name-27fc2573-4f27-46b3-9243-e5e14c737a2e



--- Starting Optuna optimization (this may take time)... ---


[I 2025-11-08 14:33:55,535] Trial 0 finished with value: 0.7941347419769651 and parameters: {'learning_rate': 0.005737568605428323, 'num_leaves': 60, 'max_depth': 9, 'min_child_weight': 0.4714872597767069, 'min_split_gain': 0.3116874526920898, 'reg_alpha': 1.766535813085442, 'reg_lambda': 0.009795327959256575, 'subsample': 0.5067130058593013, 'colsample_bytree': 0.6561639425552914, 'min_child_samples': 69, 'scale_pos_weight': 1.5701315466775592, 'n_estimators': 339}. Best is trial 0 with value: 0.7941347419769651.
[I 2025-11-08 14:34:03,119] Trial 1 finished with value: 0.7908230015003113 and parameters: {'learning_rate': 0.0038426937333743906, 'num_leaves': 32, 'max_depth': 10, 'min_child_weight': 0.7377819516101314, 'min_split_gain': 0.21974053671713506, 'reg_alpha': 7.282181726284236, 'reg_lambda': 0.009702191217050833, 'subsample': 0.6374220375330768, 'colsample_bytree': 0.8947739573073413, 'min_child_samples': 16, 'scale_pos_weight': 1.9778096112888812, 'n_estimators': 921}. Best 

Optimization finished.
Best trial (Mean CV F1): 0.8053


- Best parameter

In [12]:
best_parameter = study.best_params
best_n_estimators = study.best_trial.user_attrs['best_n_estimators']

print(f"Found Optimal n_estimators: {best_n_estimators}")
print("Found Best Hyperparameters:")
for key, value in best_parameter.items():
    if key != 'n_estimators':
        print(f"  {key}: {value}")

Found Optimal n_estimators: 18
Found Best Hyperparameters:
  learning_rate: 0.2234996484380029
  num_leaves: 37
  max_depth: 7
  min_child_weight: 0.0024055894077327886
  min_split_gain: 0.15010467481568562
  reg_alpha: 4.5988127050256843e-07
  reg_lambda: 1.9833430894127376e-06
  subsample: 0.6682611578648369
  colsample_bytree: 0.8355051824482783
  min_child_samples: 72
  scale_pos_weight: 1.4137273387602047


### Best parameter

```
    best_lgbm_params = {
    'n_estimators': 102,
    'learning_rate': 0.2699,
    'num_leaves': 22,
    'max_depth': 5,
    'reg_alpha': 8.7e-06,
    'reg_lambda': 3.58,
    'subsample': 0.986,
    'colsample_bytree': 0.755,
    # ADD THESE 2 LINES ONLY
    'min_child_samples': 1,      
    'min_split_gain': 0.0,       
    'verbosity': -1              
}
```

---

### Train simple model with best parameter

- Best parameter

In [24]:
base_lgbm = {
    'n_estimators': 102,
    'learning_rate': 0.2699,
    'num_leaves': 22,
    'max_depth': 5,
    'reg_alpha': 8.7e-06,
    'reg_lambda': 3.58,
    'subsample': 0.986,
    'colsample_bytree': 0.755,
    # ADD THESE 2 LINES ONLY
    'min_child_samples': 1,      # ← was 20 → kills splits
    'min_split_gain': 0.0,       # ← allow any tiny gain
    'verbosity': -1              # ← silence warnings
}

In [3]:
final_model = lgbm.LGBMClassifier(**base_lgbm)
final_model.fit(X_train, y_train)

NameError: name 'base_lgbm' is not defined

- Find best treshold

In [15]:
print("Finding optimal decision threshold (for F1) on held-out validation set...")
y_val_probs = final_model.predict_proba(X_val_thresh)[:, 1]
thresholds = np.arange(0.35, 0.50, 0.001)
best_f1 = 0
best_threshold = 0.5

for thresh in thresholds:
    preds = (y_val_probs >= thresh).astype(int)
    f1 = f1_score(y_val_thresh, preds)
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = thresh

print(f"Best Threshold for F1: {best_threshold:.4f} (with F1: {best_f1:.4f} on validation data)")

Finding optimal decision threshold (for F1) on held-out validation set...
Best Threshold for F1: 0.3760 (with F1: 0.8219 on validation data)


In [4]:
best_threshold = 0.4675

### Evaluate 

- Calculate and add prediction column

In [16]:
final_model.predict(
    X = train_df.drop(['label'],axis = 'columns')
)

array([1., 1., 0., ..., 0., 1., 0.])

In [17]:
test_df

Unnamed: 0_level_0,age-group,education-num,native-country,capitalgain,capitalloss,hoursperweek,occupation_prof-specialty,occupation_tech-support,occupation_exec-managerial,occupation_machine-op-inspct,...,relationship_own-child,relationship_unmarried,race_white,race_amer-indian-eskimo,race_black,race_asian-pac-islander,race_other,sex_male,sex_female,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12849,1.0,13.0,5,2.0,0.0,2.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1460,0.0,1.0,3,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
13594,2.0,9.0,5,0.0,0.0,2.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
14400,3.0,9.0,5,0.0,0.0,2.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
14333,2.0,9.0,5,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21121,3.0,9.0,5,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
12348,2.0,13.0,5,4.0,0.0,2.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
22298,3.0,13.0,5,0.0,0.0,2.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
15636,1.0,10.0,5,3.0,0.0,3.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [18]:
test_df['prediction'] = final_model.predict(
    X = test_df.drop(columns= ['label'])
)


In [19]:
test_df.head(10)

Unnamed: 0_level_0,age-group,education-num,native-country,capitalgain,capitalloss,hoursperweek,occupation_prof-specialty,occupation_tech-support,occupation_exec-managerial,occupation_machine-op-inspct,...,relationship_unmarried,race_white,race_amer-indian-eskimo,race_black,race_asian-pac-islander,race_other,sex_male,sex_female,label,prediction
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12849,1.0,13.0,5,2.0,0.0,2.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1460,0.0,1.0,3,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
13594,2.0,9.0,5,0.0,0.0,2.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
14400,3.0,9.0,5,0.0,0.0,2.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
14333,2.0,9.0,5,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
10595,2.0,13.0,5,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9962,3.0,13.0,5,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4001,4.0,9.0,5,0.0,0.0,3.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
7002,3.0,5.0,5,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
23986,2.0,15.0,5,0.0,0.0,2.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0


- Confusion metric

In [20]:
sklearn.metrics.confusion_matrix(
    y_true=test_df['label'],
    y_pred=test_df['prediction'],
)

array([[2041,  375],
       [ 331, 1433]], dtype=int64)

In [21]:
report_scores_Baselgbm = sklearn.metrics.classification_report(
    y_true=test_df['label'],
    y_pred=test_df['prediction'],
    digits=6,
    output_dict=True
)
df_score_rf = pandas.DataFrame(report_scores_Baselgbm).transpose()
df_score_rf

Unnamed: 0,precision,recall,f1-score,support
0.0,0.860455,0.844785,0.852548,2416.0
1.0,0.792588,0.812358,0.802352,1764.0
accuracy,0.8311,0.8311,0.8311,0.8311
macro avg,0.826522,0.828572,0.82745,4180.0
weighted avg,0.831815,0.8311,0.831365,4180.0


In [22]:
test_df.to_csv('./results/predictions.base_lgbm_find_tune.csv')
joblib.dump(
    value = final_model,
    filename = './model/lgbm&FindTune.joblib'
)
with open('./results/scores.lgbm&FindTune.json','w')as f:
    json.dump(
        obj=report_scores_Baselgbm,
        fp=f,
        indent = 4
    )

---