In [1]:
import pandas
import optuna
import sklearn.metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split
import numpy as np

In [2]:
pip install optuna

Note: you may need to restart the kernel to use updated packages.


### Find-tune using Optuna

- Load data

In [3]:
data_train_full = pandas.read_csv('../data/salary.train.processed.csv').set_index('id')
data_test_rf = pandas.read_csv('../data/salary.test.processed.csv').set_index('id')

- seperate the data


In [4]:
X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']

In [5]:
data_train_full['label'].value_counts()

label
0.0    9719
1.0    7001
Name: count, dtype: int64

---

### Setting optuna
Given the imbalanced class distribution, we will utilize `class_weight` to mitigate model bias. 

- Base Model Configuration
```
`class_weight = balanced`,
`random_state = 42` ,
`n_jobs = -1`
```
- Hyper parameters
```
'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=50),
'max_depth': trial.suggest_int('max_depth', 3, 15),
'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
'min_samples_split': trial.suggest_int('min_samples_split', 2, 14),
'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
'max_features': trial.suggest_float('max_features', 0.1, 1.0) 
```
We'll use 3-fold cross-Validation and return the averege F1-Weight Score

In [6]:
rf_class_weight = 'balanced' 
def objective(trial):    
    param_rf = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=50),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 14),
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'max_features': trial.suggest_float('max_features', 0.1, 1.0) 
    }
    

    model_rf = RandomForestClassifier(
        **param_rf,
        class_weight=rf_class_weight, 
        random_state=42,
        n_jobs=-1
    )
    score = cross_val_score(
        model_rf, 
        X_full, 
        y_full, 
        cv=10,                 
        scoring='f1_weighted',
        n_jobs=-1
    )
    
    f1_avg = np.mean(score)
    return f1_avg

print("\nFinding parameters with Optuna...")


pruner = optuna.pruners.MedianPruner()
study = optuna.create_study(direction='maximize', pruner=pruner)


study.optimize(
    objective, 
    n_trials=50,  
    show_progress_bar=True 
)


best_parameter = study.best_params 
print("\n--- Optuna Finished! ---")


print("Best Parameters:")
print(study.best_params)

print(f"\nBest F1-Weighted (From CV): {study.best_value:.6f}")


[I 2025-11-07 13:43:14,088] A new study created in memory with name: no-name-855fdc4e-3b5d-402b-b6c5-d11835956f10



Finding parameters with Optuna...


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-11-07 13:43:25,858] Trial 0 finished with value: 0.760174516636997 and parameters: {'n_estimators': 250, 'max_depth': 4, 'min_samples_leaf': 9, 'min_samples_split': 5, 'criterion': 'gini', 'max_features': 0.18061571501102475}. Best is trial 0 with value: 0.760174516636997.
[I 2025-11-07 13:43:59,546] Trial 1 finished with value: 0.8149581784005939 and parameters: {'n_estimators': 700, 'max_depth': 11, 'min_samples_leaf': 6, 'min_samples_split': 8, 'criterion': 'entropy', 'max_features': 0.5628163110642745}. Best is trial 1 with value: 0.8149581784005939.
[I 2025-11-07 13:44:29,810] Trial 2 finished with value: 0.810737583162244 and parameters: {'n_estimators': 600, 'max_depth': 8, 'min_samples_leaf': 10, 'min_samples_split': 9, 'criterion': 'entropy', 'max_features': 0.8029647105941743}. Best is trial 1 with value: 0.8149581784005939.
[I 2025-11-07 13:45:17,043] Trial 3 finished with value: 0.814146399512784 and parameters: {'n_estimators': 950, 'max_depth': 8, 'min_samples_lea

---

### Train simple model with best parameter

In [11]:
rf_model_final = RandomForestClassifier(
    **best_parameter,   
    class_weight=rf_class_weight,
    random_state=42,
    n_jobs=-1
)

In [12]:
rf_model_final.fit(X_full, y_full)
data_test_rf['prediction'] = rf_model_final.predict(data_test_rf.drop(['label'], axis='columns'))

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- prediction


In [None]:
print("\nRandom Forest (Optuna-Tuned) Confusion Matrix:")
print(sklearn.metrics.confusion_matrix(
    y_true=data_test_rf['label'],
    y_pred=data_test_rf['prediction']
))

report_scores_rf = sklearn.metrics.classification_report(
    y_true=data_test_rf['label'],
    y_pred=data_test_rf['prediction'],
    digits=6,
    output_dict=True
)
df_score_rf = pandas.DataFrame(report_scores_rf).transpose()
print("\nRandom Forest (Optuna-Tuned) Report:")
print(df_score_rf)


Random Forest (Optuna-Tuned) Confusion Matrix:
[[1910  506]
 [ 232 1532]]

Random Forest (Optuna-Tuned) Report:
              precision    recall  f1-score      support
0.0            0.891690  0.790563  0.838087  2416.000000
1.0            0.751717  0.868481  0.805892  1764.000000
accuracy       0.823445  0.823445  0.823445     0.823445
macro avg      0.821704  0.829522  0.821989  4180.000000
weighted avg   0.832620  0.823445  0.824500  4180.000000


Best parameter

In [10]:
best_parameter = study.best_params
best_parameter

{'n_estimators': 800,
 'max_depth': 13,
 'min_samples_leaf': 2,
 'min_samples_split': 10,
 'criterion': 'entropy',
 'max_features': 0.27802443984172714}

### Best Params
```
'n_estimators': 800,
 'max_depth': 13,
 'min_samples_leaf': 2,
 'min_samples_split': 10,
 'criterion': 'entropy',
 'max_features': 0.27802443984172714
 ```