In [4]:
import pandas
import optuna
import sklearn.metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split
import numpy as np

In [5]:
pip install optuna

Note: you may need to restart the kernel to use updated packages.


### Find-tune using Optuna

- Load data

In [6]:
data_train_full = pandas.read_csv('../data/salary.train.processed.csv').set_index('id')
data_test_rf = pandas.read_csv('../data/salary.test.processed.csv').set_index('id')

- seperate the data


In [7]:
X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']

In [8]:
data_train_full['label'].value_counts()

label
0.0    9719
1.0    7001
Name: count, dtype: int64

---

### Setting optuna
Given the imbalanced class distribution, we will utilize `class_weight` to mitigate model bias. 

- Base Model Configuration
```
`class_weight = balanced`,
`random_state = 42` ,
`n_jobs = -1`
```
- Hyper parameters
```
'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=50),
'max_depth': trial.suggest_int('max_depth', 3, 15),
'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
'min_samples_split': trial.suggest_int('min_samples_split', 2, 14),
'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
'max_features': trial.suggest_float('max_features', 0.1, 1.0) 
```
We'll use 3-fold cross-Validation and return the averege F1-Weight Score

In [9]:
rf_class_weight = 'balanced' 
def objective(trial):    
    param_rf = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=50),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 14),
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'max_features': trial.suggest_float('max_features', 0.1, 1.0) 
    }
    

    model_rf = RandomForestClassifier(
        **param_rf,
        class_weight=rf_class_weight, 
        random_state=42,
        n_jobs=-1
    )
    score = cross_val_score(
        model_rf, 
        X_full, 
        y_full, 
        cv=3,                 
        scoring='f1_weighted',
        n_jobs=-1
    )
    
    f1_avg = np.mean(score)
    return f1_avg

print("\nFinding parameters with Optuna...")


pruner = optuna.pruners.MedianPruner()
study = optuna.create_study(direction='maximize', pruner=pruner)


study.optimize(
    objective, 
    n_trials=50,  
    show_progress_bar=True 
)


best_parameter = study.best_params 
print("\n--- Optuna Finished! ---")


print("Best Parameters:")
print(study.best_params)

print(f"\nBest F1-Weighted (From CV): {study.best_value:.6f}")


[I 2025-10-31 13:05:32,620] A new study created in memory with name: no-name-85c17bba-c92d-4f36-8d50-63039ce57eac



Finding parameters with Optuna...


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-10-31 13:05:39,520] Trial 0 finished with value: 0.8063374765161333 and parameters: {'n_estimators': 800, 'max_depth': 9, 'min_samples_leaf': 8, 'min_samples_split': 4, 'criterion': 'entropy', 'max_features': 0.18996872197712816}. Best is trial 0 with value: 0.8063374765161333.
[I 2025-10-31 13:05:50,934] Trial 1 finished with value: 0.8154117756184774 and parameters: {'n_estimators': 700, 'max_depth': 13, 'min_samples_leaf': 4, 'min_samples_split': 13, 'criterion': 'gini', 'max_features': 0.8500353532736623}. Best is trial 1 with value: 0.8154117756184774.
[I 2025-10-31 13:05:53,554] Trial 2 finished with value: 0.8149649015941388 and parameters: {'n_estimators': 100, 'max_depth': 12, 'min_samples_leaf': 7, 'min_samples_split': 10, 'criterion': 'gini', 'max_features': 0.5104825877763798}. Best is trial 1 with value: 0.8154117756184774.
[I 2025-10-31 13:06:00,892] Trial 3 finished with value: 0.8147831749789254 and parameters: {'n_estimators': 850, 'max_depth': 10, 'min_samples

---

### Train simple model with best parameter

In [10]:
rf_model_final = RandomForestClassifier(
    **best_parameter,   
    class_weight=rf_class_weight,
    random_state=42,
    n_jobs=-1
)

In [11]:
rf_model_final.fit(X_full, y_full)
data_test_rf['prediction'] = rf_model_final.predict(data_test_rf.drop(['label'], axis='columns'))

In [12]:
print("\nRandom Forest (Optuna-Tuned) Confusion Matrix:")
print(sklearn.metrics.confusion_matrix(
    y_true=data_test_rf['label'],
    y_pred=data_test_rf['prediction']
))

report_scores_rf = sklearn.metrics.classification_report(
    y_true=data_test_rf['label'],
    y_pred=data_test_rf['prediction'],
    digits=6,
    output_dict=True
)
df_score_rf = pandas.DataFrame(report_scores_rf).transpose()
print("\nRandom Forest (Optuna-Tuned) Report:")
print(df_score_rf)


Random Forest (Optuna-Tuned) Confusion Matrix:
[[1910  506]
 [ 248 1516]]

Random Forest (Optuna-Tuned) Report:
              precision    recall  f1-score      support
0.0            0.885079  0.790563  0.835155  2416.000000
1.0            0.749753  0.859410  0.800845  1764.000000
accuracy       0.819617  0.819617  0.819617     0.819617
macro avg      0.817416  0.824987  0.818000  4180.000000
weighted avg   0.827970  0.819617  0.820676  4180.000000


Best parameter

In [13]:
best_parameter = study.best_params
best_parameter

{'n_estimators': 350,
 'max_depth': 13,
 'min_samples_leaf': 2,
 'min_samples_split': 6,
 'criterion': 'entropy',
 'max_features': 0.475336315954348}

### Best Params
```
'n_estimators': 350,
 'max_depth': 13,
 'min_samples_leaf': 2,
 'min_samples_split': 6,
 'criterion': 'entropy',
 'max_features': 0.475336315954348
 ```