In [13]:
import pandas
import optuna
import sklearn.metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split
import numpy as np
import joblib
import json

In [14]:
pip install optuna

Note: you may need to restart the kernel to use updated packages.


### Find-tune using Optuna

- Load data

In [15]:
data_train_full = pandas.read_csv('../data/salary.train.processed.csv').set_index('id')
data_test_rf = pandas.read_csv('../data/salary.test.processed.csv').set_index('id')

- seperate the data


In [16]:
X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']

In [17]:
data_train_full['label'].value_counts()

label
0.0    9719
1.0    7001
Name: count, dtype: int64

---

### Setting optuna
Given the imbalanced class distribution, we will utilize `class_weight` to mitigate model bias. 

- Base Model Configuration
```
`class_weight = balanced`,
`random_state = 42` ,
`n_jobs = -1`
```
- Hyper parameters
```
'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=50),
'max_depth': trial.suggest_int('max_depth', 3, 15),
'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
'min_samples_split': trial.suggest_int('min_samples_split', 2, 14),
'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
'max_features': trial.suggest_float('max_features', 0.1, 1.0) 
```
We'll use 3-fold cross-Validation and return the averege F1-Weight Score

In [18]:
rf_class_weight = 'balanced' 
def objective(trial):    
    param_rf = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=50),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 14),
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'max_features': trial.suggest_float('max_features', 0.1, 1.0) 
    }
    

    model_rf = RandomForestClassifier(
        **param_rf,
        class_weight=rf_class_weight, 
        random_state=42,
        n_jobs=-1
    )
    score = cross_val_score(
        model_rf, 
        X_full, 
        y_full, 
        cv=10,                 
        scoring='f1_weighted',
        n_jobs=-1
    )
    
    f1_avg = np.mean(score)
    return f1_avg

print("\nFinding parameters with Optuna...")


pruner = optuna.pruners.MedianPruner()
study = optuna.create_study(direction='maximize', pruner=pruner)


study.optimize(
    objective, 
    n_trials=50,  
    show_progress_bar=True 
)


best_parameter = study.best_params 
print("\n--- Optuna Finished! ---")


print("Best Parameters:")
print(study.best_params)

print(f"\nBest F1-Weighted (From CV): {study.best_value:.6f}")



Finding parameters with Optuna...


[I 2025-11-07 14:11:51,006] A new study created in memory with name: no-name-5d96e936-1718-46c8-b9f1-04350fae8362


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-11-07 14:12:01,176] Trial 0 finished with value: 0.8146787877699353 and parameters: {'n_estimators': 150, 'max_depth': 11, 'min_samples_leaf': 4, 'min_samples_split': 7, 'criterion': 'entropy', 'max_features': 0.21335167593226934}. Best is trial 0 with value: 0.8146787877699353.
[I 2025-11-07 14:12:11,953] Trial 1 finished with value: 0.8063896098894281 and parameters: {'n_estimators': 250, 'max_depth': 7, 'min_samples_leaf': 6, 'min_samples_split': 8, 'criterion': 'entropy', 'max_features': 0.9309389902825984}. Best is trial 0 with value: 0.8146787877699353.
[I 2025-11-07 14:12:14,136] Trial 2 finished with value: 0.7694430856997504 and parameters: {'n_estimators': 150, 'max_depth': 5, 'min_samples_leaf': 9, 'min_samples_split': 9, 'criterion': 'gini', 'max_features': 0.1832598728992213}. Best is trial 0 with value: 0.8146787877699353.
[I 2025-11-07 14:12:48,672] Trial 3 finished with value: 0.8143524160783547 and parameters: {'n_estimators': 700, 'max_depth': 14, 'min_samples

---

### Train simple model with best parameter

In [19]:
rf_model_base_findTune = RandomForestClassifier(
    **best_parameter,   
    class_weight=rf_class_weight,
    random_state=42,
    n_jobs=-1
)

In [20]:
rf_model_base_findTune.fit(X_full, y_full)
data_test_rf['prediction'] = rf_model_base_findTune.predict(data_test_rf.drop(['label'], axis='columns'))

In [21]:
data_test_rf.to_csv('./results/predictions.rf.base-tuned.test..csv')

In [22]:
print(sklearn.metrics.confusion_matrix(
    y_true=data_test_rf['label'],
    y_pred=data_test_rf['prediction']
))

[[1905  511]
 [ 242 1522]]


In [26]:
report_scores_rf = sklearn.metrics.classification_report(
    y_true=data_test_rf['label'],
    y_pred=data_test_rf['prediction'],
    digits=6,
    output_dict=True
)
df_score_rf = pandas.DataFrame(report_scores_rf).transpose()
df_score_rf

Unnamed: 0,precision,recall,f1-score,support
0.0,0.887285,0.788493,0.834977,2416.0
1.0,0.748647,0.862812,0.801686,1764.0
accuracy,0.819856,0.819856,0.819856,0.819856
macro avg,0.817966,0.825653,0.818331,4180.0
weighted avg,0.828778,0.819856,0.820928,4180.0


Best parameter

In [24]:
best_parameter = study.best_params
best_parameter

{'n_estimators': 550,
 'max_depth': 14,
 'min_samples_leaf': 9,
 'min_samples_split': 12,
 'criterion': 'entropy',
 'max_features': 0.3036744227936187}

### Best Params
```
'n_estimators': 800,
 'max_depth': 13,
 'min_samples_leaf': 2,
 'min_samples_split': 10,
 'criterion': 'entropy',
 'max_features': 0.27802443984172714
 ```

### Save model & results

In [28]:
joblib.dump(
    value = rf_model_base_findTune,
    filename = './model/rf_model_base-findTune.joblib'
)
with open('./results/scores.rf_model_base-findTune.json','w')as f:
    json.dump(
        obj=report_scores_rf,
        fp=f,
        indent = 4
    )