In [1]:
import pandas
import optuna
import numpy as np
import sklearn.metrics
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold, cross_val_predict

### Load data

In [6]:
data_train_full = pandas.read_csv('../data/salary.train.processed.csv').set_index('id')
data_test_lr = pandas.read_csv('../data/salary.test.processed.csv').set_index('id')

- split the data

In [7]:
X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']

X_test = data_test_lr.drop(['label'], axis='columns')
y_test = data_test_lr['label']

In [4]:
def objective(trial):
    C = trial.suggest_float('C', 1e-4, 1e2, log=True)
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2', 'elasticnet'])
    
    l1_ratio = None
    if penalty == 'elasticnet':
        l1_ratio = trial.suggest_float('l1_ratio', 0, 1)
    weight_for_class_1 = trial.suggest_float('weight_for_class_1', 1.0, 5.0)
    class_weight = {0: 1.0, 1: weight_for_class_1}

    pipeline_lr = Pipeline([
        ('scaler', StandardScaler()),
        ('model', LogisticRegression(
            C=C,
            penalty=penalty,
            l1_ratio=l1_ratio,
            solver='saga',
            class_weight=class_weight, 
            random_state=42,
            max_iter=5000,
            n_jobs=-1
        ))
    ])
    
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    
    y_probs = cross_val_predict(
        pipeline_lr, 
        X_full, 
        y_full, 
        cv=cv, 
        method='predict_proba', 
        n_jobs=-1
    )[:, 1] 

    thresholds = np.linspace(0.01, 0.99, 100)
    best_f1_in_trial = -1
    
    for t in thresholds:
        y_pred_t = (y_probs >= t).astype(int)
        
        f1_t = sklearn.metrics.f1_score(y_full, y_pred_t, average='weighted')
        
        if f1_t > best_f1_in_trial:
            best_f1_in_trial = f1_t
            
    trial.set_user_attr("best_threshold", thresholds[np.argmax([sklearn.metrics.f1_score(y_full, (y_probs >= t).astype(int), average='weighted') for t in thresholds])])
    return best_f1_in_trial

- Run Optuna

In [5]:
pruner = optuna.pruners.MedianPruner()
study = optuna.create_study(direction='maximize', pruner=pruner)

study.optimize(
    objective, 
    n_trials=100, 
    show_progress_bar=True
)

[I 2025-11-08 22:34:36,344] A new study created in memory with name: no-name-17d630b8-25bc-402a-b7aa-4bdeb15cb40a


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-11-08 22:35:04,460] Trial 0 finished with value: 0.8141272117630708 and parameters: {'C': 11.865828155244406, 'penalty': 'l2', 'weight_for_class_1': 4.000386305906321}. Best is trial 0 with value: 0.8141272117630708.
[I 2025-11-08 22:35:08,213] Trial 1 finished with value: 0.791153076546506 and parameters: {'C': 0.0005654339031528567, 'penalty': 'elasticnet', 'l1_ratio': 0.8700932679391642, 'weight_for_class_1': 2.351294391609015}. Best is trial 0 with value: 0.8141272117630708.
[I 2025-11-08 22:35:32,611] Trial 2 finished with value: 0.8140942158397853 and parameters: {'C': 74.40528755218807, 'penalty': 'l1', 'weight_for_class_1': 4.746921978412088}. Best is trial 0 with value: 0.8141272117630708.
[I 2025-11-08 22:35:39,776] Trial 3 finished with value: 0.8142720773518896 and parameters: {'C': 0.024256272439582144, 'penalty': 'l1', 'weight_for_class_1': 2.8899960806948624}. Best is trial 3 with value: 0.8142720773518896.
[I 2025-11-08 22:35:42,733] Trial 4 finished with value:

- Set the best params

In [15]:
best_lr_params = study.best_params
best_threshold = study.best_trial.user_attrs['best_threshold']
final_class_weight = {0: 1.0, 1: best_lr_params.get('weight_for_class_1')}

In [26]:
final_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(
        C=best_lr_params.get('C'),
        penalty=best_lr_params.get('penalty'),
        l1_ratio=best_lr_params.get('l1_ratio'),
        solver='saga',
        class_weight=final_class_weight, # ðŸ‘ˆ à¹ƒà¸Šà¹‰ class_weight à¸—à¸µà¹ˆà¸ˆà¸¹à¸™à¹„à¸”à¹‰
        random_state=42,
        max_iter=5000,
        n_jobs=-1
    ))
])

- Train Pipeline with all data (X_full,y_full)

In [17]:
final_pipeline.fit(X_full, y_full)

- Predict probabilities on test set

In [18]:
y_probs_test = final_pipeline.predict_proba(X_test)[:, 1]


- Predict with best thershold

In [19]:
y_pred_final = (y_probs_test >= best_threshold).astype(int)

In [20]:
print("\nLogistic Regression (Optuna-Tuned + Best Threshold) Confusion Matrix:")
print(sklearn.metrics.confusion_matrix(
    y_true=y_test,
    y_pred=y_pred_final
))

report_scores_lr = sklearn.metrics.classification_report(
    y_true=y_test,
    y_pred=y_pred_final,
    digits=6,
    output_dict=True
)
df_score_lr = pandas.DataFrame(report_scores_lr).transpose()
print("\nLogistic Regression (Optuna-Tuned + Best Threshold) Report:")
print(df_score_lr)


Logistic Regression (Optuna-Tuned + Best Threshold) Confusion Matrix:
[[2131  285]
 [ 500 1264]]

Logistic Regression (Optuna-Tuned + Best Threshold) Report:
              precision    recall  f1-score      support
0.0            0.809958  0.882036  0.844462  2416.000000
1.0            0.816010  0.716553  0.763055  1764.000000
accuracy       0.812201  0.812201  0.812201     0.812201
macro avg      0.812984  0.799295  0.803758  4180.000000
weighted avg   0.812512  0.812201  0.810107  4180.000000


In [22]:
best_lr_params

{'C': 0.3265000362645873,
 'penalty': 'l2',
 'weight_for_class_1': 1.3963084513738258}