In [None]:
import pandas as pd
import optuna
import numpy as np
import sklearn.metrics
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_predict # ‡πÉ‡∏ä‡πâ StratifiedKFold ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏à‡∏±‡∏î‡∏Å‡∏≤‡∏£ Imbalance

# --- 1. ‡πÇ‡∏´‡∏•‡∏î‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏• (‡πÄ‡∏´‡∏°‡∏∑‡∏≠‡∏ô‡πÄ‡∏î‡∏¥‡∏°) ---
try:
    data_train_full = pandas.read_csv('../data/salary.train.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå salary.train.processed.csv ‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö path")
    # exit()

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']

print(f"‡πÉ‡∏ä‡πâ‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡πÄ‡∏ó‡∏£‡∏ô‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î {len(y_full)} records ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏Å‡∏≤‡∏£‡∏à‡∏π‡∏ô (K-Fold CV)")

# --- 2. ‡∏™‡∏£‡πâ‡∏≤‡∏á‡∏ü‡∏±‡∏á‡∏Å‡πå‡∏ä‡∏±‡∏ô Objective (‡∏´‡∏±‡∏ß‡πÉ‡∏à‡∏Ç‡∏≠‡∏á Optuna) ---

def objective(trial):
    """
    ‡∏ü‡∏±‡∏á‡∏Å‡πå‡∏ä‡∏±‡∏ô‡∏ô‡∏µ‡πâ‡∏ñ‡∏π‡∏Å‡∏õ‡∏£‡∏±‡∏ö‡∏õ‡∏£‡∏∏‡∏á‡πÉ‡∏´‡πâ‡∏à‡∏π‡∏ô C, penalty, ‡πÅ‡∏•‡∏∞‡∏´‡∏≤ Best Threshold ‡∏†‡∏≤‡∏¢‡πÉ‡∏ô Cross-Validation
    """

    # 1. ‡∏Å‡∏≥‡∏´‡∏ô‡∏î‡∏ä‡πà‡∏ß‡∏á‡∏Ç‡∏≠‡∏á‡∏û‡∏≤‡∏£‡∏≤‡∏°‡∏¥‡πÄ‡∏ï‡∏≠‡∏£‡πå
    C = trial.suggest_float('C', 1e-4, 1e2, log=True)
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2', 'elasticnet'])
    
    l1_ratio = None
    if penalty == 'elasticnet':
        l1_ratio = trial.suggest_float('l1_ratio', 0, 1)

    # ‡∏à‡∏π‡∏ô class_weight (‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏¢‡∏∑‡∏î‡∏´‡∏¢‡∏∏‡πà‡∏ô ‡πÅ‡∏ó‡∏ô‡∏ó‡∏µ‡πà‡∏à‡∏∞‡πÉ‡∏ä‡πâ‡πÅ‡∏Ñ‡πà 'balanced')
    # ‡∏Å‡∏≥‡∏´‡∏ô‡∏î weight_for_class_1 ‡∏ï‡∏±‡πâ‡∏á‡πÅ‡∏ï‡πà 1 (unbalanced) ‡∏ñ‡∏∂‡∏á 5 (heavily favoring minority class)
    weight_for_class_1 = trial.suggest_float('weight_for_class_1', 1.0, 5.0)
    class_weight = {0: 1.0, 1: weight_for_class_1}

    # 2. ‡∏™‡∏£‡πâ‡∏≤‡∏á Pipeline ‡πÅ‡∏•‡∏∞ K-Fold Strategy
    pipeline_lr = Pipeline([
        ('scaler', StandardScaler()),
        ('model', LogisticRegression(
            C=C,
            penalty=penalty,
            l1_ratio=l1_ratio,
            solver='saga',
            class_weight=class_weight, # üëà ‡πÉ‡∏ä‡πâ class_weight ‡∏ó‡∏µ‡πà‡∏à‡∏π‡∏ô‡πÑ‡∏î‡πâ
            random_state=42,
            max_iter=5000,
            n_jobs=-1
        ))
    ])
    
    # 3. ‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô‡∏ú‡∏•‡∏î‡πâ‡∏ß‡∏¢ Cross-validation ‡πÅ‡∏•‡∏∞‡∏£‡∏ß‡∏ö‡∏£‡∏ß‡∏° Probabilities
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    
    # ‡πÉ‡∏ä‡πâ cross_val_predict ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÉ‡∏´‡πâ‡πÑ‡∏î‡πâ Probabilities ‡∏Ç‡∏≠‡∏á‡∏ó‡∏∏‡∏Å‡∏à‡∏∏‡∏î‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏• (OOF)
    y_probs = cross_val_predict(
        pipeline_lr, 
        X_full, 
        y_full, 
        cv=cv, 
        method='predict_proba', # üëà ‡πÉ‡∏´‡πâ‡∏Ñ‡∏∑‡∏ô‡∏Ñ‡πà‡∏≤‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ô‡πà‡∏≤‡∏à‡∏∞‡πÄ‡∏õ‡πá‡∏ô
        n_jobs=-1
    )[:, 1] # ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ô‡πà‡∏≤‡∏à‡∏∞‡πÄ‡∏õ‡πá‡∏ô‡∏Ç‡∏≠‡∏á‡∏Ñ‡∏•‡∏≤‡∏™ 1

    # 4. ‡∏Ñ‡πâ‡∏ô‡∏´‡∏≤ Best Threshold ‡∏†‡∏≤‡∏¢‡πÉ‡∏ô Trial
    thresholds = np.linspace(0.01, 0.99, 100)
    best_f1_in_trial = -1
    
    for t in thresholds:
        # ‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢‡∏Ñ‡∏•‡∏≤‡∏™‡∏î‡πâ‡∏ß‡∏¢ Threshold 't'
        y_pred_t = (y_probs >= t).astype(int)
        
        # ‡∏Ñ‡∏≥‡∏ô‡∏ß‡∏ì F1 Score (weighted)
        f1_t = sklearn.metrics.f1_score(y_full, y_pred_t, average='weighted')
        
        if f1_t > best_f1_in_trial:
            best_f1_in_trial = f1_t
            
    # ‡πÄ‡∏Å‡πá‡∏ö Best Threshold ‡πÑ‡∏ß‡πâ‡πÉ‡∏ô Trial (‡πÄ‡∏õ‡πá‡∏ô User Attribute)
    trial.set_user_attr("best_threshold", thresholds[np.argmax([sklearn.metrics.f1_score(y_full, (y_probs >= t).astype(int), average='weighted') for t in thresholds])])

    # 5. ‡∏Ñ‡∏∑‡∏ô‡∏Ñ‡πà‡∏≤ F1 ‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢‡∏™‡∏π‡∏á‡∏™‡∏∏‡∏î‡∏ó‡∏µ‡πà‡∏û‡∏ö‡πÉ‡∏ô Trial ‡∏ô‡∏µ‡πâ
    return best_f1_in_trial

# --- 3. ‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏Å‡∏≤‡∏£‡∏Ñ‡πâ‡∏ô‡∏´‡∏≤ (Study) ---

print("\n‡∏Å‡∏≥‡∏•‡∏±‡∏á‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏Å‡∏≤‡∏£‡∏Ñ‡πâ‡∏ô‡∏´‡∏≤‡∏û‡∏≤‡∏£‡∏≤‡∏°‡∏¥‡πÄ‡∏ï‡∏≠‡∏£‡πå‡πÅ‡∏•‡∏∞ Threshold ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö Logistic Regression ‡∏î‡πâ‡∏ß‡∏¢ Optuna...")

pruner = optuna.pruners.MedianPruner()
study = optuna.create_study(direction='maximize', pruner=pruner)

study.optimize(
    objective, 
    n_trials=50, 
    show_progress_bar=True
)

# --- 4. ‡πÅ‡∏™‡∏î‡∏á‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå ---

print("\n--- Optuna ‡∏Ñ‡πâ‡∏ô‡∏´‡∏≤‡πÄ‡∏™‡∏£‡πá‡∏à‡∏™‡∏¥‡πâ‡∏ô! ---")

print("‡∏û‡∏≤‡∏£‡∏≤‡∏°‡∏¥‡πÄ‡∏ï‡∏≠‡∏£‡πå‡∏ó‡∏µ‡πà‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î (Best Model Hyperparameters):")
print(study.best_params)
print(f"\nThreshold ‡∏ó‡∏µ‡πà‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î (Best Threshold): {study.best_trial.user_attrs['best_threshold']:.4f}")
print(f"\nF1-Weighted ‡∏ó‡∏µ‡πà‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î (‡∏à‡∏≤‡∏Å‡∏Å‡∏≤‡∏£ CV): {study.best_value:.6f}")

# --- 5. (‡∏Ç‡∏±‡πâ‡∏ô‡∏ï‡∏≠‡∏ô‡∏ï‡πà‡∏≠‡πÑ‡∏õ) ‡πÄ‡∏ó‡∏£‡∏ô‡πÇ‡∏°‡πÄ‡∏î‡∏•‡∏™‡∏∏‡∏î‡∏ó‡πâ‡∏≤‡∏¢‡πÅ‡∏•‡∏∞‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô‡∏ú‡∏•‡∏ö‡∏ô Test Set ---

print("\n‡∏Å‡∏≥‡∏•‡∏±‡∏á‡πÄ‡∏ó‡∏£‡∏ô Pipeline ‡∏™‡∏∏‡∏î‡∏ó‡πâ‡∏≤‡∏¢‡∏î‡πâ‡∏ß‡∏¢‡∏û‡∏≤‡∏£‡∏≤‡∏°‡∏¥‡πÄ‡∏ï‡∏≠‡∏£‡πå‡∏ó‡∏µ‡πà‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î...")

# ‡∏î‡∏∂‡∏á‡∏û‡∏≤‡∏£‡∏≤‡∏°‡∏¥‡πÄ‡∏ï‡∏≠‡∏£‡πå‡∏ó‡∏µ‡πà‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î‡∏°‡∏≤
best_lr_params = study.best_params
best_threshold = study.best_trial.user_attrs['best_threshold']

# ‡∏™‡∏£‡πâ‡∏≤‡∏á class_weight ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡πÇ‡∏°‡πÄ‡∏î‡∏•‡∏™‡∏∏‡∏î‡∏ó‡πâ‡∏≤‡∏¢
final_class_weight = {0: 1.0, 1: best_lr_params.get('weight_for_class_1')}

# ‡∏™‡∏£‡πâ‡∏≤‡∏á Pipeline ‡∏™‡∏∏‡∏î‡∏ó‡πâ‡∏≤‡∏¢
final_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(
        C=best_lr_params.get('C'),
        penalty=best_lr_params.get('penalty'),
        l1_ratio=best_lr_params.get('l1_ratio'),
        solver='saga',
        class_weight=final_class_weight, # üëà ‡πÉ‡∏ä‡πâ class_weight ‡∏ó‡∏µ‡πà‡∏à‡∏π‡∏ô‡πÑ‡∏î‡πâ
        random_state=42,
        max_iter=5000,
        n_jobs=-1
    ))
])

# ‡πÄ‡∏ó‡∏£‡∏ô Pipeline ‡∏™‡∏∏‡∏î‡∏ó‡πâ‡∏≤‡∏¢‡∏î‡πâ‡∏ß‡∏¢‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏• "‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î" (X_full, y_full)
final_pipeline.fit(X_full, y_full)

print("‡πÄ‡∏ó‡∏£‡∏ô‡πÇ‡∏°‡πÄ‡∏î‡∏•‡∏™‡∏∏‡∏î‡∏ó‡πâ‡∏≤‡∏¢‡πÄ‡∏™‡∏£‡πá‡∏à‡∏™‡∏¥‡πâ‡∏ô! ‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô‡∏ú‡∏•‡∏ö‡∏ô Test Set...")

# ‡πÇ‡∏´‡∏•‡∏î‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏• Test
data_test_lr = pandas.read_csv('../data/salary.test.processed.csv').set_index('id')
X_test = data_test_lr.drop(['label'], axis='columns')
y_test = data_test_lr['label']

# ‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢ Probabilities ‡∏ö‡∏ô Test Set
y_probs_test = final_pipeline.predict_proba(X_test)[:, 1]

# ‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢‡∏ú‡∏•‡∏î‡πâ‡∏ß‡∏¢ Best Threshold
y_pred_final = (y_probs_test >= best_threshold).astype(int)

# ‡πÅ‡∏™‡∏î‡∏á‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡∏ö‡∏ô Test Set
print("\nLogistic Regression (Optuna-Tuned + Best Threshold) Confusion Matrix:")
print(sklearn.metrics.confusion_matrix(
    y_true=y_test,
    y_pred=y_pred_final
))

report_scores_lr = sklearn.metrics.classification_report(
    y_true=y_test,
    y_pred=y_pred_final,
    digits=6,
    output_dict=True
)
df_score_lr = pandas.DataFrame(report_scores_lr).transpose()
print("\nLogistic Regression (Optuna-Tuned + Best Threshold) Report:")
print(df_score_lr)