In [92]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# --- Step 1: Load Your PROCESSED Data ---
# We use the clean CSVs you just saved
try:
    train_df = pd.read_csv('../data/salary.train.processed.csv', index_col='id')
    test_df = pd.read_csv('../data/salary.test.processed.csv', index_col='id')
except FileNotFoundError:
    print("Error: Could not find the processed CSV files.")
    print("Please make sure 'salary.train.processed.csv' and 'salary.test.processed.csv' are in the './data/' folder.")
    # Stop here if files aren't found
    raise
    

# --- Step 2: Separate Features (X) and Target (y) ---
# The 'label' column is our target (y)
# All other columns are our features (X)

# Training data
X_train = train_df.drop('label', axis=1)
y_train = train_df['label']

# Test data
X_test = test_df.drop('label', axis=1)
y_test = test_df['label']

print(f"Training features shape: {X_train.shape}")
print(f"Testing features shape: {X_test.shape}")


# --- Step 3: Initialize and Train the Model ---
# We add max_iter=1000 to help the model solve for the best fit
# 'random_state=42' just ensures you get the same result as me
log_reg = LogisticRegression(max_iter=1000, random_state=42)

print("\nTraining the model...")
log_reg.fit(X_train, y_train)
print("Model training complete!")


# --- Step 4: Evaluate the Model ---
# Make predictions on the (unseen) test data
y_pred = log_reg.predict(X_test)

# Check the accuracy
# This tells us what percentage of predictions were correct
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy on Test Data: {accuracy * 100:.2f}%")

# Get a detailed report (precision, recall, f1-score)
# This is much more useful than just accuracy
print("\nClassification Report:")
print(classification_report(y_test, y_pred,digits=4))

Training features shape: (16720, 89)
Testing features shape: (4180, 89)

Training the model...
Model training complete!

Model Accuracy on Test Data: 81.94%

Classification Report:
              precision    recall  f1-score   support

         0.0     0.8405    0.8485    0.8445      2416
         1.0     0.7898    0.7795    0.7846      1764

    accuracy                         0.8194      4180
   macro avg     0.8151    0.8140    0.8145      4180
weighted avg     0.8191    0.8194    0.8192      4180



In [93]:
import pandas
import optuna
import numpy as np
import sklearn.metrics
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline # üëà ‡∏™‡∏≥‡∏Ñ‡∏±‡∏ç‡∏°‡∏≤‡∏Å
from sklearn.model_selection import cross_val_score

# --- 1. ‡πÇ‡∏´‡∏•‡∏î‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏• (‡πÄ‡∏´‡∏°‡∏∑‡∏≠‡∏ô‡πÄ‡∏î‡∏¥‡∏°) ---
try:
    data_train_full = pandas.read_csv('../data/salary.train.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå salary.train.processed.csv ‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö path")
    # exit() 

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']

print(f"‡πÉ‡∏ä‡πâ‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡πÄ‡∏ó‡∏£‡∏ô‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î {len(y_full)} records ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏Å‡∏≤‡∏£‡∏à‡∏π‡∏ô (K-Fold CV)")


# --- 2. ‡∏™‡∏£‡πâ‡∏≤‡∏á‡∏ü‡∏±‡∏á‡∏Å‡πå‡∏ä‡∏±‡∏ô Objective (‡∏´‡∏±‡∏ß‡πÉ‡∏à‡∏Ç‡∏≠‡∏á Optuna) ---

def objective(trial):
    """
    ‡∏ü‡∏±‡∏á‡∏Å‡πå‡∏ä‡∏±‡∏ô‡∏ô‡∏µ‡πâ‡∏à‡∏∞‡∏ñ‡∏π‡∏Å‡πÄ‡∏£‡∏µ‡∏¢‡∏Å‡πÇ‡∏î‡∏¢ Optuna ‡πÉ‡∏ô‡πÅ‡∏ï‡πà‡∏•‡∏∞ "trial" (‡∏Å‡∏≤‡∏£‡∏ó‡∏î‡∏•‡∏≠‡∏á)
    """
    
    # 1. ‡∏Å‡∏≥‡∏´‡∏ô‡∏î‡∏ä‡πà‡∏ß‡∏á‡∏Ç‡∏≠‡∏á‡∏û‡∏≤‡∏£‡∏≤‡∏°‡∏¥‡πÄ‡∏ï‡∏≠‡∏£‡πå
    
    # C (Regularization strength) - ‡∏™‡∏∏‡πà‡∏°‡πÅ‡∏ö‡∏ö log (1e-4 ‡∏ñ‡∏∂‡∏á 1e2)
    C = trial.suggest_float('C', 1e-4, 1e2, log=True)
    
    # Penalty - 'saga' solver ‡∏£‡∏≠‡∏á‡∏£‡∏±‡∏ö‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2', 'elasticnet'])
    
    # l1_ratio - ‡∏à‡∏∞‡∏ñ‡∏π‡∏Å‡πÉ‡∏ä‡πâ *‡∏ï‡πà‡∏≠‡πÄ‡∏°‡∏∑‡πà‡∏≠* penalty ‡πÄ‡∏õ‡πá‡∏ô 'elasticnet'
    l1_ratio = None
    if penalty == 'elasticnet':
        l1_ratio = trial.suggest_float('l1_ratio', 0, 1)

    
    # 2. ‡∏™‡∏£‡πâ‡∏≤‡∏á Pipeline ‡∏ó‡∏µ‡πà‡∏£‡∏ß‡∏° Scaler ‡πÅ‡∏•‡∏∞ Model ‡πÄ‡∏Ç‡πâ‡∏≤‡∏î‡πâ‡∏ß‡∏¢‡∏Å‡∏±‡∏ô
    pipeline_lr = Pipeline([
        ('scaler', StandardScaler()), # ‡∏Ç‡∏±‡πâ‡∏ô‡∏ï‡∏≠‡∏ô‡∏ó‡∏µ‡πà 1: ‡∏õ‡∏£‡∏±‡∏ö‡∏™‡πÄ‡∏Å‡∏•‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•
        ('model', LogisticRegression(   # ‡∏Ç‡∏±‡πâ‡∏ô‡∏ï‡∏≠‡∏ô‡∏ó‡∏µ‡πà 2: ‡πÄ‡∏ó‡∏£‡∏ô‡πÇ‡∏°‡πÄ‡∏î‡∏•
            C=C,
            penalty=penalty,
            l1_ratio=l1_ratio,
            solver='saga',         # 'saga' ‡∏£‡∏≠‡∏á‡∏£‡∏±‡∏ö‡∏ó‡∏∏‡∏Å penalty
            class_weight='balanced', # ‡∏à‡∏±‡∏î‡∏Å‡∏≤‡∏£ imbalance
            random_state=42,
            max_iter=5000,         # 'saga' ‡∏≠‡∏≤‡∏à‡∏ï‡πâ‡∏≠‡∏á‡πÉ‡∏ä‡πâ iter ‡πÄ‡∏¢‡∏≠‡∏∞‡∏´‡∏ô‡πà‡∏≠‡∏¢
            n_jobs=-1
        ))
    ])
    
    # 3. ‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô‡∏ú‡∏• Pipeline ‡∏î‡πâ‡∏ß‡∏¢ Cross-validation
    score = cross_val_score(
        pipeline_lr, # üëà ‡πÉ‡∏ä‡πâ pipeline ‡πÅ‡∏ó‡∏ô model ‡∏ï‡∏£‡∏á‡πÜ
        X_full, 
        y_full, 
        cv=3,                 
        scoring='f1_weighted',
        n_jobs=-1
    )
    
    # 4. ‡∏Ñ‡∏∑‡∏ô‡∏Ñ‡πà‡∏≤ F1 ‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢‡∏Å‡∏•‡∏±‡∏ö‡πÑ‡∏õ‡πÉ‡∏´‡πâ Optuna
    f1_avg = np.mean(score)
    return f1_avg

# --- 3. ‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏Å‡∏≤‡∏£‡∏Ñ‡πâ‡∏ô‡∏´‡∏≤ (Study) ---

print("\n‡∏Å‡∏≥‡∏•‡∏±‡∏á‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏Å‡∏≤‡∏£‡∏Ñ‡πâ‡∏ô‡∏´‡∏≤‡∏û‡∏≤‡∏£‡∏≤‡∏°‡∏¥‡πÄ‡∏ï‡∏≠‡∏£‡πå‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö Logistic Regression ‡∏î‡πâ‡∏ß‡∏¢ Optuna...")

# ‡∏™‡∏£‡πâ‡∏≤‡∏á study object, ‡∏ö‡∏≠‡∏Å‡∏ß‡πà‡∏≤‡πÄ‡∏£‡∏≤‡∏ï‡πâ‡∏≠‡∏á‡∏Å‡∏≤‡∏£ 'maximize' (‡∏´‡∏≤‡∏Ñ‡πà‡∏≤ F1 ‡∏™‡∏π‡∏á‡∏™‡∏∏‡∏î)
pruner = optuna.pruners.MedianPruner()
study = optuna.create_study(direction='maximize', pruner=pruner)

# ‡∏™‡∏±‡πà‡∏á‡πÉ‡∏´‡πâ‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏Ñ‡πâ‡∏ô‡∏´‡∏≤ (optimize) 50 ‡∏Ñ‡∏£‡∏±‡πâ‡∏á
study.optimize(
    objective, 
    n_trials=50, 
    show_progress_bar=True
)

# --- 4. ‡πÅ‡∏™‡∏î‡∏á‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå ---

print("\n--- Optuna ‡∏Ñ‡πâ‡∏ô‡∏´‡∏≤‡πÄ‡∏™‡∏£‡πá‡∏à‡∏™‡∏¥‡πâ‡∏ô! ---")

print("‡∏û‡∏≤‡∏£‡∏≤‡∏°‡∏¥‡πÄ‡∏ï‡∏≠‡∏£‡πå‡∏ó‡∏µ‡πà‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î (Best Parameters):")
print(study.best_params)
print(f"\nF1-Weighted ‡∏ó‡∏µ‡πà‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î (‡∏à‡∏≤‡∏Å‡∏Å‡∏≤‡∏£ CV): {study.best_value:.6f}")


# --- 5. (‡∏Ç‡∏±‡πâ‡∏ô‡∏ï‡∏≠‡∏ô‡∏ï‡πà‡∏≠‡πÑ‡∏õ) ‡πÄ‡∏ó‡∏£‡∏ô‡πÇ‡∏°‡πÄ‡∏î‡∏•‡∏™‡∏∏‡∏î‡∏ó‡πâ‡∏≤‡∏¢‡πÅ‡∏•‡∏∞‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô‡∏ú‡∏•‡∏ö‡∏ô Test Set ---

print("\n‡∏Å‡∏≥‡∏•‡∏±‡∏á‡πÄ‡∏ó‡∏£‡∏ô Pipeline ‡∏™‡∏∏‡∏î‡∏ó‡πâ‡∏≤‡∏¢‡∏î‡πâ‡∏ß‡∏¢‡∏û‡∏≤‡∏£‡∏≤‡∏°‡∏¥‡πÄ‡∏ï‡∏≠‡∏£‡πå‡∏ó‡∏µ‡πà‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î...")

# ‡∏î‡∏∂‡∏á‡∏û‡∏≤‡∏£‡∏≤‡∏°‡∏¥‡πÄ‡∏ï‡∏≠‡∏£‡πå‡∏ó‡∏µ‡πà‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î‡∏°‡∏≤
best_lr_params = study.best_params

# ‡∏™‡∏£‡πâ‡∏≤‡∏á Pipeline ‡∏™‡∏∏‡∏î‡∏ó‡πâ‡∏≤‡∏¢ (‡∏ï‡πâ‡∏≠‡∏á‡∏°‡∏µ scaler ‡∏î‡πâ‡∏ß‡∏¢)
final_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(
        C=best_lr_params.get('C'),
        penalty=best_lr_params.get('penalty'),
        l1_ratio=best_lr_params.get('l1_ratio'), # ‡∏à‡∏∞‡πÄ‡∏õ‡πá‡∏ô None ‡∏ñ‡πâ‡∏≤ penalty ‡πÑ‡∏°‡πà‡πÉ‡∏ä‡πà 'elasticnet'
        solver='saga',
        class_weight='balanced',
        random_state=42,
        max_iter=5000,
        n_jobs=-1
    ))
])

# ‡πÄ‡∏ó‡∏£‡∏ô Pipeline ‡∏™‡∏∏‡∏î‡∏ó‡πâ‡∏≤‡∏¢‡∏î‡πâ‡∏ß‡∏¢‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏• "‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î" (X_full, y_full)
final_pipeline.fit(X_full, y_full)

print("‡πÄ‡∏ó‡∏£‡∏ô‡πÇ‡∏°‡πÄ‡∏î‡∏•‡∏™‡∏∏‡∏î‡∏ó‡πâ‡∏≤‡∏¢‡πÄ‡∏™‡∏£‡πá‡∏à‡∏™‡∏¥‡πâ‡∏ô! ‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô‡∏ú‡∏•‡∏ö‡∏ô Test Set...")

# ‡πÇ‡∏´‡∏•‡∏î‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏• Test
data_test_lr = pandas.read_csv('../data/salary.test.processed.csv').set_index('id')

# ‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢‡∏ú‡∏•‡∏î‡πâ‡∏ß‡∏¢ Pipeline (‡∏°‡∏±‡∏ô‡∏à‡∏∞ scale ‡πÅ‡∏•‡∏∞ predict ‡πÉ‡∏´‡πâ‡∏≠‡∏±‡∏ï‡πÇ‡∏ô‡∏°‡∏±‡∏ï‡∏¥)
data_test_lr['prediction'] = final_pipeline.predict(data_test_lr.drop(['label'], axis='columns'))

# ‡πÅ‡∏™‡∏î‡∏á‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡∏ö‡∏ô Test Set
print("\nLogistic Regression (Optuna-Tuned) Confusion Matrix:")
print(sklearn.metrics.confusion_matrix(
    y_true=data_test_lr['label'],
    y_pred=data_test_lr['prediction']
))

report_scores_lr = sklearn.metrics.classification_report(
    y_true=data_test_lr['label'],
    y_pred=data_test_lr['prediction'],
    digits=6,
    output_dict=True
)
df_score_lr = pandas.DataFrame(report_scores_lr).transpose()
print("\nLogistic Regression (Optuna-Tuned) Report:")
print(df_score_lr)

[I 2025-10-21 19:55:23,244] A new study created in memory with name: no-name-17991bf4-de1c-424c-9319-79f3eb1cb23a


‡πÉ‡∏ä‡πâ‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡πÄ‡∏ó‡∏£‡∏ô‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î 16720 records ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏Å‡∏≤‡∏£‡∏à‡∏π‡∏ô (K-Fold CV)

‡∏Å‡∏≥‡∏•‡∏±‡∏á‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏Å‡∏≤‡∏£‡∏Ñ‡πâ‡∏ô‡∏´‡∏≤‡∏û‡∏≤‡∏£‡∏≤‡∏°‡∏¥‡πÄ‡∏ï‡∏≠‡∏£‡πå‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö Logistic Regression ‡∏î‡πâ‡∏ß‡∏¢ Optuna...


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-10-21 19:55:32,933] Trial 0 finished with value: 0.8132164063781692 and parameters: {'C': 3.5518274294091126, 'penalty': 'l2'}. Best is trial 0 with value: 0.8132164063781692.
[I 2025-10-21 19:55:37,781] Trial 1 finished with value: 0.809392227702531 and parameters: {'C': 0.005018906875357333, 'penalty': 'elasticnet', 'l1_ratio': 0.5420208302897495}. Best is trial 0 with value: 0.8132164063781692.
[I 2025-10-21 19:55:44,454] Trial 2 finished with value: 0.8129823272647618 and parameters: {'C': 0.12343627546992857, 'penalty': 'l2'}. Best is trial 0 with value: 0.8132164063781692.
[I 2025-10-21 19:55:45,625] Trial 3 finished with value: 0.8102552199537056 and parameters: {'C': 0.003933986242833884, 'penalty': 'l2'}. Best is trial 0 with value: 0.8132164063781692.
[I 2025-10-21 19:55:53,683] Trial 4 finished with value: 0.8132164063781692 and parameters: {'C': 81.54029874770107, 'penalty': 'l1'}. Best is trial 0 with value: 0.8132164063781692.
[I 2025-10-21 19:56:02,111] Trial 5 f

In [94]:
best_params = {
    'C': 3.5518274294091126, 
    'penalty': 'l2', 
    # 'l1_ratio': 0.349891944349909
}

### Class weight

In [95]:
import pandas
import sklearn.metrics
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

print("--- 1. Testing LogReg with Class Weight ---")

# --- Load Data ---
try:
    data_train_full = pandas.read_csv('../data/salary.train.processed.csv').set_index('id')
    data_test_lr = pandas.read_csv('../data/salary.test.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå salary.train.processed.csv ‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö path")
    # exit() 

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']
X_test = data_test_lr.drop(['label'], axis='columns')
y_test = data_test_lr['label']

# --- Define Parameters ---
best_lr_params = best_params
# --- Create and Train Pipeline ---
pipeline = Pipeline([
    ('scaler', StandardScaler()), # Step 1: Scale
    ('model', LogisticRegression(
        **best_lr_params,
        class_weight='balanced', # üëà Add weight
        solver='saga',
        max_iter=5000,
        random_state=42,
        n_jobs=-1
    ))
])

pipeline.fit(X_full, y_full) # Train on original data
print("Model training complete.")

# --- Evaluate ---
y_pred = pipeline.predict(X_test)
report = classification_report(y_test, y_pred, digits=6, output_dict=True)
df_report = pandas.DataFrame(report).transpose()

print("\nLogReg (Tuned + Class Weight) Report:")
print(df_report)

--- 1. Testing LogReg with Class Weight ---
Model training complete.

LogReg (Tuned + Class Weight) Report:
              precision    recall  f1-score      support
0.0            0.873703  0.801738  0.836175  2416.000000
1.0            0.755986  0.841270  0.796351  1764.000000
accuracy       0.818421  0.818421  0.818421     0.818421
macro avg      0.814844  0.821504  0.816263  4180.000000
weighted avg   0.824025  0.818421  0.819369  4180.000000


### SMOTE

In [96]:
import pandas
import sklearn.metrics
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

print("\n--- 2. Testing LogReg with SMOTE ---")

# --- Load Data ---
try:
    data_train_full = pandas.read_csv('../data/salary.train.processed.csv').set_index('id')
    data_test_lr = pandas.read_csv('../data/salary.test.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå salary.train.processed.csv ‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö path")
    # exit() 

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']
X_test = data_test_lr.drop(['label'], axis='columns')
y_test = data_test_lr['label']

# --- Apply SMOTE ---
print("Applying SMOTE...")
smote = SMOTE(random_state=42, n_jobs=-1)
X_resampled, y_resampled = smote.fit_resample(X_full, y_full)
print(f"New resampled label distribution:\n{y_resampled.value_counts()}")

# --- Define Parameters ---
best_lr_params = best_params

# --- Create and Train Pipeline ---
pipeline = Pipeline([
    ('scaler', StandardScaler()), # Step 1: Scale
    ('model', LogisticRegression(
        **best_lr_params,
        # ‚ö†Ô∏è NO 'class_weight'
        solver='saga',
        max_iter=5000,
        random_state=42,
        n_jobs=-1
    ))
])

pipeline.fit(X_resampled, y_resampled) # Train on SMOTEd data
print("Model training complete.")

# --- Evaluate ---
y_pred = pipeline.predict(X_test)
report = classification_report(y_test, y_pred, digits=6, output_dict=True)
df_report = pandas.DataFrame(report).transpose()

print("\nLogReg (Tuned + SMOTE) Report:")
print(df_report)


--- 2. Testing LogReg with SMOTE ---
Applying SMOTE...
New resampled label distribution:
label
1.0    9719
0.0    9719
Name: count, dtype: int64
Model training complete.

LogReg (Tuned + SMOTE) Report:
              precision    recall  f1-score      support
0.0            0.874322  0.800497  0.835782  2416.000000
1.0            0.755081  0.842404  0.796356  1764.000000
accuracy       0.818182  0.818182  0.818182     0.818182
macro avg      0.814702  0.821450  0.816069  4180.000000
weighted avg   0.824001  0.818182  0.819144  4180.000000


### SMOTETomek

In [97]:
import pandas
import sklearn.metrics
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from imblearn.combine import SMOTETomek

print("\n--- 3. Testing LogReg with SMOTETomek ---")

# --- Load Data ---
try:
    data_train_full = pandas.read_csv('../data/salary.train.processed.csv').set_index('id')
    data_test_lr = pandas.read_csv('../data/salary.test.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå salary.train.processed.csv ‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö path")
    # exit() 

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']
X_test = data_test_lr.drop(['label'], axis='columns')
y_test = data_test_lr['label']

# --- Apply SMOTETomek ---
print("Applying SMOTETomek...")
smt = SMOTETomek(random_state=42, n_jobs=-1)
X_resampled, y_resampled = smt.fit_resample(X_full, y_full)
print(f"New resampled label distribution:\n{y_resampled.value_counts()}")

# --- Define Parameters ---
best_lr_params = best_params

# --- Create and Train Pipeline ---
pipeline = Pipeline([
    ('scaler', StandardScaler()), # Step 1: Scale
    ('model', LogisticRegression(
        **best_lr_params,
        # ‚ö†Ô∏è NO 'class_weight'
        solver='saga',
        max_iter=5000,
        random_state=42,
        n_jobs=-1
    ))
])

pipeline.fit(X_resampled, y_resampled) # Train on SMOTETomek data
print("Model training complete.")

# --- Evaluate ---
y_pred = pipeline.predict(X_test)
report = classification_report(y_test, y_pred, digits=6, output_dict=True)
df_report = pandas.DataFrame(report).transpose()

print("\nLogReg (Tuned + SMOTETomek) Report:")
print(df_report)


--- 3. Testing LogReg with SMOTETomek ---
Applying SMOTETomek...
New resampled label distribution:
label
1.0    8914
0.0    8914
Name: count, dtype: int64
Model training complete.

LogReg (Tuned + SMOTETomek) Report:
              precision    recall  f1-score      support
0.0            0.881084  0.794288  0.835438  2416.000000
1.0            0.751748  0.853175  0.799257  1764.000000
accuracy       0.819139  0.819139  0.819139     0.819139
macro avg      0.816416  0.823731  0.817347  4180.000000
weighted avg   0.826503  0.819139  0.820169  4180.000000


### ADASYN

In [98]:
import pandas
import sklearn.metrics
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import ADASYN

print("\n--- 4. Testing LogReg with ADASYN ---")

# --- Load Data ---
try:
    data_train_full = pandas.read_csv('./data/salary.train.processed.csv').set_index('id')
    data_test_lr = pandas.read_csv('./data/salary.test.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå salary.train.processed.csv ‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö path")
    # exit() 

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']
X_test = data_test_lr.drop(['label'], axis='columns')
y_test = data_test_lr['label']

# --- Apply ADASYN ---
print("Applying ADASYN...")
ada = ADASYN(random_state=42, n_jobs=-1)
X_resampled, y_resampled = ada.fit_resample(X_full, y_full)
print(f"New resampled label distribution:\n{y_resampled.value_counts()}")

# --- Define Parameters ---
best_lr_params = best_params

# --- Create and Train Pipeline ---
pipeline = Pipeline([
    ('scaler', StandardScaler()), # Step 1: Scale
    ('model', LogisticRegression(
        **best_lr_params,
        # ‚ö†Ô∏è NO 'class_weight'
        solver='saga',
        max_iter=5000,
        random_state=42,
        n_jobs=-1
    ))
])

pipeline.fit(X_resampled, y_resampled) # Train on ADASYN data
print("Model training complete.")

# --- Evaluate ---
y_pred = pipeline.predict(X_test)
report = classification_report(y_test, y_pred, digits=6, output_dict=True)
df_report = pandas.DataFrame(report).transpose()

print("\nLogReg (Tuned + ADASYN) Report:")
print(df_report)


--- 4. Testing LogReg with ADASYN ---
Error: ‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå salary.train.processed.csv ‡∏Å‡∏£‡∏∏‡∏ì‡∏≤‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö path
Applying ADASYN...
New resampled label distribution:
label
1.0    9726
0.0    9719
Name: count, dtype: int64
Model training complete.

LogReg (Tuned + ADASYN) Report:
              precision    recall  f1-score      support
0.0            0.892278  0.774834  0.829420  2416.000000
1.0            0.738713  0.871882  0.799792  1764.000000
accuracy       0.815789  0.815789  0.815789     0.815789
macro avg      0.815496  0.823358  0.814606  4180.000000
weighted avg   0.827472  0.815789  0.816916  4180.000000
