In [13]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# --- Step 1: Load Your PROCESSED Data ---
# We use the clean CSVs you just saved
try:
    train_df = pd.read_csv('../data/salary.train.processed.csv', index_col='id')
    test_df = pd.read_csv('../data/salary.test.processed.csv', index_col='id')
except FileNotFoundError:
    print("Error: Could not find the processed CSV files.")
    print("Please make sure 'salary.train.processed.csv' and 'salary.test.processed.csv' are in the './data/' folder.")
    # Stop here if files aren't found
    raise
    

# --- Step 2: Separate Features (X) and Target (y) ---
# The 'label' column is our target (y)
# All other columns are our features (X)

# Training data
X_train = train_df.drop('label', axis=1)
y_train = train_df['label']

# Test data
X_test = test_df.drop('label', axis=1)
y_test = test_df['label']

print(f"Training features shape: {X_train.shape}")
print(f"Testing features shape: {X_test.shape}")


# --- Step 3: Initialize and Train the Model ---
# We add max_iter=1000 to help the model solve for the best fit
# 'random_state=42' just ensures you get the same result as me
log_reg = LogisticRegression(max_iter=1000, random_state=42)

print("\nTraining the model...")
log_reg.fit(X_train, y_train)
print("Model training complete!")


# --- Step 4: Evaluate the Model ---
# Make predictions on the (unseen) test data
y_pred = log_reg.predict(X_test)

# Check the accuracy
# This tells us what percentage of predictions were correct
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy on Test Data: {accuracy * 100:.2f}%")

# Get a detailed report (precision, recall, f1-score)
# This is much more useful than just accuracy
print("\nClassification Report:")
print(classification_report(y_test, y_pred,digits=4))

Training features shape: (16720, 56)
Testing features shape: (4180, 56)

Training the model...
Model training complete!

Model Accuracy on Test Data: 82.11%

Classification Report:
              precision    recall  f1-score   support

         0.0     0.8432    0.8481    0.8456      2416
         1.0     0.7903    0.7840    0.7871      1764

    accuracy                         0.8211      4180
   macro avg     0.8167    0.8161    0.8164      4180
weighted avg     0.8209    0.8211    0.8210      4180



In [14]:
import pandas
import optuna
import numpy as np
import sklearn.metrics
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline # 👈 สำคัญมาก
from sklearn.model_selection import cross_val_score

# --- 1. โหลดข้อมูล (เหมือนเดิม) ---
try:
    data_train_full = pandas.read_csv('../data/salary.train.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ไม่พบไฟล์ salary.train.processed.csv กรุณาตรวจสอบ path")
    # exit() 

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']

print(f"ใช้ข้อมูลเทรนทั้งหมด {len(y_full)} records สำหรับการจูน (K-Fold CV)")


# --- 2. สร้างฟังก์ชัน Objective (หัวใจของ Optuna) ---

def objective(trial):
    """
    ฟังก์ชันนี้จะถูกเรียกโดย Optuna ในแต่ละ "trial" (การทดลอง)
    """
    
    # 1. กำหนดช่วงของพารามิเตอร์
    
    # C (Regularization strength) - สุ่มแบบ log (1e-4 ถึง 1e2)
    C = trial.suggest_float('C', 1e-4, 1e2, log=True)
    
    # Penalty - 'saga' solver รองรับทั้งหมด
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2', 'elasticnet'])
    
    # l1_ratio - จะถูกใช้ *ต่อเมื่อ* penalty เป็น 'elasticnet'
    l1_ratio = None
    if penalty == 'elasticnet':
        l1_ratio = trial.suggest_float('l1_ratio', 0, 1)

    
    # 2. สร้าง Pipeline ที่รวม Scaler และ Model เข้าด้วยกัน
    pipeline_lr = Pipeline([
        ('scaler', StandardScaler()), # ขั้นตอนที่ 1: ปรับสเกลข้อมูล
        ('model', LogisticRegression(   # ขั้นตอนที่ 2: เทรนโมเดล
            C=C,
            penalty=penalty,
            l1_ratio=l1_ratio,
            solver='saga',         # 'saga' รองรับทุก penalty
            class_weight='balanced', # จัดการ imbalance
            random_state=42,
            max_iter=5000,         # 'saga' อาจต้องใช้ iter เยอะหน่อย
            n_jobs=-1
        ))
    ])
    
    # 3. ประเมินผล Pipeline ด้วย Cross-validation
    score = cross_val_score(
        pipeline_lr, # 👈 ใช้ pipeline แทน model ตรงๆ
        X_full, 
        y_full, 
        cv=3,                 
        scoring='f1_weighted',
        n_jobs=-1
    )
    
    # 4. คืนค่า F1 เฉลี่ยกลับไปให้ Optuna
    f1_avg = np.mean(score)
    return f1_avg

# --- 3. เริ่มการค้นหา (Study) ---

print("\nกำลังเริ่มการค้นหาพารามิเตอร์สำหรับ Logistic Regression ด้วย Optuna...")

# สร้าง study object, บอกว่าเราต้องการ 'maximize' (หาค่า F1 สูงสุด)
pruner = optuna.pruners.MedianPruner()
study = optuna.create_study(direction='maximize', pruner=pruner)

# สั่งให้เริ่มค้นหา (optimize) 50 ครั้ง
study.optimize(
    objective, 
    n_trials=50, 
    show_progress_bar=True
)

# --- 4. แสดงผลลัพธ์ ---

print("\n--- Optuna ค้นหาเสร็จสิ้น! ---")

print("พารามิเตอร์ที่ดีที่สุด (Best Parameters):")
print(study.best_params)
print(f"\nF1-Weighted ที่ดีที่สุด (จากการ CV): {study.best_value:.6f}")


# --- 5. (ขั้นตอนต่อไป) เทรนโมเดลสุดท้ายและประเมินผลบน Test Set ---

print("\nกำลังเทรน Pipeline สุดท้ายด้วยพารามิเตอร์ที่ดีที่สุด...")

# ดึงพารามิเตอร์ที่ดีที่สุดมา
best_lr_params = study.best_params

# สร้าง Pipeline สุดท้าย (ต้องมี scaler ด้วย)
final_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(
        C=best_lr_params.get('C'),
        penalty=best_lr_params.get('penalty'),
        l1_ratio=best_lr_params.get('l1_ratio'), # จะเป็น None ถ้า penalty ไม่ใช่ 'elasticnet'
        solver='saga',
        class_weight='balanced',
        random_state=42,
        max_iter=5000,
        n_jobs=-1
    ))
])

# เทรน Pipeline สุดท้ายด้วยข้อมูล "ทั้งหมด" (X_full, y_full)
final_pipeline.fit(X_full, y_full)

print("เทรนโมเดลสุดท้ายเสร็จสิ้น! กำลังประเมินผลบน Test Set...")

# โหลดข้อมูล Test
data_test_lr = pandas.read_csv('../data/salary.test.processed.csv').set_index('id')

# ทำนายผลด้วย Pipeline (มันจะ scale และ predict ให้อัตโนมัติ)
data_test_lr['prediction'] = final_pipeline.predict(data_test_lr.drop(['label'], axis='columns'))

# แสดงผลลัพธ์บน Test Set
print("\nLogistic Regression (Optuna-Tuned) Confusion Matrix:")
print(sklearn.metrics.confusion_matrix(
    y_true=data_test_lr['label'],
    y_pred=data_test_lr['prediction']
))

report_scores_lr = sklearn.metrics.classification_report(
    y_true=data_test_lr['label'],
    y_pred=data_test_lr['prediction'],
    digits=6,
    output_dict=True
)
df_score_lr = pandas.DataFrame(report_scores_lr).transpose()
print("\nLogistic Regression (Optuna-Tuned) Report:")
print(df_score_lr)


[I 2025-10-22 07:00:50,735] A new study created in memory with name: no-name-42188d89-0f6e-4a93-8e40-bc2db57dac50


ใช้ข้อมูลเทรนทั้งหมด 16720 records สำหรับการจูน (K-Fold CV)

กำลังเริ่มการค้นหาพารามิเตอร์สำหรับ Logistic Regression ด้วย Optuna...


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-10-22 07:00:54,165] Trial 0 finished with value: 0.4273580268070503 and parameters: {'C': 0.00014333704798952278, 'penalty': 'elasticnet', 'l1_ratio': 0.7652225025342507}. Best is trial 0 with value: 0.4273580268070503.
[I 2025-10-22 07:01:01,784] Trial 1 finished with value: 0.8125680134395764 and parameters: {'C': 14.522194869951448, 'penalty': 'l1'}. Best is trial 1 with value: 0.8125680134395764.
[I 2025-10-22 07:01:08,390] Trial 2 finished with value: 0.8125680134395764 and parameters: {'C': 11.708657930091368, 'penalty': 'elasticnet', 'l1_ratio': 0.4529649291521767}. Best is trial 1 with value: 0.8125680134395764.
[I 2025-10-22 07:01:13,881] Trial 3 finished with value: 0.8125680134395764 and parameters: {'C': 36.34301570037935, 'penalty': 'l1'}. Best is trial 1 with value: 0.8125680134395764.
[I 2025-10-22 07:01:14,342] Trial 4 finished with value: 0.8051782639387195 and parameters: {'C': 0.001298501656926325, 'penalty': 'l2'}. Best is trial 1 with value: 0.8125680134395

In [15]:
best_params = best_lr_params

### Class weight

In [16]:
import pandas
import sklearn.metrics
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

print("--- 1. Testing LogReg with Class Weight ---")

# --- Load Data ---
try:
    data_train_full = pandas.read_csv('../data/salary.train.processed.csv').set_index('id')
    data_test_lr = pandas.read_csv('../data/salary.test.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ไม่พบไฟล์ salary.train.processed.csv กรุณาตรวจสอบ path")
    # exit() 

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']
X_test = data_test_lr.drop(['label'], axis='columns')
y_test = data_test_lr['label']

# --- Define Parameters ---
best_lr_params = best_params
# --- Create and Train Pipeline ---
pipeline = Pipeline([
    ('scaler', StandardScaler()), # Step 1: Scale
    ('model', LogisticRegression(
        **best_lr_params,
        class_weight='balanced', # 👈 Add weight
        solver='saga',
        max_iter=5000,
        random_state=42,
        n_jobs=-1
    ))
])

pipeline.fit(X_full, y_full) # Train on original data
print("Model training complete.")

# --- Evaluate ---
y_pred = pipeline.predict(X_test)
report = classification_report(y_test, y_pred, digits=6, output_dict=True)
df_report = pandas.DataFrame(report).transpose()

print("\nLogReg (Tuned + Class Weight) Report:")
print(df_report)

--- 1. Testing LogReg with Class Weight ---
Model training complete.

LogReg (Tuned + Class Weight) Report:
              precision    recall  f1-score      support
0.0            0.878292  0.800497  0.837592  2416.000000
1.0            0.756320  0.848073  0.799572  1764.000000
accuracy       0.820574  0.820574  0.820574     0.820574
macro avg      0.817306  0.824285  0.818582  4180.000000
weighted avg   0.826819  0.820574  0.821547  4180.000000


### SMOTE

In [17]:
import pandas
import sklearn.metrics
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

print("\n--- 2. Testing LogReg with SMOTE ---")

# --- Load Data ---
try:
    data_train_full = pandas.read_csv('../data/salary.train.processed.csv').set_index('id')
    data_test_lr = pandas.read_csv('../data/salary.test.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ไม่พบไฟล์ salary.train.processed.csv กรุณาตรวจสอบ path")
    # exit() 

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']
X_test = data_test_lr.drop(['label'], axis='columns')
y_test = data_test_lr['label']

# --- Apply SMOTE ---
print("Applying SMOTE...")
smote = SMOTE(random_state=42, n_jobs=-1)
X_resampled, y_resampled = smote.fit_resample(X_full, y_full)
print(f"New resampled label distribution:\n{y_resampled.value_counts()}")

# --- Define Parameters ---
best_lr_params = best_params

# --- Create and Train Pipeline ---
pipeline = Pipeline([
    ('scaler', StandardScaler()), # Step 1: Scale
    ('model', LogisticRegression(
        **best_lr_params,
        # ⚠️ NO 'class_weight'
        solver='saga',
        max_iter=5000,
        random_state=42,
        n_jobs=-1
    ))
])

pipeline.fit(X_resampled, y_resampled) # Train on SMOTEd data
print("Model training complete.")

# --- Evaluate ---
y_pred = pipeline.predict(X_test)
report = classification_report(y_test, y_pred, digits=6, output_dict=True)
df_report = pandas.DataFrame(report).transpose()

print("\nLogReg (Tuned + SMOTE) Report:")
print(df_report)


--- 2. Testing LogReg with SMOTE ---
Applying SMOTE...
New resampled label distribution:
label
1.0    9719
0.0    9719
Name: count, dtype: int64
Model training complete.

LogReg (Tuned + SMOTE) Report:
              precision    recall  f1-score      support
0.0            0.878126  0.799255  0.836836  2416.000000
1.0            0.755174  0.848073  0.798932  1764.000000
accuracy       0.819856  0.819856  0.819856     0.819856
macro avg      0.816650  0.823664  0.817884  4180.000000
weighted avg   0.826239  0.819856  0.820840  4180.000000


### SMOTETomek

In [18]:
import pandas
import sklearn.metrics
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from imblearn.combine import SMOTETomek

print("\n--- 3. Testing LogReg with SMOTETomek ---")

# --- Load Data ---
try:
    data_train_full = pandas.read_csv('../data/salary.train.processed.csv').set_index('id')
    data_test_lr = pandas.read_csv('../data/salary.test.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ไม่พบไฟล์ salary.train.processed.csv กรุณาตรวจสอบ path")
    # exit() 

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']
X_test = data_test_lr.drop(['label'], axis='columns')
y_test = data_test_lr['label']

# --- Apply SMOTETomek ---
print("Applying SMOTETomek...")
smt = SMOTETomek(random_state=42, n_jobs=-1)
X_resampled, y_resampled = smt.fit_resample(X_full, y_full)
print(f"New resampled label distribution:\n{y_resampled.value_counts()}")

# --- Define Parameters ---
best_lr_params = best_params

# --- Create and Train Pipeline ---
pipeline = Pipeline([
    ('scaler', StandardScaler()), # Step 1: Scale
    ('model', LogisticRegression(
        **best_lr_params,
        # ⚠️ NO 'class_weight'
        solver='saga',
        max_iter=5000,
        random_state=42,
        n_jobs=-1
    ))
])

pipeline.fit(X_resampled, y_resampled) # Train on SMOTETomek data
print("Model training complete.")

# --- Evaluate ---
y_pred = pipeline.predict(X_test)
report = classification_report(y_test, y_pred, digits=6, output_dict=True)
df_report = pandas.DataFrame(report).transpose()

print("\nLogReg (Tuned + SMOTETomek) Report:")
print(df_report)


--- 3. Testing LogReg with SMOTETomek ---
Applying SMOTETomek...
New resampled label distribution:
label
1.0    9220
0.0    9220
Name: count, dtype: int64
Model training complete.

LogReg (Tuned + SMOTETomek) Report:
              precision    recall  f1-score      support
0.0            0.878304  0.797599  0.836009  2416.000000
1.0            0.753776  0.848639  0.798400  1764.000000
accuracy       0.819139  0.819139  0.819139     0.819139
macro avg      0.816040  0.823119  0.817204  4180.000000
weighted avg   0.825752  0.819139  0.820137  4180.000000


### ADASYN

In [19]:
import pandas
import sklearn.metrics
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import ADASYN

print("\n--- 4. Testing LogReg with ADASYN ---")

# --- Load Data ---
try:
    data_train_full = pandas.read_csv('./data/salary.train.processed.csv').set_index('id')
    data_test_lr = pandas.read_csv('./data/salary.test.processed.csv').set_index('id')
except FileNotFoundError:
    print("Error: ไม่พบไฟล์ salary.train.processed.csv กรุณาตรวจสอบ path")
    # exit() 

X_full = data_train_full.drop(['label'], axis='columns')
y_full = data_train_full['label']
X_test = data_test_lr.drop(['label'], axis='columns')
y_test = data_test_lr['label']

# --- Apply ADASYN ---
print("Applying ADASYN...")
ada = ADASYN(random_state=42, n_jobs=-1)
X_resampled, y_resampled = ada.fit_resample(X_full, y_full)
print(f"New resampled label distribution:\n{y_resampled.value_counts()}")

# --- Define Parameters ---
best_lr_params = best_params

# --- Create and Train Pipeline ---
pipeline = Pipeline([
    ('scaler', StandardScaler()), # Step 1: Scale
    ('model', LogisticRegression(
        **best_lr_params,
        # ⚠️ NO 'class_weight'
        solver='saga',
        max_iter=5000,
        random_state=42,
        n_jobs=-1
    ))
])

pipeline.fit(X_resampled, y_resampled) # Train on ADASYN data
print("Model training complete.")

# --- Evaluate ---
y_pred = pipeline.predict(X_test)
report = classification_report(y_test, y_pred, digits=6, output_dict=True)
df_report = pandas.DataFrame(report).transpose()

print("\nLogReg (Tuned + ADASYN) Report:")
print(df_report)


--- 4. Testing LogReg with ADASYN ---
Error: ไม่พบไฟล์ salary.train.processed.csv กรุณาตรวจสอบ path
Applying ADASYN...
New resampled label distribution:
label
0.0    9719
1.0    9698
Name: count, dtype: int64
Model training complete.

LogReg (Tuned + ADASYN) Report:
              precision    recall  f1-score      support
0.0            0.892344  0.771937  0.827785  2416.000000
1.0            0.736364  0.872449  0.798651  1764.000000
accuracy       0.814354  0.814354  0.814354     0.814354
macro avg      0.814354  0.822193  0.813218  4180.000000
weighted avg   0.826519  0.814354  0.815490  4180.000000
