# Logistic Regression with Optuna

This notebook trains a multinomial logistic regression classifier on the engineered LR/SVM feature set. Optuna sweeps the regularization strength, penalty, and solver, selecting the configuration that maximizes macro F1 on the validation split.


In [1]:
import optuna
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


  from .autonotebook import tqdm as notebook_tqdm


In [13]:
DATA_DIR = Path("/Users/aaryan/Desktop/ML_multi_class/preprocessed_csv")
OUTPUT_DIR = Path("/Users/aaryan/Desktop/ML_multi_class")

X_train = pd.read_csv(DATA_DIR / "X_train_lr_svm_smote.csv")
y_train = pd.read_csv(DATA_DIR / "y_train_smote.csv").squeeze()

X_val = pd.read_csv(DATA_DIR / "X_val_lr_svm.csv")
y_val = pd.read_csv(DATA_DIR / "y_val.csv").squeeze()

X_test = pd.read_csv(DATA_DIR / "X_test_lr_svm.csv")
test_ids = pd.read_csv(DATA_DIR / "test_ids.csv")

label_map = pd.read_csv(DATA_DIR / "label_encoder_mapping.csv")
encoded_to_cluster = dict(zip(label_map["encoded_value"], label_map["cluster_name"]))

print(f"Train shape: {X_train.shape}, Val shape: {X_val.shape}, Test shape: {X_test.shape}")


Train shape: (3895, 26), Val shape: (383, 26), Test shape: (479, 26)


In [6]:
def build_pipeline(trial: optuna.Trial) -> Pipeline:
    solver = trial.suggest_categorical("solver", ["lbfgs", "saga"])
    if solver == "lbfgs":
        penalty = "l2"
    else:  # saga
        penalty = trial.suggest_categorical("penalty", ["l1", "l2", "elasticnet"])

    C = trial.suggest_float("C", 1e-3, 100.0, log=True)
    max_iter = trial.suggest_int("max_iter", 200, 600, step=100)

    lr_kwargs = {
        "penalty": penalty,
        "solver": solver,
        "C": C,
        "max_iter": max_iter,
        "n_jobs": -1,
    }
    if penalty == "elasticnet":
        lr_kwargs["l1_ratio"] = trial.suggest_float("l1_ratio", 0.0, 1.0)

    model = LogisticRegression(**lr_kwargs)

    return Pipeline([
        ("scaler", StandardScaler(with_mean=True, with_std=True)),
        ("clf", model),
    ])


def objective(trial: optuna.Trial) -> float:
    pipeline = build_pipeline(trial)
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_val)
    score = f1_score(y_val, preds, average="macro")
    trial.set_user_attr("accuracy", accuracy_score(y_val, preds))
    return score


In [7]:
study = optuna.create_study(direction="maximize", study_name="logreg_macro_f1")
study.optimize(objective, n_trials=30, timeout=1800)

print(f"Best macro F1: {study.best_value:.4f}")
print("Best params:")
for k, v in study.best_trial.params.items():
    print(f"  {k}: {v}")
print(f"Validation accuracy: {study.best_trial.user_attrs['accuracy']:.4f}")


[I 2025-11-27 14:05:37,355] A new study created in memory with name: logreg_macro_f1
[I 2025-11-27 14:05:37,453] Trial 0 finished with value: 0.5045185594576086 and parameters: {'solver': 'saga', 'penalty': 'l2', 'C': 0.09218867170595231, 'max_iter': 500}. Best is trial 0 with value: 0.5045185594576086.
[I 2025-11-27 14:05:38,202] Trial 1 finished with value: 0.49565628185995625 and parameters: {'solver': 'saga', 'penalty': 'elasticnet', 'C': 1.149471168170171, 'max_iter': 200, 'l1_ratio': 0.765793664330111}. Best is trial 0 with value: 0.5045185594576086.
[I 2025-11-27 14:05:40,209] Trial 2 finished with value: 0.45608305970821716 and parameters: {'solver': 'lbfgs', 'C': 0.015290228890881186, 'max_iter': 600}. Best is trial 0 with value: 0.5045185594576086.
[I 2025-11-27 14:05:41,573] Trial 3 finished with value: 0.4961251038417388 and parameters: {'solver': 'lbfgs', 'C': 3.3348631184747113, 'max_iter': 600}. Best is trial 0 with value: 0.5045185594576086.
[I 2025-11-27 14:05:43,082] 

Best macro F1: 0.5191
Best params:
  solver: saga
  penalty: l1
  C: 0.13212390872294072
  max_iter: 600
Validation accuracy: 0.6606


In [14]:
fixed_trial = optuna.trial.FixedTrial(study.best_params)
best_pipeline = build_pipeline(fixed_trial)
best_pipeline.fit(pd.concat([X_train, X_val], axis=0), pd.concat([y_train, y_val], axis=0))

val_preds = best_pipeline.predict(X_val)
print(classification_report(y_val, val_preds))

# Generate submission predictions aligned with sample_submission.csv
test_labels = best_pipeline.predict(X_test)
submission = pd.DataFrame(
    {
        "participant_id": test_ids.squeeze(),
        "personality_cluster": [encoded_to_cluster[int(label)] for label in test_labels],
    }
)

logreg_pred_path = OUTPUT_DIR / "logreg_submission.csv"
submission.to_csv(logreg_pred_path, index=False)
print(f"Saved submission to {logreg_pred_path}")

submission.head()


              precision    recall  f1-score   support

           0       0.27      0.53      0.36        17
           1       0.43      0.36      0.40        44
           2       0.51      0.41      0.45        61
           3       0.57      0.71      0.63        66
           4       0.92      0.86      0.89       195

    accuracy                           0.69       383
   macro avg       0.54      0.57      0.55       383
weighted avg       0.71      0.69      0.69       383

Saved submission to /Users/aaryan/Desktop/ML_multi_class/logreg_submission.csv


Unnamed: 0,participant_id,personality_cluster
0,1005,Cluster_E
1,197,Cluster_C
2,2343,Cluster_E
3,1709,Cluster_B
4,436,Cluster_E
