# Linear SVM with Optuna

Optuna tunes a linear-kernel SVM on the LR/SVM feature matrices, selecting hyperparameters that maximize macro F1 on the validation split.


In [1]:
import optuna
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


In [3]:
DATA_DIR = Path("/Users/aaryan/Desktop/ML_multi_class/preprocessed_csv")
OUTPUT_DIR = Path("/Users/aaryan/Desktop/ML_multi_class")

X_train = pd.read_csv(DATA_DIR / "X_train_lr_svm_smote.csv")
y_train = pd.read_csv(DATA_DIR / "y_train_smote.csv").squeeze()

X_val = pd.read_csv(DATA_DIR / "X_val_lr_svm.csv")
y_val = pd.read_csv(DATA_DIR / "y_val.csv").squeeze()

X_test = pd.read_csv(DATA_DIR / "X_test_lr_svm.csv")
test_ids = pd.read_csv(DATA_DIR / "test_ids.csv")

label_map = pd.read_csv(DATA_DIR / "label_encoder_mapping.csv")
encoded_to_cluster = dict(zip(label_map["encoded_value"], label_map["cluster_name"]))

print(f"Train shape: {X_train.shape}, Val shape: {X_val.shape}, Test shape: {X_test.shape}")


Train shape: (3895, 26), Val shape: (383, 26), Test shape: (479, 26)


In [4]:
def build_pipeline(trial: optuna.Trial) -> Pipeline:
    C = trial.suggest_float("C", 1e-3, 1e3, log=True)
    class_weight = trial.suggest_categorical("class_weight", [None, "balanced"])

    # Use LinearSVC for better performance with a linear kernel.
    # dual=False is recommended when n_samples > n_features.
    svc = LinearSVC(
        C=C,
        class_weight=class_weight,
        dual=False,
        max_iter=3000  # Increased for convergence
    )

    return Pipeline([
        ("scaler", StandardScaler()),
        ("clf", svc),
    ])


def objective(trial: optuna.Trial) -> float:
    pipeline = build_pipeline(trial)
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_val)
    score = f1_score(y_val, preds, average="macro")
    trial.set_user_attr("accuracy", accuracy_score(y_val, preds))
    return score


In [5]:
study = optuna.create_study(direction="maximize", study_name="svm_linear_macro_f1")
study.optimize(objective, n_trials=35, timeout=2000)

print(f"Best macro F1: {study.best_value:.4f}")
print("Best params:")
for k, v in study.best_trial.params.items():
    print(f"  {k}: {v}")
print(f"Validation accuracy: {study.best_trial.user_attrs['accuracy']:.4f}")


[I 2025-11-27 15:22:30,952] A new study created in memory with name: svm_linear_macro_f1
[I 2025-11-27 15:22:31,036] Trial 0 finished with value: 0.42853408974251284 and parameters: {'C': 0.08324746011441632, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.42853408974251284.
[I 2025-11-27 15:22:31,096] Trial 1 finished with value: 0.42492134596999287 and parameters: {'C': 0.021070472298467764, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.42853408974251284.
[I 2025-11-27 15:22:31,169] Trial 2 finished with value: 0.4252696130155097 and parameters: {'C': 49.55001881986014, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.42853408974251284.
[I 2025-11-27 15:22:31,212] Trial 3 finished with value: 0.4069425714950012 and parameters: {'C': 0.001961380166245467, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.42853408974251284.
[I 2025-11-27 15:22:31,252] Trial 4 finished with value: 0.40871330237586545 and parameters: {'C': 0.0011158717513973418,

Best macro F1: 0.4352
Best params:
  C: 0.20785419085056786
  class_weight: None
Validation accuracy: 0.6136


In [6]:
fixed_trial = optuna.trial.FixedTrial(study.best_params)
best_pipeline = build_pipeline(fixed_trial)
best_pipeline.fit(pd.concat([X_train, X_val], axis=0), pd.concat([y_train, y_val], axis=0))

val_preds = best_pipeline.predict(X_val)
print(classification_report(y_val, val_preds))

submission = pd.DataFrame(
    {
        "participant_id": test_ids.squeeze(),
        "personality_cluster": [
            encoded_to_cluster[int(label)] for label in best_pipeline.predict(X_test)
        ],
    }
)

svm_linear_pred_path = OUTPUT_DIR / "svm_linear_submission.csv"
submission.to_csv(svm_linear_pred_path, index=False)
print(f"Saved submission to {svm_linear_pred_path}")

submission.head()


              precision    recall  f1-score   support

           0       0.20      0.53      0.29        17
           1       0.31      0.32      0.31        44
           2       0.33      0.21      0.26        61
           3       0.56      0.50      0.53        66
           4       0.91      0.90      0.90       195

    accuracy                           0.64       383
   macro avg       0.46      0.49      0.46       383
weighted avg       0.65      0.64      0.64       383

Saved submission to /Users/aaryan/Desktop/ML_multi_class/svm_linear_submission.csv


Unnamed: 0,participant_id,personality_cluster
0,1005,Cluster_E
1,197,Cluster_C
2,2343,Cluster_E
3,1709,Cluster_B
4,436,Cluster_E
