# RBF SVM with Optuna

This notebook tunes a kernel SVM (RBF) on the LR/SVM feature space using Optuna to maximize macro F1 on the validation set.


In [1]:
import optuna
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATA_DIR = Path("/Users/aaryan/Desktop/ML_multi_class/preprocessed_csv")
OUTPUT_DIR = Path("/Users/aaryan/Desktop/ML_multi_class")

X_train = pd.read_csv(DATA_DIR / "X_train_lr_svm_smote.csv")
y_train = pd.read_csv(DATA_DIR / "y_train_smote.csv").squeeze()

X_val = pd.read_csv(DATA_DIR / "X_val_lr_svm.csv")
y_val = pd.read_csv(DATA_DIR / "y_val.csv").squeeze()

X_test = pd.read_csv(DATA_DIR / "X_test_lr_svm.csv")
test_ids = pd.read_csv(DATA_DIR / "test_ids.csv")

label_map = pd.read_csv(DATA_DIR / "label_encoder_mapping.csv")
encoded_to_cluster = dict(zip(label_map["encoded_value"], label_map["cluster_name"]))

print(f"Train shape: {X_train.shape}, Val shape: {X_val.shape}, Test shape: {X_test.shape}")


Train shape: (3895, 26), Val shape: (383, 26), Test shape: (479, 26)


In [3]:
def build_pipeline(trial: optuna.Trial) -> Pipeline:
    C = trial.suggest_float("C", 1e-2, 1e3, log=True)
    gamma = trial.suggest_float("gamma", 1e-4, 1.0, log=True)
    class_weight = trial.suggest_categorical("class_weight", [None, "balanced"])

    svc = SVC(
        kernel="rbf",
        C=C,
        gamma=gamma,
        probability=True,
        class_weight=class_weight,
    )

    return Pipeline([
        ("scaler", StandardScaler()),
        ("clf", svc),
    ])


def objective(trial: optuna.Trial) -> float:
    pipeline = build_pipeline(trial)
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_val)
    score = f1_score(y_val, preds, average="macro")
    trial.set_user_attr("accuracy", accuracy_score(y_val, preds))
    return score


In [4]:
study = optuna.create_study(direction="maximize", study_name="svm_rbf_macro_f1")
study.optimize(objective, n_trials=40, timeout=2400)

print(f"Best macro F1: {study.best_value:.4f}")
print("Best params:")
for k, v in study.best_trial.params.items():
    print(f"  {k}: {v}")
print(f"Validation accuracy: {study.best_trial.user_attrs['accuracy']:.4f}")


[I 2025-11-27 14:32:01,565] A new study created in memory with name: svm_rbf_macro_f1
[I 2025-11-27 14:32:04,068] Trial 0 finished with value: 0.5163293851208872 and parameters: {'C': 174.46410072859095, 'gamma': 0.000216773437364244, 'class_weight': None}. Best is trial 0 with value: 0.5163293851208872.
[I 2025-11-27 14:32:09,989] Trial 1 finished with value: 0.3569780869903467 and parameters: {'C': 0.031946729450414595, 'gamma': 0.010387135257988017, 'class_weight': None}. Best is trial 0 with value: 0.5163293851208872.
[I 2025-11-27 14:32:12,419] Trial 2 finished with value: 0.4440320174229086 and parameters: {'C': 498.9597813942119, 'gamma': 0.052464376550094796, 'class_weight': None}. Best is trial 0 with value: 0.5163293851208872.
[I 2025-11-27 14:32:20,590] Trial 3 finished with value: 0.32067468947851724 and parameters: {'C': 0.07288821483730164, 'gamma': 0.0001071468492585642, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.5163293851208872.
[I 2025-11-27 14:32:25,6

Best macro F1: 0.5478
Best params:
  C: 7.936983166476877
  gamma: 0.0010206730072738741
  class_weight: balanced
Validation accuracy: 0.6867


In [5]:
fixed_trial = optuna.trial.FixedTrial(study.best_params)
best_pipeline = build_pipeline(fixed_trial)
best_pipeline.fit(pd.concat([X_train, X_val], axis=0), pd.concat([y_train, y_val], axis=0))

val_preds = best_pipeline.predict(X_val)
print(classification_report(y_val, val_preds))

submission = pd.DataFrame(
    {
        "participant_id": test_ids.squeeze(),
        "personality_cluster": [
            encoded_to_cluster[int(label)] for label in best_pipeline.predict(X_test)
        ],
    }
)

svm_rbf_pred_path = OUTPUT_DIR / "svm_rbf_submission.csv"
submission.to_csv(svm_rbf_pred_path, index=False)
print(f"Saved submission to {svm_rbf_pred_path}")

submission.head()


              precision    recall  f1-score   support

           0       0.24      0.47      0.32        17
           1       0.54      0.45      0.49        44
           2       0.57      0.39      0.47        61
           3       0.52      0.76      0.61        66
           4       0.94      0.84      0.89       195

    accuracy                           0.69       383
   macro avg       0.56      0.58      0.56       383
weighted avg       0.73      0.69      0.70       383

Saved submission to /Users/aaryan/Desktop/ML_multi_class/svm_rbf_submission.csv


Unnamed: 0,participant_id,personality_cluster
0,1005,Cluster_E
1,197,Cluster_C
2,2343,Cluster_E
3,1709,Cluster_B
4,436,Cluster_E
