# RBF SVM with Custom Data Split

This notebook implements a custom data split strategy:
- Uses only 20% of the original training data
- From that 20%, uses 80% for training and validation
- The remaining 20% is used for evaluation
- Final predictions are made on test.csv


In [1]:
import optuna
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline

  from .autonotebook import tqdm as notebook_tqdm


## Load and Split Data

In [2]:
# Define paths
DATA_DIR = Path("../dataset")
OUTPUT_DIR = Path(".")

# Load full training data
train_df = pd.read_csv(DATA_DIR / "train.csv")
test_df = pd.read_csv(DATA_DIR / "test.csv")

print(f"Full training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

# Separate features and target
X_full = train_df.drop(['participant_id', 'personality_cluster'], axis=1)
y_full = train_df['personality_cluster']
X_test = test_df.drop(['participant_id'], axis=1)
test_ids = test_df['participant_id']

# Encode labels
label_encoder = LabelEncoder()
y_full_encoded = label_encoder.fit_transform(y_full)

print(f"\nLabel classes: {label_encoder.classes_}")
print(f"Encoded values: {np.unique(y_full_encoded)}")

Full training data shape: (1913, 14)
Test data shape: (479, 13)

Label classes: ['Cluster_A' 'Cluster_B' 'Cluster_C' 'Cluster_D' 'Cluster_E']
Encoded values: [0 1 2 3 4]


In [3]:
# Step 1: Take only 20% of the training data
X_20pct, _, y_20pct, _ = train_test_split(
    X_full, y_full_encoded, 
    test_size=0.8, 
    random_state=42, 
    stratify=y_full_encoded
)

print(f"\n20% of training data shape: {X_20pct.shape}")
print(f"Class distribution in 20% data:")
print(pd.Series(y_20pct).value_counts().sort_index())


20% of training data shape: (382, 12)
Class distribution in 20% data:
0     17
1     44
2     61
3     66
4    194
Name: count, dtype: int64


In [4]:
# Step 2: From the 20%, split into 80% train/val and 20% evaluation
X_train_val, X_eval, y_train_val, y_eval = train_test_split(
    X_20pct, y_20pct,
    test_size=0.2,
    random_state=42,
    stratify=y_20pct
)

print(f"\nTrain/Val set shape: {X_train_val.shape} (80% of 20%)")
print(f"Evaluation set shape: {X_eval.shape} (20% of 20%)")
print(f"\nClass distribution in train/val:")
print(pd.Series(y_train_val).value_counts().sort_index())
print(f"\nClass distribution in evaluation:")
print(pd.Series(y_eval).value_counts().sort_index())


Train/Val set shape: (305, 12) (80% of 20%)
Evaluation set shape: (77, 12) (20% of 20%)

Class distribution in train/val:
0     13
1     35
2     49
3     53
4    155
Name: count, dtype: int64

Class distribution in evaluation:
0     4
1     9
2    12
3    13
4    39
Name: count, dtype: int64


In [5]:
# Step 3: Further split train/val into train and validation for hyperparameter tuning
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val,
    test_size=0.2,
    random_state=42,
    stratify=y_train_val
)

print(f"\nFinal data split:")
print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Evaluation set: {X_eval.shape}")
print(f"Test set: {X_test.shape}")
print(f"\nTotal samples used for training: {X_train.shape[0] + X_val.shape[0]} ({(X_train.shape[0] + X_val.shape[0])/len(X_full)*100:.1f}% of original)")
print(f"Total samples used for evaluation: {X_eval.shape[0]} ({X_eval.shape[0]/len(X_full)*100:.1f}% of original)")


Final data split:
Training set: (244, 12)
Validation set: (61, 12)
Evaluation set: (77, 12)
Test set: (479, 12)

Total samples used for training: 305 (15.9% of original)
Total samples used for evaluation: 77 (4.0% of original)


## Hyperparameter Tuning with Optuna

In [6]:
def build_pipeline(trial: optuna.Trial) -> Pipeline:
    C = trial.suggest_float("C", 1e-2, 1e3, log=True)
    gamma = trial.suggest_float("gamma", 1e-4, 1.0, log=True)
    class_weight = trial.suggest_categorical("class_weight", [None, "balanced"])

    svc = SVC(
        kernel="rbf",
        C=C,
        gamma=gamma,
        probability=True,
        class_weight=class_weight,
        random_state=42
    )

    return Pipeline([
        ("scaler", StandardScaler()),
        ("clf", svc),
    ])


def objective(trial: optuna.Trial) -> float:
    pipeline = build_pipeline(trial)
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_val)
    score = f1_score(y_val, preds, average="macro")
    trial.set_user_attr("accuracy", accuracy_score(y_val, preds))
    return score

In [7]:
# Run hyperparameter optimization
study = optuna.create_study(direction="maximize", study_name="svm_rbf_20pct_macro_f1")
study.optimize(objective, n_trials=40, timeout=2400)

print(f"Best macro F1 (validation): {study.best_value:.4f}")
print("Best params:")
for k, v in study.best_trial.params.items():
    print(f"  {k}: {v}")
print(f"Validation accuracy: {study.best_trial.user_attrs['accuracy']:.4f}")

[I 2025-12-02 22:29:56,017] A new study created in memory with name: svm_rbf_20pct_macro_f1


[I 2025-12-02 22:29:56,048] Trial 0 finished with value: 0.33039321104786 and parameters: {'C': 5.378814077771832, 'gamma': 0.0008636880377313632, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.33039321104786.


[I 2025-12-02 22:29:56,076] Trial 1 finished with value: 0.17262737262737265 and parameters: {'C': 0.62589385608747, 'gamma': 0.38660378277370366, 'class_weight': None}. Best is trial 0 with value: 0.33039321104786.


[I 2025-12-02 22:29:56,102] Trial 2 finished with value: 0.21714285714285714 and parameters: {'C': 19.851020512832942, 'gamma': 0.00012518919438983184, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.33039321104786.


[I 2025-12-02 22:29:56,128] Trial 3 finished with value: 0.33805250305250306 and parameters: {'C': 0.3353060081249283, 'gamma': 0.02143930118458674, 'class_weight': 'balanced'}. Best is trial 3 with value: 0.33805250305250306.


[I 2025-12-02 22:29:56,157] Trial 4 finished with value: 0.21009739614390774 and parameters: {'C': 1.8425972993402562, 'gamma': 0.4633726199594021, 'class_weight': None}. Best is trial 3 with value: 0.33805250305250306.


[I 2025-12-02 22:29:56,184] Trial 5 finished with value: 0.2282051282051282 and parameters: {'C': 0.03193507839086434, 'gamma': 0.018880947049703188, 'class_weight': 'balanced'}. Best is trial 3 with value: 0.33805250305250306.


[I 2025-12-02 22:29:56,201] Trial 6 finished with value: 0.13478260869565217 and parameters: {'C': 0.10273007638930978, 'gamma': 0.002856865181157683, 'class_weight': None}. Best is trial 3 with value: 0.33805250305250306.


[I 2025-12-02 22:29:56,218] Trial 7 finished with value: 0.13478260869565217 and parameters: {'C': 0.512431357105251, 'gamma': 0.0004550508171669966, 'class_weight': None}. Best is trial 3 with value: 0.33805250305250306.


[I 2025-12-02 22:29:56,249] Trial 8 finished with value: 0.21969696969696967 and parameters: {'C': 348.5700620269606, 'gamma': 0.9377025479736626, 'class_weight': 'balanced'}. Best is trial 3 with value: 0.33805250305250306.


[I 2025-12-02 22:29:56,275] Trial 9 finished with value: 0.30728160728160725 and parameters: {'C': 4.8250436638373815, 'gamma': 0.0009068612626050924, 'class_weight': 'balanced'}. Best is trial 3 with value: 0.33805250305250306.


[I 2025-12-02 22:29:56,306] Trial 10 finished with value: 0.041176470588235294 and parameters: {'C': 0.019140191214420278, 'gamma': 0.032436105930671165, 'class_weight': 'balanced'}. Best is trial 3 with value: 0.33805250305250306.


[I 2025-12-02 22:29:56,330] Trial 11 finished with value: 0.437274830880969 and parameters: {'C': 36.656564492085, 'gamma': 0.0054338952490279304, 'class_weight': 'balanced'}. Best is trial 11 with value: 0.437274830880969.


[I 2025-12-02 22:29:56,359] Trial 12 finished with value: 0.432053872053872 and parameters: {'C': 93.31412929607848, 'gamma': 0.006193386455360446, 'class_weight': 'balanced'}. Best is trial 11 with value: 0.437274830880969.


[I 2025-12-02 22:29:56,388] Trial 13 finished with value: 0.432053872053872 and parameters: {'C': 134.5503262128645, 'gamma': 0.004671719677472065, 'class_weight': 'balanced'}. Best is trial 11 with value: 0.437274830880969.


[I 2025-12-02 22:29:56,415] Trial 14 finished with value: 0.44721718088324264 and parameters: {'C': 55.86045350117925, 'gamma': 0.05908587987296902, 'class_weight': 'balanced'}. Best is trial 14 with value: 0.44721718088324264.


[I 2025-12-02 22:29:56,442] Trial 15 finished with value: 0.41159767006448933 and parameters: {'C': 949.270748476329, 'gamma': 0.09489373836920369, 'class_weight': 'balanced'}. Best is trial 14 with value: 0.44721718088324264.


[I 2025-12-02 22:29:56,470] Trial 16 finished with value: 0.46924870897884396 and parameters: {'C': 32.21857253110989, 'gamma': 0.06447124927485022, 'class_weight': 'balanced'}. Best is trial 16 with value: 0.46924870897884396.


[I 2025-12-02 22:29:56,499] Trial 17 finished with value: 0.4352226869176022 and parameters: {'C': 26.290050412201637, 'gamma': 0.0916628886174887, 'class_weight': 'balanced'}. Best is trial 16 with value: 0.46924870897884396.


[I 2025-12-02 22:29:56,528] Trial 18 finished with value: 0.3386549707602339 and parameters: {'C': 9.722078618743067, 'gamma': 0.1130723207490084, 'class_weight': None}. Best is trial 16 with value: 0.46924870897884396.


[I 2025-12-02 22:29:56,556] Trial 19 finished with value: 0.43496918767507003 and parameters: {'C': 124.83559810618215, 'gamma': 0.040908779055191816, 'class_weight': 'balanced'}. Best is trial 16 with value: 0.46924870897884396.


[I 2025-12-02 22:29:56,587] Trial 20 finished with value: 0.30357686453576865 and parameters: {'C': 650.6784579234088, 'gamma': 0.2447513368799747, 'class_weight': 'balanced'}. Best is trial 16 with value: 0.46924870897884396.


[I 2025-12-02 22:29:56,613] Trial 21 finished with value: 0.45214045214045206 and parameters: {'C': 35.978859602795424, 'gamma': 0.011584580888445812, 'class_weight': 'balanced'}. Best is trial 16 with value: 0.46924870897884396.


[I 2025-12-02 22:29:56,642] Trial 22 finished with value: 0.4692982456140351 and parameters: {'C': 56.73470062643375, 'gamma': 0.01476615094650684, 'class_weight': 'balanced'}. Best is trial 22 with value: 0.4692982456140351.


[I 2025-12-02 22:29:56,677] Trial 23 finished with value: 0.4307513348588863 and parameters: {'C': 219.91997416774467, 'gamma': 0.012062483555457823, 'class_weight': 'balanced'}. Best is trial 22 with value: 0.4692982456140351.


[I 2025-12-02 22:29:56,702] Trial 24 finished with value: 0.3736813186813187 and parameters: {'C': 14.675735110362561, 'gamma': 0.0024045331917238208, 'class_weight': 'balanced'}. Best is trial 22 with value: 0.4692982456140351.


[I 2025-12-02 22:29:56,731] Trial 25 finished with value: 0.4746081504702194 and parameters: {'C': 52.935459256369406, 'gamma': 0.020702889902997283, 'class_weight': 'balanced'}. Best is trial 25 with value: 0.4746081504702194.


[I 2025-12-02 22:29:56,760] Trial 26 finished with value: 0.3326292319427331 and parameters: {'C': 2.4680755390707345, 'gamma': 0.15966085379336875, 'class_weight': None}. Best is trial 25 with value: 0.4746081504702194.


[I 2025-12-02 22:29:56,787] Trial 27 finished with value: 0.42105263157894735 and parameters: {'C': 327.2117333620954, 'gamma': 0.047859047926906095, 'class_weight': 'balanced'}. Best is trial 25 with value: 0.4746081504702194.


[I 2025-12-02 22:29:56,817] Trial 28 finished with value: 0.5301309468723018 and parameters: {'C': 65.01295754045863, 'gamma': 0.02074743006293025, 'class_weight': 'balanced'}. Best is trial 28 with value: 0.5301309468723018.


[I 2025-12-02 22:29:56,843] Trial 29 finished with value: 0.4391138273491214 and parameters: {'C': 9.717480137564282, 'gamma': 0.0011980444431947128, 'class_weight': 'balanced'}. Best is trial 28 with value: 0.5301309468723018.


[I 2025-12-02 22:29:56,872] Trial 30 finished with value: 0.494790782930711 and parameters: {'C': 69.32532171204588, 'gamma': 0.022102517271227685, 'class_weight': 'balanced'}. Best is trial 28 with value: 0.5301309468723018.


[I 2025-12-02 22:29:56,902] Trial 31 finished with value: 0.4916525784963949 and parameters: {'C': 78.23973950869195, 'gamma': 0.02127316584527007, 'class_weight': 'balanced'}. Best is trial 28 with value: 0.5301309468723018.


[I 2025-12-02 22:29:56,931] Trial 32 finished with value: 0.49975715365979356 and parameters: {'C': 92.04767102869764, 'gamma': 0.027460344480254018, 'class_weight': 'balanced'}. Best is trial 28 with value: 0.5301309468723018.


[I 2025-12-02 22:29:56,960] Trial 33 finished with value: 0.467070838765754 and parameters: {'C': 160.82165190615385, 'gamma': 0.024924533108665256, 'class_weight': 'balanced'}. Best is trial 28 with value: 0.5301309468723018.


[I 2025-12-02 22:29:57,003] Trial 34 finished with value: 0.4256782106782106 and parameters: {'C': 536.9197810497843, 'gamma': 0.008414386836006102, 'class_weight': 'balanced'}. Best is trial 28 with value: 0.5301309468723018.


[I 2025-12-02 22:29:57,027] Trial 35 finished with value: 0.4366666666666667 and parameters: {'C': 83.84174092504381, 'gamma': 0.0028084252450596716, 'class_weight': None}. Best is trial 28 with value: 0.5301309468723018.


[I 2025-12-02 22:29:57,052] Trial 36 finished with value: 0.41725926794015394 and parameters: {'C': 1.272466423026552, 'gamma': 0.03363491201035735, 'class_weight': 'balanced'}. Best is trial 28 with value: 0.5301309468723018.


[I 2025-12-02 22:29:57,081] Trial 37 finished with value: 0.31730008984725966 and parameters: {'C': 14.671208949209245, 'gamma': 0.0002695985526767891, 'class_weight': 'balanced'}. Best is trial 28 with value: 0.5301309468723018.


[I 2025-12-02 22:29:57,102] Trial 38 finished with value: 0.43921052631578944 and parameters: {'C': 5.439646212985039, 'gamma': 0.008829867173076615, 'class_weight': None}. Best is trial 28 with value: 0.5301309468723018.


[I 2025-12-02 22:29:57,132] Trial 39 finished with value: 0.041176470588235294 and parameters: {'C': 0.10385708221974951, 'gamma': 0.17560769971808457, 'class_weight': 'balanced'}. Best is trial 28 with value: 0.5301309468723018.


Best macro F1 (validation): 0.5301
Best params:
  C: 65.01295754045863
  gamma: 0.02074743006293025
  class_weight: balanced
Validation accuracy: 0.6393


## Train Final Model and Evaluate

In [8]:
# Build best model
fixed_trial = optuna.trial.FixedTrial(study.best_params)
best_pipeline = build_pipeline(fixed_trial)

# Train on combined train+val data (80% of 20%)
best_pipeline.fit(X_train_val, y_train_val)

# Evaluate on validation set
val_preds = best_pipeline.predict(X_val)
print("Validation Set Performance:")
print(classification_report(y_val, val_preds, target_names=label_encoder.classes_))

# Evaluate on the held-out evaluation set (20% of 20%)
eval_preds = best_pipeline.predict(X_eval)
eval_f1 = f1_score(y_eval, eval_preds, average="macro")
eval_acc = accuracy_score(y_eval, eval_preds)

print("\n" + "="*60)
print("EVALUATION SET PERFORMANCE (20% of 20% - Final Metrics)")
print("="*60)
print(f"Macro F1: {eval_f1:.4f}")
print(f"Accuracy: {eval_acc:.4f}")
print("\nDetailed Classification Report:")
print(classification_report(y_eval, eval_preds, target_names=label_encoder.classes_))

Validation Set Performance:
              precision    recall  f1-score   support

   Cluster_A       1.00      1.00      1.00         3
   Cluster_B       0.88      1.00      0.93         7
   Cluster_C       1.00      1.00      1.00        10
   Cluster_D       0.83      1.00      0.91        10
   Cluster_E       1.00      0.90      0.95        31

    accuracy                           0.95        61
   macro avg       0.94      0.98      0.96        61
weighted avg       0.96      0.95      0.95        61


EVALUATION SET PERFORMANCE (20% of 20% - Final Metrics)
Macro F1: 0.5830
Accuracy: 0.6364

Detailed Classification Report:
              precision    recall  f1-score   support

   Cluster_A       0.75      0.75      0.75         4
   Cluster_B       0.62      0.56      0.59         9
   Cluster_C       0.28      0.42      0.33        12
   Cluster_D       0.45      0.38      0.42        13
   Cluster_E       0.86      0.79      0.83        39

    accuracy                     

## Generate Predictions on Test Set

In [9]:
# Make predictions on test.csv
test_preds = best_pipeline.predict(X_test)
test_labels = label_encoder.inverse_transform(test_preds)

# Create submission file
submission = pd.DataFrame(
    {
        "participant_id": test_ids,
        "personality_cluster": test_labels,
    }
)

submission_path = OUTPUT_DIR / "svm_rbf_20pct_submission.csv"
submission.to_csv(submission_path, index=False)
print(f"\nSaved submission to {submission_path}")

print("\nSubmission preview:")
print(submission.head(10))
print(f"\nPrediction distribution:")
print(submission['personality_cluster'].value_counts().sort_index())


Saved submission to svm_rbf_20pct_submission.csv

Submission preview:
   participant_id personality_cluster
0            1005           Cluster_E
1             197           Cluster_C
2            2343           Cluster_D
3            1709           Cluster_A
4             436           Cluster_E
5             322           Cluster_A
6            1473           Cluster_C
7            1704           Cluster_C
8             901           Cluster_E
9            1058           Cluster_E

Prediction distribution:
personality_cluster
Cluster_A     25
Cluster_B     58
Cluster_C     79
Cluster_D     96
Cluster_E    221
Name: count, dtype: int64


## Summary

This notebook implements the following data split strategy:
1. **20% sampling**: Used only 20% of the original training data
2. **80/20 split**: From the 20%, split into:
   - 80% for training and validation (with further 80/20 split for hyperparameter tuning)
   - 20% for final evaluation metrics
3. **Test predictions**: Generated predictions on test.csv using the best model

The evaluation metrics on the held-out 20% evaluation set provide an unbiased estimate of model performance.