In [38]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import RandomizedSearchCV

# ML imports
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score

from sklearn.svm import LinearSVC,SVC
from sklearn.feature_selection import SelectFromModel

# SAMPLERS
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN

In [39]:
# =============================================
# Load Data
# =============================================
train = pd.read_csv("../train.csv")
test = pd.read_csv("../test.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)

Train shape: (1913, 14)
Test shape: (479, 13)


In [40]:

# ============================================================
# Preprocessing Setup
# ============================================================
X = train.drop(columns=['participant_id', 'personality_cluster'])
y = train['personality_cluster']

X_test_final = test.drop(columns=['participant_id'])

numerical_cols = ['focus_intensity', 'consistency_score']

nominal_cols = [
    'identity_code', 'cultural_background', 'upbringing_influence', 
    'external_guidance_usage', 'support_environment_score',
    'hobby_engagement_level', 'physical_activity_index',
    'creative_expression_index', 'altruism_score'
]

ordinal_cols = ['age_group']

# Pipelines
numerical_pipeline = Pipeline([('scaler', StandardScaler())])
nominal_pipeline = Pipeline([('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])
ordinal_pipeline = Pipeline([('ordinal', OrdinalEncoder())])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_cols),
        ('nom', nominal_pipeline, nominal_cols),
        ('ord', ordinal_pipeline, ordinal_cols),
    ],
    remainder='drop'
)

# Fit-transform
X_processed = preprocessor.fit_transform(X)
X_test_final_processed = preprocessor.transform(X_test_final)

# Encode multi-class labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Train validation split
X_train_processed, X_test_val_processed, y_train, y_test_val = train_test_split(
    X_processed, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print("Target classes mapping:", dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))


Target classes mapping: {'Cluster_A': np.int64(0), 'Cluster_B': np.int64(1), 'Cluster_C': np.int64(2), 'Cluster_D': np.int64(3), 'Cluster_E': np.int64(4)}


In [41]:
# ============================================================
# Sampling Strategies
# ============================================================

samplers = {
    'No_Sampling': None,
    'ROS': RandomOverSampler(random_state=42),
    'SMOTE': SMOTE(random_state=42),
    'ADASYN': ADASYN(random_state=42)
}


In [42]:
# ============================
# Feature Selection + SVM
# ============================
svm_pipeline_core = Pipeline([
    ("feature_select", SelectFromModel(
        LinearSVC(C=0.1, penalty="l1", dual=False, random_state=42, max_iter=5000)
    )),
    ("svm", SVC(kernel="linear", probability=True, random_state=42))
])

# Randomized Search Param Grid
param_dist = {
    "svm__C": [0.01, 0.1, 1, 10, 50]
}

best_models = {}
results = {}
test_predictions = {}

print("\n================= SVM MULTI-CLASS TRAINING =================\n")

for name, sampler in samplers.items():
    print(f"\n--- Training with {name} ---")

    if sampler:
        model_pipeline = ImbPipeline(steps=[
            ("sampler", sampler),
            ("svm_block", svm_pipeline_core)
        ])
        param_search = {"svm_block__svm__C": param_dist["svm__C"]}
    else:
        model_pipeline = svm_pipeline_core
        param_search = param_dist

    # ======================
    # Randomized Search
    # ======================
    search = RandomizedSearchCV(
        estimator=model_pipeline,
        param_distributions=param_search,
        n_iter=5,           # keep small for speed
        scoring="roc_auc_ovr",
        cv=3,
        verbose=1,
        n_jobs=-1,
        random_state=42
    )

    # Fit the model
    search.fit(X_train_processed, y_train)

    best_model = search.best_estimator_
    best_models[name] = best_model

    print("Best Params:", search.best_params_)

    # ======================
    # Validation predictions
    # ======================
    val_proba = best_model.predict_proba(X_test_val_processed)
    y_val_pred = np.argmax(val_proba, axis=1)

    acc = accuracy_score(y_test_val, y_val_pred)
    auc = roc_auc_score(y_test_val, val_proba, multi_class="ovr")

    print(f"Validation Accuracy: {acc:.4f}")
    print(f"Validation ROC-AUC: {auc:.4f}")

    results[name] = {"Val_Accuracy": acc, "Val_ROC_AUC": auc}

    # ======================
    # TEST predictions
    # ======================
    test_proba = best_model.predict_proba(X_test_final_processed)
    test_pred_labels = label_encoder.inverse_transform(np.argmax(test_proba, axis=1))

    # Save separate CSV for each sampler
    submission = pd.DataFrame({
        "participant_id": test["participant_id"],
        "personality_cluster": test_pred_labels
    })

    file_name = f"svm_multiclass_submission_{name}.csv"
    submission.to_csv(file_name, index=False)
    print(f"Saved → {file_name}")
    test_predictions[name] = test_pred_labels

# ======================
# Summary
# ======================
summary_df = pd.DataFrame(results).T.sort_values(by="Val_ROC_AUC", ascending=False)
print("\n===== SUMMARY OF RESULTS =====")
print(summary_df)

best_strategy = summary_df.index[0]
final_model = best_models[best_strategy]
print(f"\nBest Model Selected → {best_strategy}")




--- Training with No_Sampling ---
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best Params: {'svm__C': 1}
Validation Accuracy: 0.7285
Validation ROC-AUC: 0.8874
Saved → svm_multiclass_submission_No_Sampling.csv

--- Training with ROS ---
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best Params: {'svm_block__svm__C': 0.1}
Validation Accuracy: 0.6684
Validation ROC-AUC: 0.8524
Saved → svm_multiclass_submission_ROS.csv

--- Training with SMOTE ---
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best Params: {'svm_block__svm__C': 0.1}
Validation Accuracy: 0.6815
Validation ROC-AUC: 0.8568
Saved → svm_multiclass_submission_SMOTE.csv

--- Training with ADASYN ---
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best Params: {'svm_block__svm__C': 0.1}
Validation Accuracy: 0.6762
Validation ROC-AUC: 0.8561
Saved → svm_multiclass_submission_ADASYN.csv

===== SUMMARY OF RESULTS =====
             Val_Accuracy  Val_ROC_AUC
No_Sampling      0