In [5]:
# Core
import os
import importlib
import numpy as np
import pandas as pd
from collections import Counter

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Sampling
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

# Model / Feature selection / Scaling
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.svm import SVC, LinearSVC

# Metrics
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score
)

# Saving Model
import joblib


In [6]:
X_train = pd.read_csv('X_train.csv')
X_test  = pd.read_csv('X_test.csv')

y_train = pd.read_csv('Y_train.csv').squeeze("columns")  
y_test  = pd.read_csv('Y_test.csv').squeeze("columns")    # -> Series

# Optional sanity check
print(y_train.value_counts(dropna=False))
print(y_train.dtype)


Label
LumA      230
LumB      102
Normal     70
Basal      70
Her2       24
Name: count, dtype: int64
object


In [7]:
pipelineSmote = ImbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('svm', SVC(kernel='rbf', probability=True, random_state=42))  # Using SVC as the final classifier
])

In [9]:

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform

param_dist = {
    'svm__C': loguniform(1e-3, 1e3),   
    'svm__kernel': ['linear', 'rbf'],
    'svm__gamma': loguniform(1e-6, 1e-1)
}

rand = RandomizedSearchCV(
    estimator=pipelineSmote,
    param_distributions=param_dist,
    n_iter=50,                     # you decide budget
    scoring='f1_weighted',
    cv=3,
    n_jobs=12,
    verbose=3,
    random_state=42
)

rand.fit(X_train, y_train)
print(rand.best_params_)
best_model2 = rand.best_estimator_



Fitting 3 folds for each of 50 candidates, totalling 150 fits


KeyboardInterrupt: 

In [None]:
import pandas as pd

results_df = pd.DataFrame(rand.cv_results_)

# sort by best score (RandomizedSearch stores mean test score column)
results_df = results_df.sort_values(by='mean_test_score', ascending=False)

top10 = results_df.head(10)

cols_to_show = ['mean_test_score', 'param_svm__C', 'param_svm__gamma', 'param_svm__kernel']
print(top10[cols_to_show])

# Evaluate the model on test data
y_pred = best_model2.predict(X_test)
print("\nTest Accuracy (best model):", accuracy_score(y_test, y_pred))
print("\nModel Evaluation on Test Data:")
print(classification_report(y_test, y_pred))


cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=np.unique(y_test),
            yticklabels=np.unique(y_test))
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix (SVM Model)')
plt.tight_layout()
plt.show()
