# SVM testing

In [1]:
from sklearn.svm import SVC 
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd 
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import shap
import matplotlib.pyplot  as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
jee = pd.read_csv("../01_Data/03_final/JEE_Dropout_Final.csv", delimiter=',')
jee.head()

Unnamed: 0,jee_main_score,school_board,class_12_percent,attempt_count,coaching_institute,daily_study_hours,family_income,parent_education,location_type,peer_pressure_level,mental_health_issues,admission_taken,dropout,Income vs Admission,PSxIA,peer_focused_mh,parental_support
0,78.95,1,70.09,1,0,5.4,0,0,2,0,0,0,1,0,0.0,0,1.62
1,70.06,0,78.0,1,0,5.5,1,0,2,0,1,0,0,1,1.65,1,1.65
2,81.07,2,64.36,1,0,7.0,0,3,1,1,1,0,1,0,0.0,3,2.7
3,93.32,2,73.21,1,0,2.1,0,1,1,1,1,1,0,3,2.49,3,0.83
4,68.72,1,89.02,1,0,6.3,1,2,1,2,0,1,0,4,9.16,4,2.29


In [3]:
X = jee.drop(["admission_taken","Income vs Admission","PSxIA","dropout"], axis=1)
Y = jee["dropout"]

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

In [22]:
from sklearn.metrics import f1_score
pipeline = Pipeline([
    ('smote',SMOTE()),
    ('scaler', StandardScaler()),
    ('svm', SVC(kernel='rbf',C=2,probability=True))
])
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print("Accuracy: ", f1_score(y_test, y_pred))
print(classification_report(y_test,y_pred))

pipeline = Pipeline([
    ('smote',SMOTE()),
    ('scaler', StandardScaler()),
    ('svm', SVC(kernel='linear',C=2,probability=True))
])
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print("Accuracy: ", f1_score(y_test, y_pred))
print(classification_report(y_test,y_pred))

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import make_scorer

scorer = make_scorer(f1_score,average='macro')
pipeline = Pipeline([
    ('smote',SMOTE()),
    ('scaler', StandardScaler()),
    ('rfc', SVC(probability=True,random_state=42))
])
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print("Accuracy: ", f1_score(y_test, y_pred))
print(classification_report(y_test,y_pred))

cv= StratifiedKFold(n_splits = 5, shuffle=True, random_state=42)
scores = cross_val_score(pipeline,X, Y, scoring=scorer, cv=cv)

print(scores)
print()
print(scores.mean())

Accuracy:  0.6424474187380497
              precision    recall  f1-score   support

           0       0.94      0.81      0.87       793
           1       0.53      0.81      0.64       207

    accuracy                           0.81      1000
   macro avg       0.74      0.81      0.76      1000
weighted avg       0.86      0.81      0.83      1000

Accuracy:  0.6678700361010831
              precision    recall  f1-score   support

           0       0.97      0.80      0.87       793
           1       0.53      0.89      0.67       207

    accuracy                           0.82      1000
   macro avg       0.75      0.84      0.77      1000
weighted avg       0.88      0.82      0.83      1000

Accuracy:  0.6832740213523132
              precision    recall  f1-score   support

           0       0.98      0.79      0.88       793
           1       0.54      0.93      0.68       207

    accuracy                           0.82      1000
   macro avg       0.76      0.86     

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import make_scorer

scorer = make_scorer(f1_score,average='macro')
pipeline = Pipeline([
    ('smote',SMOTE()),
    ('scaler', StandardScaler()),
    ('rfc', RandomForestClassifier(random_state=42))
])
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print("Accuracy: ", f1_score(y_test, y_pred))
print(classification_report(y_test,y_pred))

cv= StratifiedKFold(n_splits = 5, shuffle=True, random_state=42)
scores = cross_val_score(pipeline,X, Y, scoring=scorer, cv=cv)

print(scores)
print()
print(scores.mean())


Accuracy:  0.6290322580645161
              precision    recall  f1-score   support

           0       0.93      0.83      0.88       793
           1       0.54      0.75      0.63       207

    accuracy                           0.82      1000
   macro avg       0.73      0.79      0.75      1000
weighted avg       0.85      0.82      0.83      1000

[0.77640482 0.78466242 0.77204418 0.78890038 0.79462398]

0.7833271574016921


In [8]:
import shap
import matplotlib.pyplot as plt

# 1. Transform your training data through SMOTE + scaling
X_res, y_res = pipeline.named_steps['smote'].fit_resample(X_train, y_train)
X_res_scaled = pipeline.named_steps['scaler'].fit_transform(X_res)

# 2. Get the trained SVM model
svm_model = pipeline.named_steps['svm']

# 3. Select a small background dataset for SHAP speed
background = shap.sample(X_res_scaled, 50, random_state=42)

# 4. Create the SHAP explainer using predict_proba
explainer = shap.KernelExplainer(svm_model.predict_proba, background)

# 5. Explain a subset of the test set (scaled in the same way)
X_test_scaled = pipeline.named_steps['scaler'].transform(X_test)
shap_values = explainer.shap_values(X_test_scaled[:100])

# 6. Plot summary for positive class (class index 1)
shap.summary_plot(shap_values[1], X_test_scaled[:100], feature_names=X_train.columns)
plt.show()


  1%|          | 1/100 [00:19<31:50, 19.29s/it]


KeyboardInterrupt: 

In [9]:
import shap
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

# Example: Assuming you already have df with target column 'target'
# Train-test split


X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Train SVM
model = SVC(kernel="rbf", probability=True, random_state=42)
model.fit(X_train_res, y_train_res)

# Evaluate
print(classification_report(y_test, model.predict(X_test)))

# ---- SHAP PART ----
# Take a small sample to speed things up
X_sample = X_train_res.sample(200, random_state=42)

# Use the universal fast explainer
explainer = shap.Explainer(model, X_sample)
shap_values = explainer(X_sample)

# Summary plot
shap.summary_plot(shap_values, X_sample, feature_names=X.columns)
plt.show()


              precision    recall  f1-score   support

           0       0.97      0.70      0.81       804
           1       0.42      0.90      0.57       196

    accuracy                           0.74      1000
   macro avg       0.69      0.80      0.69      1000
weighted avg       0.86      0.74      0.76      1000



TypeError: The passed model is not callable and cannot be analyzed directly with the given masker! Model: SVC(probability=True, random_state=42)

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)