In [47]:
import joblib
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [4]:
df = pd.read_csv('diabetes.csv')

In [5]:
x = df.drop('Outcome', axis=1)
y = df['Outcome']

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,
                                                    random_state=42, stratify=y)

In [36]:
# SVM Pipline
svm_pipeline = Pipeline([
    ('skew_fix', PowerTransformer(method='yeo-johnson')), # skew fix
    ('scaler', StandardScaler()),
    ('svm', SVC(kernel='rbf', C=1, gamma=0.01, probability=True,
                random_state=42))
])
# Decision Tree Pipline
dt_pipeline = Pipeline([
    ('dt', DecisionTreeClassifier(max_depth=3, min_samples_leaf=5,
                                  random_state=42))
])  # Tree algorithms doesn't need feature scaling
# These Pipelines were rebuilt to on the GridSearch best_params

In [29]:
# SVM GridSearchCV
svm_best_params = {
    'svm__C': [0.1, 1, 10, 100],
    'svm__gamma': [1, 0.1, 0.01, 0.001],
}
svm_grid = GridSearchCV(
    estimator=svm_pipeline,
    param_grid=svm_best_params,
    cv=5,
    n_jobs=1,
)

svm_grid.fit(x_train, y_train)

In [35]:
tree_params = {
    "dt__max_depth": [3, 5, 10, None],
    "dt__min_samples_split": [2, 5, 10],
    "dt__min_samples_leaf": [1, 2, 5]
}
tree_grid = GridSearchCV(
    estimator=dt_pipeline,
    param_grid=tree_params,
    cv=5,
    n_jobs=1,
)

tree_grid.fit(x_train, y_train)


In [41]:
print(f'SVM Best Params: {svm_grid.best_params_}')
print(f"Decision Tree Best Params: {tree_grid.best_params_}")

SVM Best Params: {'svm__C': 1, 'svm__gamma': 0.01}
Decision Tree Best Params: {'dt__max_depth': 3, 'dt__min_samples_leaf': 5, 'dt__min_samples_split': 2}


In [37]:
svm_pipeline.fit(x_train, y_train)


In [38]:
dt_pipeline.fit(x_train, y_train)

In [48]:
# SVM Score
svm_score = svm_pipeline.score(x_test, y_test)
# Decision Tree Score
dt_score = dt_pipeline.score(x_test, y_test)
print(f'SVM Score: {svm_score}')
print(f'Decision Tree Score: {dt_score}')

SVM Score: 0.7402597402597403
Decision Tree Score: 0.7012987012987013


In [24]:
y_pred_svm = svm_pipeline.predict(x_test)
y_pred_dt = dt_pipeline.predict(x_test)

In [44]:
# SVM Classification Report
svm_report = pd.DataFrame(classification_report(y_test, y_pred_svm,
                                            output_dict=True)).T
display(svm_report.round(3))  # for readability

Unnamed: 0,precision,recall,f1-score,support
0,0.769,0.83,0.798,100.0
1,0.63,0.537,0.58,54.0
accuracy,0.727,0.727,0.727,0.727
macro avg,0.699,0.684,0.689,154.0
weighted avg,0.72,0.727,0.722,154.0


In [46]:
# Decision Tree Classification Report
dt_report = pd.DataFrame(classification_report(y_test, y_pred_dt,
                                             output_dict=True)).T
display(dt_report.round(3))

Unnamed: 0,precision,recall,f1-score,support
0,0.759,0.85,0.802,100.0
1,0.643,0.5,0.562,54.0
accuracy,0.727,0.727,0.727,0.727
macro avg,0.701,0.675,0.682,154.0
weighted avg,0.718,0.727,0.718,154.0


In [49]:
# Saving the SVM Model for future use
joblib.dump(svm_pipeline, 'svm_diabetes_prediction_model.pkl')

['svm_diabetes_prediction_model.pkl']