In [1]:
import numpy as np
import pandas as pd
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
# read the dataset and build a preprocessor for it
df = pd.read_csv('data/diabetes_prediction_dataset.csv')
y = df['diabetes']
X = df.drop(columns=['diabetes'])

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(X)
categorical_columns = categorical_columns_selector(X)

onehot = OneHotEncoder()
scaler = StandardScaler()

preprocessor = ColumnTransformer([
    ('onehot', onehot, categorical_columns),
    ('zscore', scaler, numerical_columns)
])

# Part1: SVC Model

In [4]:
df_10000 = df.sample(n=10000, random_state=38)
y_10000 = df_10000['diabetes']
X_10000 = df_10000.drop(columns=['diabetes'])

pipe_SVC = Pipeline([
    ('make_futures', preprocessor),
    ('classifier', SVC())
])

search_space_SVC = [{'classifier__kernel': ['linear'],
                 'classifier__C': np.logspace(-3, 2, 11)
                },
                {'classifier__kernel': ['poly'],
                 'classifier__gamma': np.logspace(-3, 2, 5),
                 'classifier__degree': range(2,5),
                 'classifier__C': np.logspace(-3, 2, 5)
                },
                {'classifier__kernel': ['rbf'],
                 'classifier__gamma': np.logspace(-3, 2, 9),
                 'classifier__C': np.logspace(-3, 2, 9)
                },
                ]

best_model_SVC = GridSearchCV(pipe_SVC, search_space_SVC, cv=5, verbose=1)

X_train, X_test, y_train, y_test = train_test_split(X_10000, y_10000,
                                                    test_size=0.25, random_state=38)

In [5]:
base_model_SVC = Pipeline([
    ('make_futures', preprocessor),
    ('classifier', SVC())
])
base_model_SVC.fit(X_train, y_train)
y_pred = base_model_SVC.predict(X_test)

df_SVC = pd.DataFrame({'label': y_test, 'predict_base':y_pred})

In [6]:
%time best_model_SVC.fit(X_train, y_train)

y_pred = best_model_SVC.predict(X_test)
df_SVC['predict_best'] = y_pred

Fitting 5 folds for each of 167 candidates, totalling 835 fits
