In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import metrics

In [2]:
# Suppress future warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
# Set seed for reproducibility
seed = 2019
np.random.seed(seed)

In [6]:
# Read the data in
df = pd.read_csv("diabetes.csv")
X = df.iloc[:, 0:8]  # independent variables
y = df['Outcome'].values  # dependent variables

In [7]:
# Normalize
X = StandardScaler().fit_transform(X)

In [9]:
# Evaluate the model by splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
kfold = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)

In [10]:
num_trees = 10
verbose = True  # to print the progress

In [11]:
# Define base classifiers
clfs = [
    KNeighborsClassifier(),
    RandomForestClassifier(n_estimators=num_trees, random_state=seed),
    GradientBoostingClassifier(n_estimators=num_trees, random_state=seed)
]

In [12]:
# Creating train and test sets for blending
dataset_blend_train = np.zeros((X_train.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_test.shape[0], len(clfs)))

In [13]:
print('5-fold cross validation:\n')
for i, clf in enumerate(clfs):
    scores = cross_val_score(clf, X_train, y_train, cv=kfold, scoring='accuracy')
    print(f"##### Base Model {i} #####")
    print("Train CV Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
    
    clf.fit(X_train, y_train)
    print("Train Accuracy: %0.2f" % (metrics.accuracy_score(clf.predict(X_train), y_train)))
    
    dataset_blend_train[:, i] = clf.predict_proba(X_train)[:, 1]
    dataset_blend_test[:, i] = clf.predict_proba(X_test)[:, 1]
    print("Test Accuracy: %0.2f" % (metrics.accuracy_score(clf.predict(X_test), y_test)))

5-fold cross validation:

##### Base Model 0 #####
Train CV Accuracy: 0.73 (+/- 0.01)
Train Accuracy: 0.83
Test Accuracy: 0.75
##### Base Model 1 #####
Train CV Accuracy: 0.74 (+/- 0.02)
Train Accuracy: 0.98
Test Accuracy: 0.79
##### Base Model 2 #####
Train CV Accuracy: 0.74 (+/- 0.01)
Train Accuracy: 0.80
Test Accuracy: 0.80


In [14]:
print("##### Meta Model #####")
meta_clf = LogisticRegression()
scores = cross_val_score(meta_clf, dataset_blend_train, y_train, cv=kfold, scoring='accuracy')
meta_clf.fit(dataset_blend_train, y_train)
print("Train CV Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
print("Train Accuracy: %0.2f" % (metrics.accuracy_score(meta_clf.predict(dataset_blend_train), y_train)))
print("Test Accuracy: %0.2f" % (metrics.accuracy_score(meta_clf.predict(dataset_blend_test), y_test)))

##### Meta Model #####
Train CV Accuracy: 0.98 (+/- 0.01)
Train Accuracy: 0.98
Test Accuracy: 0.79
