In [61]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import LeaveOneOut
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import math

import keras
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, Conv1D, LSTM, SimpleRNN
from keras.layers import GlobalMaxPooling1D

In [62]:
def NN(X_train, y_train, validation, test):
    print("NN")
    model = Sequential()
    model.add(Dense(256, activation="relu", input_dim=8))
    model.add(Dense(64, activation="relu"))
    model.add(Dense(256, activation="relu"))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
    fit_model = model.fit(X_train, y_train, epochs=1000, batch_size=128, verbose=0)
    valid_res = np.argmax(model.predict(validation), axis=-1)
    test_res = np.argmax(model.predict(test), axis=-1)
    return (valid_res, test_res)

In [63]:
def KNN(X_train, y_train, validation, test):
    print("KNN")
    neigh = KNeighborsClassifier(n_neighbors=3)
    neigh.fit(X_train, y_train)
    valid_res = neigh.predict(validation)
    test_res = neigh.predict(test)
    return (valid_res, test_res)

In [64]:
def LogisticR(X_train, y_train, validation, test):
    print("LogisticR")
    logreg = LogisticRegression()
    logreg.fit(X_train, y_train)
    valid_res = logreg.predict(validation)
    test_res = logreg.predict(test)
    return (valid_res, test_res)
    

In [65]:
# Import dataset
data = pd.read_csv("data/diabetes.csv")
y = data["Outcome"].to_numpy()
X = data.drop(["Outcome"], axis=1).to_numpy()
print(X.shape)

(768, 8)


In [66]:
# Split dataset into training (0.6) and test (0.4)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [67]:
# Split test into validation (0.5) and Prediction (0.5)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [68]:
# Try each model using training, validation and prediction. These will return the validation and prediction sets
valid_knn, test_knn = KNN(X_train, y_train, X_valid, X_test)
valid_lr, test_lr = LogisticR(X_train, y_train, X_valid, X_test)
valid_nn, test_nn = NN(X_train, y_train, X_valid, X_test)
knn_acc = accuracy_score(y_valid, valid_knn)
lr_acc = accuracy_score(y_valid, valid_lr)
nn_acc = accuracy_score(y_valid, valid_nn)
print("KNN: {}%\nLR: {}%\nNN: {}%".format(knn_acc*100, lr_acc*100, nn_acc*100))

KNN
LogisticR
NN


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


KNN: 77.92207792207793%
LR: 76.62337662337663%
NN: 67.53246753246754%


In [87]:
def calc_error(baseline, validation):
    error = 0
    for (b, p) in zip(baseline, validation):
        if b != p:
            error += 1
    return 1/error
        

In [89]:
knn_error = calc_error(y_valid, valid_knn)
lr_error = calc_error(y_valid, valid_lr)
nn_error = calc_error(y_valid, valid_nn)
total = knn_error + lr_error + nn_error
knn_weight = math.floor((knn_error/total)*100)
lr_weight = math.floor((lr_error/total)*100)
nn_weight = math.floor((nn_error/total)*100)
print("Weights: {}, {}, {}".format(knn_weight, lr_weight, nn_weight))

Weights: 38, 35, 25


In [90]:
def ensemble(pred1, w1, pred2, w2, pred3, w3):
    predict = []
    for (p1, p2, p3) in zip(pred1, pred2, pred3):
        tally = [0, 0]
        tally[p1] += w1
        tally[p2] += w2
        tally[p3] += w3
        if tally[0] > tally[1]:
            predict.append(0)
        else:
            predict.append(1)
    return predict

In [91]:
predictions = ensemble(test_knn, knn_weight, test_nn, nn_weight, test_lr, lr_weight)

In [92]:
print("KNN: {0:.3f}%\nLR: {1:.3f}%\nNN: {2:.3f}%".format(float(accuracy_score(y_test, test_knn)*100), float(accuracy_score(y_test, test_lr)*100), float(accuracy_score(y_test, test_nn)*100)))

average = (accuracy_score(y_test, test_knn)*100 + accuracy_score(y_test, test_lr)*100 + accuracy_score(y_test, test_nn)*100)/3
print("Average of models: {0:.3f}%".format(float(average)))
print("Ensemble: {0:.3f}%".format(float(accuracy_score(y_test, predictions)*100)))



KNN: 61.688%
LR: 76.623%
NN: 66.234%
Average of models: 68.182%
Ensemble: 71.429%
