In [1]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import f1_score, precision_score, recall_score

import neural_network_methods as nn

In [2]:
def split_train_test(data, num_train):
    
    train = data[0:num_train]
    test  = data[num_train:]
    
    return train, test

In [3]:
def construct_X(data):
    
    n = len(data)
    num_features = 0
    
    for i in range(len(data[0])):
        if(data[0][i] == ","):
            num_features += 1
    
    X = np.zeros((n, num_features))
    for i in range(n):
        arr = data[i].split(",")
        for j in range(num_features):
            X[i][j] = arr[j]
        
    return X.T
    

def construct_Y(data):
    
    n = len(data)
    
    Y = np.zeros(n)
    for i in range(n):
        Y[i] = data[i][-1]
        
    return Y

def accuracy(pred, Y):
    
    count = 0
    for i in range(len(Y)):
        if(pred[i]-Y[i]==0):
            count +=1
    
    accuracy = count/len(Y)
    return accuracy

In [4]:
data_total = np.loadtxt('diabetes_dataset_total.csv', str)

In [5]:
train, test = split_train_test(data_total, 700)

In [6]:
dt =  DecisionTreeClassifier(criterion = 'log_loss', max_depth = 100)
rf =  RandomForestClassifier()
knn = KNeighborsClassifier(n_neighbors = 5)

In [7]:
X_train = construct_X(train).T
Y_train = construct_Y(train)

X_test = construct_X(test).T
Y_test = construct_Y(test)

# Results

In [15]:
iterations = 100

accuracy_array_dt  = np.zeros(iterations)
accuracy_array_rf  = np.zeros(iterations)
accuracy_array_knn = np.zeros(iterations)

f1_array_dt  = np.zeros(iterations)
f1_array_rf  = np.zeros(iterations)
f1_array_knn = np.zeros(iterations)

precision_array_dt  = np.zeros(iterations)
precision_array_rf  = np.zeros(iterations)
precision_array_knn = np.zeros(iterations)

recall_array_dt  = np.zeros(iterations)
recall_array_rf  = np.zeros(iterations)
recall_array_knn = np.zeros(iterations)

for i in range(iterations):
    X_train = construct_X(train).T
    Y_train = construct_Y(train)

    X_test = construct_X(test).T
    Y_test = construct_Y(test)
    
    dt = dt.fit(X_train, Y_train)
    Y_predict_dt = dt.predict(X_test)
    
    rf = rf.fit(X_train, Y_train)
    Y_predict_rf = rf.predict(X_test)
    
    knn = knn.fit(X_train, Y_train)
    Y_predict_knn = knn.predict(X_test)
    
    accuracy_array_dt[i]  = accuracy(Y_predict_dt, Y_test)
    accuracy_array_rf[i]  = accuracy(Y_predict_rf, Y_test)
    accuracy_array_knn[i] = accuracy(Y_predict_knn, Y_test)
    
    f1_array_dt[i]  = f1_score(Y_predict_dt, Y_test)
    f1_array_rf[i]  = f1_score(Y_predict_rf, Y_test)
    f1_array_knn[i] = f1_score(Y_predict_knn, Y_test)
    
    precision_array_dt[i]  = precision_score(Y_predict_dt, Y_test)
    precision_array_rf[i]  = precision_score(Y_predict_rf, Y_test)
    precision_array_knn[i] = precision_score(Y_predict_knn, Y_test)
    
    recall_array_dt[i]  = recall_score(Y_predict_dt, Y_test)
    recall_array_rf[i]  = recall_score(Y_predict_rf, Y_test)
    recall_array_knn[i] = recall_score(Y_predict_knn, Y_test)
    
    print(i, f1_array_dt[i], f1_array_rf[i], f1_array_knn[i])

0 0.6545454545454545 0.6382978723404256 0.6538461538461539
1 0.6428571428571429 0.6666666666666667 0.6538461538461539
2 0.6071428571428571 0.6666666666666666 0.6538461538461539
3 0.5818181818181818 0.6808510638297872 0.6538461538461539
4 0.5818181818181818 0.6938775510204083 0.6538461538461539
5 0.6071428571428571 0.68 0.6538461538461539
6 0.6296296296296297 0.7083333333333334 0.6538461538461539
7 0.5714285714285714 0.6530612244897959 0.6538461538461539
8 0.6296296296296297 0.6382978723404256 0.6538461538461539
9 0.5660377358490566 0.6521739130434783 0.6538461538461539
10 0.6296296296296297 0.6808510638297872 0.6538461538461539
11 0.5862068965517241 0.6538461538461539 0.6538461538461539
12 0.5818181818181818 0.7058823529411765 0.6538461538461539
13 0.6206896551724138 0.723404255319149 0.6538461538461539
14 0.6153846153846153 0.68 0.6538461538461539
15 0.6415094339622641 0.6956521739130435 0.6538461538461539
16 0.5614035087719299 0.7058823529411765 0.6538461538461539
17 0.58181818181818

In [19]:
print('Max F1 score for decision trees:',np.max(f1_array_dt))
print('Max F1 score for random forest:',np.max(f1_array_rf))
print('Max F1 score for K-nearest neighbors:',np.max(f1_array_knn))

Max F1 score for decision trees: 0.6545454545454545
Max F1 score for random forest: 0.7659574468085106
Max F1 score for K-nearest neighbors: 0.6538461538461539
