In [1]:
import os
import subprocess
import sys

import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Normalizer

In [2]:
input_file = "CKC_dataset2.csv"
data = pd.read_csv(input_file, header=0)

sys.path.insert(0, os.path.abspath('../'))
import knnmodule

In [3]:
#give K a proper value
k=5

data.head()
data = data.replace('?', np.nan)
data = data[data.columns.drop(['id'])]
data

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48,80,1.02,1,0,,normal,notpresent,notpresent,121,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7,50,1.02,4,0,,normal,notpresent,notpresent,,...,38,6000,,no,no,no,good,no,no,ckd
2,62,80,1.01,2,3,normal,normal,notpresent,notpresent,423,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,48,70,1.005,4,0,normal,abnormal,present,notpresent,117,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51,80,1.01,2,0,normal,normal,notpresent,notpresent,106,...,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55,80,1.02,0,0,normal,normal,notpresent,notpresent,140,...,47,6700,4.9,no,no,no,good,no,no,notckd
396,42,70,1.025,0,0,normal,normal,notpresent,notpresent,75,...,54,7800,6.2,no,no,no,good,no,no,notckd
397,12,80,1.02,0,0,normal,normal,notpresent,notpresent,100,...,49,6600,5.4,no,no,no,good,no,no,notckd
398,17,60,1.025,0,0,normal,normal,notpresent,notpresent,114,...,51,7200,5.9,no,no,no,good,no,no,notckd


In [4]:
def isNaN(string):
    return string != string
def encode(column):
    values = []
    encodedColumn = []
    for element in column:
        if not isNaN(element):
            if element not in values:
               values.append(element)
            encodedColumn.append(values.index(element))
        else:
            encodedColumn.append(element)
    return encodedColumn

In [5]:
columnsToBeEncoded = ['rbc','pc','pcc','ba', 'htn', 'cad','dm','appet','pe','ane','class']
dataEncoded = data
for column in columnsToBeEncoded:
    dataEncoded[column] = encode(dataEncoded[column])
dataEncoded

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48,80,1.02,1,0,,0.0,0.0,0.0,121,...,44,7800,5.2,0.0,0.0,0.0,0.0,0.0,0.0,0
1,7,50,1.02,4,0,,0.0,0.0,0.0,,...,38,6000,,1.0,1.0,0.0,0.0,0.0,0.0,0
2,62,80,1.01,2,3,0.0,0.0,0.0,0.0,423,...,31,7500,,1.0,0.0,0.0,1.0,0.0,1.0,0
3,48,70,1.005,4,0,0.0,1.0,1.0,0.0,117,...,32,6700,3.9,0.0,1.0,0.0,1.0,1.0,1.0,0
4,51,80,1.01,2,0,0.0,0.0,0.0,0.0,106,...,35,7300,4.6,1.0,1.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55,80,1.02,0,0,0.0,0.0,0.0,0.0,140,...,47,6700,4.9,1.0,1.0,0.0,0.0,0.0,0.0,1
396,42,70,1.025,0,0,0.0,0.0,0.0,0.0,75,...,54,7800,6.2,1.0,1.0,0.0,0.0,0.0,0.0,1
397,12,80,1.02,0,0,0.0,0.0,0.0,0.0,100,...,49,6600,5.4,1.0,1.0,0.0,0.0,0.0,0.0,1
398,17,60,1.025,0,0,0.0,0.0,0.0,0.0,114,...,51,7200,5.9,1.0,1.0,0.0,0.0,0.0,0.0,1


In [6]:
imputer = KNNImputer(n_neighbors=k)
dataComplete = imputer.fit_transform(dataEncoded)
X = dataComplete[:,:24]
y = dataComplete[:,24]

In [7]:
clf = ExtraTreesClassifier(n_estimators=50)
clf = clf.fit(X, y)
vals = [clf.feature_importances_]

featureImportance = pd.DataFrame( vals, columns = data.columns.drop(['class']))
featureImportance.sort_values(by=0, ascending=False, axis=1)


Unnamed: 0,sg,rbc,dm,htn,hemo,al,pcv,rbcc,pe,pc,...,bgr,bu,ane,bp,pcc,wbcc,pot,age,ba,cad
0,0.146071,0.14562,0.107564,0.102332,0.098297,0.089399,0.063321,0.039187,0.030507,0.030228,...,0.012618,0.011825,0.010812,0.009889,0.00682,0.005887,0.005503,0.005162,0.004338,0.001815


In [8]:
model = SelectFromModel(clf, prefit=True)
X_new = model.transform(X)
X_new.shape

(400, 7)

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train, X_test_rf, y_train, y_test_rf = train_test_split(X, y, test_size=0.33)

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=0)
rfc.fit(X_train,y_train)

y_predicted_rf = rfc.predict(X_test_rf)
accuracy_score(y_test_rf, y_predicted_rf)

0.9772727272727273

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.33)

lrc = LogisticRegression(max_iter=600)
lrc.fit(X_train, y_train)

y_predicted_log = lrc.predict(X_test)
accuracy_score(y_test, y_predicted_log)


0.9696969696969697

In [11]:
from sklearn import svm
clf = svm.SVC(kernel = 'rbf')
clf.fit(X_train, y_train)

y_predicted_svm = clf.predict(X_test)
accuracy_score(y_test, y_predicted_svm)

0.9090909090909091

In [12]:
from sklearn.neighbors import KNeighborsClassifier
from math import sqrt

acc_score = []
predicted_bulk = []
sqrt_k = int(sqrt(np.size(data,0)))
for i in range(1,sqrt_k,2):
    clf = KNeighborsClassifier(n_neighbors=i)
    clf.fit(X_train, y_train)
    y_predicted = clf.predict(X_test)
    predicted_bulk.append(y_predicted)
    acc = accuracy_score(y_test, y_predicted)
    acc_score.append(acc) 
    print('Accuracy value for k = ' , i , 'is:', acc)

Accuracy value for k =  1 is: 0.9621212121212122
Accuracy value for k =  3 is: 0.9545454545454546
Accuracy value for k =  5 is: 0.9545454545454546
Accuracy value for k =  7 is: 0.9621212121212122
Accuracy value for k =  9 is: 0.9545454545454546
Accuracy value for k =  11 is: 0.9545454545454546
Accuracy value for k =  13 is: 0.946969696969697
Accuracy value for k =  15 is: 0.946969696969697
Accuracy value for k =  17 is: 0.9393939393939394
Accuracy value for k =  19 is: 0.946969696969697


In [13]:
array = np.asarray(acc_score)
min_val = np.ndarray.max(array)
val = np.where(array==min_val)[0][0]
chosenK = val*2+1
y_predicted_knn = predicted_bulk[val]
chosenK

1

In [14]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=1)
clf.fit(X_train, y_train)

y_predicted_nb = clf.predict(X_test)
accuracy_score(y_test, y_predicted_nb)

0.9166666666666666

In [15]:
from sklearn.neural_network import MLPClassifier
acc_score = []
predicted_bulk = []

sqrt_k = int(sqrt(np.size(data,0)))
for i in range(1,31):
    model = MLPClassifier(hidden_layer_sizes = i, max_iter=2500)
    model.fit(X_train, y_train)
    y_predicted = model.predict(X_test)
    acc = accuracy_score(y_test, y_predicted)
    acc_score.append(acc) 
    print('Accuracy value for k = ' , i , 'is:', acc)

Accuracy value for k =  1 is: 0.9696969696969697
Accuracy value for k =  2 is: 0.9545454545454546
Accuracy value for k =  3 is: 0.9545454545454546
Accuracy value for k =  4 is: 0.9696969696969697
Accuracy value for k =  5 is: 0.9545454545454546
Accuracy value for k =  6 is: 0.9545454545454546
Accuracy value for k =  7 is: 0.9545454545454546
Accuracy value for k =  8 is: 0.9545454545454546
Accuracy value for k =  9 is: 0.9696969696969697
Accuracy value for k =  10 is: 0.9696969696969697
Accuracy value for k =  11 is: 0.9696969696969697
Accuracy value for k =  12 is: 0.9696969696969697
Accuracy value for k =  13 is: 0.9621212121212122
Accuracy value for k =  14 is: 0.9696969696969697
Accuracy value for k =  15 is: 0.9545454545454546
Accuracy value for k =  16 is: 0.9545454545454546
Accuracy value for k =  17 is: 0.9545454545454546
Accuracy value for k =  18 is: 0.9545454545454546
Accuracy value for k =  19 is: 0.9696969696969697
Accuracy value for k =  20 is: 0.9545454545454546
Accuracy 

In [16]:
fnn_array = np.asarray(acc_score)
max_val = np.ndarray.max(fnn_array)
chosenK = np.where(fnn_array==max_val)[0][0]+1
chosenK

1

In [17]:
#Final model 

In [18]:
def get_notckc_probability(array):
    probab = []
    for x in array:
        probab.append(x[1])
    return probab
def get_2D_array(l1,l2):
    array = []
    for i in range(len(l1)):
        array.append([l1[i],l2[i]])
    return array
def activationFunction(prob1,prob2,w):
    return w[0]+prob1*w[1]+prob2*w[2]
def prediction(trainingSet, w):
    y = activationFunction(trainingSet[0],trainingSet[1], w)
    if y > 0:
        activation = 1
    else:
        activation = 0            
    return activation
def train(trainingSet, labels, weights, learning_rate):
    for inputs, label in zip(trainingSet, labels):
            predictionValue = prediction(inputs, weights)
            weights[1] += learning_rate * (label - predictionValue) * inputs[0]
            weights[2] += learning_rate * (label - predictionValue) * inputs[1]
            weights[0] += learning_rate * (label - predictionValue) #bias
    return weights
def accuracy(trainingSet, labels, weights):
    TP = 0
    TN = 0
    FP = 0
    FN = 0
    for inputs, label in zip(trainingSet, labels):
        predictedValue = prediction(inputs, weights)
        if predictedValue == 0 and label == 0:
            TP+=1
        elif predictedValue == 1 and label == 0:
            FN+=1
        elif predictedValue == 0 and label == 1:
            FP+=1
        elif predictedValue == 1and label == 1:
            TN+=1
    return (TP+TN)/(TP+FN+FP+TN)
    

In [19]:
notckc_prob_rf = rfc.predict_proba(X_test_rf)
notckc_prob_log = lrc.predict_proba(X_test)

l1 = get_notckc_probability(notckc_prob_rf)
l2 =get_notckc_probability(notckc_prob_log)

#new training data
probs = get_2D_array(l1,l2)
weights = np.zeros(3)
weights = train(probs, y_test,weights,0.1)
accuracy(probs, y_test, weights)

0.9848484848484849

In [20]:
def metric (y_predicted, y_true):
    TP = 0
    TN = 0
    FP = 0
    FN = 0
    for predictedValue, label in zip(y_predicted, y_true):
        if predictedValue == 0 and label == 0:
            TP+=1
        elif predictedValue == 1 and label == 0:
            FN+=1
        elif predictedValue == 0 and label == 1:
            FP+=1
        elif predictedValue == 1and label == 1:
            TN+=1
    acc = (TP+TN)/(TP+FN+FP+TN)
    sensitivity = TP/(TP+FN)
    specificity = TN/(FP+TN)
    precision = TP/(TP+FP)
    F1 = 2 * (precision*sensitivity)/(precision+sensitivity)
    return acc, sensitivity, specificity, F1     

In [22]:
Acc, SENS, SPEC, F1 = metric(y_predicted_rf, y_test_rf)
print("RF: Acc", Acc,"Sensitivity", SENS,"Specificity", SPEC," F1", F1)
Acc, SENS, SPEC, F1 = metric(y_predicted_log, y_test)
print("LOG: Acc", Acc,"Sensitivity", SENS,"Specificity", SPEC," F1", F1)
Acc, SENS, SPEC, F1 = metric(y_predicted_knn, y_test)
print("KNN: Acc", Acc,"Sensitivity", SENS,"Specificity", SPEC," F1", F1)
Acc, SENS, SPEC, F1 = metric(y_predicted_nb, y_test)
print("NB: Acc", Acc,"Sensitivity", SENS,"Specificity", SPEC," F1", F1)
Acc, SENS, SPEC, F1 = metric(y_predicted_svm, y_test)
print("SVM: Acc", Acc,"Sensitivity", SENS,"Specificity", SPEC," F1", F1)

RF: Acc 0.9772727272727273 Sensitivity 1.0 Specificity 0.9375  F1 0.9824561403508771
LOG: Acc 0.9848484848484849 Sensitivity 0.9753086419753086 Specificity 1.0  F1 0.9875
KNN: Acc 0.9621212121212122 Sensitivity 0.9506172839506173 Specificity 0.9803921568627451  F1 0.9685534591194969
NB: Acc 0.9166666666666666 Sensitivity 0.8641975308641975 Specificity 1.0  F1 0.9271523178807948
SVM: Acc 0.9090909090909091 Sensitivity 0.9012345679012346 Specificity 0.9215686274509803  F1 0.9240506329113924
