Prediction of original data
===============================

## Imports

Importing needed packages:

In [141]:
import pandas 
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import classification_report

# turn of warnings
import warnings
warnings.filterwarnings('ignore')

## Import Data

In [142]:
data= pandas.read_csv('../original_data/diabetes.csv', index_col=0);

display(data.head())


Unnamed: 0_level_0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
6,148,72,35,0,33.6,0.627,50,1
1,85,66,29,0,26.6,0.351,31,0
8,183,64,0,0,23.3,0.672,32,1
1,89,66,23,94,28.1,0.167,21,0
0,137,40,35,168,43.1,2.288,33,1


## Data preparation


In [143]:
target=data['Outcome']
data=data.drop(['Outcome'],axis=1)
## One-Hot Encoding
enc2 = np.array(['Glucose','SkinThickness','Insulin','DiabetesPedigreeFunction'])

##naive bayes data -> encoding all values because of the use of Bernouli NB
nb_data=pandas.get_dummies(data,columns=list(data.columns))

## saving the labels
labels= list(data.columns)




## Splitting the data in train and test data

In [144]:
# splitting data for rf
train_pred, test_pred, train_target, test_target = train_test_split(data, target, test_size=0.33,random_state=42)

#splitting data for nb
nb_train_pred, nb_test_pred, nb_train_target, nb_test_target = train_test_split(nb_data, target, test_size=0.33,random_state=42)

display(list(set(nb_test_pred.columns)-set(nb_train_pred.columns)))
## Transform everythin to np.array
train_pred=np.array(train_pred)
train_target=np.array(train_target)

test_pred=np.array(test_pred)
test_target=np.array(test_target)


nb_test_pred=np.array(nb_test_pred)
nb_train_pred=np.array(nb_train_pred)


[]

## Random Forest

Train & predict with model

In [145]:
classifier = RandomForestClassifier(random_state=42)
classifier = classifier.fit(train_pred,train_target)

rf_rediction = classifier.predict(test_pred)

difference = rf_rediction - test_target

Displaying feature importance:

In [146]:
feature_importances = pandas.DataFrame(classifier.feature_importances_,
                                   index = labels,
                                    columns=['importance']).sort_values('importance',ascending=False)
display(feature_importances.head(20))

Unnamed: 0,importance
Glucose,0.289367
BMI,0.183435
Age,0.163101
DiabetesPedigreeFunction,0.125864
BloodPressure,0.09475
SkinThickness,0.074484
Insulin,0.068998


#### Performance meassure

In [147]:
print('Total number of predictions:' + str(len(rf_rediction)))
print('Total number of wrong predictions:' + str(np.count_nonzero(difference))) 
print('Accuracy: '+ str(((len(rf_rediction) - np.count_nonzero(difference))/len(rf_rediction))*100)+' %')


Total number of predictions:254
Total number of wrong predictions:67
Accuracy: 73.62204724409449 %


In [148]:
# Creating the confusion matrix
cm = metrics.confusion_matrix(rf_rediction, test_target)
# Assigning columns names
cm_df = pandas.DataFrame(cm, 
            columns = ['Predicted Negative', 'Predicted Positive'],
            index = ['Actual Negative', 'Actual Positive'])
# Showing the confusion matrix
cm_df


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,134,33
Actual Positive,34,53


In [149]:
print(classification_report( test_target,rf_rediction,target_names=['0','1']))

              precision    recall  f1-score   support

           0       0.80      0.80      0.80       168
           1       0.61      0.62      0.61        86

    accuracy                           0.74       254
   macro avg       0.71      0.71      0.71       254
weighted avg       0.74      0.74      0.74       254



## Naive Bayes
Train & predict with model

In [150]:

model = BernoulliNB()
model = model.fit(nb_train_pred,nb_train_target)

naive_prediction= model.predict(nb_test_pred)
naive_difference = naive_prediction - nb_test_target

#### Performance meassure

In [151]:
print('Total number of predictions:' + str(len(naive_prediction)))
print('Total number of wrong predictions:' + str(np.count_nonzero(naive_difference))) 
print('Accuracy: '+ str(((len(naive_prediction) - np.count_nonzero(naive_difference))/len(naive_prediction))*100)+' %')
print("Accuracy:",metrics.accuracy_score(nb_test_target, naive_prediction))


Total number of predictions:254
Total number of wrong predictions:82
Accuracy: 67.71653543307087 %
Accuracy: 0.6771653543307087


In [152]:
# Creating the confusion matrix
cm = metrics.confusion_matrix( test_target,naive_prediction)
# Assigning columns names
cm_df = pandas.DataFrame(cm, 
            columns = ['Predicted Negative', 'Predicted Positive'],
            index = ['Actual Negative', 'Actual Positive'])
# Showing the confusion matrix
cm_df


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,154,14
Actual Positive,68,18


In [153]:
print(classification_report( test_target,naive_prediction,target_names=['0','1']))

              precision    recall  f1-score   support

           0       0.69      0.92      0.79       168
           1       0.56      0.21      0.31        86

    accuracy                           0.68       254
   macro avg       0.63      0.56      0.55       254
weighted avg       0.65      0.68      0.63       254



## KNN 
Train & predict with model

In [154]:
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(train_pred, train_target)

knn_prediction=knn_model.predict(test_pred)
knn_difference = knn_prediction - test_target

#### Performance meassure

In [155]:
print('Total number of predictions:' + str(len(knn_prediction)))
print('Total number of wrong predictions:' + str(np.count_nonzero(knn_difference))) 
print('Accuracy: '+ str(((len(knn_prediction) - np.count_nonzero(knn_difference))/len(knn_prediction))*100)+' %')
print("Accuracy:",metrics.accuracy_score(test_target, knn_prediction))


Total number of predictions:254
Total number of wrong predictions:79
Accuracy: 68.89763779527559 %
Accuracy: 0.6889763779527559


In [156]:
# Creating the confusion matrix
cm = metrics.confusion_matrix(test_target,knn_prediction)
# Assigning columns names
cm_df = pandas.DataFrame(cm, 
            columns = ['Predicted Negative', 'Predicted Positive'],
            index = ['Actual Negative', 'Actual Positive'])
# Showing the confusion matrix
cm_df


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,126,42
Actual Positive,37,49


In [157]:
#metricy
print(classification_report( test_target,knn_prediction,target_names=['0','1']))

              precision    recall  f1-score   support

           0       0.77      0.75      0.76       168
           1       0.54      0.57      0.55        86

    accuracy                           0.69       254
   macro avg       0.66      0.66      0.66       254
weighted avg       0.69      0.69      0.69       254



## Voting Classifier
Calculating voting:

In [158]:
voting_pred=[]
i=0
while i < len(prediction):
    tmp=rf_rediction[i]+naive_prediction[i]+knn_prediction[i]
    if tmp>1:
        voting_pred.append(1)
    else:
        voting_pred.append(0)
    i += 1
voting_diff=voting_pred-test_target

#### Performance meassure

In [159]:
print('Total number of predictions:' + str(len(voting_pred)))
print('Total number of wrong predictions:' + str(np.count_nonzero(voting_diff))) 
print('Accuracy: '+ str(((len(knn_prediction) - np.count_nonzero(voting_diff))/len(voting_pred))*100)+' %')
print("Accuracy:",metrics.accuracy_score(test_target, voting_pred))


Total number of predictions:254
Total number of wrong predictions:71
Accuracy: 72.04724409448819 %
Accuracy: 0.7204724409448819


In [160]:
# Creating the confusion matrix
cm = metrics.confusion_matrix( test_target,voting_pred,)
# Assigning columns names
cm_df = pandas.DataFrame(cm, 
            columns = ['Predicted Negative', 'Predicted Positive'],
            index = ['Actual Negative', 'Actual Positive'])
# Showing the confusion matrix
cm_df


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,137,31
Actual Positive,40,46


In [161]:
#metrics
print(classification_report( test_target,voting_pred,target_names=['0','1']))

              precision    recall  f1-score   support

           0       0.77      0.82      0.79       168
           1       0.60      0.53      0.56        86

    accuracy                           0.72       254
   macro avg       0.69      0.68      0.68       254
weighted avg       0.71      0.72      0.72       254

