Prediction of original data
===============================

## Imports

Importing needed packages:

In [1]:
import pandas 
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics

# turn of warnings
import warnings
warnings.filterwarnings('ignore')

## Import Data

In [2]:
data= pandas.read_csv('../original_data/diabetes.csv', index_col=0);

display(data.head())


Unnamed: 0_level_0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
6,148,72,35,0,33.6,0.627,50,1
1,85,66,29,0,26.6,0.351,31,0
8,183,64,0,0,23.3,0.672,32,1
1,89,66,23,94,28.1,0.167,21,0
0,137,40,35,168,43.1,2.288,33,1


## Data preparation


In [3]:
target=data['Outcome']
data=data.drop(['Outcome'],axis=1)
## One-Hot Encoding
#enc=np.array(['Pregnancies','BloodPressure','BMI','Age'])
enc2 = np.array(['Glucose','SkinThickness','Insulin','DiabetesPedigreeFunction'])

##naive bayes data -> encoding all values because of the use of Bernouli NB
nb_data=pandas.get_dummies(data,columns=list(data.columns))

#data for rf
#data= pandas.get_dummies(data,columns=enc)

## saving the labels
labels= list(data.columns)




## Splitting the data in train and test data

In [4]:
# splitting data for rf
train_pred, test_pred, train_target, test_target = train_test_split(data, target, test_size=0.33,random_state=42)
display(train_pred.head())
display(test_pred.head())

#splitting data for nb
nb_train_pred, nb_test_pred, nb_train_target, nb_test_target = train_test_split(nb_data, target, test_size=0.33,random_state=42)

display(list(set(nb_test_pred.columns)-set(nb_train_pred.columns)))
## Transform everythin to np.array
train_pred=np.array(train_pred)
train_target=np.array(train_target)

test_pred=np.array(test_pred)
test_target=np.array(test_target)


nb_test_pred=np.array(nb_test_pred)
nb_train_pred=np.array(nb_train_pred)


Unnamed: 0_level_0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Pregnancies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10,115,98,0,0,24.0,1.022,34
7,142,60,33,190,28.8,0.687,61
4,116,72,12,87,22.1,0.463,37
1,126,60,0,0,30.1,0.349,47
3,78,70,0,0,32.5,0.27,39


Unnamed: 0_level_0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Pregnancies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
6,98,58,33,190,34.0,0.43,43
2,112,75,32,0,35.7,0.148,21
2,108,64,0,0,30.8,0.158,21
8,107,80,0,0,24.6,0.856,34
7,136,90,0,0,29.9,0.21,50


[]

## Random Forest

In [5]:
classifier = RandomForestClassifier(random_state=42)
classifier = classifier.fit(train_pred,train_target)

prediction = classifier.predict(test_pred)

difference = prediction - test_target

In [6]:
print('Total number of predictions:' + str(len(prediction)))
print('Total number of wrong predictions:' + str(np.count_nonzero(difference))) 
print('Accuracy: '+ str(((len(prediction) - np.count_nonzero(difference))/len(prediction))*100)+' %')


Total number of predictions:254
Total number of wrong predictions:67
Accuracy: 73.62204724409449 %


In [7]:
feature_importances = pandas.DataFrame(classifier.feature_importances_,
                                   index = labels,
                                    columns=['importance']).sort_values('importance',ascending=False)
display(feature_importances.head(20))

Unnamed: 0,importance
Glucose,0.289367
BMI,0.183435
Age,0.163101
DiabetesPedigreeFunction,0.125864
BloodPressure,0.09475
SkinThickness,0.074484
Insulin,0.068998


## Naive Bayes

In [8]:

model = BernoulliNB()
model = model.fit(nb_train_pred,nb_train_target)

naive_prediction= model.predict(nb_test_pred)
naive_difference = naive_prediction - nb_test_target

In [9]:
print('Total number of predictions:' + str(len(naive_prediction)))
print('Total number of wrong predictions:' + str(np.count_nonzero(naive_difference))) 
print('Accuracy: '+ str(((len(naive_prediction) - np.count_nonzero(naive_difference))/len(naive_prediction))*100)+' %')
print("Accuracy:",metrics.accuracy_score(nb_test_target, naive_prediction))


Total number of predictions:254
Total number of wrong predictions:82
Accuracy: 67.71653543307087 %
Accuracy: 0.6771653543307087
