Prediction of anonymized data
===============================

## Imports

Importing needed packages:

In [44]:
import pandas 
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics

# turn of warnings
import warnings
warnings.filterwarnings('ignore')

## Import Data

In [45]:
data= pandas.read_csv('diabetis_anonymized01.csv', index_col=0);

display(data.head())


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,"[0, 5[",84,"[0, 20[",0,0,"[0, 5[",0.304,"[21, 40[",0
1,"[0, 5[",74,"[0, 20[",0,0,"[0, 5[",0.102,"[21, 40[",0
2,"[0, 5[",94,"[0, 20[",0,0,"[0, 5[",0.256,"[21, 40[",0
3,"[0, 5[",80,"[0, 20[",0,0,"[0, 5[",0.174,"[21, 40[",0
4,"[0, 5[",116,"[0, 20[",0,0,"[20, 25[",0.187,"[21, 40[",0


## Data preparation


In [46]:
target=data['Outcome']
data=data.drop(['Outcome'],axis=1)
## One-Hot Encoding
enc=np.array(['Pregnancies','BloodPressure','BMI','Age'])
enc2 = np.array(['Glucose','SkinThickness','Insulin','DiabetesPedigreeFunction'])

##naive bayes data -> encoding all values because of the use of Bernouli NB
nb_data=pandas.get_dummies(data,columns=list(data.columns))

#data for rf
data= pandas.get_dummies(data,columns=enc)

## saving the labels
labels= list(data.columns)




## Splitting the data in train and test data

In [47]:
# splitting data for rf
train_pred, test_pred, train_target, test_target = train_test_split(data, target, test_size=0.33,random_state=42)
display(train_pred.head())
display(test_pred.head())

#splitting data for nb
nb_train_pred, nb_test_pred, nb_train_target, nb_test_target = train_test_split(nb_data, target, test_size=0.33,random_state=42)

display(list(set(nb_test_pred.columns)-set(nb_train_pred.columns)))
## Transform everythin to np.array
train_pred=np.array(train_pred)
train_target=np.array(train_target)

test_pred=np.array(test_pred)
test_target=np.array(test_target)


nb_test_pred=np.array(nb_test_pred)
nb_train_pred=np.array(nb_train_pred)


Unnamed: 0,Glucose,SkinThickness,Insulin,DiabetesPedigreeFunction,"Pregnancies_[0, 5[","Pregnancies_[10, 15[","Pregnancies_[5, 10[","BloodPressure_[0, 20[","BloodPressure_[40, 60[","BloodPressure_[60, 80[",...,"BMI_[15, 20[","BMI_[20, 25[","BMI_[25, 30[","BMI_[30, 35[","BMI_[35, 40[","BMI_[40, 45[","BMI_[45, 50[","Age_[21, 40[","Age_[40, 60[","Age_[60, 80["
203,171,33,135,0.199,1,0,0,0,0,1,...,0,0,0,1,0,0,0,1,0,0
196,115,30,96,0.529,1,0,0,0,0,1,...,0,0,0,1,0,0,0,1,0,0
286,107,30,74,0.757,1,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0
93,104,0,0,0.582,1,0,0,0,0,1,...,1,0,0,0,0,0,0,1,0,0
586,61,28,0,0.243,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


Unnamed: 0,Glucose,SkinThickness,Insulin,DiabetesPedigreeFunction,"Pregnancies_[0, 5[","Pregnancies_[10, 15[","Pregnancies_[5, 10[","BloodPressure_[0, 20[","BloodPressure_[40, 60[","BloodPressure_[60, 80[",...,"BMI_[15, 20[","BMI_[20, 25[","BMI_[25, 30[","BMI_[30, 35[","BMI_[35, 40[","BMI_[40, 45[","BMI_[45, 50[","Age_[21, 40[","Age_[40, 60[","Age_[60, 80["
627,107,0,0,0.727,0,0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0
271,158,13,387,0.295,1,0,0,0,0,1,...,0,0,0,1,0,0,0,1,0,0
290,112,32,0,0.148,1,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0
63,108,26,63,0.318,1,0,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0
302,84,23,115,0.471,1,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0


[]

## Random Forest

In [48]:
classifier = RandomForestClassifier(random_state=42)
classifier = classifier.fit(train_pred,train_target)

prediction = classifier.predict(test_pred)

difference = prediction - test_target

In [49]:
print('Total number of predictions:' + str(len(prediction)))
print('Total number of wrong predictions:' + str(np.count_nonzero(difference))) 
print('Accuracy: '+ str(((len(prediction) - np.count_nonzero(difference))/len(prediction))*100)+' %')


Total number of predictions:217
Total number of wrong predictions:58
Accuracy: 73.27188940092167 %


In [50]:
feature_importances = pandas.DataFrame(classifier.feature_importances_,
                                   index = labels,
                                    columns=['importance']).sort_values('importance',ascending=False)
display(feature_importances.head(20))

Unnamed: 0,importance
Glucose,0.297246
DiabetesPedigreeFunction,0.18613
Insulin,0.109843
SkinThickness,0.106933
"BMI_[30, 35[",0.041823
"Age_[40, 60[",0.031182
"Pregnancies_[0, 5[",0.025379
"BloodPressure_[80, 100[",0.024577
"BMI_[25, 30[",0.023469
"Pregnancies_[5, 10[",0.02325


## Naive Bayes

In [51]:

model = BernoulliNB()
model = model.fit(nb_train_pred,nb_train_target)

naive_prediction= model.predict(nb_test_pred)
naive_difference = naive_prediction - nb_test_target

In [52]:
print('Total number of predictions:' + str(len(naive_prediction)))
print('Total number of wrong predictions:' + str(np.count_nonzero(naive_difference))) 
print('Accuracy: '+ str(((len(naive_prediction) - np.count_nonzero(naive_difference))/len(naive_prediction))*100)+' %')
print("Accuracy:",metrics.accuracy_score(nb_test_target, naive_prediction))


Total number of predictions:217
Total number of wrong predictions:68
Accuracy: 68.66359447004609 %
Accuracy: 0.6866359447004609
