# 1- Import libraries.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

--------
# 2- Import dataset.

In [2]:
data = pd.read_csv('D://Diabetes Prediction/Data/diabetes.csv')

---------
# 3- Display first 5 rows of dataset.

In [3]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


--------
# 4- Display last 5 rows of dataset.

In [4]:
data.tail()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.34,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1
767,1,93,70,31,0,30.4,0.315,23,0


--------
# 5- Find shape of our dataset.

In [5]:
print('Number of rows',data.shape[0])
print('Number of columns',data.shape[1])

Number of rows 768
Number of columns 9


-----------
# 6- Get more informations about our dataset.

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


---------
# 7- Get overall statistics of our dataset.

In [7]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


###### - We can see values that equal zeros we will deal with it.

---------
# 8- Check null values in the dataset.

In [8]:
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

###### - No null values in our dataset.

----------
# 9- Handling values that equal zero.

#### 1- Get copy of dataset.

In [9]:
copy_data = data.copy(deep=True)

#### 2- Replace zero values with null.

In [12]:
copy_data[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI']]=copy_data[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI']].replace(0,np.nan)

#### 3- Check null values in copy data.

In [13]:
copy_data.isna().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

###### - Now we see null values and we will handle it .

#### 4- Replace zero values with the mean in the real dataset.

In [14]:
data['Glucose'] = data['Glucose'].replace(0,data['Glucose'].mean())
data['BloodPressure'] = data['BloodPressure'].replace(0,data['BloodPressure'].mean())
data['SkinThickness'] = data['SkinThickness'].replace(0,data['SkinThickness'].mean())
data['Insulin'] = data['Insulin'].replace(0,data['Insulin'].mean())
data['BMI'] = data['BMI'].replace(0,data['BMI'].mean())

-----------
# 5- Store feature matrix in X and response(target) in victor y.

In [15]:
X = data.drop('Outcome',axis=1)
y = data['Outcome']

----------
# 6- Splitting data into training set and testing set.

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

---------
# 7- Import models.

In [20]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline

----------
# 8- Build the pipeline.

In [31]:
pipeline_lr = Pipeline([('scaler1',StandardScaler()),('lr_classifier',LogisticRegression())])
pipeline_kn = Pipeline([('scaler2',StandardScaler()),('kn_classifier',KNeighborsClassifier())])
pipeline_svc = Pipeline([('scaler3',StandardScaler()),('svc_classifier',SVC())])
pipeline_dt = Pipeline([('scaler4',StandardScaler()),('dt_classifier',DecisionTreeClassifier())])
pipeline_rf = Pipeline([('scaler5',StandardScaler()),('rf_classifier',RandomForestClassifier(max_depth=3))])
pipeline_gb = Pipeline([('scaler5',StandardScaler()),('gb_classifier',GradientBoostingClassifier())])

-----------
# 9- Creat list of pipelines.

In [32]:
pipelines = [pipeline_lr,pipeline_kn,pipeline_svc,pipeline_dt,pipeline_rf,pipeline_gb]

------------
# 10- Fitting on training data.

In [33]:
for pipe in pipelines:
    pipe.fit(X_train,y_train)

-----
# 11- Prediction and accuracy.

In [34]:
pipe_dict = {0:'LR',1:'KN',2:'SVC',3:'DT',4:'RF',5:'GB'}

In [35]:
for i,model in enumerate(pipelines):
    print('{} Test Accuracy {} '.format(pipe_dict[i],model.score(X_test,y_test)))

LR Test Accuracy 0.7662337662337663 
KN Test Accuracy 0.7662337662337663 
SVC Test Accuracy 0.7337662337662337 
DT Test Accuracy 0.7272727272727273 
RF Test Accuracy 0.7922077922077922 
GB Test Accuracy 0.7597402597402597 


###### - Random Forest Classifier is the best model.

--------------
# 12- Build the final model.

In [39]:
from sklearn.ensemble import RandomForestClassifier

In [41]:
X = data.drop('Outcome',axis=1)
y = data['Outcome']

In [40]:
RF = RandomForestClassifier(max_depth=3)

In [43]:
RF.fit(X,y)

In [49]:
test_df  = pd.DataFrame({'Pregnancies':6,'Glucose':148.0,'BloodPressure':72.0,'SkinThickness':35.0,
                         'Insulin':79.799479,'BMI':33.6,'DiabetesPedigreeFunction':0.627,'Age':50},index=[0])

In [50]:
RF.predict(test_df)

array([1], dtype=int64)

-----------
# 13- Saving the model.

In [51]:
import joblib

In [52]:
joblib.dump(RF,'Diabetes_model')

['Diabetes_model']

In [53]:
model = joblib.load('Diabetes_model')

In [54]:
model.predict(test_df)

array([1], dtype=int64)

----------
# 14- GUI.

In [55]:
from tkinter import *

In [68]:
def show_entry():
    p1 = int(e1.get())
    p2 = float(e2.get())
    p3 = float(e3.get())
    p4 = float(e4.get())
    p5 = float(e5.get())
    p6 = float(e6.get())
    p7 = float(e7.get())
    p8 = int(e8.get())
    model = joblib.load('Diabetes_model')
    df  = pd.DataFrame({'Pregnancies':p1,'Glucose':p2,'BloodPressure':p3,'SkinThickness':p4,
                         'Insulin':p5,'BMI':p6,'DiabetesPedigreeFunction':p7,'Age':p8},index=[0])
    result = model.predict(df)
    if result == 1:
        Label(master,text='Diabetic').grid(row=10)
    else:
        Label(master,text='Not Diabetic').grid(row=10)

master = Tk()
master.title('Diabetes Prediction')
label = Label(master,text='Diavetes Prediction',bg='red',fg='white').grid(row=0,columnspan=2)
Label(master,text='Pregnancies').grid(row=1)
Label(master,text='Glucose').grid(row=2)
Label(master,text='BloodPressure').grid(row=3)
Label(master,text='SkinThickness').grid(row=4)
Label(master,text='Insulin').grid(row=5)
Label(master,text='BMI').grid(row=6)
Label(master,text='DiabetesPedigreeFunction').grid(row=7)
Label(master,text='Age').grid(row=8)

e1 = Entry(master)
e2 = Entry(master)
e3 = Entry(master)
e4 = Entry(master)
e5 = Entry(master)
e6 = Entry(master)
e7 = Entry(master)
e8 = Entry(master)

e1.grid(row=1,column=1)
e2.grid(row=2,column=1)
e3.grid(row=3,column=1)
e4.grid(row=4,column=1)
e5.grid(row=5,column=1)
e6.grid(row=6,column=1)
e7.grid(row=7,column=1)
e8.grid(row=8,column=1)

Button(master,text='Predict',command=show_entry).grid()
mainloop()