# MACHINE LEARNING MODELS FOR DIABETES PREDICTION

In [1]:
import numpy as np
import pandas as pd

In [2]:
# machine learning libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

# import xgboost as xgb

Read __`diabetesdata.csv`__ file into a pandas dataframe. 
About the data:

1. __TimesPregnant__: Number of times pregnant 
2. __glucoseLevel__: Plasma glucose concentration a 2 hours in an oral glucose tolerance test 
3. __BP__: Diastolic blood pressure (mm Hg)  
5. __insulin__: 2-Hour serum insulin (mu U/ml) 
6. __BMI__: Body mass index (weight in kg/(height in m)^2) 
7. __pedigree__: Diabetes pedigree function 
8. __Age__: Age (years) 
9. __IsDiabetic__: 0 if not diabetic or 1 if diabetic) 








In [3]:
data = pd.read_csv("diabetesdata.csv")
data.head()

Unnamed: 0,TimesPregnant,glucoseLevel,BP,insulin,BMI,Pedigree,Age,IsDiabetic
0,6,148.0,72,0,33.6,0.627,50.0,1
1,1,,66,0,26.6,0.351,31.0,0
2,8,183.0,64,0,23.3,0.672,,1
3,1,,66,94,28.1,0.167,21.0,0
4,0,137.0,40,168,43.1,2.288,33.0,1


In [4]:
data.tail()

Unnamed: 0,TimesPregnant,glucoseLevel,BP,insulin,BMI,Pedigree,Age,IsDiabetic
763,10,101.0,76,180,32.9,0.171,63.0,0
764,2,122.0,70,0,36.8,0.34,27.0,0
765,5,121.0,72,112,26.2,0.245,30.0,0
766,1,126.0,60,0,30.1,0.349,47.0,1
767,1,93.0,70,0,30.4,0.315,23.0,0


In [5]:
data.shape

(768, 8)

In [7]:
data.describe()

Unnamed: 0,TimesPregnant,glucoseLevel,BP,insulin,BMI,Pedigree,Age,IsDiabetic
count,768.0,734.0,768.0,768.0,768.0,768.0,735.0,768.0
mean,3.845052,121.016349,69.105469,79.799479,31.992578,0.471876,33.353741,0.348958
std,3.369578,31.66024,19.355807,115.244002,7.88416,0.331329,11.772944,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,141.0,80.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,846.0,67.1,2.42,81.0,1.0


In [8]:
data.isna().sum()

TimesPregnant     0
glucoseLevel     34
BP                0
insulin           0
BMI               0
Pedigree          0
Age              33
IsDiabetic        0
dtype: int64

In [9]:
def NullsPerColumnCount(series):
    num = series.isna().sum()
    den = len(series)
    percentage = round(num / den, 4)
    return percentage

In [10]:
NullsPerColumnCount(data)

TimesPregnant    0.0000
glucoseLevel     0.0443
BP               0.0000
insulin          0.0000
BMI              0.0000
Pedigree         0.0000
Age              0.0430
IsDiabetic       0.0000
dtype: float64

In [12]:
a = data.isnull().sum(axis=1).tolist()

In [13]:
list(set(a))

[0, 1, 2]

In [14]:
a.count(1) + a.count(2)

64

In [15]:
a.count(0)

704

In [16]:
len(a)

768

In [17]:
PercentNull = (len(a) - a.count(0)) * 100 / len(a)
print(PercentNull)

8.333333333333334


In [18]:
# split values
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(data, test_size=0.15, random_state=100)
print("Number of samples in training data:", len(train_df))
print("Number of samples in test data:", len(test_df))

Number of samples in training data: 652
Number of samples in test data: 116


**Replace the Nan values in  __`train_df`__ and __`test_df`__  with the mean of EACH feature.**

In [20]:
train_df = train_df.fillna(train_df.mean())
test_df = test_df.fillna(test_df.mean())

In [21]:
train_df.isna().any()

TimesPregnant    False
glucoseLevel     False
BP               False
insulin          False
BMI              False
Pedigree         False
Age              False
IsDiabetic       False
dtype: bool

In [22]:
test_df.isna().any()

TimesPregnant    False
glucoseLevel     False
BP               False
insulin          False
BMI              False
Pedigree         False
Age              False
IsDiabetic       False
dtype: bool

In [24]:
X_train = train_df.drop("IsDiabetic", 1)
Y_train = train_df["IsDiabetic"]
X_test = test_df.drop("IsDiabetic", 1)
Y_test = test_df["IsDiabetic"]

Use this dataset to train perceptron, logistic regression and random forest models using 15% test split.

In [26]:
# Logistic Regression

from sklearn import linear_model
from sklearn.linear_model import LogisticRegression

print("Training a Logistic Regression Model..")
logreg = linear_model.LogisticRegression()
logreg.fit(X_train, Y_train)

logreg_train_acc = logreg.score(X_train, Y_train)
logreg_test_acc = logreg.score(X_test, Y_test)
print("logreg training acuracy= ", logreg_train_acc)
print("logreg test accuracy= ", logreg_test_acc)

Training a Logistic Regression Model..
logreg training acuracy=  0.7745398773006135
logreg test accuracy=  0.75


In [27]:
# Perceptron

from sklearn.linear_model import Perceptron

print("Training a Perceptron Model..")
perceptron = Perceptron(
    max_iter=100, verbose=0, random_state=None, fit_intercept=True, eta0=0.002
)
perceptron.fit(X_train, Y_train)

perceptron_train_acc = perceptron.score(X_train, Y_train)
perceptron_test_acc = perceptron.score(X_test, Y_test)
print("perceptron training acuracy= ", perceptron_train_acc)
print("perceptron test accuracy= ", perceptron_test_acc)

Training a Perceptron Model..
perceptron training acuracy=  0.6503067484662577
perceptron test accuracy=  0.646551724137931


In [28]:
# Adaboost

from sklearn.ensemble import AdaBoostClassifier

print("Training an Adaboost Classifier..")

adaboost = AdaBoostClassifier(n_estimators=500)
adaboost.fit(X_train, Y_train)
adaboost_train_acc = adaboost.score(X_train, Y_train)
adaboost_test_acc = adaboost.score(X_test, Y_test)
print("adaboost training acuracy= ", adaboost_train_acc)
print("adaboost test accuracy= ", adaboost_test_acc)

Training an Adaboost Classifier..
adaboost training acuracy=  0.9217791411042945
adaboost test accuracy=  0.6724137931034483


In [29]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=500, max_depth=2, random_state=0)
random_forest.fit(X_train, Y_train)
random_forest_train_acc = random_forest.score(X_train, Y_train)
random_forest_test_acc = random_forest.score(X_test, Y_test)
print("random_forest training acuracy= ", random_forest_train_acc)
print("random_forest test accuracy= ", random_forest_test_acc)

random_forest training acuracy=  0.7638036809815951
random_forest test accuracy=  0.7413793103448276


**Comment**

Mean imputation is not the best type of imputation to use. Because it has several problems such as: 
- Mean imputation reduces the variance of the imputed variables.
- Mean imputation shrinks standard errors, which invalidates most hypothesis tests and the calculation of confidence interval.
- Mean imputation does not preserve relationships between variables such as correlations.

Alternative methods to impute data: Paul Allison (2009) suggests either maximum likelihood estimation or multiple imputation methods, both of which try to preserve relationships between variables and the inherent variability of the data. Stochastic regression imputation: The predicted value from a regression plus a random residual value.This has all the advantages of regression imputation but adds in the advantages of the random component. Most multiple imputation is based off of some form of stochastic regression imputation.

# Using Guess Matrix


__Add columns __`BMI_band`__ & __`Pedigree_band`__  to  __`Data`__  by cutting __`BMI`__ & __`Pedigree`__ into 3 intervals.


In [30]:
data["BMI_band"] = pd.cut(data["BMI"], 3)
data["Pedigree_band"] = pd.cut(data["Pedigree"], 3)

In [31]:
data.head()

Unnamed: 0,TimesPregnant,glucoseLevel,BP,insulin,BMI,Pedigree,Age,IsDiabetic,BMI_band,Pedigree_band
0,6,148.0,72,0,33.6,0.627,50.0,1,"(22.367, 44.733]","(0.0757, 0.859]"
1,1,,66,0,26.6,0.351,31.0,0,"(22.367, 44.733]","(0.0757, 0.859]"
2,8,183.0,64,0,23.3,0.672,,1,"(22.367, 44.733]","(0.0757, 0.859]"
3,1,,66,94,28.1,0.167,21.0,0,"(22.367, 44.733]","(0.0757, 0.859]"
4,0,137.0,40,168,43.1,2.288,33.0,1,"(22.367, 44.733]","(1.639, 2.42]"


In [32]:
print("BMI_Band_Interval: ")
data["BMI_band"].unique()

BMI_Band_Interval: 


[(22.367, 44.733], (-0.0671, 22.367], (44.733, 67.1]]
Categories (3, interval[float64]): [(-0.0671, 22.367] < (22.367, 44.733] < (44.733, 67.1]]

In [33]:
print("Pedigree_Band_Interval: ")
data["Pedigree_band"].unique()

Pedigree_Band_Interval: 


[(0.0757, 0.859], (1.639, 2.42], (0.859, 1.639]]
Categories (3, interval[float64]): [(0.0757, 0.859] < (0.859, 1.639] < (1.639, 2.42]]

Group __`data`__ by __`Pedigree_band`__ & determine ratio of diabetic in each band.

In [34]:
pedigree_DiabeticRatio = (
    data[["Pedigree_band", "IsDiabetic"]]
    .groupby(["Pedigree_band"], as_index=False)
    .mean()
    .sort_values(by="Pedigree_band", ascending=True)
)
pedigree_DiabeticRatio

Unnamed: 0,Pedigree_band,IsDiabetic
0,"(0.0757, 0.859]",0.327007
1,"(0.859, 1.639]",0.540541
2,"(1.639, 2.42]",0.444444


In [35]:
pedigree_DiabeticRatio["IsDiabetic"][1]

0.5405405405405406

Group  __`data`__ by __`BMI_band`__ & determine ratio of diabetic in each band.

In [36]:
BMI_DiabeticRatio = (
    data[["BMI_band", "IsDiabetic"]]
    .groupby(["BMI_band"], as_index=False)
    .mean()
    .sort_values(by="BMI_band", ascending=True)
)
BMI_DiabeticRatio

Unnamed: 0,BMI_band,IsDiabetic
0,"(-0.0671, 22.367]",0.039216
1,"(22.367, 44.733]",0.358297
2,"(44.733, 67.1]",0.611111


In [37]:
BMI_DiabeticRatio["IsDiabetic"][1]

0.35829662261380324

Convert features - 'BP','insulin','BMI' and 'Pedigree'   into categorical values by mapping different bands of values of these features to integers 0,1,2. 

In [39]:
columns_to_segment = ["BP", "insulin", "BMI", "Pedigree"]
for column in columns_to_segment:
    data[column] = pd.cut(data[column], 3, labels=[0, 1, 2])

In [40]:
data.head()

Unnamed: 0,TimesPregnant,glucoseLevel,BP,insulin,BMI,Pedigree,Age,IsDiabetic,BMI_band,Pedigree_band
0,6,148.0,1,0,1,0,50.0,1,"(22.367, 44.733]","(0.0757, 0.859]"
1,1,,1,0,1,0,31.0,0,"(22.367, 44.733]","(0.0757, 0.859]"
2,8,183.0,1,0,1,0,,1,"(22.367, 44.733]","(0.0757, 0.859]"
3,1,,1,0,1,0,21.0,0,"(22.367, 44.733]","(0.0757, 0.859]"
4,0,137.0,0,0,1,2,33.0,1,"(22.367, 44.733]","(1.639, 2.42]"



Consider the original dataset, instead of generalizing the NAN values with the mean of the feature, I will try assigning values to NANs based on some hypothesis. For example for age I assume that the relation between BMI and BP of people is a reflection of the age group. I can have 9 types of BMI and BP relations and my aim is to find the median age of each of that group:

The Age guess matrix will look like this:  

| BMI | 0       | 1      | 2  |
|-----|-------------|------------- |----- |
| BP  |             |              |      |
| 0   | a00         | a01          | a02  |
| 1   | a10         | a11          | a12  |
| 2   | a20         | a21          |  a22 |


Now I will create a guess_matrix  for NaN values of *'Age'* ( using 'BMI' and 'BP')  and  *'glucoseLevel'*  (using 'BP' and 'Pedigree') for the given dataset and assign values accordingly to the NaNs in 'Age' or *'glucoseLevel'* .


In [42]:
guess_age = np.zeros((3, 3), dtype=int)
guess_age

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]])

In [43]:
data.head()

Unnamed: 0,TimesPregnant,glucoseLevel,BP,insulin,BMI,Pedigree,Age,IsDiabetic,BMI_band,Pedigree_band
0,6,148.0,1,0,1,0,50.0,1,"(22.367, 44.733]","(0.0757, 0.859]"
1,1,,1,0,1,0,31.0,0,"(22.367, 44.733]","(0.0757, 0.859]"
2,8,183.0,1,0,1,0,,1,"(22.367, 44.733]","(0.0757, 0.859]"
3,1,,1,0,1,0,21.0,0,"(22.367, 44.733]","(0.0757, 0.859]"
4,0,137.0,0,0,1,2,33.0,1,"(22.367, 44.733]","(1.639, 2.42]"


In [44]:
# Display guess Age Matrix

for i in range(0, 3):
    for j in range(0, 3):
        df1 = data[data["BP"] == i]
        df2 = df1[df1["BMI"] == j]["Age"].dropna()
        age_guess = df2.median()
        guess_age[i, j] = int(age_guess)

print("Guess Matrix for Age:\n", guess_age)

Guess Matrix for Age:
 [[24 29 33]
 [25 29 32]
 [55 37 31]]


In [45]:
# Assigning guess Age values to NaN Age values in the dataset

for i in range(0, 3):
    for j in range(0, 3):
        row_mask = (data.Age.isnull()) & (data.BP == i) & (data.BMI == j)
        data.loc[row_mask, "Age"] = guess_age[i, j]

data["Age"] = data["Age"].apply(np.int64)

In [46]:
data["Age"].isna().any()

False

In [47]:
guess_glucoseLevel = np.zeros((3, 3), dtype=int)
guess_glucoseLevel

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]])

In [48]:
# Display guess glucose Level Matrix

for i in range(0, 3):
    for j in range(0, 3):
        df1 = data[data["BP"] == i]
        df2 = df1[df1["Pedigree"] == j]["glucoseLevel"].dropna()
        glucoseLevel_guess = df2.median()
        guess_glucoseLevel[i, j] = int(glucoseLevel_guess)

print("Guess Matrix for Glucose Level:\n", guess_glucoseLevel)

Guess Matrix for Glucose Level:
 [[115 127 137]
 [112 115 149]
 [133 129 159]]


In [49]:
# Assigning guess Age values to NaN Age values in the dataset

for i in range(0, 3):
    for j in range(0, 3):
        row_mask = (data.glucoseLevel.isnull()) & (data.BP == i) & (data.Pedigree == j)
        data.loc[row_mask, "glucoseLevel"] = guess_glucoseLevel[i, j]

In [50]:
data.head()

Unnamed: 0,TimesPregnant,glucoseLevel,BP,insulin,BMI,Pedigree,Age,IsDiabetic,BMI_band,Pedigree_band
0,6,148.0,1,0,1,0,50,1,"(22.367, 44.733]","(0.0757, 0.859]"
1,1,112.0,1,0,1,0,31,0,"(22.367, 44.733]","(0.0757, 0.859]"
2,8,183.0,1,0,1,0,29,1,"(22.367, 44.733]","(0.0757, 0.859]"
3,1,112.0,1,0,1,0,21,0,"(22.367, 44.733]","(0.0757, 0.859]"
4,0,137.0,0,0,1,2,33,1,"(22.367, 44.733]","(1.639, 2.42]"


In [51]:
data["glucoseLevel"].isna().any()

False

In [52]:
data["glucoseLevel"] = pd.cut(data["glucoseLevel"], 4, labels=[0, 1, 2, 3])
data["Age"] = pd.cut(data["Age"], 4, labels=[0, 1, 2, 3])
data.head()

Unnamed: 0,TimesPregnant,glucoseLevel,BP,insulin,BMI,Pedigree,Age,IsDiabetic,BMI_band,Pedigree_band
0,6,2,1,0,1,0,1,1,"(22.367, 44.733]","(0.0757, 0.859]"
1,1,2,1,0,1,0,0,0,"(22.367, 44.733]","(0.0757, 0.859]"
2,8,3,1,0,1,0,0,1,"(22.367, 44.733]","(0.0757, 0.859]"
3,1,2,1,0,1,0,0,0,"(22.367, 44.733]","(0.0757, 0.859]"
4,0,2,0,0,1,2,0,1,"(22.367, 44.733]","(1.639, 2.42]"


Use this dataset (with all features in categorical form) to train perceptron, logistic regression and random forest models using 15% test split.


In [53]:
train_df, test_df = train_test_split(data, test_size=0.15, random_state=100)
X_train = train_df.drop(train_df.columns[[-1, -2, -3]], axis=1)
Y_train = train_df["IsDiabetic"]
X_test = test_df.drop(test_df.columns[[-1, -2, -3]], axis=1)
Y_test = test_df["IsDiabetic"]

print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

(652, 7) (652,) (116, 7) (116,)


In [54]:
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression

In [55]:
# Logistic Regression

logreg = linear_model.LogisticRegression()
logreg.fit(X_train, Y_train)

logreg_train_acc = logreg.score(X_train, Y_train)
logreg_test_acc = logreg.score(X_test, Y_test)
print("logreg training acuracy= ", logreg_train_acc)
print("logreg test accuracy= ", logreg_test_acc)

logreg training acuracy=  0.754601226993865
logreg test accuracy=  0.7155172413793104


In [56]:
# Perceptron

perceptron = Perceptron(
    max_iter=100, verbose=0, random_state=None, fit_intercept=True, eta0=0.002
)
perceptron.fit(X_train, Y_train)

perceptron_train_acc = perceptron.score(X_train, Y_train)
perceptron_test_acc = perceptron.score(X_test, Y_test)
print("perceptron training acuracy= ", perceptron_train_acc)
print("perceptron test accuracy= ", perceptron_test_acc)

perceptron training acuracy=  0.6641104294478528
perceptron test accuracy=  0.6637931034482759


In [57]:
# Random Forest

random_forest = RandomForestClassifier()
random_forest.fit(X_train, Y_train)
random_forest_train_acc = random_forest.score(X_train, Y_train)
random_forest_test_acc = random_forest.score(X_test, Y_test)
print("random_forest training acuracy= ", random_forest_train_acc)
print("random_forest test accuracy= ", random_forest_test_acc)

random_forest training acuracy=  0.8742331288343558
random_forest test accuracy=  0.6551724137931034
