In [15]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import CategoricalNB

In [16]:
df = pd.read_csv('diabetes.csv', names=("Pregnancy Rate", "Glucose", "Blood Pressure", "Skin", "Insulin", "BMI", "Diabetes.Inheritence", "Age", "Diabetes"),
                 na_values={"Glucose": 0, "Blood Pressure": 0, "Skin": 0, "Insulin": 0, "BMI": 0, "Diabetes-Inheritence": 0, "Age": 0})

df.dropna(inplace=True)

In [17]:
# Dataset
df.head(n=20)

Unnamed: 0,Pregnancy Rate,Glucose,Blood Pressure,Skin,Insulin,BMI,Diabetes.Inheritence,Age,Diabetes
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1
13,1,189.0,60.0,23.0,846.0,30.1,0.398,59,1
14,5,166.0,72.0,19.0,175.0,25.8,0.587,51,1
16,0,118.0,84.0,47.0,230.0,45.8,0.551,31,1
18,1,103.0,30.0,38.0,83.0,43.3,0.183,33,0
19,1,115.0,70.0,30.0,96.0,34.6,0.529,32,1
20,3,126.0,88.0,41.0,235.0,39.3,0.704,27,0


In [18]:
X = df.iloc[:, 0:-1]
y = df["Diabetes"]
scaler = MinMaxScaler() # MinMaxScaler used

Xs = scaler.fit_transform(X)

In [19]:
quartiles = df.quantile([0.25, 0.50, 0.75]) # Quantiles for CategoricalNB
quartiles

Unnamed: 0,Pregnancy Rate,Glucose,Blood Pressure,Skin,Insulin,BMI,Diabetes.Inheritence,Age,Diabetes
0.25,1.0,99.0,62.0,21.0,76.75,28.4,0.26975,23.0,0.0
0.5,2.0,119.0,70.0,29.0,125.5,33.2,0.4495,27.0,0.0
0.75,5.0,143.0,78.0,37.0,190.0,37.1,0.687,36.0,1.0


In [20]:
dfC = df.copy() 

# Function to properly value things for CategoricalNB
def convert_to_categorical(value, column): 
    if value <= quartiles.iloc[0][column]:
        return 0
    if value <= quartiles.iloc[1][column]:
        return 1
    if value <= quartiles.iloc[2][column]:
        return 2
    else:
        return 3


for column in df.drop('Diabetes', axis=1):
    dfC[column] = df[column].apply(lambda x: convert_to_categorical(x, column))
dfC.head(n=20)

Unnamed: 0,Pregnancy Rate,Glucose,Blood Pressure,Skin,Insulin,BMI,Diabetes.Inheritence,Age,Diabetes
3,0,0,1,1,1,0,0,0,0
4,0,2,0,2,2,3,3,2,1
6,2,0,0,2,1,1,0,1,1
8,1,3,1,3,3,1,0,3,1
13,0,3,0,1,3,1,1,3,1
14,2,3,2,0,2,0,2,3,1
16,0,1,3,3,3,3,2,2,1
18,0,1,0,3,1,3,0,2,0
19,0,1,1,2,1,2,2,2,1
20,2,2,3,3,3,3,3,1,0


In [21]:
df.head(n=1)

Unnamed: 0,Pregnancy Rate,Glucose,Blood Pressure,Skin,Insulin,BMI,Diabetes.Inheritence,Age,Diabetes
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0


In [22]:
# Train test split for K-nearest and Logistical Regression
X_train, X_test, y_train, y_test = train_test_split(Xs, y, test_size=0.20, random_state=3, stratify=y)

In [23]:
# K-nearest neighbhour
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print(knn.score(X_test, y_test))
print(knn.predict(X_test))

0.8227848101265823
[0 0 1 1 0 0 1 0 0 0 0 0 0 1 1 0 1 0 1 1 1 0 1 0 1 0 0 0 0 1 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 1 1 1 1 0 1 0 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0]


In [24]:
# CategoricalNB
Xc = dfC.iloc[:, 0:-1]

# Another split for this model
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(Xc, y, test_size=0.20, random_state=3, stratify=y)
cnb = CategoricalNB()
cnb.fit(X_train_c, y_train_c)
print(cnb.score(X_test_c, y_test_c))
print(cnb.predict(X_test_c))

0.7721518987341772
[0 0 1 1 0 0 1 0 0 0 0 1 0 0 1 0 0 1 1 0 0 0 1 0 1 0 0 0 0 1 1 0 1 0 1 0 0
 1 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 1 0 1 1 1 0 0 0 1 0 1 0 1 0 0 0 0 0 0
 1 0 1 0 0]


In [25]:
# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
print(log_reg.score(X_test, y_test))
print(log_reg.predict(X_test))

0.8481012658227848
[0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 1 0 1 0 0 0 0 1 1 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 0 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0]
