In [13]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import CategoricalNB

In [14]:
df = pd.read_csv('diabetes.csv',
                 names=("Pregnancy Rate", "Glucose", "Blood Pressure", "Skin", "Insulin", "BMI", "Diabetes.Inheritence", "Age","Diabetes"),
                 na_values={"Glucose": 0, "Blood Pressure": 0, "Skin": 0, "Insulin": 0, "BMI": 0, "Diabetes-Inheritence": 0, "Age": 0})

df.dropna(inplace=True)

In [15]:
X = df.iloc[:, 0:-1]
y = df["Diabetes"]
scaler = MinMaxScaler()

Xs = scaler.fit_transform(X)

In [16]:
quartiles = df.quantile([0.25, 0.50, 0.75])
quartiles

Unnamed: 0,Pregnancy Rate,Glucose,Blood Pressure,Skin,Insulin,BMI,Diabetes.Inheritence,Age,Diabetes
0.25,1.0,99.0,62.0,21.0,76.75,28.4,0.26975,23.0,0.0
0.5,2.0,119.0,70.0,29.0,125.5,33.2,0.4495,27.0,0.0
0.75,5.0,143.0,78.0,37.0,190.0,37.1,0.687,36.0,1.0


In [17]:
dfC = df

def convert_to_categorical(value, column):
    if value <= quartiles.iloc[0, column]:
        return 'low'
    elif value <= quartiles.iloc[1, column]:
        return 'quite low'
    elif value <= quartiles.iloc[2, column]:
        return 'quite high'
    else:
        return 'high'

for column in range(8):
    dfC[f'{column}'] = dfC.iloc[:, column].apply(lambda x: convert_to_categorical(x, column))
dfC

Unnamed: 0,Pregnancy Rate,Glucose,Blood Pressure,Skin,Insulin,BMI,Diabetes.Inheritence,Age,Diabetes,0,1,2,3,4,5,6,7
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0,low,low,quite low,quite low,quite low,low,low,low
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1,low,quite high,low,quite high,quite high,high,high,quite high
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1,quite high,low,low,quite high,quite low,quite low,low,quite low
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1,quite low,high,quite low,high,high,quite low,low,high
13,1,189.0,60.0,23.0,846.0,30.1,0.398,59,1,low,high,low,quite low,high,quite low,quite low,high
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
753,0,181.0,88.0,44.0,510.0,43.3,0.222,26,1,low,high,high,high,high,high,low,quite low
755,1,128.0,88.0,39.0,110.0,36.5,1.057,37,1,low,quite high,high,high,quite low,quite high,high,high
760,2,88.0,58.0,26.0,16.0,28.4,0.766,22,0,quite low,low,low,quite low,low,low,high,low
763,10,101.0,76.0,48.0,180.0,32.9,0.171,63,0,high,quite low,quite high,high,quite high,quite low,low,high


In [18]:
X_train, X_test, y_train, y_test = train_test_split(Xs, y, test_size=0.20, random_state=3, stratify=y)

In [19]:
X_testS = scaler.fit_transform(X_test)

In [20]:
# K-nearest neighbhour
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print(knn.score(X_test, y_test))
print(knn.predict(X_test))

0.8227848101265823
[0 0 1 1 0 0 1 0 0 0 0 0 0 1 1 0 1 0 1 1 1 0 1 0 1 0 0 0 0 1 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 1 1 1 1 0 1 0 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0]


In [21]:
# CategoricalNB
cnb = CategoricalNB()

In [22]:
# Logistic Regression
log_reg = LogisticRegression(random_state=16)
log_reg.fit(X_train, y_train)
print(log_reg.score(X_test, y_test))
print(log_reg.predict(X_test))

0.8481012658227848
[0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 1 0 1 0 0 0 0 1 1 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 0 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0]
