In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('diabetes.csv')

In [None]:
df.head()

In [None]:
df.columns


In [None]:
df.info()

In [None]:
df.duplicated().sum()

In [None]:
def outliers(x):
    y = x.select_dtypes(include=[int, float])
    for i in y:
        sns.boxplot(x=df[i])
        plt.title(i)
        plt.show()

In [None]:
outliers(df)

In [None]:
def cal_uf_lf(q1, q3):
    IQR = q3-q1
    lf = q1-(IQR*1.5)
    uf = q3+(IQR*1.5)
    print("lf-->", lf, "uf-->", uf)
    return [lf, uf]

In [None]:
df.describe()

In [None]:
[lf, uf] = cal_uf_lf(1.000000, 6.000000)

In [None]:
df['Pregnancies'] = np.where(df['Pregnancies']>uf, uf, df['Pregnancies'])
df['Pregnancies'] = np.where(df['Pregnancies']<lf, lf, df['Pregnancies'])

In [None]:
plt.boxplot(df['Pregnancies'])
plt.show()

In [None]:
[lf, uf] = cal_uf_lf(99.000000, 140.250000)

In [None]:
[lf, uf] = cal_uf_lf(62.000000, 80.0000000)

In [None]:
df['BloodPressure'] = np.where(df['BloodPressure']>uf, uf, df['BloodPressure'])
df['BloodPressure'] = np.where(df['BloodPressure']<lf, lf, df['BloodPressure'])

In [None]:
plt.boxplot(df['BloodPressure'])

In [None]:
[lf, uf] = cal_uf_lf(0.000000, 32.000000)

In [None]:
df['SkinThickness'] = np.where(df['SkinThickness']>uf, uf, df['SkinThickness'])
df['SkinThickness'] = np.where(df['SkinThickness']<lf, lf, df['SkinThickness'])

In [None]:
plt.boxplot(df['SkinThickness'])

In [None]:
[lf, uf] = cal_uf_lf(0.000000, 127.250000)

In [None]:
df['Insulin'] = np.where(df['Insulin']>uf, uf, df['Insulin'])
df['Insulin'] = np.where(df['Insulin']<lf, lf, df['Insulin'])

In [None]:
plt.boxplot(df['Insulin'])

In [None]:
[lf, uf] = cal_uf_lf(27.300000, 36.600000)

In [None]:
df['BMI'] = np.where(df['BMI']>uf, uf, df['BMI'])
df['BMI'] = np.where(df['BMI']<lf, lf, df['BMI'])

In [None]:
plt.boxplot(df['BMI'])
plt.show()

In [None]:
df.describe()

In [None]:
[lf, uf]  =  cal_uf_lf(0.243750,0.626250)

In [None]:
df['DiabetesPedigreeFunction'] = np.where(df['DiabetesPedigreeFunction']>uf, uf, df['DiabetesPedigreeFunction'])
df['DiabetesPedigreeFunction'] = np.where(df['DiabetesPedigreeFunction']<lf, lf, df['DiabetesPedigreeFunction'])

In [None]:
plt.boxplot(df['DiabetesPedigreeFunction'])

In [None]:
[lf, uf] = cal_uf_lf(24.000000, 41.000000)

In [None]:
df['Age'] = np.where(df['Age']>uf, uf, df['Age'])
df['Age'] = np.where(df['Age']<lf, lf, df['Age'])

In [None]:
plt.boxplot(df['Age'])

In [None]:
[lf, uf] = cal_uf_lf(0.000000, 1.000000)

In [None]:
df['Outcome'] = np.where(df['Outcome']>uf, uf, df['Outcome'])
df['Outcome'] = np.where(df['Outcome']<lf, lf, df['Outcome'])

In [None]:
plt.boxplot(df['Outcome'])
plt.show()

In [None]:
y  = df['Outcome']
x = df.drop(columns='Outcome')

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=88)

In [None]:
log_model = LogisticRegression()

In [None]:
log_model.fit(x_train, y_train)

In [None]:
print("Train accuracy:", log_model.score(x_train, y_train))
print("Test accuracy:", log_model.score(x_test, y_test))

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)

In [None]:
knn.fit(x_train, y_train)

In [None]:
print("Train accuracy:", knn.score(x_train, y_train))
print("Test accuracy:", knn.score(x_test, y_test))

In [None]:
k = []
tr = []
te = []

for i in range(3, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(x_train, y_train)
    print("KNeighborsClassifier:", i)
    k.append(i)

    print("Train accuracy:", knn.score(x_train, y_train))
    tr.append(round(knn.score(x_train, y_train),2))
    print("Test accuracy:", knn.score(x_test, y_test))
    te.append(round(knn.score(x_test, y_test),2))

    print("**********************************")

In [None]:
print(tr)
print(te)
print(k)






In [None]:
plt.figure(figsize=(15, 5))
plt.plot(k, tr, label="train")
plt.plot(k, tr, "go--")

plt.plot(k, te, label="test")
plt.plot(k, te, "bo--")

for x,y in zip(k, tr):
    plt.text(x, y, y)

for x,y in zip(k, te):
   plt.text(x, y, y)

plt.xticks(k)
plt.legend()
plt.show()

In [None]:
print(metrics.classification_report(y_train, knn.predict(x_train)))

In [None]:
print(metrics.classification_report(y_test, knn.predict(x_test)))

In [None]:
f = {'Pregnancies':[6, 1], 'Glucose':[148, 85], 'BloodPressure':[72, 66], 'SkinThickness':[35, 29], 'Insulin':[0, 0], 'BMI':[33.6, 26.6], 'DiabetesPedigreeFunction':[.627, .351], 'Age':[50, 31]}

In [None]:
ff = pd.DataFrame(f)

In [None]:
knn.predict(ff)

In [None]:
df