In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
## Loading dataset
df = pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv")

In [None]:
## Distribution - Outcome
df['Outcome'].value_counts().plot(kind = 'bar')

In [None]:
## Correlation plot
corr = df.corr()

mask = np.triu(np.ones_like(corr, dtype = bool))
f, ax = plt.subplots(figsize = (11,9))
cmap = sns.diverging_palette(230, 20, as_cmap = True)

sns.heatmap(corr, mask = mask, cmap = cmap, vmax = 0.3, center = 0,
            square = True, linewidths = 0.5, cbar_kws = {"shrink": 0.5})

In [None]:
df_temp = df.copy()
df_temp['Outcome'] = np.where(df['Outcome'] == 1, "Diabetic", "Non-Diabetic")
sns.pairplot(df_temp, hue = "Outcome")

In [None]:
## Checking for outliers
q1 = df.quantile(0.25)
q3 = df.quantile(0.75)

IQR = q3 - q1
IQR

In [None]:
## Removing outliers from the dataset
df_updated = df[~((df < (q1 - 1.5 * IQR)) |(df > (q3 + 1.5 * IQR))).any(axis=1)]
df.shape, df_updated.shape

In [None]:
X = df.drop(columns = 'Outcome', axis = 1).values
Y = df['Outcome'].values

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.15, random_state = 711)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

### kNN Classification

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

for i in range(1, 11):
    model_knn = KNeighborsClassifier(n_neighbors = i)
    model_knn.fit(x_train, y_train)

    y_pred_knn = model_knn.predict(x_test)
    print("Accuracy: {} for {} nearest neighbors".format(metrics.accuracy_score(y_test, y_pred_knn), i))

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

for i in range(1, 100, 10):
    model_rf = RandomForestClassifier(n_estimators = i, bootstrap = True, max_features = 'sqrt')
    model_rf.fit(x_train, y_train)
    pred_rf = model_rf.predict(x_test)
    prob_rf = model_rf.predict_proba(x_test)[:, 1]    
    roc_value = roc_auc_score(y_test, prob_rf)
    score_rf = accuracy_score(y_test, pred_rf)
    print("Accuracy: {} and ROC value: {} for {} estimators".format(score_rf, roc_value, i))

### ANN Classification - Pytorch

In [None]:
x_train_pt = torch.FloatTensor(x_train)
x_test_pt = torch.FloatTensor(x_test)
y_train_pt = torch.LongTensor(y_train)
y_test_pt = torch.LongTensor(y_test)

# type(x_train)

In [None]:
## Building a model
class NN_model(nn.Module):
    def __init__(self, inp_features = 8, hn1 = 20, hn2 = 30, hn3 = 20, out_features = 2):
        super().__init__()
        self.fc1 = nn.Linear(inp_features, hn1)
        self.fc2 = nn.Linear(hn1, hn2)
        self.fc3 = nn.Linear(hn2, hn3)
        self.out = nn.Linear(hn3, out_features)
        
    def forward(self, x):
        x = F.sigmoid(self.fc1(x))
        x = F.sigmoid(self.fc2(x))
        x = F.sigmoid(self.fc3(x))
        x = F.softmax(self.out(x))  
        return x

In [None]:
torch.manual_seed(711)
model_pt = NN_model()
model_pt.parameters

In [None]:
## Params
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.005)
epochs = 1000

## Training Loop
final_losses = []
for i in range(epochs):
    y_pred=model.forward(x_train_pt)
    loss=loss_fn(y_pred,y_train_pt)
    final_losses.append(loss)
    if i%10==1:
        print("Epoch number: {} and the loss : {}".format(i,loss.item()))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [None]:
plt.plot(range(epochs), final_losses)
plt.xlabel('Loss')
plt.ylabel('Epoch')

In [None]:
predictions=[]
with torch.no_grad():
    for i,data in enumerate(x_test_pt):
        y_pred=model(data)
        predictions.append(y_pred.argmax().item())
        print(y_pred.argmax().item())

In [None]:
cm=confusion_matrix(y_test_pt,predictions)

plt.figure(figsize=(10,6))
sns.heatmap(cm,annot=True)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')

In [None]:
score=accuracy_score(y_test_pt,predictions)
score