In [170]:
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import Dataset,DataLoader
from torchsummary import summary
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [171]:
device=  'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [172]:
data_df = pd.read_csv('diabetes_binary_health_indicators_BRFSS2015.csv')
data_df.head(2)

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0


In [173]:
print(data_df.columns)
print(data_df.isnull().sum())
data_df.dropna(inplace=True)
data_df.shape

Index(['Diabetes_binary', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')
Diabetes_binary         0
HighBP                  0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
Stroke                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
dtype: int64


(253680, 22)

In [174]:
X_train,X_temp , Y_train,Y_temp = train_test_split(data_df.drop('Diabetes_binary',axis=1),data_df.Diabetes_binary,test_size=.3,random_state=69,stratify=data_df.Diabetes_binary)
X_val,X_test,Y_val,Y_test = train_test_split(X_temp,Y_temp,test_size=.5,random_state=69,stratify=Y_temp)

In [175]:
print(X_train.shape,X_val.shape,X_test.shape)

(177576, 21) (38052, 21) (38052, 21)


In [176]:
from sklearn.preprocessing import MinMaxScaler
scaler= MinMaxScaler()

In [177]:
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [178]:
class dataset(Dataset):
    def __init__(self,x,y):
        self.x = torch.tensor(x,dtype=torch.float32).to(device)
        self.y = torch.tensor(y,dtype=torch.float32).to(device)
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self,index):
        return self.x[index] , self.y[index]
    

In [179]:
y_train_long = torch.tensor(Y_train.values).to(torch.long)
class_count =torch.bincount(y_train_long)
class_weight = 1./class_count.float()
sample_weights=class_weight[y_train_long]


In [180]:
from torch.utils.data import WeightedRandomSampler

sampler = WeightedRandomSampler(
    weights=sample_weights,
    num_samples=len(sample_weights),
    replacement=True
)

In [181]:
train_dataset = dataset(X_train_scaled,Y_train.values)
validation_dataset = dataset(X_val_scaled,Y_val.values)
test_dataset = dataset(X_test_scaled,Y_test.values)

In [None]:
train_dataloader = DataLoader(train_dataset,batch_size=32,sampler=sampler)
val_dataloader = DataLoader(validation_dataset,batch_size=32,shuffle=False)
test_dataloader = DataLoader(test_dataset,batch_size=32,shuffle=False)

In [205]:
class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.input_layer = nn.Linear(data_df.drop("Diabetes_binary",axis=1).shape[1],32)
        self.linear = nn.Linear(32,8)
        self.bn1 = nn.BatchNorm1d(32)
        self.output =nn.Linear(8,1)
        self.bn2 = nn.BatchNorm1d(8)
        self.bn3 = nn.BatchNorm1d(64)
        self.bn4=nn.BatchNorm1d(32)
        #self.linear2 = nn.Linear(64,32)
        self.linear1 = nn.Linear(128,64)
        self.relu=nn.LeakyReLU(0.1)
        self.dropout=nn.Dropout(.1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self,x):
        x = self.relu(self.bn1(self.input_layer(x)))
        #x=self.dropout(x)
        x=self.relu(self.bn2(self.linear(x)))
        #x=self.dropout(x)
        #x=self.relu(self.bn3(self.linear1(x)))
        #x=self.dropout(x)
        #x=self.relu(self.bn4(self.linear2(x)))
        x=self.output(x)
        
        return x


In [206]:
model = MyModel().to(device)

In [207]:
summary(model,(data_df.drop('Diabetes_binary',axis=1).shape[1],))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                   [-1, 32]             704
       BatchNorm1d-2                   [-1, 32]              64
         LeakyReLU-3                   [-1, 32]               0
            Linear-4                    [-1, 8]             264
       BatchNorm1d-5                    [-1, 8]              16
         LeakyReLU-6                    [-1, 8]               0
            Linear-7                    [-1, 1]               9
Total params: 1,057
Trainable params: 1,057
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.01
----------------------------------------------------------------


In [208]:
data_df.Diabetes_binary.value_counts()

Diabetes_binary
0.0    218334
1.0     35346
Name: count, dtype: int64

In [209]:
labels = data_df.Diabetes_binary
n_pos= (labels == 1).sum()
n_neg = (labels ==0).sum()
pos_weight = n_neg/n_pos

In [210]:
#criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([pos_weight],device=device))
criterion = nn.BCEWithLogitsLoss()
optimazer =Adam(model.parameters(),lr=1e-3,weight_decay = 1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimazer,factor=.5,patience=2)

In [211]:
total_loss_train_plot = []
total_loss_validation_plot = []
total_acc_train_plot = []
total_acc_validation_plot = []

EPOCHS = 10

for epoch in range(EPOCHS):
    total_acc_train = 0
    total_loss_train = 0 
    total_acc_val = 0
    total_loss_val = 0
    model.train()
    for input,label in train_dataloader:
        input =input.to(device)
        label=label.to(device)
        optimazer.zero_grad()
        
        pred = model(input).squeeze(1)
        batch_loss = criterion(pred,label)
        total_loss_train += batch_loss.item()
        batch_loss.backward()
        
        preds = torch.sigmoid(pred)
        acc = (preds.round()==label).sum().item()
        total_acc_train +=acc
        
        optimazer.step()
        
        
    with torch.no_grad():
        model.eval()
        for input,label in val_dataloader:
            input=input.to(device)
            label=label.to(device)
            pred = model(input).squeeze(1)
            batch_loss = criterion(pred,label)
            total_loss_val += batch_loss.item()
            
            preds = torch.sigmoid(pred)
            acc = (preds.round()==label).sum().item()
            total_acc_val += acc
            
    total_loss_train_plot.append(round(total_loss_train/len(train_dataloader),4))
    total_loss_validation_plot.append(round(total_loss_val/len(val_dataloader),4))
    
    total_acc_train_plot.append(round(total_acc_train/len(train_dataset)*100,4))
    total_acc_validation_plot.append(round(total_acc_val/len(validation_dataset)*100,4))
    
    scheduler.step(total_loss_validation_plot[epoch])
    print(f'Epoch:{epoch+1} Train Loss : {total_loss_train_plot[epoch]}, Train Acc : {total_acc_train_plot[epoch]}%')
    print(f'Epoch:{epoch+1} Val Loss : {total_loss_validation_plot[epoch]}, Val Acc : {total_acc_validation_plot[epoch]}%')
    print('='*55)
    

Epoch:1 Train Loss : 0.5193, Train Acc : 74.1744%
Epoch:1 Val Loss : 0.487, Val Acc : 73.9698%
Epoch:2 Train Loss : 0.5134, Train Acc : 74.7258%
Epoch:2 Val Loss : 0.5175, Val Acc : 70.5456%
Epoch:3 Train Loss : 0.5141, Train Acc : 74.4571%
Epoch:3 Val Loss : 0.5139, Val Acc : 71.1211%
Epoch:4 Train Loss : 0.5119, Train Acc : 74.7331%
Epoch:4 Val Loss : 0.5154, Val Acc : 70.9608%
Epoch:5 Train Loss : 0.5101, Train Acc : 74.7973%
Epoch:5 Val Loss : 0.5034, Val Acc : 72.1775%
Epoch:6 Train Loss : 0.5085, Train Acc : 74.9561%
Epoch:6 Val Loss : 0.5215, Val Acc : 71.3235%
Epoch:7 Train Loss : 0.5093, Train Acc : 74.9009%
Epoch:7 Val Loss : 0.5029, Val Acc : 72.1486%
Epoch:8 Train Loss : 0.5082, Train Acc : 74.9054%
Epoch:8 Val Loss : 0.4992, Val Acc : 72.1197%
Epoch:9 Train Loss : 0.5058, Train Acc : 75.1335%
Epoch:9 Val Loss : 0.531, Val Acc : 70.472%
Epoch:10 Train Loss : 0.5079, Train Acc : 74.7736%
Epoch:10 Val Loss : 0.5028, Val Acc : 71.9069%


In [212]:
all_preds = []
all_labels=[]
with torch.no_grad():
    model.eval()
    total_loss_test =0
    total_acc_test = 0
    for input,label in test_dataloader:
        input,label = input.to(device),label.to(device)
        pred = model(input).squeeze(1)
        probs = torch.sigmoid(pred)
        preds = probs.round()
        
        all_preds.append(preds.cpu())
        all_labels.append(label.cpu())
        
        
all_preds = torch.cat(all_preds).numpy()
all_labels = torch.cat(all_labels).numpy()



In [213]:
from sklearn.metrics import classification_report
cm=classification_report(all_labels,all_preds,target_names=['No Diabetes','Diabetes'])
print(cm)

              precision    recall  f1-score   support

 No Diabetes       0.96      0.71      0.81     32750
    Diabetes       0.30      0.80      0.44      5302

    accuracy                           0.72     38052
   macro avg       0.63      0.75      0.63     38052
weighted avg       0.87      0.72      0.76     38052



In [214]:
small_X = next(iter(train_dataloader))[0]
small_Y = next(iter(train_dataloader))[1]

for i in range(50):
    optimazer.zero_grad()
    output = model(small_X)
    loss = criterion(output.squeeze(1),small_Y)
    loss.backward()
    optimazer.step()
    print(f'epoch:{i} loss: {loss.item():.4f}')

epoch:0 loss: 1.0349
epoch:1 loss: 1.0312
epoch:2 loss: 1.0253
epoch:3 loss: 1.0178
epoch:4 loss: 1.0093
epoch:5 loss: 0.9998
epoch:6 loss: 0.9900
epoch:7 loss: 0.9795
epoch:8 loss: 0.9688
epoch:9 loss: 0.9583
epoch:10 loss: 0.9476
epoch:11 loss: 0.9370
epoch:12 loss: 0.9267
epoch:13 loss: 0.9165
epoch:14 loss: 0.9065
epoch:15 loss: 0.8964
epoch:16 loss: 0.8864
epoch:17 loss: 0.8766
epoch:18 loss: 0.8670
epoch:19 loss: 0.8574
epoch:20 loss: 0.8482
epoch:21 loss: 0.8395
epoch:22 loss: 0.8307
epoch:23 loss: 0.8223
epoch:24 loss: 0.8143
epoch:25 loss: 0.8067
epoch:26 loss: 0.7993
epoch:27 loss: 0.7926
epoch:28 loss: 0.7865
epoch:29 loss: 0.7810
epoch:30 loss: 0.7760
epoch:31 loss: 0.7711
epoch:32 loss: 0.7664
epoch:33 loss: 0.7620
epoch:34 loss: 0.7579
epoch:35 loss: 0.7539
epoch:36 loss: 0.7501
epoch:37 loss: 0.7464
epoch:38 loss: 0.7429
epoch:39 loss: 0.7394
epoch:40 loss: 0.7360
epoch:41 loss: 0.7328
epoch:42 loss: 0.7297
epoch:43 loss: 0.7268
epoch:44 loss: 0.7239
epoch:45 loss: 0.721