In [1]:
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
import pandas as pd
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset

In [2]:
data = pd.read_csv('./.data/diabetes.csv')

In [3]:
x = data.iloc[:,0:-1].values #.values convert to numpy array
yString= list(data.iloc[:,-1])

In [4]:
x, yString[:5] #checking the data

(array([[  6. , 148. ,  72. , ...,   0. ,  33.6,  50. ],
        [  1. ,  85. ,  66. , ...,   0. ,  26.6,  31. ],
        [  8. , 183. ,  64. , ...,   0. ,  23.3,  32. ],
        ...,
        [  5. , 121. ,  72. , ..., 112. ,  26.2,  30. ],
        [  1. , 126. ,  60. , ...,   0. ,  30.1,  47. ],
        [  1. ,  93. ,  70. , ...,   0. ,  30.4,  23. ]]),
 ['positive', 'negative', 'positive', 'negative', 'positive'])

In [5]:
yInt = []
for string in yString:
    if string == 'positive':
        yInt.append(1)
    else:
        yInt.append(0)

In [6]:
y = np.array(yInt, dtype = 'float64') # to np array
y[:5]

array([1., 0., 1., 0., 1.])

In [7]:
sc = StandardScaler()
x = sc.fit_transform(x) 
#normalizing the data, it calculates the mean and sd and 
#then transform normalizing the data ((value - mean) / sd)
x

array([[ 0.63994726,  0.84832379,  0.14964075, ..., -0.69289057,
         0.20401277,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575, ..., -0.69289057,
        -0.68442195, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, ..., -0.69289057,
        -1.10325546, -0.10558415],
       ...,
       [ 0.3429808 ,  0.00330087,  0.14964075, ...,  0.27959377,
        -0.73518964, -0.27575966],
       [-0.84488505,  0.1597866 , -0.47073225, ..., -0.69289057,
        -0.24020459,  1.17073215],
       [-0.84488505, -0.8730192 ,  0.04624525, ..., -0.69289057,
        -0.20212881, -0.87137393]])

In [8]:
x = torch.tensor(x)
print('before unsqueeze =>', y.shape)
y = torch.tensor(y).unsqueeze(1) #addinf aditional dimension to the first index
print('after unsqueeze =>', y.shape)

before unsqueeze => (768,)
after unsqueeze => torch.Size([768, 1])


In [9]:
#checking that we have the same structure
print(x.shape, x.dtype)
print(y.shape, y.dtype)

torch.Size([768, 7]) torch.float64
torch.Size([768, 1]) torch.float64


In [10]:
#Because we are using our own ds, we need to create that class to tell Pytorch about it
class Dataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
    def __getitem__(self, index):
        return self.x[index], self.y[index]
    
    def __len__(self):
        return len(self.x)

In [11]:
dataset = Dataset(x, y)

In [12]:
print(dataset.x.shape)
print(dataset.y.shape)

torch.Size([768, 7])
torch.Size([768, 1])


In [13]:
trainLoader = torch.utils.data.DataLoader(dataset=dataset, batch_size=32, shuffle=True)

In [14]:
len(trainLoader) #24 * 32 = 768

24

In [15]:
print("There is {} batches in the dataset".format(len(trainLoader)))
shown = 0
for (x,y) in trainLoader:
    if shown == 1:
        break
    print("For one iteration (batch), there is:")
    print("Data:    {} , Type: {} ".format(x.shape, x.dtype))
    print("Labels:  {} , Type: {}" .format(y.shape, y.dtype))
    shown += 1

There is 24 batches in the dataset
For one iteration (batch), there is:
Data:    torch.Size([32, 7]) , Type: torch.float64 
Labels:  torch.Size([32, 1]) , Type: torch.float64


In [16]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.fc1 = torch.nn.Linear(x.shape[1], 5)
        self.fc2 = torch.nn.Linear(5, 3)
        self.fc3 = torch.nn.Linear(3, 1)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, x):
        out = self.fc1(x)
        out = self.sigmoid(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        out = self.fc3(out)
        out = self.sigmoid(out)
        return out

In [38]:
net = Model()
#In Cross Entropy Loss: the Input should be of shape (N,C) and the labels should be of shape (N)
#In Binary Cross Entropy: the input and output should have the same shape 
#reduction='mean' --> the losses are averaged over observations for each minibatch (size_average = True - before) 
criterion = torch.nn.BCELoss(reduction='mean')   
optimizer = torch.optim.SGD(net.parameters(), lr=0.021, weight_decay=1e-5)

In [27]:
net.parameters

<bound method Module.parameters of Model(
  (fc1): Linear(in_features=7, out_features=5, bias=True)
  (fc2): Linear(in_features=5, out_features=3, bias=True)
  (fc3): Linear(in_features=3, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)>

In [40]:
epochs = 100
for epoch in range(epochs):
    for inputs, labels in trainLoader:
        inputs = Variable(inputs.float())
        labels = Variable(labels.float())
        output = net(inputs)
        optimizer.zero_grad()
        #loss =  -(labels * torch.log(output) + (1 - labels) * torch.log(1 - output)).mean()
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
    #Accuracy
    #After we have looped through all our dataset, we can evaluate on the last iteration of each epoch. 
    #'output' is the network's output of the last mini-batch of the epoch, and labels is the corresponding batch labels
    output = (output>0.5).float()
    correct = (output == labels).float().sum()
    print("Epoch {}/{}, Loss: {:.3f}, Accuracy: {:.3f}".format(epoch+1,epochs, loss.item(), correct/output.shape[0]))

Epoch 1/100, Loss: 0.335, Accuracy: 0.875
Epoch 2/100, Loss: 0.460, Accuracy: 0.750
Epoch 3/100, Loss: 0.444, Accuracy: 0.750
Epoch 4/100, Loss: 0.521, Accuracy: 0.719
Epoch 5/100, Loss: 0.433, Accuracy: 0.781
Epoch 6/100, Loss: 0.426, Accuracy: 0.812
Epoch 7/100, Loss: 0.362, Accuracy: 0.844
Epoch 8/100, Loss: 0.363, Accuracy: 0.844
Epoch 9/100, Loss: 0.351, Accuracy: 0.812
Epoch 10/100, Loss: 0.502, Accuracy: 0.781
Epoch 11/100, Loss: 0.362, Accuracy: 0.812
Epoch 12/100, Loss: 0.662, Accuracy: 0.625
Epoch 13/100, Loss: 0.441, Accuracy: 0.688
Epoch 14/100, Loss: 0.455, Accuracy: 0.719
Epoch 15/100, Loss: 0.386, Accuracy: 0.906
Epoch 16/100, Loss: 0.546, Accuracy: 0.719
Epoch 17/100, Loss: 0.422, Accuracy: 0.812
Epoch 18/100, Loss: 0.384, Accuracy: 0.750
Epoch 19/100, Loss: 0.660, Accuracy: 0.594
Epoch 20/100, Loss: 0.437, Accuracy: 0.812
Epoch 21/100, Loss: 0.409, Accuracy: 0.781
Epoch 22/100, Loss: 0.507, Accuracy: 0.750
Epoch 23/100, Loss: 0.437, Accuracy: 0.844
Epoch 24/100, Loss: 