In [1]:
import torch
import torchvision
from torchvision import datasets, transforms
import matplotlib.pyplot as plt

# Data Generators

In [2]:
# download mnist dataset in local directory
train = datasets.MNIST("", train=True, download=True,
                       transform=transforms.Compose([transforms.ToTensor()]))
# we specify all transforms we have to apply in transforms.Compose([])
test = datasets.MNIST("", train=False, download=True,
                      transform=transforms.Compose([transforms.ToTensor()]))

trainset = torch.utils.data.DataLoader(train, batch_size=10, shuffle=True)
testset = torch.utils.data.DataLoader(test, batch_size=10, shuffle=True)

In [3]:
for data in trainset:
    # print one batch of dataset
    print(data)
    break

[tensor([[[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]],


        [[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]],


        [[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]],


        ...,


        [[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0

In [4]:
x, y = data[0][0], data[1][0]
print(data[0][0].shape)
print(y)
# shape of object is (1, 28, 28) but we need (28, 28) for displaying

torch.Size([1, 28, 28])
tensor(9)


# Data Cleaning

Checking if the dataset is balanced because if we have 70% of our dataset only as one number then the neural network will always keep predicting that number and we won't be able to get out of this hole no matter how much we train because the model will figure out the quickest way to reduce loss is by predicting that number as most of the dataset predicts that number.

In [5]:
total = 0
counter_dict = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}
for data in trainset:
    Xs, ys = data
    for y in ys:
        counter_dict[int(y)] += 1
        total += 1
for i in counter_dict:
    print(f"{i}: {counter_dict[i]/total*100}")

0: 9.871666666666666
1: 11.236666666666666
2: 9.93
3: 10.218333333333334
4: 9.736666666666666
5: 9.035
6: 9.863333333333333
7: 10.441666666666666
8: 9.751666666666667
9: 9.915000000000001


# Model

In [6]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    print("running on the gpu")
else:
    device = torch.device("cpu")
    print("running on the cpu")

running on the gpu


In [7]:
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        # initializes nn.Module class
        self.fc1 = nn.Linear(784, 64)
        # 784 after flattening image - 28x28(flatten because it is an ann not cnn)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 64)
        self.fc4 = nn.Linear(64, 10)

    def forward(self, X):
        # we can insert logic in this forward function using if statements etc.
        X = F.relu(self.fc1(X))
        X = F.relu(self.fc2(X))
        X = F.relu(self.fc3(X))
        X = self.fc4(X)
        return F.log_softmax(X, dim=1)
        # perform softmax on output layer (dim = 1 is axes)

net = Net()
print(net)

Net(
  (fc1): Linear(in_features=784, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=64, bias=True)
  (fc4): Linear(in_features=64, out_features=10, bias=True)
)


In [8]:
# try out model
X = torch.rand((28, 28))
X = X.view(-1, 28*28)
# -1 specifies this input will be of unknown shape
output = net(X)
print(output)

tensor([[-2.2340, -2.2474, -2.3792, -2.4804, -2.3623, -2.3531, -2.2776, -2.1942,
         -2.3303, -2.2038]], grad_fn=<LogSoftmaxBackward>)


# Training

In [9]:
import torch.optim as optim

optimizer = optim.Adam(net.parameters(), lr=0.001)
epochs = 5

In [10]:
net.to(device)

for epoch in range(epochs):
    for data in trainset:
        # data is a batch of features and labels
        X, y = data
        # X contains 10 images and y contains 10 integers in those images
        X = X.to(device)
        y = y.to(device)
        
        net.zero_grad()
        # clears old gradients from the last step
        output = net(X.view(-1, 28*28))
        loss = F.nll_loss(output, y)
        loss.backward()
        # backpropogate loss and get direction in which weights will move by computing derivative(gradient)
        optimizer.step()
        # this will adjust the weights
    print("epoch: " + str(epoch+1) + " loss: " + str(loss))

epoch: 1 loss: tensor(0.5315, device='cuda:0', grad_fn=<NllLossBackward>)
epoch: 2 loss: tensor(0.6937, device='cuda:0', grad_fn=<NllLossBackward>)
epoch: 3 loss: tensor(0.0424, device='cuda:0', grad_fn=<NllLossBackward>)
epoch: 4 loss: tensor(0.1731, device='cuda:0', grad_fn=<NllLossBackward>)
epoch: 5 loss: tensor(0.0086, device='cuda:0', grad_fn=<NllLossBackward>)


# Testing

In [11]:
correct = 0
total = 0
with torch.no_grad():
    #  set all the requires_grad flag to false
    for data in testset:
        X, y = data
        X = X.to(device)
        y = y.to(device)
        output = net(X.view(-1, 784))
        for idx, i in enumerate(output):
            if torch.argmax(i) == y[idx]:
                correct += 1
            total += 1
print("Accuracy: ", round(correct/total, 3))

Accuracy:  0.976


In [13]:
# getting predictions

output = torch.argmax(net(X[0].view(-1, 784))[0]).cpu()
output = output.detach().numpy()
print(output)

1
