#Imports

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

#First CNN

In [2]:
# Load the MNIST dataset and apply data preparations using transform method within torchvision
train_dataset = datasets.MNIST('mnist_data/', train=True, download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ]))

test_dataset = datasets.MNIST('mnist_data/', train=False, 
                  transform=transforms.Compose([
                      transforms.ToTensor(),
                      transforms.Normalize((0.1307,), (0.3081,))
                  ]))

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to mnist_data/MNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/9912422 [00:00<?, ?it/s]

Extracting mnist_data/MNIST/raw/train-images-idx3-ubyte.gz to mnist_data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to mnist_data/MNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Extracting mnist_data/MNIST/raw/train-labels-idx1-ubyte.gz to mnist_data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to mnist_data/MNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting mnist_data/MNIST/raw/t10k-images-idx3-ubyte.gz to mnist_data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to mnist_data/MNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/4542 [00:00<?, ?it/s]

Extracting mnist_data/MNIST/raw/t10k-labels-idx1-ubyte.gz to mnist_data/MNIST/raw



In [3]:
# Define the model as a class
class ConvNet(nn.Module):
    def __init__(self): 
      # Defining the initilization of the class. In other words, these variables will be defined every time we call this class. It's sort of a way to create local variables that are only needed within the class, and these only exist within the class.
        super(ConvNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=5, padding=2) 
        # We might need to remove the padding parameter, as this is a bit advanced. What it essentially does, is control the number of zero-valued pixels (RGB) within the 1-channel image, so that we preserve the spatial size of the feature map.
        # The feature map is essentially an intermediate representation of the input data that is learned by a CNN. It's the result of the summmarization of the condensed form of the information within the image (edges etc), that are used for identifying features.

        # For the first convolutional layer, we use 1 input channel since we're dealing with gray-scale images. 32 is the number of output channels/filters that will be processed by this CNN layer.
        # We chose this value based on different suggestions online regarding the same type of data.
        self.conv2 = nn.Conv2d(32, 64, kernel_size=5, padding=2)
        # Since we had 32 output channels/filters, we need to use the same value for the inputs for the next layer of the CNN. We then somewhat arbitrarily chose to double it for the output layer.
        self.fc1 = nn.Linear(7 * 7 * 64, 1000) 
        # Dense layer that performs a dot-product between inputs and weights as well as adding a bias term. 
        # We use the input size of 7 * 7 * 64 for the image, with 1000
        self.fc2 = nn.Linear(1000, 10)
        # Another dense layer that also performs a dot product between inputs and weights from the previous layer and then adds a bias.
        # This needs to be changed according to the number of neurons in the other dense layer, if we choose to change it around

    def forward(self, x): # Defining the forward pass function of the model. 
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2(x), 2))
        x = x.view(-1, 7 * 7 * 64)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

In [4]:
# Initialize the model, loss function, and optimizer
model = ConvNet() # Creating an instance of the class we defined above.
n_epochs = 10 # We set the number of epochs to 10 for the first run
learning_rate = 0.01 # We set the learning_rate to 0.01, as this seems to have yielded the best results in the different examples we've seen in NNs
criterion = nn.CrossEntropyLoss() # We use CrossEntropyLoss due to the data being about classifiying handwritten digits. This means we're dealing with a multiclass classification.
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.5) # Define the Stochastic Gradient Descend optimizer. 

In [5]:
# Defining function to train the model
def train(model, train_loader, criterion, optimizer, epoch): # We define this function with the model, train_loader, criterion, optimizer and epochs as inputs for the function.
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

In [6]:
# Defining function to test the model
def test(model, test_loader, criterion):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            output = model(data)
            test_loss += criterion(output, target).item()
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [7]:
# Create dataloaders for the train and test datasets
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1000, shuffle=True)

In [8]:
# Train the model for 10 epochs
for epoch in range(n_epochs):
    train(model, train_loader, criterion, optimizer, epoch)
    test(model, test_loader, criterion)


Test set: Average loss: 0.0001, Accuracy: 9727/10000 (97%)


Test set: Average loss: 0.0000, Accuracy: 9838/10000 (98%)


Test set: Average loss: 0.0001, Accuracy: 9802/10000 (98%)


Test set: Average loss: 0.0000, Accuracy: 9870/10000 (99%)


Test set: Average loss: 0.0000, Accuracy: 9891/10000 (99%)


Test set: Average loss: 0.0000, Accuracy: 9908/10000 (99%)


Test set: Average loss: 0.0000, Accuracy: 9885/10000 (99%)


Test set: Average loss: 0.0000, Accuracy: 9885/10000 (99%)


Test set: Average loss: 0.0000, Accuracy: 9904/10000 (99%)


Test set: Average loss: 0.0000, Accuracy: 9914/10000 (99%)



The following is how to save the model parameters in a dictionary. In this case they've been commented out to avoid unnecessary extra files, but basically you would need to recreate an instance of the model with an identical architecture, and then load the saved parameters into it.

In [None]:
# Save the model
#torch.save(model.state_dict(), 'model1.pt')

In [None]:
# Load the saved model
#model.load_state_dict(torch.load('model1.pt'))

#Second CNN

In [9]:
# Load the MNIST dataset and apply data preparations using transform method within torchvision
train_dataset = datasets.MNIST('mnist_data/', train=True, download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ]))

test_dataset = datasets.MNIST('mnist_data/', train=False, 
                  transform=transforms.Compose([
                      transforms.ToTensor(),
                      transforms.Normalize((0.1307,), (0.3081,))
                  ]))

The number of neurons between the first and second dense layer were were altered to be 100 rather than 1000. This is mainly to see what the difference is in terms of performance between the two.  

In [10]:
# Define the model as a class
class ConvNet(nn.Module):
    def __init__(self): 
      # Defining the initilization of the class. In other words, these variables will be defined every time we call this class. It's sort of a way to create local variables that are only needed within the class, and these only exist within the class.
        super(ConvNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=5, padding=2) 
        # We might need to remove the padding parameter, as this is a bit advanced. What it essentially does, is control the number of zero-valued pixels (RGB) within the 1-channel image, so that we preserve the spatial size of the feature map.
        # The feature map is essentially an intermediate representation of the input data that is learned by a CNN. It's the result of the summmarization of the condensed form of the information within the image (edges etc), that are used for identifying features.

        # For the first convolutional layer, we use 1 input channel since we're dealing with gray-scale images. 32 is the number of output channels/filters that will be processed by this CNN layer.
        # We chose this value based on different suggestions online regarding the same type of data.
        self.conv2 = nn.Conv2d(32, 64, kernel_size=5, padding=2)
        # Since we had 32 output channels/filters, we need to use the same value for the inputs for the next layer of the CNN. We then somewhat arbitrarily chose to double it for the output layer.
        self.fc1 = nn.Linear(7 * 7 * 64, 100) 
        # Dense layer that performs a dot-product between inputs and weights as well as adding a bias term. 
        # We use the input size of 7 * 7 * 64 for the image, with 100 neurons
        self.fc2 = nn.Linear(100, 10)
        # Another dense layer that also performs a dot product between inputs and weights from the previous layer and then adds a bias.
        # This needs to be changed according to the number of neurons in the other dense layer, if we choose to change it around

    def forward(self, x): # Defining the internal forward pass function of the model class. 
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2(x), 2))
        x = x.view(-1, 7 * 7 * 64)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

We chose to keep the number of epochs at 10, since we drastically reduced the number of neurons within the dense layers. With 1000 neurons the model was reaching around 99% accuracy rate, which is quite impressive, so seeing what it can achieve with a tenth of that would be interesting. We also chose to increase the learning rate from 0.01 to 0.05

In [11]:
# Initialize the model, loss function, and optimizer
model = ConvNet() # Creating an instance of the class we defined above.
n_epochs = 10 # We set the number of epochs to 10 for the first run
learning_rate = 0.05 # Learning rate has been changed since first CNN
criterion = nn.CrossEntropyLoss() # We use CrossEntropyLoss due to the data being about classifiying handwritten digits. This means we're dealing with a multiclass classification.
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.5) # Define the Stochastic Gradient Descend optimizer. 

In [12]:
# Defining function to train the model
def train(model, train_loader, criterion, optimizer, epoch): # We define this function with the model, train_loader, criterion, optimizer and epochs as inputs for the function.
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

In [13]:
# Defining function to test the model
def test(model, test_loader, criterion):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            output = model(data)
            test_loss += criterion(output, target).item()
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [14]:
# Create dataloaders for the train and test datasets
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1000, shuffle=True)

In [15]:
# Train the model for 10 epochs
for epoch in range(n_epochs):
    train(model, train_loader, criterion, optimizer, epoch) # We train and test the model using the loop functions we defined earlier
    test(model, test_loader, criterion)


Test set: Average loss: 0.0001, Accuracy: 9839/10000 (98%)


Test set: Average loss: 0.0000, Accuracy: 9845/10000 (98%)


Test set: Average loss: 0.0000, Accuracy: 9897/10000 (99%)


Test set: Average loss: 0.0000, Accuracy: 9903/10000 (99%)


Test set: Average loss: 0.0000, Accuracy: 9890/10000 (99%)


Test set: Average loss: 0.0000, Accuracy: 9918/10000 (99%)


Test set: Average loss: 0.0000, Accuracy: 9913/10000 (99%)


Test set: Average loss: 0.0000, Accuracy: 9906/10000 (99%)


Test set: Average loss: 0.0000, Accuracy: 9912/10000 (99%)


Test set: Average loss: 0.0000, Accuracy: 9926/10000 (99%)



So in terms of performance, there wasn't a big difference between the different parameter setups. A lower amount of neurons within the dense layers actually had what you could call a marginal improvement over the significantly larger number of neurons. This difference is so small, that it is feasible in this case to conclude, that the number of neurons within the dense layers don't make a big difference beyond 100. The next thing would be to try and reduce it even further, and figure out where the performance would begin to drop, by for example reducing it by 50, so that only half the neurons are present within these layers. The increased learning_rate also did not seem to make a significant difference along with the reduced number of neurons within the model. 

In [None]:
# Save the model
#torch.save(model.state_dict(), 'model2.pt')

In [None]:
# Load the saved model
#model.load_state_dict(torch.load('model2.pt'))