In [6]:
import torch
from torch import nn
import torch.nn.functional as F
from torchvision import datasets, transforms


# Define a transform to normalize the data
transform = transforms.Compose([transforms.ToTensor(),
                              transforms.Normalize((0.5,), (0.5,)),
                              ])

# Download and load the training data
trainset = datasets.MNIST('~/.pytorch/MNIST_data/', download=True, train=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)

In [7]:
# Build a feed-forward network
model = nn.Sequential(nn.Linear(784, 128),
                      nn.ReLU(),
                      nn.Linear(128, 64),
                      nn.ReLU(),
                      nn.Linear(64, 10))

# Define the loss
criterion = nn.CrossEntropyLoss()

# Get our data
images, labels = next(iter(trainloader))
# Flatten images
images = images.view(images.shape[0], -1)

# Forward pass, get our logits
logits = model(images)
# Calculate the loss with the logits and the labels
loss = criterion(logits, labels)

print(loss)

tensor(2.2935, grad_fn=<NllLossBackward>)


In [8]:
print('Before backward pass: \n', model[0].weight.grad)

loss.backward()

print('After backward pass: \n', model[0].weight.grad)

Before backward pass: 
 None
After backward pass: 
 tensor([[-0.0017, -0.0017, -0.0017,  ..., -0.0017, -0.0017, -0.0017],
        [ 0.0030,  0.0030,  0.0030,  ...,  0.0030,  0.0030,  0.0030],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 0.0009,  0.0009,  0.0009,  ...,  0.0009,  0.0009,  0.0009],
        [ 0.0020,  0.0020,  0.0020,  ...,  0.0020,  0.0020,  0.0020],
        [-0.0021, -0.0021, -0.0021,  ..., -0.0021, -0.0021, -0.0021]])


In [9]:
from torch import optim

# Optimizers require the parameters to optimize and a learning rate
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [10]:
print('Initial weights - ', model[0].weight)

images, labels = next(iter(trainloader))
images.resize_(64, 784)

# Clear the gradients, do this because gradients are accumulated
optimizer.zero_grad()

# Forward pass, then backward pass, then update weights
output = model.forward(images)
loss = criterion(output, labels)
loss.backward()
print('Gradient -', model[0].weight.grad)

Initial weights -  Parameter containing:
tensor([[-0.0281,  0.0273,  0.0008,  ...,  0.0127,  0.0270, -0.0021],
        [-0.0282,  0.0223, -0.0280,  ..., -0.0143, -0.0341,  0.0210],
        [ 0.0097,  0.0252,  0.0005,  ...,  0.0253, -0.0277, -0.0288],
        ...,
        [-0.0089,  0.0053, -0.0079,  ..., -0.0097,  0.0328, -0.0314],
        [-0.0013,  0.0112,  0.0347,  ..., -0.0005, -0.0145,  0.0110],
        [-0.0115, -0.0110,  0.0051,  ...,  0.0161,  0.0134,  0.0059]],
       requires_grad=True)
Gradient - tensor([[-0.0022, -0.0022, -0.0022,  ..., -0.0022, -0.0022, -0.0022],
        [ 0.0006,  0.0006,  0.0006,  ...,  0.0006,  0.0006,  0.0006],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [-0.0008, -0.0008, -0.0008,  ..., -0.0008, -0.0008, -0.0008],
        [ 0.0017,  0.0017,  0.0017,  ...,  0.0017,  0.0017,  0.0017],
        [-0.0025, -0.0025, -0.0025,  ..., -0.0025, -0.0025, -0.0025]])


In [11]:
# Take an update step and few the new weights
optimizer.step()
print('Updated weights - ', model[0].weight)

Updated weights -  Parameter containing:
tensor([[-0.0280,  0.0273,  0.0008,  ...,  0.0127,  0.0270, -0.0021],
        [-0.0282,  0.0223, -0.0280,  ..., -0.0143, -0.0341,  0.0210],
        [ 0.0097,  0.0252,  0.0005,  ...,  0.0253, -0.0277, -0.0288],
        ...,
        [-0.0089,  0.0053, -0.0079,  ..., -0.0097,  0.0329, -0.0314],
        [-0.0014,  0.0112,  0.0347,  ..., -0.0005, -0.0146,  0.0110],
        [-0.0115, -0.0110,  0.0051,  ...,  0.0161,  0.0134,  0.0060]],
       requires_grad=True)


In [12]:
### With Epochs ###
model = nn.Sequential(nn.Linear(784, 128),
                      nn.ReLU(),
                      nn.Linear(128, 64),
                      nn.ReLU(),
                      nn.Linear(64, 10),
                      nn.LogSoftmax(dim=1))

criterion = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.003)

epochs = 5
for e in range(epochs):
    running_loss = 0
    for images, labels in trainloader:
        # Flatten MNIST images into a 784 long vector
        images = images.view(images.shape[0], -1)
    
        # TODO: Training pass
        optimizer.zero_grad()
        
        output = model.forward(images)
        
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    else:
        print(f"Training loss: {running_loss/len(trainloader)}")

Training loss: 1.9718727765561166
Training loss: 0.9333064187246599
Training loss: 0.538826524051649
Training loss: 0.4324731629476873
Training loss: 0.38631370824092487
