# PyTorch tutorial

In [2]:
import torch
from torch import nn
from torch.nn import functional as F
from torchvision.transforms import functional as transforms
from torchvision import models as pretrained_models
from PIL import Image

## Overview of tutorial
- Primitive data structures in PyTorch
- Defining models in code
- Loading an image file and passing it through your model
- Using pretrained models (from PyTorch or from third parties)
- Inspecting layer/channel/unit activations

## Tensors, primitives in PyTorch
### What does a layer of activations or an image input look like?
![alt text](assets/alexnet.png "Title")

In [3]:
# Tensors are just multi-dimensional matrices. Matrices are 2-dimensional (rows X columns), tensors can be more
a = torch.tensor([[[1, 1, 1, 1], [2, 2, 2, 2]], 
                  [[3, 3, 3, 3], [4, 4, 4, 4]], 
                  [[5, 5, 5, 5], [6, 6, 6, 5]]], 
                 dtype=torch.float32)
print(a.shape)
print(a)

torch.Size([3, 2, 4])
tensor([[[1., 1., 1., 1.],
         [2., 2., 2., 2.]],

        [[3., 3., 3., 3.],
         [4., 4., 4., 4.]],

        [[5., 5., 5., 5.],
         [6., 6., 6., 5.]]])


In [4]:
# You can perform math on tensors just like with ordinary numbers or matrices
b = torch.ones(size=a.shape, dtype=a.dtype)
c = a + b
print('Tensor c is:\n{}'.format(c))

# You can also use scalars on your tensors
d = a / 2 + 5
print('Tensor d is:\n{}'.format(d))

# You can grab slices along different dimensions
e = a[:, 1, 0:3]    # Everything along the first dimension, second slice of the second dimension, slices 1,2,3 of the third dimension
print('Tensor e is:\n{}'.format(e))

Tensor c is:
tensor([[[2., 2., 2., 2.],
         [3., 3., 3., 3.]],

        [[4., 4., 4., 4.],
         [5., 5., 5., 5.]],

        [[6., 6., 6., 6.],
         [7., 7., 7., 6.]]])
Tensor d is:
tensor([[[5.5000, 5.5000, 5.5000, 5.5000],
         [6.0000, 6.0000, 6.0000, 6.0000]],

        [[6.5000, 6.5000, 6.5000, 6.5000],
         [7.0000, 7.0000, 7.0000, 7.0000]],

        [[7.5000, 7.5000, 7.5000, 7.5000],
         [8.0000, 8.0000, 8.0000, 7.5000]]])
Tensor e is:
tensor([[2., 2., 2.],
        [4., 4., 4.],
        [6., 6., 6.]])


In [5]:
# Here's what the tensor would look like for an RGB image of height=128 and width=256 initialized with
# random pixels in the range [0-1]
image_tensor = torch.rand(size=(3, 128, 256))     # [channels X height X width] (same structure for conv layers)

## Defining models in code

### 3 steps:
1. Create a subclass of `nn.Module`
1. Define layers/parameters in `__init__()`
2. Define connections between layers mapping inputs to outputs in `forward()`

![alt text](assets/alexnet.png "Title")

In [6]:
# Define the model class
class TinyCNN(nn.Module):
    
    def __init__(self, input_channels, n_classes):    # Can take all the arguments you want
        super().__init__()                            # Always need to initialize the superclass!!!
        
        # Use the nn module to create all of your layers
        self.conv_1 = nn.Conv2d(input_channels, 96, kernel_size=11, stride=4)  # Conv layer with initialized params
        self.relu_1 = nn.ReLU(inplace=True)                                    # ReLU nonlinearity
        self.maxpool_1 = nn.MaxPool2d(kernel_size=3, stride=2)                 # Reduce resolution
        self.conv_2 = nn.Conv2d(96, 256, kernel_size=5)
        self.relu_2 = nn.ReLU(inplace=True)
        self.maxpool_2 = nn.MaxPool2d(kernel_size=3, stride=2)
        self.adaptive_maxpool = nn.AdaptiveMaxPool2d(output_size=1)            # Collapse height/width completely
        self.fc_1 = nn.Linear(256, 100)                                        # 256 units -> 100 units
        self.fc_relu = nn.ReLU(inplace=True)
        self.fc_2 = nn.Linear(100, n_classes)
        
    def forward(self, input):               # Define how inputs pass through your network from layer to layer
        # Pass the input through all of the convolutional/relu/pooling layers
        x = self.conv_1(input)
        x = self.relu_1(x)
        x = self.maxpool_1(x)
        x = self.conv_2(x)
        x = self.relu_2(x)
        x = self.maxpool_2(x)
        
        # Collapse the height/width layers and get rid of them to make this a single-dimensional layer (vector)
        x = self.adaptive_maxpool(x)        # [batches X channels X height X width] -> [batches X channels X 1 X 1]
        x = x.view(x.shape[0], -1)          # [batches X channels X 1 X 1] -> [batches X channels]
        
        # Pass the features through the linear/relu layers
        x = self.fc_1(x)
        x = self.fc_relu(x)
        x = self.fc_2(x)
        
        # Your model doesn't have to be a statically connected series of layers; you can have any arbitrary
        # Python code in the `forward` function. Here, if the model is not being trained, we convert the
        # outputs to a probability distribution over classes (i.e. make them sum to 1)
        if not self.training:
            x = F.softmax(x, dim=1)
        
        # Return the output of the network
        return x

In [7]:
# Instantiate an instance of the model
model = TinyCNN(input_channels=3, n_classes=10)
model.eval()     # Whenever we're not training, model should be in evaluation mode
print(model)

# Put the model on the GPU, if one is available
if torch.cuda.is_available():
    model.cuda()

TinyCNN(
  (conv_1): Conv2d(3, 96, kernel_size=(11, 11), stride=(4, 4))
  (relu_1): ReLU(inplace=True)
  (maxpool_1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv_2): Conv2d(96, 256, kernel_size=(5, 5), stride=(1, 1))
  (relu_2): ReLU(inplace=True)
  (maxpool_2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (adaptive_maxpool): AdaptiveMaxPool2d(output_size=1)
  (fc_1): Linear(in_features=256, out_features=100, bias=True)
  (fc_relu): ReLU(inplace=True)
  (fc_2): Linear(in_features=100, out_features=10, bias=True)
)


## Loading an image file and passing it through your model

### 3 steps:
1. Open the image using the Pillow (PIL) python package
2. Preprocess the image to a format your model expects (resolution, centre crop, etc.)
3. Convert the image to a PyTorch tensor
4. Normalize the tensor's range of values to be the same as during training ([0 - 1], [-1 - 1], etc.)
5. Pass the tensor to your model

In [8]:
# Load image
image = Image.open('assets/dog.jpg')
print(image)

<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=800x572 at 0x7FD91B719310>


In [9]:
# Preprocess image
image = transforms.center_crop(image, (572, 572))    # Use a square aspect ratio for this model
image = transforms.resize(image, (224, 224))         # Lower the resolution (faster computation)
print(image)

<PIL.Image.Image image mode=RGB size=224x224 at 0x7FD91CB20450>


In [10]:
# Convert image to tensor and normalize to imagenet mean and standard deviation
image_tensor = transforms.to_tensor(image)                                       # Pixels ranges [0 - 1]
image_tensor = transforms.normalize(image_tensor, mean=(0.485, 0.456, 0.406),    # Normalize to ImageNet range
                                    std=(0.229, 0.224, 0.225))
print(image_tensor.shape)

# Put the image_tensor input on the GPU, if one is available
if torch.cuda.is_available():
    image_tensor = image_tensor.cuda()

torch.Size([3, 224, 224])


In [11]:
# Now, our image is in a data format that our model can handle, and we can get the model's output
image_tensor_batch = image_tensor.unsqueeze(dim=0)              # [C X H X W] -> [1 X C X H X W] (batch dimension)
class_probabilities_batch = model(image_tensor_batch)
class_probabilities = class_probabilities_batch.squeeze(dim=0)  # [1 X n_classes] -> [n_classes]
print(class_probabilities.shape)
print(class_probabilities.sum())

torch.Size([10])
tensor(1.0000, grad_fn=<SumBackward0>)


## Using pretrained models

### The Torchvision module provides many commonly used pretrained models, and people online often make their model code + trained parameters available

In [12]:
# Let's load a pretrained AlexNet and see what's inside
model = pretrained_models.alexnet(pretrained=True)
model.eval()
print(model)

AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
 

In [13]:
# We can try this pretrained AlexNet on the exact same image that we had before.
# The image is of a dog, so hopefully the class with the highest probability should be one too
class_probabilities = model(image_tensor.unsqueeze(0)).squeeze(0)
class_probabilities = F.softmax(class_probabilities, dim=0)         # PyTorch AlexNet skipped this layer
max_prob, max_class_idx = class_probabilities.max(dim=0)
max_prob, max_class_idx = max_prob.item(), max_class_idx.item()     # Convert 0d tensors to Python primitives
print('Most probable class is {} with "probability" {}'.format(max_class_idx, max_prob))

Most probable class is 217 with "probability" 0.3000657558441162


#### ImageNet class 916 corresponds to "web site, website, internet site, site"... Let's see if the model does better when we show it an image from the ImageNet dataset

In [14]:
# Load the image, preprocess it, and convert it to a tensor with the model's trained range of values
image = Image.open('assets/imagenet_dog.jpg')
image = transforms.resize(image, (224, 224))
image_tensor = transforms.to_tensor(image)
image_tensor = transforms.normalize(image_tensor, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))

# Get the model's predictions
class_probabilities = model(image_tensor.unsqueeze(0)).squeeze(0)
class_probabilities = F.softmax(class_probabilities, dim=0)         # PyTorch AlexNet skipped this layer
max_prob, max_class_idx = class_probabilities.max(dim=0)
max_prob, max_class_idx = max_prob.item(), max_class_idx.item()     # Convert 0d tensors to Python primitives
print('Most probable class is {} with "probability" {}'.format(max_class_idx, max_prob))

Most probable class is 247 with "probability" 0.982213020324707


#### Class 247 is "Saint Bernard, St Bernard". Makes a little more sense

## Inspecting layer/channel/unit activations

### The way I do it is I write a model class, give it the pre-trained model's weights, and just return the layer/channel/unit I need. There may be other ways.

In [15]:
class AlexNetConv3(nn.Module):
    
    def __init__(self):
        super().__init__()
        
        alexnet_pretrained = pretrained_models.alexnet(pretrained=True)
        self.features = alexnet_pretrained.features[:8]     # Grab the first 8 features layers of AlexNet (conv3)
        
    def forward(self, input):
        conv3 = self.features(input)          # [batch_size X 384 X height X width]
        conv3_channel4 = conv3[:, 3]          # [batch_size X height X width]. Same as conv3[:, 3, :, :]
        conv3_channel4_unith5w2 = conv3_channel4[:, 4, 1]       # [batch_size]
        return conv3, conv3_channel4, conv3_channel4_unith5w2   # Just returning all 3 to show how to do each

In [16]:
# Instantiate the model
model = AlexNetConv3()
model.eval()

# Get the intended layers/channels/units of the model for our dog image
conv3, conv3_channel4, conv3_channel4_unith5w2 = model(image_tensor.unsqueeze(dim=0))
conv3, conv3_channel4, conv3_channel4_unith5w2 = conv3.squeeze(dim=0), conv3_channel4.squeeze(dim=0), conv3_channel4_unith5w2.squeeze(dim=0)

print(conv3.shape)
print(conv3_channel4.shape)
print(conv3_channel4_unith5w2.shape)

torch.Size([384, 13, 13])
torch.Size([13, 13])
torch.Size([])


In [17]:
# We can save our model's parameters like so (usually done after training)
model_serialized_params = model.state_dict()
torch.save(model_serialized_params, 'assets/saved_alexnet_pretrained.pth')

# And similarly load our model. For this, we need to instantiate an instance of the model class
# and then load the .pth parameters. When you find code online, they will always provide the model class.
# Sometimes, they may also provide the trained parameters as well.
model_serialized_params = torch.load('assets/saved_alexnet_pretrained.pth',
                                    map_location=lambda storage, loc: storage)    # Load regardless of if it was saved on CPU or GPU
model.load_state_dict(model_serialized_params)

<All keys matched successfully>

## Conclusion

I hope this has been helpful. There is obviously much more to PyTorch than what we've convered here. In the rest
of the repository, you'll find code showing how to:
- Train a custom model for classification
- Fine-tune a pretrained model
- Evaluate a model
- Make a dataset class for your training/evaluation pipeline
- Log training curves to the console and plot them using TensorboardX

Of course, I can't show everything, but rest assured that PyTorch has plenty of easy-to-use code, and we've only scratched the surface.
- Tensor operations (can do much more than just +-*/ tensors together)
- Neural network layers (very rare that you'll ever have to write your own)
- Loss functions
- Stochastic Gradient Descent-based optimizers

And, the best thing about PyTorch, there is plenty of code online which is all relatively modular and easy to understand when compared to other machine learning frameworks.