*Classifying Image Using AlexNet Pretrained Model from PyTorch*

Import Models from TorchVision

In [1]:
from torchvision import models
import torch
 
dir(models)

['AlexNet',
 'DenseNet',
 'GoogLeNet',
 'Inception3',
 'MNASNet',
 'MobileNetV2',
 'ResNet',
 'ShuffleNetV2',
 'SqueezeNet',
 'VGG',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_utils',
 'alexnet',
 'densenet',
 'densenet121',
 'densenet161',
 'densenet169',
 'densenet201',
 'detection',
 'googlenet',
 'inception',
 'inception_v3',
 'mnasnet',
 'mnasnet0_5',
 'mnasnet0_75',
 'mnasnet1_0',
 'mnasnet1_3',
 'mobilenet',
 'mobilenet_v2',
 'resnet',
 'resnet101',
 'resnet152',
 'resnet18',
 'resnet34',
 'resnet50',
 'resnext101_32x8d',
 'resnext50_32x4d',
 'segmentation',
 'shufflenet_v2_x0_5',
 'shufflenet_v2_x1_0',
 'shufflenet_v2_x1_5',
 'shufflenet_v2_x2_0',
 'shufflenetv2',
 'squeezenet',
 'squeezenet1_0',
 'squeezenet1_1',
 'utils',
 'vgg',
 'vgg11',
 'vgg11_bn',
 'vgg13',
 'vgg13_bn',
 'vgg16',
 'vgg16_bn',
 'vgg19',
 'vgg19_bn',
 'video',
 'wide_resnet101_2',
 'wide_resnet50_2']

Load AlexNet

In [2]:
alexnet = models.alexnet(pretrained=True)

In [3]:
print(alexnet)

AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
 

Pre-process the input image with the help of transforms present in TochVision module

In [4]:
from torchvision import transforms
transform = transforms.Compose([            # define 'transform' variable to contain all transformations
 transforms.Resize(256),                    # resize image to 256x256 pixels
 transforms.CenterCrop(224),                # crop image to 224x224 pixels around the center
 transforms.ToTensor(),                     # convert image to PyTorch tensor
 transforms.Normalize(                      # normalize image
 mean=[0.485, 0.456, 0.406],                # set means
 std=[0.229, 0.224, 0.225]                  # set standard deviations
 )])

Load input image and pre-process it

In [5]:
# Import Pillow
from PIL import Image
img = Image.open("dog.jpg")

Pre-process the image and prepare a batch to be passed through the network.

In [6]:
img_t = transform(img)
batch_t = torch.unsqueeze(img_t, 0)

Put our model in eval mode

In [7]:
alexnet.eval()

AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
 

Carry out the inference

In [8]:
out = alexnet(batch_t)
print(out.shape)

torch.Size([1, 1000])


This is all good but what do we do with this output vector out with 1000 elements? We still haven’t got the class (or label) of the image. For this, we will first read and store the labels from a text file having a list of all the 1000 labels. Note that the line number specified the class number, so it’s very important to make sure that you don’t change that order.

In [10]:
with open('imagenet_classes.txt') as f:
  classes = [line.strip() for line in f.readlines()]

Now, we need to find out the index where the maximum score in output vector out occurs. We will use this index to find out the prediction.

In [12]:
_, index = torch.max(out, 1)
 
percentage = torch.nn.functional.softmax(out, dim=1)[0] * 100
 
print(classes[index[0]], percentage[index[0]].item())

208: 'Labrador retriever', 41.585166931152344


Let’s see what other classes the model thought the image belonged to.

In [13]:
_, indices = torch.sort(out, descending=True)
[(classes[idx], percentage[idx].item()) for idx in indices[0][:5]]

[("208: 'Labrador retriever',", 41.585166931152344),
 ("207: 'golden retriever',", 16.59166145324707),
 ("176: 'Saluki, gazelle hound',", 16.286880493164062),
 ("172: 'whippet',", 2.8539133071899414),
 ("173: 'Ibizan hound, Ibizan Podenco',", 2.3924720287323)]

Reference: https://www.learnopencv.com/pytorch-for-beginners-image-classification-using-pre-trained-models/