
**Install requirements**

In [None]:
!pip3 install torch
!pip3 install torchvision
!pip3 install Pillow-SIMD
!pip3 install tqdm

**Import libraries**

In [None]:
import os
import logging

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Subset, DataLoader
from torch.backends import cudnn

import torchvision
from torchvision import transforms
from torchvision.models import alexnet

from PIL import Image
from tqdm import tqdm

**Set Arguments**

In [36]:
DEVICE = 'cpu'

NUM_CLASSES = 102

BATCH_SIZE = 256     

LR = 1e-3            
MOMENTUM = 0.9       
WEIGHT_DECAY = 5e-5  

NUM_EPOCHS = 40      
STEP_SIZE = 20       
GAMMA = 0.1          

LOG_FREQUENCY = 10

**Define Data Preprocessing**

In [8]:
# Define transforms for training phase
train_transform = transforms.Compose([transforms.Resize(256),      # Resizes short size of the PIL image to 256
                                      transforms.CenterCrop(224),  # Crops a central square patch of the image
                                                                   # 224 because torchvision's AlexNet needs a 224x224 input!
                                                                   # Remember this when applying different transformations, otherwise you get an error
                                      transforms.ToTensor(), # Turn PIL Image to torch.Tensor
                                      transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) # Normalizes tensor with mean and standard deviation
])
# Define transforms for the evaluation phase
eval_transform = transforms.Compose([transforms.Resize(256),
                                      transforms.CenterCrop(224),
                                      transforms.ToTensor(),
                                      transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))                                    
])

**Prepare Dataset**

In [20]:
from torchvision.datasets import VisionDataset

from PIL import Image


def pil_loader(path):
    with open(path, 'rb') as f:
        img = Image.open(f)
        return img.convert('RGB')


class Caltech1(VisionDataset):
    def __init__(self, root, split='train', transform=None, target_transform=None):
        super(Caltech1, self).__init__(root, transform=transform, target_transform=target_transform)
        

        self.split = split # This defines the split you are going to use
                           # (split files are called 'train.txt' and 'test.txt')

        #path = "/content/Caltech101/" + self.split + ".txt"
        path = "./Caltech101/" + self.split + ".txt"

        self.data = list()
        self.labels = set()
        
        with open(path, 'r') as f:
            for line in f:
                #image_path = "/content/" + root + '/' + line.strip()
                image_path = "" + root + '/' + line.strip()

                label = line.strip().split('/')[0]
                image = pil_loader(image_path)
                if "BACKGROUND_Google" not in label:
                    self.labels.add(label)
                    self.data.append((image, list(self.labels).index(label)))

    def __getitem__(self, index):
        '''
        __getitem__ should access an element through its index
        Args:
            index (int): Index

        Returns:
            tuple: (sample, target) where target is class_index of the target class.
        '''

        image, label = self.data[index]

        # Applies preprocessing when accessing the image
        if self.transform is not None:
            image = self.transform(image)

        return image, label

    def __len__(self):
        '''
        The __len__ method returns the length of the dataset
        It is mandatory, as this is used by several other components
        '''
        return len(self.data)


In [21]:
# Clone github repository with data
if not os.path.isdir('./Caltech101'):
  !git clone https://github.com/CarachinoAlessio/test_Caltech101.git
  #!mv 'test_Caltech101' 'Caltech101'

DATA_DIR = 'Caltech101/101_ObjectCategories'

from Caltech101.caltech_dataset import Caltech
from sklearn.model_selection import train_test_split

# Prepare Pytorch train/test Datasets
train_ds = Caltech1(DATA_DIR, split='train',  transform=train_transform)
test_dataset = Caltech1(DATA_DIR, split='test', transform=eval_transform)

ids = [x for x in range(len(train_ds))]
labels_indexes = [y for (x,y) in train_ds]

train_indexes, val_indexes = train_test_split(ids, test_size=0.5, shuffle=True, stratify=labels_indexes)

train_dataset = Subset(train_ds, train_indexes)
val_dataset = Subset(train_ds, val_indexes)

# Check dataset sizes
print('Train Dataset: {}'.format(len(train_dataset)))
print('Valid Dataset: {}'.format(len(val_dataset)))
print('Test Dataset: {}'.format(len(test_dataset)))

Train Dataset: 2892
Valid Dataset: 2892
Test Dataset: 2893


**Prepare Dataloaders**

In [22]:
# Dataloaders iterate over pytorch datasets and transparently provide useful functions (e.g. parallelization and shuffling)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

**Prepare Network**

In [37]:
net = alexnet(pretrained=True) # Loading AlexNet model

# AlexNet has 1000 output neurons, corresponding to the 1000 ImageNet's classes
# We need 101 outputs for Caltech-101
net.classifier[6] = nn.Linear(4096, NUM_CLASSES) # nn.Linear in pytorch is a fully connected layer
                                                 # The convolutional layer is nn.Conv2d

# We just changed the last layer of AlexNet with a new fully connected layer with 101 outputs
# It is strongly suggested to study torchvision.models.alexnet source code

Downloading: "https://download.pytorch.org/models/alexnet-owt-7be5be79.pth" to C:\Users\alcarachin/.cache\torch\hub\checkpoints\alexnet-owt-7be5be79.pth
100%|██████████| 233M/233M [00:45<00:00, 5.38MB/s] 


**Prepare Training**

In [38]:
# Define loss function
criterion = nn.CrossEntropyLoss()
parameters_to_optimize = net.parameters()
optimizer = optim.SGD(parameters_to_optimize, lr=LR, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=STEP_SIZE, gamma=GAMMA)

**Train**

In [39]:
net = net.to(DEVICE)
cudnn.benchmark

current_step = 0
for epoch in range(NUM_EPOCHS):
  print('Starting epoch {}/{}, LR = {}'.format(epoch+1, NUM_EPOCHS, scheduler.get_lr()))

  for images, labels in train_dataloader:
    images = images.to(DEVICE)
    labels = labels.to(DEVICE)

    net.train()
    optimizer.zero_grad()

    
    outputs = net(images)
    loss = criterion(outputs, labels)

    if current_step % LOG_FREQUENCY == 0:
      print('Step {}, Loss {}'.format(current_step, loss.item()))

    loss.backward()
    optimizer.step()

    current_step += 1

  # Step the scheduler
  scheduler.step() 

Starting epoch 1/40, LR = [0.001]
Step 0, Loss 4.808793067932129
Step 10, Loss 3.0609891414642334
Starting epoch 2/40, LR = [0.001]
Step 20, Loss 2.1703829765319824
Starting epoch 3/40, LR = [0.001]
Step 30, Loss 1.5911250114440918
Starting epoch 4/40, LR = [0.001]
Step 40, Loss 1.3508399724960327
Starting epoch 5/40, LR = [0.001]
Step 50, Loss 0.8977063298225403
Starting epoch 6/40, LR = [0.001]
Step 60, Loss 0.7443479299545288
Starting epoch 7/40, LR = [0.001]
Step 70, Loss 0.6106410026550293
Starting epoch 8/40, LR = [0.001]
Step 80, Loss 0.40423792600631714
Starting epoch 9/40, LR = [0.001]
Step 90, Loss 0.4432658851146698
Starting epoch 10/40, LR = [0.001]
Step 100, Loss 0.32229936122894287
Starting epoch 11/40, LR = [0.001]
Step 110, Loss 0.2469196319580078
Step 120, Loss 0.31955334544181824
Starting epoch 12/40, LR = [0.001]
Step 130, Loss 0.24901512265205383
Starting epoch 13/40, LR = [0.001]
Step 140, Loss 0.21305115520954132
Starting epoch 14/40, LR = [0.001]
Step 150, Loss 0

**Validation**

In [40]:
net = net.to(DEVICE)
net.train(False)

running_corrects = 0
for images, labels in tqdm(val_dataloader):
  images = images.to(DEVICE)
  labels = labels.to(DEVICE)

  outputs = net(images)
  _, preds = torch.max(outputs.data, 1)

  running_corrects += torch.sum(preds == labels.data).data.item()

accuracy = running_corrects / float(len(val_dataset))

print('Validation Accuracy: {}'.format(accuracy))

100%|██████████| 12/12 [00:22<00:00,  1.88s/it]

Validation Accuracy: 0.8153526970954357





**Test**

In [41]:
net = net.to(DEVICE)
net.train(False)

running_corrects = 0
for images, labels in tqdm(test_dataloader):
  images = images.to(DEVICE)
  labels = labels.to(DEVICE)

  outputs = net(images)
  _, preds = torch.max(outputs.data, 1)

  running_corrects += torch.sum(preds == labels.data).data.item()

accuracy = running_corrects / float(len(test_dataset))

print('Test Accuracy: {}'.format(accuracy))

100%|██████████| 12/12 [00:23<00:00,  1.95s/it]

Test Accuracy: 0.8067749740753543





In [43]:
torch.save(net, 'baseline.pth')