
**Install requirements**

In [None]:
!pip3 install 'torch'
!pip3 install 'torchvision'
!pip3 install 'Pillow-SIMD'
!pip3 install 'tqdm'
!pip3 install 'matplotlib'

**Import libraries**

In [None]:
import os
import logging

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
from torch.utils.data import Subset, DataLoader
from torch.backends import cudnn

import torchvision
from torchvision import transforms
from torchvision.models import alexnet

import sklearn.model_selection

from PIL import Image
from tqdm import tqdm

**Set Arguments**

In [None]:
DEVICE = 'cuda' # 'cuda' or 'cpu'

NUM_CLASSES = 100

BATCH_SIZE = 128     # Higher batch sizes allows for larger learning rates. An empirical heuristic suggests that, when changing
                     # the batch size, learning rate should change by the same factor to have comparable results

LR = 1e-1            # The initial Learning Rate
MOMENTUM = 0.9       # Hyperparameter for SGD, keep this at 0.9 when using SGD
WEIGHT_DECAY = 1e-4  # Regularization, you can keep this at the default

NUM_EPOCHS = 150     # Total number of training epochs (iterations over dataset)
STEP_SIZE = 10       # How many epochs before decreasing learning rate (if using a step-down policy)
GAMMA = 0.1          # Multiplicative factor for learning rate step-down

LOG_FREQUENCY = 1000
SCHEDULER_TYPE = 'LR'
NORM_TYPE = 'GROUP'


**Define Data Preprocessing**

In [None]:
# This dataset has 100 classes containing 600 images each. There are 500 training images and 100 testing images per 
# class. The 100 classes in the CIFAR-100 are grouped into 20 superclasses. Each image comes with a "fine" label (the class to which it belongs) and a 
# "coarse" label (the superclass to which it belongs).
# Define transforms for training phase

# random crop, random horizontal flip, per-pixel normalization 

train_transform = transforms.Compose([transforms.RandomCrop(32, padding=4),
                                      transforms.RandomHorizontalFlip(0.5), # 0.5 probability
                                      transforms.ToTensor(), # Turn PIL Image to torch.Tensor
                                      transforms.Normalize(mean=[0.485, 0.456, 0.406], # TODO: Check 
                                                           std=[0.229, 0.224, 0.225])
])
# Define transforms for the evaluation phase
eval_transform = transforms.Compose([transforms.RandomCrop(32, padding=4),
                                      transforms.RandomCrop(32),
                                      transforms.RandomHorizontalFlip(0.5), 
                                      transforms.ToTensor(),
                                      transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                           std=[0.229, 0.224, 0.225])
])

**Prepare Dataset**

In [None]:
# Clone github repository with data
if not os.path.isdir('./CentralizedResNet'):
  !git clone https://github.com/AML-SergioMejia/CentralizedResNet.git

root_dir = "CIFAR100"
""""
root: str,
train: bool = True, 
transform: Optional[Callable] = None, 
target_transform: Optional[Callable] = None, 
download: bool = False
"""

# Prepare Pytorch train/test Datasets
train_dataset = torchvision.datasets.CIFAR100( root_dir, transform=train_transform, download=True)#Caltech(DATA_DIR, split='train',  transform=train_transform)
test_dataset = torchvision.datasets.CIFAR100( root_dir, train=False, transform=train_transform, download=True)

stratified_sampling = sklearn.model_selection.train_test_split([i for i in range(len(train_dataset))], test_size=0.2, stratify=train_dataset.targets)
train_indexes = stratified_sampling[0] # split the indices for your train split
val_indexes = stratified_sampling[1] # split the indices for your val split

val_dataset = Subset(train_dataset, val_indexes)
train_dataset = Subset(train_dataset, train_indexes)

# Check dataset sizes
print('Train Dataset: {}'.format(len(train_dataset)))
print('Valid Dataset: {}'.format(len(val_dataset)))
print('Test Dataset: {}'.format(len(test_dataset)))

**Prepare Dataloaders**

In [None]:
# Dataloaders iterate over pytorch datasets and transparently provide useful functions (e.g. parallelization and shuffling)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, drop_last=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

**Prepare Network**

In [None]:
from CentralizedResNet.resnet_cifar import ResNet20
# We need 100 outputs
net = ResNet20(num_blocks=3, num_classes=100, option='B', norm_type=NORM_TYPE)

import os.path
ckpt_path = 'trained_ResNet20'
if SCHEDULER_TYPE is not None:
  ckpt_path += f'_{SCHEDULER_TYPE}'
ckpt_path += f'_{NORM_TYPE}'
ckpt_path += '.pt'

if os.path.exists(ckpt_path):
  net.load_state_dict(torch.load(ckpt_path))
  print("LOADED CHECKPOINT")
print(f"Number of model parameters = {sum(p.numel() for p in net.parameters())}")

**Prepare Training**

In [None]:
# Define loss function
criterion = nn.CrossEntropyLoss() # for classification, we use Cross Entropy

# Choose parameters to optimize
# To access a different set of parameters, you have to access submodules of AlexNet
# (nn.Module objects, like AlexNet, implement the Composite Pattern)
# e.g.: parameters of the fully connected layers: net.classifier.parameters()
# e.g.: parameters of the convolutional layers: look at alexnet's source code ;) 
parameters_to_optimize = net.parameters() # In this case we optimize over all the parameters of AlexNet

# Define optimizer
# An optimizer updates the weights based on loss
# We use SGD with momentum
optimizer = optim.SGD(parameters_to_optimize, lr=LR, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)

# Define scheduler
# A scheduler dynamically changes learning rate
if SCHEDULER_TYPE == 'WarmRestarts':
  scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=2)
elif SCHEDULER_TYPE == 'LR':
  scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=NUM_EPOCHS, eta_min=0.00001)
else:
  scheduler = None

**Train**

In [None]:
# By default, everything is loaded to cpu
net = net.to(DEVICE) # this will bring the network to GPU if DEVICE is cuda

cudnn.benchmark # Calling this optimizes runtime

current_step = 0
# Start iterating over the epochs
for epoch in range(NUM_EPOCHS):
  print('Starting epoch {}/{} [LR={}]'.format(epoch+1, NUM_EPOCHS, scheduler.get_lr()))

  i = 0
  # Iterate over the dataset
  for images, labels in train_dataloader:
    # Bring data over the device of choice
    images = images.to(DEVICE)
    labels = labels.to(DEVICE)

    net.train() # Sets module in training mode

    # PyTorch, by default, accumulates gradients after each backward pass
    # We need to manually set the gradients to zero before starting a new iteration
    optimizer.zero_grad() # Zero-ing the gradients
    with torch.set_grad_enabled(True):
      # Forward pass to the network
      outputs = net(images)

      # Compute loss based on output and ground truth
      loss = criterion(outputs, labels)

      # Log loss
      if current_step % LOG_FREQUENCY == 0:
        print('Step {}, Loss {}'.format(current_step, loss.item()))

      # Compute gradients for each layer and update weights
      loss.backward()  # backward pass: computes gradients
      if optimizer is not None:
        optimizer.step() # update weights based on accumulated gradients
    if scheduler is not None:
      scheduler.step(epoch + i / len(train_dataloader))
    i += 1

    current_step += 1
  # end for dataset iteration

**Validation on last state of network**

In [None]:
def validate_input(network, dataset):
  network = network.to(DEVICE) # this will bring the network to GPU if DEVICE is cuda
  network.train(False) # Set Network to evaluation mode
  val_dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

  running_corrects = 0
  running_loss = 0.0
  for input, labels in tqdm(val_dataloader):
    input = input.to(DEVICE)
    labels = labels.to(DEVICE)

    # Forward Pass
    outputs = network(input)

    loss = criterion(outputs, labels)
    
    running_loss += loss.item()

    # Get predictions
    _, preds = torch.max(outputs.data, 1)

    # Update Corrects
    running_corrects += torch.sum(preds == labels.data).data.item()

  # Calculate Accuracy
  accuracy = running_corrects / float(len(dataset))
  loss = running_loss / len(val_dataloader)
  return accuracy, loss
#accuracy = validate_input(net, val_dataset)
#print('Validation Accuracy: {}'.format(accuracy))

**Training + validation per epoch**

In [None]:
# !! Call "Prepare training" cell again before executing this module !!

# By default, everything is loaded to cpu
net = net.to(DEVICE) # this will bring the network to GPU if DEVICE is cuda

cudnn.benchmark # Calling this optimizes runtime

current_step = 0

loss_values = []
acc_values = []

last_val = 0

# Start iterating over the epochs
for epoch in range(NUM_EPOCHS):
  print('Starting epoch {}/{}, LR = {}'.format(epoch+1, NUM_EPOCHS, scheduler.get_lr()))

  running_loss = 0.0
  running_correct = 0
  # Iterate over the dataset
  for images, labels in train_dataloader:
    # Bring data over the device of choice
    images = images.to(DEVICE)
    labels = labels.to(DEVICE)

    net.train() # Sets module in training mode

    # PyTorch, by default, accumulates gradients after each backward pass
    # We need to manually set the gradients to zero before starting a new iteration
    optimizer.zero_grad() # Zero-ing the gradients

    # Forward pass to the network
    outputs = net(images)

    # Compute loss based on output and ground truth
    loss = criterion(outputs, labels)
    
    running_loss += loss.item()
    _, preds = torch.max(outputs.data, 1)
    running_correct += torch.sum(preds == labels.data).data.item()

    # Log loss
    if current_step % LOG_FREQUENCY == 0:
      print('Step {}, Loss {}'.format(current_step, loss.item()))

    # Compute gradients for each layer and update weights
    loss.backward()  # backward pass: computes gradients
    optimizer.step() # update weights based on accumulated gradients

    current_step += 1
  # end for dataset iteration
  
  accuracy_val, loss_val = validate_input(net, val_dataset)

  # Save best model
  if epoch > 50 and accuracy_val > last_val:
    last_val = accuracy_val
    path = 'trained_ResNet20'
    if scheduler is not None:
      path += f'_{SCHEDULER_TYPE}'
    path += f'_{NORM_TYPE}'
    torch.save(net.state_dict(), f'{path}.pt')
  
  loss_train = running_loss/len(train_dataloader.dataset)
  accuracy_train = running_correct/len(train_dataloader.dataset)
  
  loss_values.append([loss_train, loss_val])
  acc_values.append([accuracy_train, accuracy_val])
  print('Validation Accuracy: {}'.format(accuracy_val))

  # Step the scheduler
  if scheduler is not None:
    scheduler.step() 

In [None]:
from matplotlib import pyplot as plt

plt.plot(range(NUM_EPOCHS), loss_values)
plt.legend(["Train", "Val"])
plt.xlabel("Epochs")
plt.show()

plt.plot(range(NUM_EPOCHS), acc_values)
plt.legend(["Train", "Val"])
plt.xlabel("Epochs")
plt.show()


**Test**

In [None]:
net = net.to(DEVICE) # this will bring the network to GPU if DEVICE is cuda
net.train(False) # Set Network to evaluation mode

running_corrects = 0
for images, labels in tqdm(test_dataloader):
  images = images.to(DEVICE)
  labels = labels.to(DEVICE)

  # Forward Pass
  outputs = net(images)

  # Get predictions
  _, preds = torch.max(outputs.data, 1)

  # Update Corrects
  running_corrects += torch.sum(preds == labels.data).data.item()

# Calculate Accuracy
accuracy = running_corrects / float(len(test_dataset))

print('Test Accuracy: {}'.format(accuracy))

Save model

In [None]:
path = 'trained_ResNet20'
if scheduler is not None:
  path += f'_{SCHEDULER_TYPE}'
path += f'_{NORM_TYPE}'
torch.save(net.state_dict(), f'{path}.pt')