In [32]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
import torch.multiprocessing as mp

import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, random_split

In [33]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


## **Download CIFAR-10 Dataset**

In [35]:
print('==> Preparing data..')
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

trainset = torchvision.datasets.CIFAR10(
    root='./data', train=True, download=True, transform=transform_train)



# Define the split ratio
train_size = int(0.8 * len(trainset))  # 80% for training
val_size = len(trainset) - train_size  # Remaining 20% for validation

# Randomly split the training dataset into training and validation sets
train_set, val_set = random_split(trainset, [train_size, val_size])

debug_size = int(0.02 * len(trainset))
_, debug_set = random_split(trainset, [ len(trainset) - debug_size, debug_size])

debug_loader = torch.utils.data.DataLoader(
    debug_set, batch_size=128, shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(
    root='./data', train=False, download=True, transform=transform_test)

trainloader = torch.utils.data.DataLoader(
    train_set, batch_size=128, shuffle=True, num_workers=2)

valloader = torch.utils.data.DataLoader(
    val_set, batch_size=128, shuffle=False, num_workers=2)

testloader = torch.utils.data.DataLoader(
    testset, batch_size=100, shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat', 'deer',
           'dog', 'frog', 'horse', 'ship', 'truck')

==> Preparing data..
Files already downloaded and verified
Files already downloaded and verified


## **ResNet Model**

In [36]:

class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_channels, out_channels, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.downsample = downsample

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = F.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = F.relu(out)

        return out

class ResNet_CIFAR10(nn.Module):
    def __init__(self, block, layers, num_classes=10):
        super(ResNet_CIFAR10, self).__init__()
        self.in_channels = 64
        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)  # Changed kernel to 3x3
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)

        # Removed maxpool since it's not needed for CIFAR-10's small input size

        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)

    def _make_layer(self, block, out_channels, blocks, stride=1):
        downsample = None
        if stride != 1 or self.in_channels != out_channels * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.in_channels, out_channels * block.expansion, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels * block.expansion),
            )

        layers = []
        layers.append(block(self.in_channels, out_channels, stride, downsample))
        self.in_channels = out_channels * block.expansion

        for _ in range(1, blocks):
            layers.append(block(self.in_channels, out_channels))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)

        return x



## **Number of Layers in Resnet are caluculated as follows**

- 1 FC at the end
- 1 Conv layer in the begining
- BasicBlock: 2 conv Layer
- Each Layer could have multiple BasicBlocks

Example: ResNet_CIFAR10(BasicBlock,[2, 1, 1, 1])
1 conv in the begining + 1 fc at the end +
Layer1 : 2 Basicblocks => 4 Conv layer
Layer2 : 1 Basicblock => 2 Conv Layer
Layer3: 1 Basicblock => 2 conv Layer
Layer3 : 1 Basicblock => 2 conv Layer

Total Layer = 1 + 1 + 4 + 2 +2 +2 = 12
So, this is ResNet12 with Layer1 having 2 basicblocks

In [37]:
resnet_10 = ResNet_CIFAR10(BasicBlock,[1, 1, 1, 1])

In [38]:
resnet_12 = ResNet_CIFAR10(BasicBlock,[2, 1, 1, 1])

In [39]:
resnet_14 = ResNet_CIFAR10(BasicBlock,[2, 2, 1, 1])

In [40]:
resnet_18 = ResNet_CIFAR10(BasicBlock,[2, 2, 2, 2])

## **Baseline Training for all Models**



In [41]:
# Training and testing utilities
def train(model, device, train_loader, optimizer, criterion, epoch, train_losses, model_name):
    model.train()
    running_loss = 0.0
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if batch_idx % 100 == 0:
            print(f'Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)}] Loss: {loss.item():.6f}')

    average_loss = running_loss / len(train_loader)
    train_losses.append(average_loss)

    if epoch%25 == 0:
      # Specify a path to save the model and optimizer state
      save_path = model_name + "_" + epoch + ".pth"

      # Save model and optimizer state
      torch.save({
          'model_state_dict': model.state_dict(),
          'optimizer_state_dict': optimizer.state_dict(),
          'epoch': epoch,
          'loss': average_loss
      }, save_path)


def test(model, device, test_loader, criterion, test_losses):
    model.eval()
    test_loss = 0.0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += criterion(output, target).item()
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    average_loss = test_loss / len(test_loader)
    test_losses.append(average_loss)
    print(f'Test set: Average loss: {average_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} ({100. * correct / len(test_loader.dataset):.0f}%)')


In [42]:
def training_process(model, device, model_name = None, num_epochs = None, checkpoint = None):
  criterion = nn.CrossEntropyLoss()       # Cross-entropy loss for classification
  if checkpoint is not None:
    checkpoint = torch.load(checkpoint)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch']
  else:
    optimizer = optim.Adam(model.parameters(), lr=0.001)  # Adam optimizer
    start_epoch = 0
  model.to(device)
  train_losses = []
  validation_losses = []
  iters = 100
  for epoch in range(start_epoch, start_epoch + iters):
      # train(model, device, debug_loader, optimizer, criterion, epoch, train_losses)
      train(model, device, trainloader, optimizer, criterion, epoch, train_losses)
      test(model, device, valloader, criterion, validation_losses)
  num_epochs[model_name] = start_epoch + iters




In [43]:
models = [resnet_10, resnet_12, resnet_14, resnet_18 ]
model_names = ["resnet_10", "resnet_12", "resnet_14", "resnet_18"]
num_epochs = {
    "resnet_10": 0,
    "resnet_12": 0,
    "resnet_14": 0,
    "resnet_18": 0
}


In [45]:
# training_process(resnet_10, device, model_name = "resnet_10", num_epochs = num_epochs)

In [46]:
# criterion = nn.CrossEntropyLoss()
# test(resnet_18, device, debug_loader, criterion, [])

## **Parallel Training**

In [47]:
# Function to handle parallel training
def parallel_training(models, model_names, device):
    # Create a list of processes
    processes = []
    for i, model in enumerate(models):
        process = mp.Process(target=training_process, args=( model, device, model_names[i], num_epochs, None ))
        processes.append(process)
        process.start()

    # Wait for all processes to complete
    for process in processes:
        process.join()


In [50]:
# Ensure multiprocessing works well in Jupyter/interactive environments
def set_mp_start_method():
  try:
    # Only set start method if it hasn't been set yet
    mp.set_start_method('spawn', force=True)
  except RuntimeError as e:
    if 'context has already been set' not in str(e):
      raise

def start_training():
  set_mp_start_method()

  # Train the models in parallel
  parallel_training(models, model_names, device=device)

In [51]:
start_training()