# Theoretic Questions

## A

In normal CNNs, the grid sampling is fixed.
The receptive field of a convolutional layer is defined by a regular grid of points.

Deformable convolutional layers allows the sampling grid to be adjusted according to the learned offsets.
They have an additional offset (Which is laearnable) for each sampling point in the kernel.
This allows the layer to handle spatial transformations, such as object deformations, rotations, scalings, ....

## B

It is done with the help of learnable offsets in the layer. during the training phase, these parameters are learned and adjusted so that the transformations in the input are taken care of.

## C

Normal convolutional layers have fixed receptive fields which means they can't adapt to variations in object sizes, shapes, or rotations. So, they might fail to capture the entire object when it's deformed or rotated significantly.

## D

Deformable convolution layers include additional convolutional layers dedicated to predicting the offsets. These modules take the feature maps from the previous layers as input.

# Practical questions

In [None]:
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import random_split
from torch.utils.data.dataloader import DataLoader
import torchvision
import torchvision.models as models
import torchvision.transforms as transforms

from matplotlib import pyplot as plt

In [None]:
def generate_dataset() -> (list, list):
    """
        This function returns the trainset and testset of the CIFAR10 dataset
    """

    generator1 = torch.Generator().manual_seed(42)

    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    trainset_all = torchvision.datasets.CIFAR10(root="./data", train=True, download=True, transform=transform)
    testset_all = torchvision.datasets.CIFAR10(root="./data", train=False, download=True, transform=transform)

    # Filter the dataset to include only 'plane' and 'car' classes
    classes_of_interest = [0, 1] # plane, automobile
    train_dataset = [item for i, item in enumerate(trainset_all) if trainset_all.targets[i] in classes_of_interest]
    test_dataset = [item for i, item in enumerate(testset_all) if testset_all.targets[i] in classes_of_interest]

    return train_dataset, test_dataset

def separate_train_val(trainset: list, val_ratio = 0.2) -> (list, list):
    """
        This function separates the train and validation sets with the given ratio
    """

    validation_size = int(val_ratio*len(trainset))
    train_size = len(trainset) - validation_size
    train_dataset, val_dataset = random_split(trainset, [train_size, validation_size])
    return train_dataset, val_dataset

def create_dataloaders(train_dataset, val_dataset, test_dataset):
    """
        This function creates the dataloaders if the input is not None
    """
    if train_dataset is not None:
        train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
    else:
        train_loader = None
    if val_dataset is not None:
        val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True)
    else:
        val_loader = None
    if test_dataset is not None:
        test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
    else:
        test_loader = None

    return train_loader, val_loader, test_loader

def train(model, train_loader, val_loader, epochs, optimizer, loss_function) -> (list, list, list, list):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.train()

    train_loss_values = []
    train_accuracy_values = []
    val_loss_values = []
    val_accuracy_values = []

    for epoch in range(epochs):
        running_loss = 0.0
        correct = 0
        total = 0

        model.train()
        for i, data in enumerate(train_loader):

            inputs, labels = data[0].to(device), data[1].to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward + backward + optimize
            outputs = model(inputs)
            # print(outputs)
            loss = loss_function(outputs, labels)
            # print(outputs)
            loss.backward()
            optimizer.step()

            # Calculate training loss
            running_loss += loss.item()

            # Calculate training accuracy
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

        epoch_train_loss = running_loss / len(train_loader)
        epoch_train_accuracy = correct / total

        train_loss_values.append(epoch_train_loss)
        train_accuracy_values.append(epoch_train_accuracy)

        print(f"Epoch [{epoch + 1}/{epochs}], "
            f"Train Loss: {epoch_train_loss:.4f}, "
            f"Train Accuracy: {100 * epoch_train_accuracy:.2f}%")

        if val_loader is not None:

            model.eval()
            val_running_loss = 0.0
            val_correct = 0
            val_total = 0

            with torch.no_grad():
                for val_data in val_loader:
                    val_inputs, val_labels = val_data[0].to(device), val_data[1].to(device)
                    val_outputs = model(val_inputs)
                    val_loss = loss_function(val_outputs, val_labels)
                    val_running_loss += val_loss.item()
                    _, val_predicted = val_outputs.max(1)
                    val_total += val_labels.size(0)
                    val_correct += val_predicted.eq(val_labels).sum().item()

            epoch_val_loss = val_running_loss / len(val_loader)
            epoch_val_accuracy = val_correct / val_total

            val_loss_values.append(epoch_val_loss)
            val_accuracy_values.append(epoch_val_accuracy)

            print(f"Validation Loss: {epoch_val_loss:.4f}, "
                f"Validation Accuracy: {100 * epoch_val_accuracy:.2f}%")

    print("Finished Training")

    return train_loss_values, train_accuracy_values, val_loss_values, val_accuracy_values


def test(model, test_loader) -> float:

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    test_correct = 0
    test_total = 0

    with torch.no_grad():
        for test_data in test_loader:
            test_inputs, test_labels = test_data[0].to(device), test_data[1].to(device)
            test_outputs = model(test_inputs)
            _, test_predicted = test_outputs.max(1)
            test_total += test_labels.size(0)
            test_correct += test_predicted.eq(test_labels).sum().item()

    test_accuracy = test_correct / test_total
    print(f"Test Accuracy: {100 * test_accuracy:.2f}%")
    return test_accuracy


def plot_results(train_loss_values, train_accuracy_values, val_loss_values, val_accuracy_values):

    epochs = 60
    # Plotting accuracy and loss per epoch
    plt.figure(figsize=(10, 5))

    # Plotting training and validation loss
    plt.subplot(1, 2, 1)
    plt.plot(range(1, epochs + 1), train_loss_values, label='Training Loss')
    if val_loss_values is not None:
        plt.plot(range(1, epochs + 1), val_loss_values, label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss per Epoch')
    plt.legend()

    # Plotting training and validation accuracy
    plt.subplot(1, 2, 2)
    plt.plot(range(1, epochs + 1), train_accuracy_values, label='Training Accuracy')
    if val_accuracy_values is not None:
        plt.plot(range(1, epochs + 1), val_accuracy_values, label='Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.title('Training and Validation Accuracy per Epoch')
    plt.legend()

    plt.tight_layout()
    plt.show()

from google.colab import drive

def save_model(model, filename):
    drive.mount('/content/drive')
    torch.save(model.state_dict(), f'/content/drive/MyDrive/DeepHW/ProblemSet3/{filename}')

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class DeformableConv2D(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
        super(DeformableConv2D, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        
        # Regular convolutional layer weights
        self.conv_weights = nn.Parameter(torch.Tensor(out_channels, in_channels, kernel_size, kernel_size))
        nn.init.kaiming_normal_(self.conv_weights, mode='fan_out', nonlinearity='relu')
        
        # Offset prediction layer
        self.offset_conv = nn.Conv2d(in_channels, 2 * kernel_size * kernel_size, 
                                     kernel_size=3, 
                                     stride=stride, 
                                     padding=padding)
        
        nn.init.constant_(self.offset_conv.weight, 0)
        nn.init.constant_(self.offset_conv.bias, 0)

    def forward(self, x):
        batch_size, _, height, width = x.size()
        
        # Predict offsets
        offsets = self.offset_conv(x)
        
        # Reshape offsets to have separate dimensions for x and y offsets
        offsets = offsets.view(batch_size, 2, self.kernel_size * self.kernel_size, height, width)
        offsets_x, offsets_y = torch.split(offsets, 1, dim=1)
        # Adjust the offset handling and grid generation part
        offsets_x = offsets_x.squeeze(1).view(batch_size, self.kernel_size * self.kernel_size, height, width)
        offsets_y = offsets_y.squeeze(1).view(batch_size, self.kernel_size * self.kernel_size, height, width)

        # Generate sampling grid
        grid_y, grid_x = torch.meshgrid(torch.arange(height), torch.arange(width))
        grid_x = grid_x.type_as(x).view(1, 1, height, width).expand(batch_size, -1, -1, -1)
        grid_y = grid_y.type_as(x).view(1, 1, height, width).expand(batch_size, -1, -1, -1)

        # Add offsets to the grid
        grid_x = grid_x + offsets_x
        grid_y = grid_y + offsets_y

        # Normalize grid to [-1, 1]
        grid_x = 2 * grid_x / max(width - 1, 1) - 1
        grid_y = 2 * grid_y / max(height - 1, 1) - 1

        # Reshape grid
        # Adjust the grid reshaping part
        # Adjust the grid creation
        grid = torch.stack((grid_x, grid_y), dim=-1).permute(0, 3, 1, 2).contiguous()

        # Perform sampling using grid_sample
        sampled_features = F.grid_sample(x, grid, align_corners=True)


        
        # Perform convolution using the sampled features and convolutional weights
        output = F.conv2d(sampled_features, self.conv_weights, stride=self.stride, padding=self.padding)
        
        return output

# Example usage:
input_channels = 3
output_channels = 64
kernel_size = 3
padding = 1

# Create an instance of DeformableConv2D
deformable_conv = DeformableConv2D(input_channels, output_channels, kernel_size, padding=padding)

# Dummy input
dummy_input = torch.randn(1, input_channels, 32, 32)

# Forward pass
output = deformable_conv(dummy_input)
print("Output shape:", output.shape)


RuntimeError: permute(sparse_coo): number of dimensions in the tensor input does not match the length of the desired ordering of dimensions i.e. input.dim() = 5 is not equal to len(dims) = 4