In [None]:
!pip install torch torchvision
!pip install pycocotools

In [None]:
!wget http://images.cocodataset.org/zips/train2017.zip
!unzip train2017.zip

!wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip
!unzip annotations_trainval2017.zip

In [6]:
import torch
import torch.nn as nn
import torchvision
from torch.utils.data import DataLoader, Subset
from torchvision.datasets import CocoDetection
from torchvision import transforms
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.transforms.functional import resize, to_tensor
import torch.optim as optim
from torchvision.transforms import ToTensor, RandomResizedCrop
import time
import torch.nn.functional as F
import numpy as np

# Define the Deformable Convolutional Layer
class DeformableConv2dLayer(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
        super(DeformableConv2dLayer, self).__init__()
        self.conv_offset = nn.Conv2d(in_channels, 2 * kernel_size * kernel_size, kernel_size=kernel_size, stride=stride, padding=padding)
        self.deform_conv = torchvision.ops.DeformConv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)

    def forward(self, x):
        offset = self.conv_offset(x)
        x = self.deform_conv(x, offset)
        return x

# Define the Deformable Convolutional Neural Network (DCNN) model
class DeformableCNNModel(nn.Module):
    def __init__(self, num_classes):
        super(DeformableCNNModel, self).__init__()
        self.features = nn.Sequential(
            DeformableConv2dLayer(3, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            DeformableConv2dLayer(64, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.roi_pooling = nn.AdaptiveAvgPool2d((7, 7))
        self.fc = nn.Linear(128 * 7 * 7, num_classes)

    def forward(self, x):
        x = self.features(x)
        x = self.roi_pooling(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

# Load COCO dataset
root = '/content/train2017'
annFile = '/content/annotations/instances_train2017.json'
transform = transforms.Compose([transforms.ToTensor()])

dataset = CocoDetection(root, annFile, transform=transform)

# Randomly sample 0.1% of the indices
indices1 = np.random.choice(len(dataset), int(0.001 * len(dataset)), replace=False)

# Use only the sampled indices for the dataset
subset_dataset = Subset(dataset, indices1)

# Use a custom collate function to handle COCO annotations
def collate_fn(batch):
    images, targets = zip(*batch)
    images = [resize(img, (480, 640)) for img in images]
    images = torch.stack(images)

    # Handle COCO targets format
    new_targets = []
    for target in targets:
        if len(target) > 0 and 'category_id' in target[0]:  # Check if it's not empty and already in the desired format
            new_targets.extend(target)
        else:  # Convert from COCO format to the desired format
            new_targets.extend([{"category_id": t["category_id"]} for t in target])

    return images, new_targets

data_loader = DataLoader(subset_dataset, batch_size=2, collate_fn=collate_fn)

# Instantiate the model
num_classes = len(dataset.coco.getCatIds())
model = DeformableCNNModel(num_classes=num_classes)
num_epochs = 3

# Define optimizer and loss function
optimizer = torch.optim.SGD(model.parameters(), lr=0.005, momentum=0.9)
criterion = nn.CrossEntropyLoss()

start_time = time.time()
for epoch in range(num_epochs):
    for images, targets in data_loader:
        # Assuming targets is a list of dictionaries with 'category_id' key
        target_indices = [target['category_id'] for target in targets]

        # Dynamically adjust the number of classes based on the max class index in the batch
        num_classes = max(target_indices) + 1

        # Ensure the batch size of targets matches the model output
        # Adjust the number of output units in the model's final layer
        model.fc = nn.Linear(128 * 7 * 7, num_classes)
        outputs = model(images)
        if outputs.size(0) != len(target_indices):
            continue  # Skip batch if sizes do not match

        optimizer.zero_grad()
        loss = criterion(outputs, torch.tensor(target_indices))
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}')

end_time = time.time()
training_time = end_time - start_time
print(f"Total Training Time: {training_time:.2f} seconds")

loading annotations into memory...
Done (t=14.39s)
creating index...
index created!
Epoch 1/3, Loss: 2.627627372741699
Epoch 2/3, Loss: 2.7098464965820312
Epoch 3/3, Loss: 2.6445679664611816
Total Training Time: 695.14 seconds
