In [1]:
#imports 
import os
import numpy as np
import math
import json
import random
from collections import Counter
from PIL import Image
import pandas as pd
import spacy
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import DataLoader,Dataset
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
import torch.nn.init as init

import torchvision
#from pycocotools.coco import COCO
import torch.optim as optim
import torchvision.transforms as T

from torchvision.models import resnet152
from torchvision.models import resnet101
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.ops import MultiScaleRoIAlign

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
# Define model
def get_model(num_classes):
    # Create the Resnet-152 backbone
    backbone = resnet152(weights="DEFAULT")

    # Remove the final fully connected layer (because Faster R-CNN does not need it)
    backbone = torch.nn.Sequential(*list(backbone.children())[:-2])

    # Define the number of output channels for the backbone
    backbone.out_channels = 2048

    # Create an anchor generator for the RPN
    anchor_generator = AnchorGenerator(
        sizes=((32, 64, 128, 256, 512),),
        aspect_ratios=((0.5, 1.0, 2.0),)
    )

    # Define the RoI Pooler
    roi_pooler = MultiScaleRoIAlign(
        featmap_names=["0"],
        output_size=7,
        sampling_ratio=2
    )

    for param in backbone.parameters():
        param.requires_grad = False

    # Create Faster R-CNN with the Resnet-152 backbone
    model = FasterRCNN(
        backbone,
        num_classes=num_classes,  # Adjust this to your custom dataset's number of classes
        rpn_anchor_generator=anchor_generator,
        box_roi_pool=roi_pooler
    )
   
    return model

In [52]:
# Define your custom dataset class
class CustomDataset(Dataset):
    def __init__(self, data_path,transform):
        # Load annotations from JSON
        with open(data_path) as f:
            data = json.load(f)
        self.images = data['images']
        self.annotations = data['annotations']
        self.categories = {cat['id']: cat['name'] for cat in data['categories']}
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        # Load image and annotations
        img_info = self.images[idx]
        mg_location = os.path.join('D:\\WORK\\M.SC\\MSC_Project\\GitHub\\Dataset\\VG\\VG_100K',img_info['file_name'])
        image = Image.open(mg_location).convert("RGB")

        #apply the transfromation to the image
        if self.transform is not None:
            image = self.transform(image)

        target = {
            "boxes": torch.empty((0, 4), dtype=torch.float32),  # Empty tensor with shape [0, 4]
            "labels": torch.empty((0,), dtype=torch.int64),  # Empty tensor with shape [0]
            "image_id": torch.tensor([img_info['id']]),
            "area": torch.empty((0,), dtype=torch.float32),  # Empty tensor with shape [0]
            "iscrowd": torch.empty((0,), dtype=torch.int64)  # Empty tensor with shape [0]
        }

        for ann in self.annotations:
            if ann['image_id'] == img_info['id']:
                # Convert bbox values to integers
                bbox = [int(round(val)) for val in ann["bbox"]]
                # Transform (x, y, w, h) to (x1, y1, x2, y2)
                x, y, w, h = bbox
                x1, y1 = x, y
                x2, y2 = x + w, y + h
                target["boxes"] = torch.vstack((target["boxes"], torch.tensor([x1, y1, x2, y2])))
                target["labels"] = torch.hstack((target["labels"], torch.tensor(ann["category_id"])))
                target["area"] = torch.hstack((target["area"], torch.tensor(ann["area"])))
                target["iscrowd"] = torch.hstack((target["iscrowd"], torch.tensor(ann["iscrowd"])))

        # Convert lists to tensors
        for key in target:
            if key != "image_id":
                target[key] = torch.tensor(target[key])

        return image, target


In [53]:
def custom_collate_fn(batch):
    """
    Custom collate function to handle varying sizes of the targets within the batch.
    """
    images = []
    targets = []
    
    for item in batch:
        images.append(item[0])
        targets.append(item[1])
    
    return images, targets


In [54]:
transform = T.Compose([T.Resize((800, 800)), T.ToTensor(), T.Normalize((0.485, 0.456, 0.406),(0.229, 0.224, 0.225))])
dataset = CustomDataset('D:\\WORK\\M.SC\\MSC_Project\\GitHub\\Image2Description\\instances_vg3k_cocoaligned_train.json',transform=transform)

In [55]:
from torchvision.datasets import CocoDetection

# Check if GPU is available and set device accordingly
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#transform = T.Compose([T.Resize((800, 800)), T.ToTensor(), T.Normalize((0.485, 0.456, 0.406),(0.229, 0.224, 0.225))])
# Load the dataset
#dataset = CocoDetection(root='D:\\WORK\\M.SC\\MSC_Project\\GitHub\\Dataset\\VG\\VG_100K', annFile='D:\\WORK\\M.SC\\MSC_Project\\GitHub\\Image2Description\\instances_vg3k_cocoaligned_train.json', transform=transform)
#dataset_test = CocoDetection(root='D:\\WORK\\M.SC\\MSC_Project\\GitHub\\Dataset\\VG\\VG_100K', annFile='D:\\WORK\\M.SC\\MSC_Project\\GitHub\\Image2Description\\instances_vg3k_cocoaligned_val.json', transform=transform)

In [56]:

# Define data loaders
data_loader = DataLoader(dataset, batch_size=8, shuffle=True,collate_fn=custom_collate_fn)
#data_loader_test = DataLoader(dataset_test, batch_size=8, shuffle=False)
model=get_model(3000)
model = model.to(device)  # Move the model to GPU

# Set up optimizer and learning rate scheduler
# Define your optimizer and scheduler
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

loss_fn = torch.nn.CrossEntropyLoss()

# Train your model
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for images, targets in data_loader:
        images = [image.to(device) for image in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        optimizer.zero_grad()
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
        losses.backward()
        optimizer.step()

    # Update learning rate
    lr_scheduler.step()

    # Print loss
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {losses.item()}")

# Save your trained model
torch.save(model.state_dict(), 'trained_model.pth')

# Evaluation
model.eval()
results = []

'''for images, targets in data_loader_test:
    images = list(image.to(device) for image in images)
    targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

    outputs = model(images)
    results.extend(outputs)'''

# Process results as needed

  target[key] = torch.tensor(target[key])


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# Move model to GPU
print(model)
total_params = sum(p.numel() for p in model.parameters())
print("Total parameters:", total_params)
for name, param in model.named_parameters():
    print(name, param.shape)

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (