In [1]:
# %matplotlib inline
# %config InlineBackend.figure_format = 'retina'

# Data Loader


In [6]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torchvision.transforms.functional as TF
from torchvision.transforms import Compose, Resize, ToTensor, Normalize
from PIL import Image
import os

class VHRDataset(Dataset):
    def __init__(self, positive_img_dir, negative_img_dir, gt_dir, transform=None):
        """
        Args:
            positive_img_dir (string): Directory with all the positive images.
            negative_img_dir (string): Directory with all the negative images.
            gt_dir (string): Directory with all the ground truth files for positive images.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.positive_img_dir = positive_img_dir
        self.negative_img_dir = negative_img_dir
        self.gt_dir = gt_dir
        self.transform = transform
        
        # List of all images and their types (positive/negative)
        self.imgs = [(os.path.join(positive_img_dir, f), 'positive') for f in os.listdir(positive_img_dir) if f.endswith('.jpg')]
        self.imgs += [(os.path.join(negative_img_dir, f), 'negative') for f in os.listdir(negative_img_dir) if f.endswith('.jpg')]

    def __len__(self):
        return len(self.imgs)

    def __getitem__(self, idx):
        img_path, img_type = self.imgs[idx]
        image = Image.open(img_path).convert('RGB')
        
        boxes = []
        labels = []
        
        if img_type == 'positive':
            gt_path = os.path.join(self.gt_dir, os.path.basename(img_path).replace('.jpg', '.txt'))
            with open(gt_path, 'r') as file:
                for line in file:
                    if line == '\n':
                        continue
                    x1, y1, x2, y2, a = line.strip().split(',')
                    x1, x2 = x1.strip('('), x2.strip('(')
                    y1, y2 = y1.strip(')').strip(), y2.strip(')').strip()
                    boxes.append([int(x1), int(y1), int(x2), int(y2)])
                    labels.append(int(a))

        sample = {'image': image, 'boxes': boxes, 'labels': labels}
        if self.transform:
            sample = self.transform(sample)

        return sample

class MyTransform:
    """Apply transformations to the image and adjust bounding boxes."""
    def __init__(self, output_size):
        self.output_size = output_size

    def __call__(self, sample):
        image, boxes, labels = sample['image'], sample['boxes'], sample['labels']
        w, h = image.size
        new_h, new_w = self.output_size if isinstance(self.output_size, tuple) else (self.output_size, self.output_size)

        # Resize the image
        image = image.resize((new_w, new_h), Image.BILINEAR)

        # Scale the bounding boxes
        new_boxes = []
        for box in boxes:
            scaled_box = [
                box[0] * new_w / w, box[1] * new_h / h,
                box[2] * new_w / w, box[3] * new_h / h
            ]
            new_boxes.append(scaled_box)

        # Convert image eto tensor and normalize
        image = TF.to_tensor(image)
        image = TF.normalize(image, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

        return {'image': image, 'boxes': torch.tensor(new_boxes, dtype=torch.float32), 'labels': torch.tensor(labels, dtype=torch.int8)}

# Used to combine several samples into a single batched tensor to be processed by the model.
def collate_fn(batch):
    images = [item['image'] for item in batch]
    boxes = [item['boxes'] for item in batch]
    labels = [item['labels'] for item in batch]

    # Pad boxes and labels to the maximum length so that they are of uniform size
    max_boxes_len = max(len(b) for b in boxes) # 25
    max_boxes = 67
    assert max_boxes_len <= max_boxes, f"Oh shit, {max_boxes_len} labels"
    padded_boxes = []
    padded_labels = []
    for b, l in zip(boxes, labels):
        if b.ndim == 2:
            b = torch.nn.functional.pad(b, (0, 0, 0, max_boxes - b.size(0)))
            l = torch.nn.functional.pad(l, (0, max_boxes - l.size(0)))
        else:
            b = torch.zeros((max_boxes, 4))
            l = torch.zeros((max_boxes,  ))
    
        padded_boxes.append(b)
        padded_labels.append(l)
    
    # Convert padded lists to tensors
    padded_boxes = torch.stack(padded_boxes)
    padded_labels = torch.stack(padded_labels)

    return {'image': torch.stack(images), 'boxes': padded_boxes, 'labels': padded_labels}

dataset = VHRDataset(
    positive_img_dir='/home/dan/projects/cse4830/RI-CNN/NWPU VHR-10 dataset/positive image set',
    negative_img_dir='/home/dan/projects/cse4830/RI-CNN/NWPU VHR-10 dataset/negative image set',
    gt_dir='/home/dan/projects/cse4830/RI-CNN/NWPU VHR-10 dataset/ground truth',
    transform = MyTransform((512, 384))
)

dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

In [None]:
import torch.nn as nn
import torch.nn.functional as F

"""
This model is a "Single Shot Detector" which means it tries to predict both the bounding
boxes and the class labels at the time time. Earlier architectures would first try to
predict the regions that contained objects, and then those regions would be classified.
This was very complicated and not as fast as a single shot approach.
"""

class ObjectDetectionCNN(nn.Module):
    def __init__(self, num_classes):
        super(ObjectDetectionCNN, self).__init__()
        
        # Convolutional layers
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)

        # Fully connected layers for bounding box regression
        self.fc1_bbox = nn.Linear(128 * 48 * 64, 500)
        self.fc2_bbox = nn.Linear(500, 4 * 67)

        # Fully connected layers for class prediction
        self.fc1_cls = nn.Linear(128 * 48 * 64, 500)
        self.fc2_cls = nn.Linear(500, num_classes * 67)

    def forward(self, x):
        # Apply convolutional layers
        x = self.pool(F.relu(self.conv1(x))
        x = self.pool(F.relu(self.conv2(x))
        x = self.pool(F.relu(self.conv3(x))

        # Flatten the feature map
        x = x.view(x.size(0), -1)

        # Bounding box regression
        bbox = F.relu(self.fc1_bbox(x))
        bbox = self.FC2_bbox(bbox)
        bbox = bbox.view(-1, 67, 4)

        # Class prediction
        cls = F.relu(self.fc1_cls(x))
        cls = self.fc2_cls(x)
        cls = cls.view(-1, 67, num_classes)

        return bbox, cls

In [5]:
for batch in dataloader:
    images = batch['image']
    boxes = batch['boxes']
    labels = batch['labels']
    # Process images, boxes, and labels
    # Example: forward pass in your model

    print(images.shape, boxes.shape, labels.shape)


torch.Size([4, 3, 512, 384]) torch.Size([4, 67, 4]) torch.Size([4, 67])
torch.Size([4, 3, 512, 384]) torch.Size([4, 67, 4]) torch.Size([4, 67])
torch.Size([4, 3, 512, 384]) torch.Size([4, 67, 4]) torch.Size([4, 67])
torch.Size([4, 3, 512, 384]) torch.Size([4, 67, 4]) torch.Size([4, 67])
torch.Size([4, 3, 512, 384]) torch.Size([4, 67, 4]) torch.Size([4, 67])
torch.Size([4, 3, 512, 384]) torch.Size([4, 67, 4]) torch.Size([4, 67])
torch.Size([4, 3, 512, 384]) torch.Size([4, 67, 4]) torch.Size([4, 67])
torch.Size([4, 3, 512, 384]) torch.Size([4, 67, 4]) torch.Size([4, 67])
torch.Size([4, 3, 512, 384]) torch.Size([4, 67, 4]) torch.Size([4, 67])
torch.Size([4, 3, 512, 384]) torch.Size([4, 67, 4]) torch.Size([4, 67])
torch.Size([4, 3, 512, 384]) torch.Size([4, 67, 4]) torch.Size([4, 67])
torch.Size([4, 3, 512, 384]) torch.Size([4, 67, 4]) torch.Size([4, 67])
torch.Size([4, 3, 512, 384]) torch.Size([4, 67, 4]) torch.Size([4, 67])
torch.Size([4, 3, 512, 384]) torch.Size([4, 67, 4]) torch.Size([

In [6]:
def visualize_image_with_annotations(image, annotations):
    to_pil = ToPILImage()
    # Convert tensor image to PIL image
    image_pil = to_pil(image)

    # Create figure and axes
    fig, ax = plt.subplots(1)

    # Display the image
    ax.imshow(image_pil)

    # Plot annotations
    for annotation in annotations:
        category_id = annotation['category_id']
        bbox = annotation['poly']

        # Extracting x and y coordinates separately
        x_coords = [bbox[i] for i in range(0, len(bbox), 2)]
        y_coords = [bbox[i] for i in range(1, len(bbox), 2)]

        # Convert coordinates to list explicitly
        x_coords = list(x_coords)
        y_coords = list(y_coords)

        # Create a Polygon patch
        polygon = patches.Polygon(list(zip(x_coords, y_coords)), linewidth=1, edgecolor='r', facecolor='none')

        # Add the patch to the Axes
        ax.add_patch(polygon)

        # Add category label
        ax.text(min(x_coords), min(y_coords), str(category_id), fontsize=10, color='w', verticalalignment='bottom')

    # Show plot
    plt.show()

transform = transforms.Compose([
    #transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

root_dir = '/content/drive/My Drive/Dataset'
train_dataset = CustomDataset(root_dir, split='train', transform=transform)

x = train_dataset.__getitem__(1)
image_tensor = x[0]
#print(image_tensor)
annotations = x[1]
#print(annotations)

# Visualize the image with bounding box annotations
visualize_image_with_annotations(image_tensor, annotations['annotations'])

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/My Drive/Dataset/Annotations/train'

# Data Preprocess

In [None]:
class CustomDatasetV2(Dataset):
    def __init__(self, root_dir, split='train', transform=None):
        self.root_dir = root_dir
        self.split = split
        self.transform = transform

        # Load annotation file names for the specified split
        annotation_dir = os.path.join(root_dir, 'Annotations', split)
        self.annotation_files = sorted(os.listdir(annotation_dir))

    def __len__(self):
        return len(self.annotation_files)

    def __getitem__(self, idx):
        annotation_name = self.annotation_files[idx]
        img_name = annotation_name.replace('.json', '.jpg')

        # Load image
        img_path = os.path.join(self.root_dir, 'Images', img_name)
        image = Image.open(img_path).convert("RGB")

        # Load JSON annotations
        annotation_path = os.path.join(self.root_dir, 'Annotations', self.split, annotation_name)
        with open(annotation_path, 'r') as f:
            annotations = json.load(f)

        # Extract bounding box coordinates and class labels
        bbox_coords = []
        class_labels = []
        for annotation in annotations['annotations']:
            bbox = annotation['poly']
            bbox_coords.append(bbox)
            class_labels.append(annotation['category_id'])

        target = {
            'boxes': bbox_coords,
            'labels': class_labels
        }

        if self.transform:
            target = self.adjust_bbox(target, image.size)
            image = self.transform(image)

        return image, target
    def adjust_bbox(self, target, new_size):
        size = 224
        new_width, new_height = new_size

        # Calculate scaling factors
        width_ratio = size / new_width  # Adjust accordingly if resizing
        height_ratio = size / new_height  # Adjust accordingly if resizing

        adjusted_bboxes = []
        for bbox in target['boxes']:
            adjusted_bbox = []
            for i in range(0, len(bbox), 2):  # Iterate over x and y coordinates separately
                x_coord = bbox[i] * width_ratio
                y_coord = bbox[i + 1] * height_ratio
                adjusted_bbox.extend([x_coord, y_coord])  # Add adjusted coordinates
            adjusted_bboxes.append(adjusted_bbox)

        return {'boxes': adjusted_bboxes, 'labels': target['labels']}



In [None]:
root_dir = '/content/drive/My Drive/Dataset'

transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Adjust size as needed
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

train_dataset = CustomDatasetV2(root_dir, split='train', transform=transform)
val_dataset = CustomDatasetV2(root_dir, split='val', transform=transform)
test_dataset = CustomDatasetV2(root_dir, split='test', transform=transform)
print(len(train_dataset.__getitem__(6)[1]['labels']))
print((train_dataset.__getitem__(2)[0]))

294
tensor([[[-1.1932, -1.1418, -1.1589,  ..., -0.7993, -0.6281, -0.6281],
         [-1.1760, -1.1418, -1.1589,  ..., -0.8849, -0.7137, -0.6452],
         [-1.1589, -1.1075, -1.1418,  ..., -0.9534, -0.7308, -0.3541],
         ...,
         [ 0.0398,  0.1083,  0.1254,  ..., -1.0904, -1.1247, -1.1760],
         [ 0.0912,  0.1083,  0.1083,  ..., -1.0904, -1.1247, -1.1247],
         [ 0.0741,  0.0912,  0.0912,  ..., -1.0904, -1.1075, -1.1075]],

        [[-0.7577, -0.7052, -0.7227,  ..., -0.5476, -0.3375, -0.4076],
         [-0.7402, -0.7052, -0.7052,  ..., -0.5826, -0.4426, -0.4076],
         [-0.7052, -0.6877, -0.7052,  ..., -0.6702, -0.6001, -0.3200],
         ...,
         [-0.0224,  0.0476,  0.0476,  ..., -0.7927, -0.8102, -0.8452],
         [-0.0049,  0.0301,  0.0126,  ..., -0.7927, -0.8102, -0.8277],
         [-0.0049,  0.0126, -0.0049,  ..., -0.8102, -0.8102, -0.7927]],

        [[-1.0201, -0.9853, -1.0201,  ..., -0.5495, -0.2707, -0.2532],
         [-1.0201, -1.0027, -0.9853,  ...

In [None]:
batch_size=32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn, drop_last=True)

# Basic CNN


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class BBoxClassificationCNN(nn.Module):
    def __init__(self, num_classes):
        super(BBoxClassificationCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(64 * 28 * 28, 512)
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2(x), 2))
        x = F.relu(F.max_pool2d(self.conv3(x), 2))
        x = x.view(-1, 64 * 28 * 28)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


In [None]:
# Define model, loss function, and optimizer
model = BBoxClassificationCNN(10)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for images, targets in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False):
        optimizer.zero_grad()
        outputs = model(images)
        num_bboxes = len(targets[1]['labels'])  # Get the number of bounding boxes
        num_outputs = outputs.size(0)  # Get the batch size of the model's output

        # Ensure that the number of bounding boxes matches the batch size of the model's output
        if num_bboxes != num_outputs:
            continue  # Skip this batch if the number of bounding boxes doesn't match

        labels = torch.tensor(targets[1]['labels'], dtype=torch.int64)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * images.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    print(f"Training Loss: {epoch_loss:.4f}")





# Evaluation loop
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, targets in tqdm(val_loader, desc="Evaluation", leave=False):
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        total += len(targets[1]['labels'])  # Update total using the number of labels per image
        correct += (predicted == torch.tensor(targets['labels'])).sum().item()

accuracy = correct / total
print(f"Accuracy on evaluation set: {accuracy:.2%}")




Training Loss: 0.0000




Training Loss: 0.0684




Training Loss: 0.0000




Training Loss: 0.0000




Training Loss: 0.0000




Training Loss: 0.0000




Training Loss: 0.0000




Training Loss: 0.0000




Training Loss: 0.0000




Training Loss: 0.0000




TypeError: list indices must be integers or slices, not str