In [1]:
# %matplotlib inline
# %config InlineBackend.figure_format = 'retina'

# Data Loader


In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torchvision.transforms.functional as TF
from torchvision.transforms import Compose, Resize, ToTensor, Normalize
from PIL import Image
import os

class VHRDataset(Dataset):
    def __init__(self, positive_img_dir, negative_img_dir, gt_dir, transform=None):
        """
        Args:
            positive_img_dir (string): Directory with all the positive images.
            negative_img_dir (string): Directory with all the negative images.
            gt_dir (string): Directory with all the ground truth files for positive images.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.positive_img_dir = positive_img_dir
        self.negative_img_dir = negative_img_dir
        self.gt_dir = gt_dir
        self.transform = transform
        
        # List of all images and their types (positive/negative)
        self.imgs = [(os.path.join(positive_img_dir, f), 'positive') for f in os.listdir(positive_img_dir) if f.endswith('.jpg')]
        self.imgs += [(os.path.join(negative_img_dir, f), 'negative') for f in os.listdir(negative_img_dir) if f.endswith('.jpg')]

    def __len__(self):
        return len(self.imgs)

    def __getitem__(self, idx):
        img_path, img_type = self.imgs[idx]
        image = Image.open(img_path).convert('RGB')
        
        boxes = []
        labels = []
        
        if img_type == 'positive':
            gt_path = os.path.join(self.gt_dir, os.path.basename(img_path).replace('.jpg', '.txt'))
            with open(gt_path, 'r') as file:
                for line in file:
                    if line == '\n':
                        continue
                    x1, y1, x2, y2, a = line.strip().split(',')
                    x1, x2 = x1.strip('('), x2.strip('(')
                    y1, y2 = y1.strip(')').strip(), y2.strip(')').strip()
                    boxes.append([int(x1), int(y1), int(x2), int(y2)])
                    labels.append(int(a))

        sample = {'image': image, 'boxes': boxes, 'labels': labels}
        if self.transform:
            sample = self.transform(sample)

        return sample

class MyTransform:
    """Apply transformations to the image and adjust bounding boxes."""
    def __init__(self, output_size):
        self.output_size = output_size

    def __call__(self, sample):
        image, boxes, labels = sample['image'], sample['boxes'], sample['labels']
        w, h = image.size
        new_h, new_w = self.output_size if isinstance(self.output_size, tuple) else (self.output_size, self.output_size)

        # Resize the image
        image = image.resize((new_w, new_h), Image.BILINEAR)

        # Scale the bounding boxes
        new_boxes = []
        for box in boxes:
            scaled_box = [
                box[0] * new_w / w, box[1] * new_h / h,
                box[2] * new_w / w, box[3] * new_h / h
            ]
            new_boxes.append(scaled_box)

        # Convert image eto tensor and normalize
        image = TF.to_tensor(image)
        image = TF.normalize(image, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

        return {'image': image, 'boxes': torch.tensor(new_boxes, dtype=torch.float32), 'labels': torch.tensor(labels, dtype=torch.int8)}

# Used to combine several samples into a single batched tensor to be processed by the model.
def collate_fn(batch):
    images = [item['image'] for item in batch]
    boxes = [item['boxes'] for item in batch]
    labels = [item['labels'] for item in batch]

    # Pad boxes and labels to the maximum length so that they are of uniform size
    max_boxes_len = max(len(b) for b in boxes) # 25
    max_boxes = 67
    assert max_boxes_len <= max_boxes, f"Oh shit, {max_boxes_len} labels"
    padded_boxes = []
    padded_labels = []
    for b, l in zip(boxes, labels):
        if b.ndim == 2:
            b = torch.nn.functional.pad(b, (0, 0, 0, max_boxes - b.size(0)))
            l = torch.nn.functional.pad(l, (0, max_boxes - l.size(0)))
        else:
            b = torch.zeros((max_boxes, 4))
            l = torch.zeros((max_boxes,  ))
    
        padded_boxes.append(b)
        padded_labels.append(l)
    
    # Convert padded lists to tensors
    padded_boxes = torch.stack(padded_boxes)
    padded_labels = torch.stack(padded_labels)

    return {'image': torch.stack(images), 'boxes': padded_boxes, 'labels': padded_labels}

dataset = VHRDataset(
    positive_img_dir='/home/dan/projects/cse4830/RI-CNN/NWPU VHR-10 dataset/positive image set',
    negative_img_dir='/home/dan/projects/cse4830/RI-CNN/NWPU VHR-10 dataset/negative image set',
    gt_dir='/home/dan/projects/cse4830/RI-CNN/NWPU VHR-10 dataset/ground truth',
    transform = MyTransform((512, 384))
)

dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

In [4]:
import torch.nn as nn
import torch.nn.functional as F
from RI_CNN import *

"""
This model is a "Single Shot Detector" which means it tries to predict both the bounding
boxes and the class labels at the time time. Earlier architectures would first try to
predict the regions that contained objects, and then those regions would be classified.
This was very complicated and not as fast as a single shot approach.
"""

# Standard model with conventional convolutional layers
class MrEngineerMan(nn.Module):
    def __init__(self, img_height=384, img_width=512, num_boxes=67, num_classes=11):
        super(MrEngineerMan, self).__init__()
        self.num_boxes = num_boxes
        self.num_classes = num_classes
        self.img_height = img_height
        self.img_width = img_width

        # Standard Convolutional Layers
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(2, 2)  # This will reduce the dimension by half each time it's applied

        # For better gradients
        self.batch_norm1 = nn.BatchNorm2d(16)
        self.batch_norm2 = nn.BatchNorm2d(32)
        self.batch_norm3 = nn.BatchNorm2d(64)

        # Calculate the output size after convolutions and pooling
        out_size = self.calculate_conv_output_size()

        # Dense Layers
        self.fc1_bbox = nn.Linear(out_size, 256)
        self.fc2_bbox = nn.Linear(256, num_boxes * 4)
        self.fc1_class = nn.Linear(out_size, 256)
        self.fc2_class = nn.Linear(256, num_boxes * num_classes)
        self.fc1_conf = nn.Linear(out_size, 256)
        self.fc2_conf = nn.Linear(256, num_boxes)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(self.pool(x))  # Apply pooling and activation
        x = self.batch_norm1(x)
        x = self.conv2(x)
        x = F.relu(self.pool(x))  # Apply second pooling and activation
        x = self.batch_norm2(x)
        x = self.conv3(x)
        x = F.relu(self.pool(x))  # Third pooling and activation
        x = self.batch_norm3(x)

        x = x.view(x.size(0), -1)  # Flatten the output for dense layers

        bbox = F.relu(self.fc1_bbox(x))
        cls = F.relu(self.fc1_class(x))
        conf = F.relu(self.fc1_conf(x))
        
        bbox = F.relu(self.fc2_bbox(bbox)).view(8, self.num_boxes, -1)
        cls = F.relu(self.fc2_class(cls)).view(8, self.num_boxes, -1)
        conf = F.relu(self.fc2_conf(conf)).unsqueeze(-1)

        cls = F.softmax(cls)
        conf = F.sigmoid(conf)
        
        return torch.cat((bbox, conf, cls), dim=-1)

    def calculate_conv_output_size(self):
        size = (self.img_height // 2 // 2 // 2, self.img_width // 2 // 2 // 2)
        return size[0] * size[1] * 64
    
# Special model with Vector based convolutional layers
class SpecialEngineerMan(nn.Module):
    def __init__(self, img_height=384, img_width=512, num_boxes=67, num_classes=10):
        super(SpecialEngineerMan, self).__init__()
        self.num_boxes = num_boxes
        self.num_classes = num_classes
        self.img_height = img_height
        self.img_width = img_width

        # Standard Convolutional Layers
        self.conv1 = VectorTransformConv2d(3, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = VectorConv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.conv3 = VectorConv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = VectorMaxPool2d(2, 2)  # This will reduce the dimension by half each time it's applied

        # For better gradients
        self.batch_norm1 = VectorBatchNorm2d(16)
        self.batch_norm2 = VectorBatchNorm2d(32)
        self.batch_norm3 = nn.BatchNorm2d(64)

        # Calculate the output size after convolutions and pooling
        out_size = self.calculate_conv_output_size()

        # Dense Layers
        self.fc1 = nn.Linear(out_size, 512)
        self.fc2 = nn.Linear(512, num_boxes * (4 + 1 + num_classes))  # 4 for bbox, 1 for confidence, num_classes for class probs

    def forward(self, x):
        x = self.conv1(x)
        x = vector_relu(self.pool(x))  # Apply pooling and activation
        x = self.batch_norm1(x)
        x = self.conv2(x)
        x = vector_relu(self.pool(x))  # Apply second pooling and activation
        x = self.batch_norm2(x)
        x = self.conv3(x)
        x = vector_relu(self.pool(x))  # Third pooling and activation

        # Drop the phase and normalize
        x = x[..., 0]
        x = self.batch_norm3(x)

        # The decision head
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        
        return x.view(-1, self.num_boxes, 5 + self.num_classes)  # Reshape for output format

    def calculate_conv_output_size(self):
        size = (self.img_height // 2 // 2 // 2, self.img_width // 2 // 2 // 2)
        return size[0] * size[1] * 64

In [6]:
import torch.optim as optim
from loss import DetectionLoss
from dataloader import MrDataHead

# Hyperparameters
num_epochs = 20
learning_rate = 0.0001

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize the model
model = MrEngineerMan().to(device)
#model = SpecialEngineerMan().to(device)

# Loss and Optimizer
criterion = DetectionLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Load data
train_loader = MrDataHead(batch_size=8)

# Training loop
def train_the_engineer():
    model.train()  # Set model to training mode
    for epoch in range(num_epochs):
        for i, (images, labels) in enumerate(train_loader):
            images = images.to(device)
            labels = labels.to(device)
            
            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)
            
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=1.0)
            optimizer.step()
            
            if (i+1) % 10 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():,.4f}')
                
        print(f"Ending Epoch {epoch+1} with Loss {loss.item():,.4f}")

# Save the model checkpoint
def save_the_wisdom():
    torch.save(model.state_dict(), 'mr_engineerman.ckpt')
    print("Wisdom stored safely in 'mr_engineerman.ckpt'")

if __name__ == '__main__':
    train_the_engineer()
    save_the_wisdom()

  padded_boxes[i, :num_boxes] = torch.tensor(boxes[i], dtype=torch.float32)
  padded_labels[i, :num_boxes] = torch.tensor(labels[i], dtype=torch.int64)
  padded_confidences[i, :num_boxes] = torch.tensor(confidences[i], dtype=torch.float32)
  cls = F.softmax(cls)


0.3000604212284088 1285.395263671875 5.8289594650268555


  torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=1.0)


OutOfMemoryError: CUDA out of memory. Tried to allocate 192.00 MiB. GPU 0 has a total capacity of 7.75 GiB of which 1.56 MiB is free. Process 6608 has 5.63 GiB memory in use. Including non-PyTorch memory, this process has 2.10 GiB memory in use. Of the allocated memory 1.91 GiB is allocated by PyTorch, and 5.22 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)