# LEGO Detection - Faster R-CNN

### Libraries

In [None]:
# import libraries

import os
import cv2
import torch
import torchvision

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches

from google.colab import drive
from xml.etree import ElementTree as ET
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
import torchvision.ops.boxes as box_ops
from tqdm import tqdm
from torchvision.transforms import functional as F

### Data preparation

In [None]:
# mount drive on colab notebook

drive.mount('/content/drive')

In [None]:
# unzip data files

!unzip "/content/drive/MyDrive/02 - tagged1.zip" -d "/content"

In [None]:
# major variables

photos_dir = '/content/photos'
renders_dir = '/content/renders'

# photos_dir = 'content/photos' # running locally
# renders_dir = 'content/renders' # running locally

In [None]:
def load_data(data_dir):
    '''
    Returns a list of images and labels for each image
    '''
    image_paths = []
    num_legos = []
    for subdir, _, files in os.walk(data_dir):
        for file in files:
            if file.endswith('.jpg'):
                n = int(subdir.split(os.sep)[-1])
                image_paths.append(os.path.join(subdir, file))
                num_legos.append(n)
    combined = list(zip(image_paths, num_legos))
    combined.sort()
    image_paths, num_legos = zip(*combined)
    image_paths = np.asarray(image_paths)
    num_legos = torch.Tensor(num_legos).to(torch.int64)
    return image_paths, num_legos

In [None]:
def parse_xml(xml_file):
    '''
    Read the xml file and return the bounding box coordinates
    '''
    tree = ET.parse(xml_file)
    root = tree.getroot()
    bounding_boxes = []
    for obj in root.findall('object'):
        bbox = obj.find('bndbox')
        xmin = int(bbox.find('xmin').text)
        ymin = int(bbox.find('ymin').text)
        xmax = int(bbox.find('xmax').text)
        ymax = int(bbox.find('ymax').text)
        bounding_boxes.append([xmin, ymin, xmax, ymax])
    return bounding_boxes

In [None]:
def parse_all_xml(image_paths):
    '''
    Parse all XML files corresponding to the given image paths.
    '''
    bounding_boxes = []
    for img_path in image_paths:
        xml_path = img_path.replace('.jpg', '.xml')
        bounding_boxes.append(parse_xml(xml_path))
    return bounding_boxes

In [None]:
# load data

image_paths, num_legos = load_data(photos_dir)

In [None]:
# parse bounding boxes for all images

bounding_boxes = parse_all_xml(image_paths)

In [None]:
# class distribution in overall data

plt.hist(num_legos, bins=range(1, max(num_legos)), align='left', rwidth=0.8)
plt.xlabel('Number of LEGOs')
plt.ylabel('Frequency')
plt.title('LEGO Distribution')
plt.show()

In [None]:
# work with defined train test split

train_test_split = np.genfromtxt('/content/drive/MyDrive/train_test_split.csv', delimiter=',', dtype=None, encoding=None)
# train_test_split = np.genfromtxt('content/train_test_split.csv', delimiter=',', dtype=None, encoding=None) # running locally

train_test_ids = {
    'train': [],
    'test': []
}
for index, row in enumerate(train_test_split):
    if row[1] == '1':
      train_test_ids['test'].append(index - 1)
    elif row[1] == '0':
      train_test_ids['train'].append(index - 1)

len(train_test_ids['train']), len(train_test_ids['test'])

In [None]:
# class distribution in training data

num_legos_train = num_legos[train_test_ids['train']]
plt.hist(num_legos_train, bins=range(1, max(num_legos_train)), align='left', rwidth=0.8)
plt.xlabel('Number of LEGOs')
plt.ylabel('Frequency')
plt.title('LEGO Training Distribution')
plt.show()

In [None]:
# undersampling of larger classes in training data

indices1 = []
indices2 = []

for i in train_test_ids['train']:
    if num_legos[i] == 1:
        indices1.append(i)
    elif num_legos[i] == 2:
        indices2.append(i)
        
np.random.shuffle(indices1, )
np.random.shuffle(indices2, )

leftovers1 = indices1[100:]
leftovers2 = indices2[100:]

for i in leftovers1:
    train_test_ids['train'].remove(i)
for i in leftovers2:
    train_test_ids['train'].remove(i)

num_legos_train = num_legos[train_test_ids['train']]
plt.hist(num_legos_train, bins=range(1, max(num_legos_train)), align='left', rwidth=0.8)
plt.xlabel('Number of LEGOs')
plt.ylabel('Frequency')
plt.title('LEGO Training Distribution (Undersampling)')
plt.show()

In [None]:
# validation set

indices = train_test_ids['test']
np.random.shuffle(indices, )

test_size = 0.4 * len(indices)
split = int(np.floor(test_size))
train_test_ids['valid'], train_test_ids['test'] = indices[split:], indices[:split]

len(train_test_ids['train']), len(train_test_ids['valid']), len(train_test_ids['test'])

In [None]:
# bounding boxes for train, valid, and test sets

train_boxes = [bounding_boxes[i] for i in train_test_ids['train']]
valid_boxes = [bounding_boxes[i] for i in train_test_ids['valid']]
test_boxes = [bounding_boxes[i] for i in train_test_ids['test']]

In [None]:
class LegosDataset(Dataset):
    '''
    Dataset class for the legos dataset
    '''
    def __init__(self, images_filenames, num_legos, bounding_boxes, transforms=[], augmented=[]):
        self.images_filenames = images_filenames
        self.labels = num_legos
        self.bounding_boxes = bounding_boxes
        self.transforms = transforms
        self.augmented = augmented

    def __len__(self):
        return len(self.images_filenames)

    def __getitem__(self, id):
        image_filename = self.images_filenames[id]
        label = self.labels[id]
        bounding_boxes = self.bounding_boxes[id]
        image = cv2.imread(image_filename)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        original_height, original_width = image.shape[:2]
        transformation = self.transforms[self.augmented[id]]
        image = transformation(image)
        scale_w = 224.0 / original_width
        scale_h = 224.0 / original_height
        scaled_boxes = []
        for box in bounding_boxes:
            x_min, y_min, x_max, y_max = box
            x_min = int(x_min * scale_w)
            y_min = int(y_min * scale_h)
            x_max = int(x_max * scale_w)
            y_max = int(y_max * scale_h)
            scaled_boxes.append([x_min, y_min, x_max, y_max])
        target = {
            'boxes': torch.tensor(scaled_boxes, dtype=torch.float32),
            'labels': torch.ones((label,), dtype=torch.int64)
        }
        return image, target

In [None]:
# train, valid and test sets

transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

augment = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2),
    transforms.RandomApply([transforms.GaussianBlur(kernel_size=(5, 9), sigma=(0.1, 5))], p=0.5),
    transforms.RandomGrayscale(p=0.1),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

train_dataset = LegosDataset(image_paths[train_test_ids['train']], num_legos[train_test_ids['train']], train_boxes, 
                             transforms=[transform, augment], augmented=[0]*len(image_paths[train_test_ids['train']]))

valid_dataset = LegosDataset(image_paths[train_test_ids['valid']], num_legos[train_test_ids['valid']], valid_boxes, 
                             transforms=[transform], augmented=[0]*len(image_paths[train_test_ids['valid']]))

test_dataset = LegosDataset(image_paths[train_test_ids['test']], num_legos[train_test_ids['test']], test_boxes, 
                            transforms=[transform], augmented=[0]*len(image_paths[train_test_ids['test']]))

In [None]:
def generate_data(image_paths, num_legos, bounding_boxes, copies=5):
    '''
    Generate more data by copying images with more than 6 legos
    '''
    new_image_paths = []
    new_num_legos = []
    new_bounding_boxes = []
    for id in range(len(image_paths)):
        if num_legos[id] >= 6:
            for n in range(copies):
                new_image_paths.append(image_paths[id])
                new_num_legos.append(num_legos[id])
                new_bounding_boxes.append(bounding_boxes[id])
    return new_image_paths, new_num_legos, new_bounding_boxes

In [None]:
# oversampling of smaller classes in training data - augmentation

new_image_paths, new_num_legos, new_bounding_boxes = generate_data(
                                                        image_paths[train_test_ids['train']], 
                                                        num_legos[train_test_ids['train']], 
                                                        train_boxes
                                                    )

for img, lbl, bbox in zip(new_image_paths, new_num_legos, new_bounding_boxes):
    train_dataset.images_filenames = np.append(train_dataset.images_filenames, img)
    train_dataset.labels = torch.cat((train_dataset.labels, torch.tensor([lbl], dtype=torch.int64)))
    train_dataset.bounding_boxes.append(bbox)

train_dataset.augmented.extend([1] * len(new_image_paths))

num_legos_train = train_dataset.labels
plt.hist(num_legos_train, bins=range(1, max(num_legos_train)), align='left', rwidth=0.8)
plt.xlabel('Number of LEGOs')
plt.ylabel('Frequency')
plt.title('LEGO Training Distribution (Oversampling)')
plt.show()

In [None]:
# dataloaders

batch_size = 16
num_workers = 2

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))

len(train_dataset), len(valid_dataset), len(test_dataset)

### Model definition

In [None]:
# get cpu or gpu device for training

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

In [None]:
# faster r-cnn model

faster_rcnn = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

in_features = faster_rcnn.roi_heads.box_predictor.cls_score.in_features
num_classes = 2
faster_rcnn.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

In [None]:
# put model in device

model = faster_rcnn.to(device)

model

### Model training

In [None]:
def compute_map(predictions, targets):
    '''
    Compute the Mean Average Precision (mAP) for the given predictions and targets
    '''
    pred_boxes = [pred['boxes'].cpu() for pred in predictions]
    pred_scores = [pred['scores'].cpu() for pred in predictions]
    gt_boxes = [target['boxes'].cpu() for target in targets]
    pred_boxes_flat = torch.cat(pred_boxes, dim=0)
    pred_scores_flat = torch.cat(pred_scores, dim=0)
    gt_boxes_flat = torch.cat(gt_boxes, dim=0)
    iou_matrix = box_ops.box_iou(pred_boxes_flat, gt_boxes_flat)
    true_positives = torch.sum(iou_matrix >= 0.5, dim=1)
    false_positives = torch.sum(iou_matrix < 0.5, dim=1)
    false_negatives = gt_boxes_flat.shape[0] - true_positives
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    map_score = precision.mean().item()
    return map_score

In [None]:
def epoch_iter(dataloader, model, optimizer=None, is_train=True):
    '''
    Perform one epoch of training/validation/testing
    '''
    if is_train:
        assert optimizer is not None, "When training, please provide an optimizer"
    num_batches = len(dataloader)
    model.train() if is_train else model.eval()
    total_loss = 0.0
    total_map = 0.0
    with torch.set_grad_enabled(is_train):
        for batch, (images, targets) in enumerate(tqdm(dataloader)):
            images = list(image.to(device) for image in images)
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
            if is_train:
                loss_data = model(images, targets)
                losses = sum(loss for loss in loss_data.values())
                optimizer.zero_grad()
                losses.backward()
                optimizer.step()
                total_loss += losses.item()
            else:
                with torch.no_grad():
                    predictions = model(images)
                    map_score = compute_map(predictions, targets)
                    total_map += map_score
    avg_loss = total_loss / num_batches if is_train else None
    avg_map = total_map / num_batches if not is_train else None
    return avg_loss, avg_map

In [None]:
def train(model, model_name, num_epochs, train_dataloader, validation_dataloader, optimizer, train_history=None, val_history=None):
    '''
    Train the model
    '''
    if train_history is None:
        train_history = {'loss': []}
    if val_history is None:
        val_history = {'map': []}
    best_val_map = -np.inf
    print("Start training...")
    for t in range(num_epochs):
        print(f"Epoch {t+1}/{num_epochs}")
        train_loss, _ = epoch_iter(train_dataloader, model, optimizer, is_train=True)
        print(f"Train loss: {train_loss:.3f}")
        _, val_map = epoch_iter(validation_dataloader, model, is_train=False)
        print(f"Validation mAP: {val_map:.3f}")
        if val_map > best_val_map:
            best_val_map = val_map
            save_dict = {
                'model': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'epoch': t
            }
            torch.save(save_dict, model_name + '_best_model.pth')
        save_dict = {
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'epoch': t
        }
        torch.save(save_dict, model_name + '_latest_model.pth')
        train_history['loss'].append(train_loss)
        val_history['map'].append(val_map)
    print("Finished")
    return train_history, val_history

In [None]:
# training and validation history

train_history = {'loss': []}
val_history = {'map': []}

In [None]:
# model layers

for param in model.parameters():
    param.requires_grad = False
    
head_layers = ['roi_heads.box_predictor']
rpn_layers = [layer[0] for layer in list(model.named_parameters()) if 'rpn' in layer[0]]
backbone_layers = [layer[0] for layer in list(model.named_parameters()) if 'backbone' in layer[0]]

In [None]:
def unfreeze_layers(model, layers_to_unfreeze):
    '''
    Unfreeze the specified layers in the model
    '''
    for name, param in model.named_parameters():
        if any(layer in name for layer in layers_to_unfreeze):
            param.requires_grad = True

In [None]:
# train head

unfreeze_layers(model, head_layers)

optimizer = torch.optim.Adam(model.roi_heads.box_predictor.parameters(), lr=1e-4)
num_epochs = 5

train_history, val_history = train(
                                model,
                                'lego_detector',
                                num_epochs,
                                train_dataloader,
                                valid_dataloader,
                                optimizer,
                                train_history,
                                val_history
                            )

In [None]:
# train head + RPN

model = faster_rcnn.to(device)
checkpoint = torch.load('lego_detector_latest_model.pth')
model.load_state_dict(checkpoint['model'])

unfreeze_layers(model, head_layers)
unfreeze_layers(model, rpn_layers)

optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-5)
num_epochs = 5

train_history, val_history = train(
                              model,
                              'lego_detector',
                              num_epochs,
                              train_dataloader,
                              valid_dataloader,
                              optimizer,
                              train_history,
                              val_history
                            )

In [None]:
# train head + RPN + backbone

model = faster_rcnn.to(device)
checkpoint = torch.load('lego_detector_latest_model.pth')
model.load_state_dict(checkpoint['model'])

unfreeze_layers(model, head_layers)
unfreeze_layers(model, rpn_layers)
unfreeze_layers(model, backbone_layers)
for param in model.parameters():
    param.requires_grad = True

optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
num_epochs = 10

train_history, val_history = train(
                              model,
                              'lego_detector',
                              num_epochs,
                              train_dataloader,
                              valid_dataloader,
                              optimizer,
                              train_history,
                              val_history
                            )

### Training evolution analysis

In [None]:
def plotTrainingHistory(train_history, val_history):
    '''
    Plot the training history of the model
    '''
    pass
    # TODO

In [None]:
# visualize training history

plotTrainingHistory(train_history, val_history)

### Model testing

In [None]:
# load best model

model = faster_rcnn.to(device)
checkpoint = torch.load('lego_detector_best_model.pth')
model.load_state_dict(checkpoint['model'])

In [None]:
# evaluate model on test data

_, test_map = epoch_iter(test_dataloader, model, is_train=False)
print(f"Test mAP: {test_map:.3f}")

In [None]:
def show_predictions(model, dataloader):
    '''
    Display images along with their true and predicted bounding boxes
    '''
    pass
    # TODO

In [None]:
# view predictions

show_predictions(model, test_dataloader)