# Object Detection - Faster RCNN (PyTorch)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

*Much of the implementation here is described by the use of pre-defined functions. It's a much cleaner way of conducting this whole process, in my opinion.*

In [None]:
import torch
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection import  FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator


import os
import numpy as np
import cv2
import glob
import albumentations as A
import pandas as pd
from torch.utils.data import Dataset
from albumentations.pytorch.transforms import ToTensorV2
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

In [None]:
ROOT_PATH = '../input/train-test-imgs-orig/Train_Images/Train_Images'
TEST_PATH = '../input/train-test-imgs-orig/Test_Images (1) (1)/Test_Images'
PREDICTION_THRES = 0.5
EPOCHS = 20
MIN_SIZE = 800
BATCH_SIZE = 4
DEBUG = False

In [None]:
def model():
    # we will keep the image size to the original 800 for faster training,
    # larger image size would for sure result in better - more accurate results
    # however there is a trade off since the training time increases
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True, 
                                                                 min_size=MIN_SIZE)
    # additional background class
    num_classes = 4
    # get the input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # replace pre-trained head with our features head
    # the head layer will classify the images based on our data input features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    return model

*Loading images when working with PyTorch is a hassle. If the images are disorganized, creating a custom class for loading the dataset is the way to go. Understandably, if the images are of small quantity, organizing them in folders and using the ImageFolder library would be a much simpler, and less boilerplate solution.*

In [None]:
class FruitDataset(Dataset):
    def __init__(self, dataframe, image_dir, transforms=None):
        super().__init__()

        self.image_ids = dataframe['Image_ID'].unique()
        self.df = dataframe
        self.image_dir = image_dir
        self.transforms = transforms
        
    def __getitem__(self, index: int):

        image_id = self.image_ids[index]
        records = self.df[self.df['Image_ID'] == image_id]

        image = cv2.imread(f"{self.image_dir}/{image_id}.jpg", cv2.IMREAD_COLOR)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
        image /= 255.0
    
        # convert the boxes into x_min, y_min, x_max, y_max format
        boxes = records[['xmin', 'ymin', 'width', 'height']].values
        # x_max
        boxes[:, 2] = boxes[:, 0] + boxes[:, 2]
        
        #y_max
        boxes[:, 3] = boxes[:, 1] + boxes[:, 3]
        
        # get the area of the bounding boxes
        # h * w
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        area = torch.as_tensor(area, dtype=torch.float32)

        # we have only one class - detecting only one object
        labels = torch.ones((records.shape[1],), dtype=torch.int64)
        
        # supposing that all instances are not crowd
        iscrowd = torch.zeros((records.shape[0],), dtype=torch.int64)
        
        target = {}
        target['boxes'] = boxes
        target['labels'] = labels
        target['image_ID'] = torch.tensor([index])
        target['area'] = area
        target['iscrowd'] = iscrowd

        # apply the image transforms
        if self.transforms:
            sample = {
                'image': image,
                'bboxes': target['boxes'],
                'labels': labels
            }
            sample = self.transforms(**sample)
            image = sample['image']
            
            target['boxes'] = torch.stack(tuple(map(torch.FloatTensor, 
                                                    zip(*sample['bboxes'])))).permute(1, 0)

        return image, target, image_id

    def __len__(self):
        return self.image_ids.shape[0]


In [None]:
def collate_fn(batch):
    """
    This function helps when we have different number of object instances
    in the batches in the dataset.
    """
    return tuple(zip(*batch))

In [None]:
# function for the image transforms
def train_transform():
    return A.Compose([
        A.Flip(0.5),
        A.RandomRotate90(0.5),
#         A.MotionBlur(p=0.2),
#         A.MedianBlur(blur_limit=3, p=0.1),
        A.Blur(blur_limit=3, p=0.1),
        ToTensorV2(p=1.0)
    ], bbox_params={'format': 'pascal_voc', 'label_fields': ['labels']})

*As for the image transforms, you always really want to have the A.Flip mentioned - at the very least.*

In [None]:
# path to the input root directory
DIR_INPUT = ROOT_PATH
# read the annotation CSV file
train_df = pd.read_csv(f"../input/csv-files/Train (6).csv")
print(train_df.head())
print(f"Total number of image IDs (objects) in dataframe: {len(train_df)}")

# get all the image paths as list
image_paths = glob.glob(f"{DIR_INPUT}/*.jpg")
image_names = []
for image_path in image_paths:
    image_names.append(image_path.split(os.path.sep)[-1].split('.')[0])
print(f"Total number of training images in folder: {len(image_names)}")
image_ids = train_df['Image_ID'].unique()
print(f"Total number of unique train images IDs in dataframe: {len(image_ids)}")
# number of images that we want to train out of all the unique images
train_ids = image_names[:] # use all the images for training
train_df = train_df[train_df['Image_ID'].isin(train_ids)]
print(f"Number of image IDs (objects) training on: {len(train_df)}")

In [None]:
train_dataset = FruitDataset(train_df, DIR_INPUT, train_transform())
train_data_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_fn
)

*The model generally has a good runtime, so using the stochastic gradient decent optimizer won't hurt.*

In [None]:
# the computation device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=0.0005)

In [None]:
def train(train_dataloader):
    model.train()
    running_loss = 0
    for i, data in enumerate(train_dataloader):
        
        optimizer.zero_grad()
        images, targets, images_ids = data[0], data[1], data[2]
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        loss_dict = model(images, targets)
        loss = sum(loss for loss in loss_dict.values())
        running_loss += loss.item()
        loss.backward()
        optimizer.step()
        if i % 25 == 0:
            print(f"Iteration #{i} loss: {loss}")
    train_loss = running_loss/len(train_dataloader.dataset)
    return train_loss

In [None]:
def save_model():
    torch.save(model.state_dict(), '/kaggle/working/fasterrcnn_resnet50_fpn.pth')

In [None]:
# for the lovers of visualization, like me
def visualize():
    """
    This function will only execute if `DEBUG` is `True` in 
    the third cell
    """
    images, targets, image_ids = next(iter(train_data_loader))
    images = list(image for image in images)
    targets = [{k: v for k, v in t.items()} for t in targets]
    for i in range(1):
        boxes = targets[i]['boxes'].cpu().numpy().astype(np.int32)
        sample = images[i].permute(1,2,0).cpu().numpy()
        fig, ax = plt.subplots(1, 1, figsize=(15, 12))
        for box in boxes:
            cv2.rectangle(sample,
                        (box[0], box[1]),
                        (box[2], box[3]),
                        (220, 0, 0), 3)
        ax.set_axis_off()
        plt.imshow(sample)
        plt.show()

In [None]:
# such wonderful code right here...
import time
if DEBUG:
    visualize()
    
num_epochs = EPOCHS
for epoch in range(num_epochs):
    start = time.time()
    train_loss = train(train_data_loader)
    print(f"Epoch #{epoch} loss: {train_loss}")   
    end = time.time()
    print(f"Took {(end - start) / 60} minutes for epoch {epoch}")

In [None]:
save_model()

In [None]:
# validation time

from tqdm import tqdm
# set the computation device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.load_state_dict(torch.load('/kaggle/working/fasterrcnn_resnet50_fpn.pth'))

DIR_TEST = TEST_PATH
test_images = os.listdir(DIR_TEST)
print(f"Validation instances: {len(test_images)}")

boxes_list = []
scores_list = []
image_paths = []

detection_threshold = PREDICTION_THRES
model.eval()
with torch.no_grad():
    for i, image in tqdm(enumerate(test_images), total=len(test_images)):
        orig_image = cv2.imread(f"{DIR_TEST}/{test_images[i]}", cv2.IMREAD_COLOR)
        image = cv2.cvtColor(orig_image, cv2.COLOR_BGR2RGB).astype(np.float32)
        # make the pixel range between 0 and 1
        image /= 255.0
        image = np.transpose(image, (2, 0, 1)).astype(np.float)
        image = torch.tensor(image, dtype=torch.float).cuda()
        image = torch.unsqueeze(image, 0)
        cpu_device = torch.device("cpu")
        outputs = model(image)
        
        outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs]
        if len(outputs[0]['boxes']) != 0:
            for counter in range(len(outputs[0]['boxes'])):
                image_paths.append(f"{test_images[i]}")
                boxes = outputs[0]['boxes'].data.numpy()
                scores = outputs[0]['scores'].data.numpy()
                labels = outputs[0]['labels'].data.numpy()
                boxes = boxes[scores >= detection_threshold].astype(np.float32)
                scores = scores[scores >= detection_threshold].astype(np.float32)
                boxes_list.append(boxes)
                scores_list.append(scores)
                draw_boxes = boxes.copy()
                
                
            for box in enumerate(draw_boxes):
                cv2.rectangle(orig_image,
                            (int(box[0]), int(box[1])),
                            (int(box[2]), int(box[3])),
                            (0, 0, 255), 3)
                cv2.putText(orig_image, pred_classes[box], 
                            (int(box[0]), int(box[1]-5)),
                            cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 0, 255), 
                            2, lineType=cv2.LINE_AA)
            cv2.imwrite(f"/kaggle/working/{test_images[i]}", orig_image,)
print('TEST PREDICTIONS COMPLETE')

*Test Images, with annotated predictions are written to the specified path. Bounding boxes are all appended to the list `boxes_list` and their confidence scores to the list `scores_list`. All predictions took into account the pre-set threshold of 0.5. This value should generally be as high as your data allows it to be, to maximize on precision.*

*Notably, however, if the metric of choice is mAP, as for most Object Detection tasks, having more than the most accurate prediction that the model generates is a better choice. Despite that, the Faster RCNN's most outstanding feature, compared to its counterparts and alternatives, is its fast computational speed.*