# Homework 03: Object detection

* **REMEMBER TO COPY THIS FILE TO YOUR OWN DRIVE**
* You can only modify the non maximal suppression function
* What you need to do in this homework
  * Understand YOLO and one-stage object detection models.
  * Finish the Non-maximum Suppression function.
  * Inference on sample images and finish the report.
  * [HW3 Report](https://docs.google.com/document/d/1BN-69IoHkGvBuOFyc0jxJvqsGpfmeurGrZCBHWaX_fI/edit)

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

%cd /content/gdrive/MyDrive/

Mounted at /content/gdrive
/content/gdrive/MyDrive


In [1]:
import os
if not os.path.isdir("AI-hw3"):
    os.makedirs("AI-hw3")

%cd AI-hw3

/home/edwardleemacau/proj/ntu_ml/ai-sp23/AI-hw3


# Download data and checkpoint

In [2]:
# download the pretrain weight of YOLOv7
if not os.path.isfile("yolov7.pt"):
    !wget https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7.pt

# download the inference data
if not os.path.isdir("data"):
  ! gdown --folder https://drive.google.com/drive/folders/1RhCVmkRD_6sF4gsgFzgUKDUrQlXJB04X?usp=share_link -O data

# Import packages

In [3]:
import torch
import random
import cv2
import numpy as np
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset
import glob

from PIL import Image, ImageDraw, ImageFont

# Utility function you may need (not necessary)

In [4]:
def xyxy2xywh(x):
    # Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] where xy1=top-left, xy2=bottom-right
    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
    y[:, 0] = (x[:, 0] + x[:, 2]) / 2  # x center
    y[:, 1] = (x[:, 1] + x[:, 3]) / 2  # y center
    y[:, 2] = x[:, 2] - x[:, 0]  # width
    y[:, 3] = x[:, 3] - x[:, 1]  # height
    return y


def xywh2xyxy(x):
    # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
    y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
    y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
    y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
    y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
    return y


def box_iou(box1, box2):
    # https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py
    """
    Return intersection-over-union (Jaccard index) of boxes.
    Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
    Arguments:
        box1 (Tensor[N, 4])
        box2 (Tensor[M, 4])
    Returns:
        iou (Tensor[N, M]): the NxM matrix containing the pairwise
            IoU values for every element in boxes1 and boxes2
    """

    def box_area(box):
        # box = 4xn
        return (box[2] - box[0]) * (box[3] - box[1])

    area1 = box_area(box1.T)
    area2 = box_area(box2.T)

    # inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)
    inter = (torch.min(box1[:, None, 2:], box2[:, 2:]) - torch.max(box1[:, None, :2], box2[:, :2])).clamp(0).prod(2)
    return inter / (area1[:, None] + area2 - inter)  # iou = inter / (area1 + area2 - inter)

# Non-Maximum Suppression(NMS)

In [5]:
def non_max_suppression(prediction, confidence_threshold=0.4, iou_thres=0.6):
    """Runs Non-Maximum Suppression (NMS) on inference results
    Returns:
         list of detections, on (n,6) tensor per image [xyxy, conf, cls]
    """

    # Start of your code

    candidates_mask = prediction[..., 4] > confidence_threshold

    outputs = list()

    for image_number, current_prediction in enumerate(prediction):

        current_prediction = current_prediction[candidates_mask[image_number]]  # confidence

        current_prediction[:, 5:] *= current_prediction[:, 4:5]  # conf = obj_conf * cls_conf

        boxes = xywh2xyxy(current_prediction[:, :4])

        conf, nonzero_confidence_j = current_prediction[:, 5:].max(1, keepdim=True)
          
        current_prediction = torch.cat((boxes, conf, nonzero_confidence_j.float()), 1)[conf.view(-1) > confidence_threshold]

        # using non-maximum suppression to remove the redundant bbox
        conf = current_prediction[:, 4]
        
        ret = list()
        while True:
            # Check if there is no remain bbox, terminate the loop
            val, idx = conf.max(0)
            if val == 0:
                break

            # pop candidate bbox to retval
            ret.append(current_prediction[idx].tolist())

            # calculate the IoU between the candidate bbox and the remain bboxes
            box1 = current_prediction[idx, :4].unsqueeze(0)
            box2 = current_prediction[:, :4]
            
            # iou.shape: (n, )
            iou = box_iou(box1, box2).squeeze(0)
            iou_mask = iou > iou_thres

            # suppress the bbox confidence to 0 if the IoU is higher than the threshold
            current_prediction[iou_mask, 4] = 0
            current_prediction[idx, 4] = 0

        outputs.append(ret)

    return outputs

    # End of your code

# Datasets

In [6]:
# build our own dataloader
class MyDataset(torch.utils.data.Dataset):
  def __init__(self, root, transform=None):
    self.data = glob.glob(os.path.join(root,'*'))
    print(self.data)
    self.transform = transform
    self.len = len(self.data)

  def __len__(self):
    return self.len

  def __getitem__(self, index):
    image = Image.open(self.data[index])

    return transform(image), self.data[index]


yolo_img_size = 640

transform = transforms.Compose([
    transforms.Resize((yolo_img_size, yolo_img_size)),
    transforms.ToTensor()
])

# Load model and data

In [7]:
# download the yolov7 pretrain weight
model = torch.hub.load('WongKinYiu/yolov7', 'custom', 'yolov7.pt',
                        force_reload=True, trust_repo=True)


dataset = MyDataset("data", transform=transform)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True)

Downloading: "https://github.com/WongKinYiu/yolov7/zipball/main" to /home/edwardleemacau/.cache/torch/hub/main.zip

                 from  n    params  module                                  arguments                     
  0                -1  1       928  models.common.Conv                      [3, 32, 3, 1]                 
  1                -1  1     18560  models.common.Conv                      [32, 64, 3, 2]                
  2                -1  1     36992  models.common.Conv                      [64, 64, 3, 1]                
  3                -1  1     73984  models.common.Conv                      [64, 128, 3, 2]               
  4                -1  1      8320  models.common.Conv                      [128, 64, 1, 1]               
  5                -2  1      8320  models.common.Conv                      [128, 64, 1, 1]               
  6                -1  1     36992  models.common.Conv                      [64, 64, 3, 1]                
  7                -1  1    

Adding autoShape... 
['data/1.jpg', 'data/4.jpg', 'data/3.jpg', 'data/2.jpg']


# Plot the result

In [8]:
class_labels = model.module.names if hasattr(model, 'module') else model.names

# set color for class labels
colors = [[random.randint(0, 255) for _ in range(3)] for _ in class_labels]


# draw the bounding box detect by YOLO on image
def plot_bbox(name, pred):
  img = Image.open(name)
  width, height = img.size
  for p in pred:
    label = class_labels[int(p[5])]
    box = [p[0]/yolo_img_size*width,
           p[1]/yolo_img_size*height,
           p[2]/yolo_img_size*width,
           p[3]/yolo_img_size*height]
    draw = ImageDraw.Draw(img)
    draw.rectangle(box, outline=tuple(colors[int(p[5])]), width=3)

    fontsize = max(round(max(img.size) / 40), 12)
    font = ImageFont.load_default()
    # font = ImageFont.truetype("arial.ttf", fontsize)
    txt_width, txt_height = font.getsize(label)
    draw.rectangle([box[0], box[1] - txt_height + 4, box[0] + txt_width, box[1]], fill=tuple(colors[int(p[5])]))
    draw.text((box[0], box[1] - txt_height + 1), label, fill=(255, 255, 255), font=font)
    del draw

  dor_position = name.find(".")
  save_path = name[:dor_position]+"_pred"+name[dor_position:]
  
  if not os.path.isdir("prediction"):
    os.makedirs("prediction")

  print("the result of image:", save_path, "is save to", save_path.replace("data", "prediction"))
  img.save(save_path.replace("data", "prediction"))

# Inference

In [9]:
# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the YOLO model
model.to(device)

# Inference
model.eval()
with torch.no_grad():
  for batch_idx, (images, filename) in enumerate(dataloader):
    images = images.to(device)
    outputs = model(images)[0]
    
    # Process the outputs
    preds = non_max_suppression(outputs, confidence_threshold=0.4, iou_thres=0.6)

    for idx, pred in enumerate(preds):
      plot_bbox(filename[idx], pred)

the result of image: data/4_pred.jpg is save to prediction/4_pred.jpg
the result of image: data/2_pred.jpg is save to prediction/2_pred.jpg
the result of image: data/1_pred.jpg is save to prediction/1_pred.jpg
the result of image: data/3_pred.jpg is save to prediction/3_pred.jpg


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
  txt_width, txt_height = font.getsize(label)
