## **Project2 - Neural networks for object detection**

---



**Name:** Dana Haham

**ID:** 209278407

In [None]:
# Imports
import cv2
import numpy as np
import os
import sys
import shutil
import torch

import torchvision
import torchvision.transforms as transforms
import torchvision.models.detection.mask_rcnn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor

import albumentations as A
from albumentations.pytorch import ToTensorV2

from google.colab import drive

!pip install pytube
from pytube import YouTube

In [None]:
print(f" is cuda available: {torch.cuda.is_available()}")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [41]:
# Dict of the categories in the trained dataset
class_names = {
  0: 'drone-bird-aircraft',
  1: 'aircraft',
  2: 'bird',
  3: 'drone'
}

# Pre-processing the image
img_transform = A.Compose([
    A.Resize(225, 225),
    A.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
    ToTensorV2()
])

In [42]:
def detect_objects(image, detected_frame, model):

  # Set the model to evaluation mode
  model.eval()

  # Apply transformations to the image
  transformed = img_transform(image=image)
  transformed_image = transformed['image']

  # Add a batch dimension (BxCxHxW)
  transformed_image = transformed_image.unsqueeze(0)

  # Move the image tensor to the same device as the model
  transformed_image = transformed_image.to(device)

  # Perform inference
  with torch.no_grad():
    predictions = model(transformed_image)

  # Get the predicted class and bounding box
  pred_boxes = predictions[0]['boxes'].detach().cpu().numpy()
  pred_labels = predictions[0]['labels'].detach().cpu().numpy()
  pred_scores = predictions[0]['scores'].detach().cpu().numpy()

  # Filter out predictions with low confidence
  CONFIDENCE_THRESHOLD = 0.85
  high_confidence_idxs = pred_scores > CONFIDENCE_THRESHOLD

  pred_boxes = pred_boxes[high_confidence_idxs]
  pred_labels = pred_labels[high_confidence_idxs]
  pred_scores = pred_scores[high_confidence_idxs]

  # Get the original image dimensions
  height, width = image.shape[:2]

  scale_x = width / 225
  scale_y = height / 225

  for box, label, score in zip(pred_boxes, pred_labels, pred_scores):

        # Calculate bounding box coordinates
        x_min = int(box[0] * scale_x)
        y_min = int(box[1] * scale_y)
        x_max = int(box[2] * scale_x)
        y_max = int(box[3] *scale_y)

        # Draw the bounding box
        cv2.rectangle(detected_frame, (x_min, y_min), (x_max, y_max), (255, 0, 0), 4)

        # Add class name label
        class_name = class_names[label]
        cv2.putText(detected_frame, class_name, (x_min, y_min - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

  return detected_frame

In [43]:
def create_model():

  # Bird, drone, aircraft and background
  num_classes = 4

  # Transfer Learning on MASK RCNN with Fine-Tuning

  # Load Pretrained Mask R-CNN:
  model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)

  # Freeze all the layers of the model
  for param in model.backbone.parameters():
      param.requires_grad = False

  # Unfreeze the classifier and box predictor in the ROI heads
  for param in model.roi_heads.box_predictor.parameters():
      param.requires_grad = True

  # Unfreeze the mask predictor
  for param in model.roi_heads.mask_predictor.parameters():
      param.requires_grad = True

  # Modify the model
  in_features = model.roi_heads.box_predictor.cls_score.in_features
  model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

  in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
  hidden_layer = 256
  model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask, hidden_layer, num_classes)

  model.to(device)

  return model

In [44]:
# Open video in the given path
def open_video(video_path):

    # Open the video file
    cap = cv2.VideoCapture(video_path)

    # Check if the video was successfully loaded
    if not cap.isOpened():
        print("Error: Could not open video.")
        exit()

    return cap

# Create new video in the given path acordding to the given properties
def create_video(output_path, fps, frame_width, frame_height):

    # Create the video file
    write = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (frame_width, frame_height))

    return write

# Release the given videos
def release_videos(video_input, video_output):
    video_input.release()
    video_output.release()

# Download the video from the given url to colab
def download_youtube_video(url):

  # Colab's working directory
  download_path = '/content'
  yt = YouTube(url)

  # Download the highest resolution video
  yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(output_path=download_path, filename="input.mp4")

# Download the given video to the drive
def download_output():
  drive.mount('/content/drive', force_remount=True)
  output_directory = '/content/drive/MyDrive/Project2'

  # Ensure the output directory exists
  os.makedirs(output_directory, exist_ok=True)

  # Move the file using shutil for handling file operations
  shutil.move("/content/output.mp4", os.path.join(output_directory, "output.mp4"))

# Load the best model
def load_model(model_path):
    drive.mount('/content/drive', force_remount=True)

    # Create model archicture
    model = create_model()

    # Load trained model
    model.load_state_dict(torch.load(model_path))
    model.cuda()

    return model

In [45]:
# Run the algorithm on the given video
def handle_video():

    # Download the video from youtube to colab
    video_url = 'https://www.youtube.com/watch?v=hoHAC2b1K0Q'
    download_youtube_video(video_url)

    # Open the video file
    og_video = open_video('/content/input.mp4')

    # Load the model
    model = load_model('/content/drive/MyDrive/Project2/model.pth')

    # Read first frame
    ret, frame = og_video.read()

    # Define the output video writer
    detected_video = create_video(f'/content/output.mp4', og_video.get(cv2.CAP_PROP_FPS), int(og_video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(og_video.get(cv2.CAP_PROP_FRAME_HEIGHT)))

    # Capture farme by frame until the video is over
    while(ret):

        # Create new image for the detected objects in the frame
        detected_frame = np.zeros_like(frame)

        # Detect the object in the frame
        detected_frame = detect_objects(frame, detected_frame, model)

        # Blend the detected objects in the frame
        res_image = cv2.addWeighted(frame, 0.8 ,detected_frame, 1, 0)

        # Add the detected frame to the output video
        detected_video.write(res_image)

        # Continue to the next frame
        ret, frame = og_video.read()

    # Release the video
    release_videos(og_video, detected_video)

    # Download the video to the drive
    download_output()

In [None]:
# Main
if __name__ == "__main__":

  # Active the actions on the video
  handle_video()