In [21]:
!pip install torch
!pip install -qr https://raw.githubusercontent.com/ultralytics/yolov5/master/requirements.txt
!pip install imageio==2.4.1
!pip install moviepy
!pip install opencv-python

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [22]:
import os, subprocess
from random import randint
import numpy as np
from PIL import Image, ImageDraw, ImageFont
from tqdm.auto import tqdm

import librosa
import moviepy.editor as mp
import cv2

import torch
import torch.nn as nn
import torchvision.transforms as transforms

In [23]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
image_size = (299, 299)
classes = ["carrying", "normal", "threat"]

In [25]:
yolo = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)

root_path = "/content/drive/MyDrive/NUS/CS4243/CS4243_mini_project"
model_root_path = os.path.join(root_path, "models")
ensemble_demo_path = os.path.join(root_path, "ensemble_demo")

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

image_model_name = "inception_ensemble_image_classifier_lr3_e20_elr7"
spec_model_name = "inception_ensemble_spectrogram_classifier_lr3_e20_elr9"

Using cache found in /root/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2022-11-10 Python-3.7.15 torch-1.12.1+cu113 CUDA:0 (Tesla T4, 15110MiB)

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients
Adding AutoShape... 


In [26]:
def create_model(num_classes: int = 3, model_path = None):
    model = torch.hub.load('pytorch/vision:v0.10.0', 'inception_v3', pretrained=model_path is None)
    model.AuxLogits.fc = nn.Linear(768, num_classes)
    model.fc = nn.Linear(2048, num_classes)
    if model_path:
        model.load_state_dict(torch.load(model_path))
    return model.to(device)

In [27]:
image_model = create_model(model_path=os.path.join(model_root_path, image_model_name))
image_model.eval()
spec_model = create_model(model_path=os.path.join(model_root_path, spec_model_name))
spec_model.eval()
print("Models loaded!")

Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0
  f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "



Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0


Models loaded!


In [28]:
input_shape = (299, 299)

# Image transformations
img_transform = transforms.Compose([
    transforms.Resize(input_shape),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

spec_transform = transforms.Compose([
    transforms.Resize(input_shape),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [29]:
# crop by expanding bounding box by 10% of image size
crop_expansion_factor = 0.1

def get_person_bounding_boxes(img):
    result = yolo(img)
    detection = result.pandas().xyxy[0]
    bounding_boxes = []

    width, height = img.size

    if "person" in detection.name.values:
        rows = detection[detection.name == "person"]

        for _, row in rows.iterrows():
          ymin = max(0, int(row.ymin - height * crop_expansion_factor)) 
          ymax = min(height, int(row.ymax + height * crop_expansion_factor))
          xmin = max(0, int(row.xmin - width * crop_expansion_factor))
          xmax = min(width, int(row.xmax + width * crop_expansion_factor))

          bounding_boxes.append({'xmin': xmin, 'xmax': xmax, 'ymin': ymin, 'ymax': ymax})

    if not bounding_boxes:
        bounding_boxes.append({'xmin': 0, 'xmax': width, 'ymin': 0, 'ymax': height})

    return bounding_boxes

In [30]:
def has_audio(file_path):
    result = subprocess.run(
        [
            "ffprobe", 
            "-v", 
            "error", 
            "-show_entries", 
            "format=nb_streams", 
            "-of", 
            "default=noprint_wrappers=1:nokey=1", 
            file_path,
        ],
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
    )
    try:
        return int(result.stdout.strip()) > 1
    except:
        return False

def extract_audio(video_name, video_path):
    try:
        video_data = mp.VideoFileClip(video_path)
        file_name, _, _ = video_name.rpartition(".")
        if video_data.audio:
            audio_name = f"{file_name}_audio.mp3"
            video_data.audio.write_audiofile(os.path.join(ensemble_demo_path, audio_name))
            return os.path.join(ensemble_demo_path, audio_name)
    except:
        ...


def get_melspectrogram_db(file_path, sr = None, n_fft = 2048, hop_length = 512, n_mels = 128, fmin = 20, fmax = 8300, top_db = 80):
    wav, sr = librosa.load(file_path, sr=sr)
    if wav.shape[0] < 5 * sr:
        wav = np.pad(wav, int(np.ceil((5 * sr - wav.shape[0]) / 2)), mode="reflect")
    else:
        wav = wav[:5*sr]
    spec = librosa.feature.melspectrogram(
        wav,
        sr=sr, 
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels,
        fmin=fmin,
        fmax=fmax
    )
    spec_db = librosa.power_to_db(spec, top_db=top_db)
    return spec_db


def get_spectrogram_image(file_path):
    spec = get_melspectrogram_db(file_path)
    mean = spec.mean()
    std = spec.std()
    spec_norm = (spec - mean) / (std + 1e-6)
    spec_min, spec_max = spec_norm.min(), spec_norm.max()
    spec_scaled = 255 * (spec_norm - spec_min) / (spec_max - spec_min)
    spec_scaled = spec_scaled.astype(np.uint8)
    return spec_scaled


def convert_spectrograms(audio_path):
    spec_img = get_spectrogram_image(audio_path)
    return spec_img


def get_spectrogram(video_name, video_path):
    if has_audio(video_path):
        print("Processing Audio...")
        audio_path = extract_audio(video_name, video_path)

        print("Processing Spectrograms...")
        return convert_spectrograms(audio_path)
    else:
        print("Video doesn't have audio")
        return np.zeros(input_shape)

In [31]:
def extract_random_frame(video_path):
    vidcap = cv2.VideoCapture(video_path)
    num_of_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
    res = False
    frame_to_read = 0
    frame = []
    while not res:
        frame_to_read = randint(0, num_of_frames - 1)
        vidcap.set(cv2.CAP_PROP_POS_FRAMES, frame_to_read)
        res, frame = vidcap.read()
    print(f"=================\nRandomly chose frame {frame_to_read} from {num_of_frames} frames from the video")
    return frame

In [32]:
image_weight = 0.93
spec_weight = 0.57

def get_single_classification(img, spec):
    transformed_img = img_transform(img).unsqueeze(0).to(device)
    transformed_spec = spec_transform(spec).unsqueeze(0).to(device)

    img_logit = image_model(transformed_img)
    spec_logit = spec_model(transformed_spec)
    spec_mask = int(np.array(spec).any())
    masked_spec = torch.transpose(spec_mask * torch.transpose(spec_logit, 0, 1), 0, 1)

    logit = (img_logit * image_weight + masked_spec  * spec_weight) / (image_weight + spec_weight)

    return torch.max(logit, dim=1)[1].item()

In [33]:
# video_name = "A0222462N_20220831_threat_0007.mp4" # threat, with sound, person not moving much
# video_name = "A0261367B_20220902_threat_4024.mp4" # thread, no sound
video_name = "0217442_3182022_threat_1283.mp4" # threat with sound and moving frames

video_path = os.path.join(ensemble_demo_path, video_name)

extracted_spectrogram = get_spectrogram(video_name, video_path)
extracted_frame = extract_random_frame(video_path)

Processing Audio...
[MoviePy] Writing audio in /content/drive/MyDrive/NUS/CS4243/CS4243_mini_project/ensemble_demo/0217442_3182022_threat_1283_audio.mp3


100%|██████████| 224/224 [00:00<00:00, 807.35it/s]

[MoviePy] Done.






Processing Spectrograms...
Randomly chose frame 76 from 152 frames from the video


In [34]:
BATCH_SIZE = 16

def extract_frame_batches(video_path):
    print("Extracting frame batches")
    vidcap = cv2.VideoCapture(video_path)
    num_of_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_batches = []

    for start_frame_idx in tqdm(range(0, num_of_frames, BATCH_SIZE)):
        batch = []
        for frame_idx in range(start_frame_idx, start_frame_idx + BATCH_SIZE):
            if frame_idx == num_of_frames:
                break
            frame = []
            res = False
            while not res:
                vidcap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
                res, frame = vidcap.read()
            frame_pil = Image.fromarray(frame[: , :, ::-1]).convert('RGB')
            batch.append(frame_pil)
        frame_batches.append(batch)

    return frame_batches

In [35]:
frame_batches = extract_frame_batches(video_path)

Extracting frame batches


  0%|          | 0/10 [00:00<?, ?it/s]

In [36]:
try:
  spec_img = Image.fromarray(extracted_spectrogram)
except Exception as e:
  print('Warning: Failed to parse image')
  print('Error: %s' %e)
  raise

try:
  spec_pil = spec_img.convert('RGB')
except:
  print('Warning: Failed to format image')
  raise

In [37]:
print('Getting classification and annotation for each frame\n')
video_frames = []

for batch in tqdm(frame_batches):
    for frame in batch:
        bounding_boxes = get_person_bounding_boxes(frame)

        img_to_annotate = frame.copy()
        draw = ImageDraw.Draw(img_to_annotate)
        annotating_color = 'red'

        for bounding_box in bounding_boxes:
            xmin = bounding_box['xmin']
            xmax = bounding_box['xmax']
            ymin = bounding_box['ymin']
            ymax = bounding_box['ymax']

            pred = get_single_classification(frame.crop((xmin, ymin, xmax, ymax)), spec_pil)

            draw.rectangle([(xmin, ymin), (xmax - 1, ymax - 1)], outline=annotating_color)
            draw.text((xmin + 1, ymin + 1), classes[pred], fill=annotating_color)
        video_frames.append(img_to_annotate)

Getting classification and annotation for each frame



  0%|          | 0/10 [00:00<?, ?it/s]

In [38]:
output_video_path = f'{ensemble_demo_path}/{video_name.rpartition(".")[0]}_output_classification.avi'

print(f'Writing result to video {output_video_path}\n')

frame_size = (1280, 720)
fourcc = cv2.VideoWriter_fourcc(*'MJPG')
out = cv2.VideoWriter(output_video_path, fourcc, 15, frame_size)
for frame in tqdm(video_frames):
    out_frame = cv2.resize(np.array(frame)[:, :, ::-1], frame_size)
    out.write(out_frame)
out.release()

Writing result to video /content/drive/MyDrive/NUS/CS4243/CS4243_mini_project/ensemble_demo/0217442_3182022_threat_1283_output_classification.avi



  0%|          | 0/152 [00:00<?, ?it/s]

In [39]:
from moviepy.editor import *

clip=VideoFileClip(output_video_path)
clip.ipython_display(width=frame_size[0])

100%|██████████| 152/152 [00:01<00:00, 78.23it/s]
