In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import os
import glob
import torch
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
from multiprocessing import Pool, cpu_count

# Load Data + YOLO

In [8]:
# load yolo
def load_model():
  model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)
  return model

In [47]:
project_path = '/content/drive/MyDrive/Grad/DSAN6600/proj/'
raw_imgs_path = os.path.join(project_path, 'data/raw')
all_imgs = glob.glob(os.path.join(raw_imgs_path, '*.jpg'))

### TESTING ON SUBSET ###
#all_imgs = all_imgs[:200]
batch_size = int(len(all_imgs) * 0.1)

In [43]:
def load_and_process_imgs(img_paths):
  model = load_model()
  imgs = [Image.open(img_path) for img_path in img_paths]
  results = model(imgs)
  dets = results.pandas().xyxy
  people_det = [det[(det['class'] == 0) & (det['confidence'] >= 0.5)] for det in dets]
  return people_det

def worker(img_paths):
  return load_and_process_imgs(img_paths)

def create_batches(img_paths, batch_size):
  return [img_paths[i:i + batch_size] for i in range(0, len(img_paths), batch_size)]

In [44]:
def crop_and_save(img_paths, detections_list, verbose=False):
  out_path = os.path.join(project_path, 'data/subjectbox')
  os.makedirs(out_path, exist_ok=True)

  for img_path, detections in zip(img_paths, detections_list):
    img = Image.open(img_path)
    if detections.empty:
      if verbose:
        print(f"No detections for {img_path}")
      continue

    if verbose:
      n_crops = len(detections)
      plt.figure(figsize=(5 * max(1, n_crops), 6))
      plt.subplot(1, n_crops + 1, 1)
      plt.imshow(img)
      plt.title('Original Image')
      plt.axis('off')
    i = 2

    for index, row in detections.iterrows():
      xmin, ymin, xmax, ymax = map(int, [row['xmin'], row['ymin'], row['xmax'], row['ymax']])
      cropped_img = img.crop((xmin, ymin, xmax, ymax))
      img_name_no_ext = os.path.splitext(os.path.basename(img_path))[0]
      new_img_name = f"{img_name_no_ext}_subject_box_{xmin}_{ymin}_{xmax}_{ymax}.jpg"
      save_path = os.path.join(out_path, new_img_name)
      cropped_img.save(save_path)

      if verbose:
        plt.subplot(1, n_crops + 1, i)
        plt.imshow(cropped_img)
        plt.title(f'Cropped {i-1}')
        plt.axis('off')
        i += 1

In [48]:
batch_data = create_batches(all_imgs, batch_size)
cpu_count = 2 # UPDATE ACCORDINGLY

with Pool(processes=cpu_count) as pool:
  for result, batch_paths in zip(pool.imap_unordered(worker, batch_data), batch_data):
    if not result:
      print("No detections in this batch:", batch_paths)
      continue
    crop_and_save(batch_paths, result, verbose=False)

  self.pid = os.fork()
  self.pid = os.fork()
Using cache found in /root/.cache/torch/hub/ultralytics_yolov5_master
Using cache found in /root/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2024-4-16 Python-3.10.12 torch-2.2.1+cu121 CPU

YOLOv5 🚀 2024-4-16 Python-3.10.12 torch-2.2.1+cu121 CPU

Fusing layers... 
Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 
Adding AutoShape... 
Using cache found in /root/.cache/torch/hub/ultralytics_yolov5_master
Using cache found in /root/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2024-4-16 Python-3.10.12 torch-2.2.1+cu121 CPU

YOLOv5 🚀 2024-4-16 Python-3.10.12 torch-2.2.1+cu121 CPU

Fusing layers... 
Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShap