# Script for saving annotated frames

In [1]:
import torch
import numpy as np
import cv2
import keyboard
import uuid
import os
import win32gui
from PIL import ImageGrab
import pandas as pd

print("Is CUDA available: %s" % torch.cuda.is_available()) 
print("GPUs: %s" % torch.cuda.device_count()) 
print("GPU name: %s" % torch.cuda.get_device_name(0))

Is CUDA available: True
GPUs: 1
GPU name: NVIDIA GeForce GTX 1660 Ti


In [3]:
# model = torch.hub.load('ultralytics/yolov5', 'yolov5x', pretrained=True) # pre-trained YOLOv5-XL version
model = torch.hub.load('ultralytics/yolov5', 'custom', path='weights/first_weights.pt', force_reload=True) # weights after the first training
# model = torch.hub.load('ultralytics/yolov5', 'custom', path='weights/final_weights.pt', force_reload=True) # weights after the second training

Downloading: "https://github.com/ultralytics/yolov5/archive/master.zip" to C:\Users\julen/.cache\torch\hub\master.zip
YOLOv5  2023-4-17 Python-3.9.13 torch-1.8.2+cu111 CUDA:0 (NVIDIA GeForce GTX 1660 Ti, 6144MiB)



[31m[1mrequirements:[0m C:\Users\julen\.cache\torch\hub\requirements.txt not found, check failed.


Fusing layers... 
Model summary: 157 layers, 7015519 parameters, 0 gradients, 15.8 GFLOPs
Adding AutoShape... 


- Saving images from a video every x seconds

In [2]:
IMAGES_PATH = 'data/images/' # path for saving the images
SEC = .5 # save two frames every second
FPS = 60 # frame rate

count = 0

keyboard.wait('s') # wait for the 's' key to be pressed to start
print("Saving images")
while True:
    hwnd = win32gui.FindWindow(None, "Cuphead") # find the game's window
    rect = win32gui.GetWindowPlacement(hwnd)[-1]
    image = ImageGrab.grab(rect)
    file_name = uuid.uuid1() # create unique identifier for each of the file names
    
    if count%(SEC*FPS) == 0:
        cv2.imwrite(os.path.join(IMAGES_PATH,"%d.jpg" % file_name), cv2.cvtColor(np.array(image),cv2.COLOR_BGR2RGB)) # save frame as JPG file

    count+=1

    if keyboard.is_pressed("q"):
        print("Q pressed, exiting")
        break

cv2.destroyAllWindows()

Saving images
Q pressed, exiting


- Second filter for getting anotated images based on the first training

In [4]:
"""
Given the YOLO output results converts and returns those results in the YOLO input format:
class_num: class value
x: x coordinate of the detection b-box center
y: y coordinate of the detection b-box center
w: width of the detection b-box
h: height of the detection b-box

Detections will be filtered to remove the ones with a low area

Except for the class value, all other values will be normalized
"""
def filter2(results):
    CLASSES = [0, 1]

    # convert to dataframe
    res_fil = results.pandas().xyxy[0]

    # filter the detections with a confidence over 0.29, and unwanted detections
    res_fil = res_fil.loc[(res_fil['confidence']>0.29) & ((res_fil['class'].isin(CLASSES)))] 

    aux = pd.DataFrame()
    res = pd.DataFrame()

    # check each row looking for overlaping bounding boxes
    for _, row1 in res_fil.iterrows():
        min_row = pd.DataFrame()
        for _, row2 in res_fil.iterrows():
            if((abs(row1['xmin']-row2['xmin']) > 0) and (abs(row1['ymin']-row2['ymin']) > 0)
            and (abs(row1['xmax']-row2['xmax']) > 0) and (abs(row1['ymax']-row2['ymax']) > 0)
            and (abs(row1['xmin']-row2['xmin']) <= 100) and (abs(row1['ymin']-row2['ymin']) <= 100)
            and (abs(row1['xmax']-row2['xmax']) <= 100) and (abs(row1['ymax']-row2['ymax']) <= 100)):
                if(row2['confidence'] >= row1['confidence']):
                    min_row = row1.T
        if(len(min_row) > 0):
            aux = pd.concat([aux, min_row.T], ignore_index=False)
        
    # get the difference to remove the overlapinng bounding boxes
    res = pd.concat([res_fil, aux.T]).drop_duplicates(keep=False)

    
    # compute the input-format values
    xmax = res.xmax.astype(float)
    xmin = res.xmin.astype(float)
    ymax = res.ymax.astype(float)
    ymin = res.ymin.astype(float)
    
    class_num = list(res["class"].astype(int))

    x = list(((xmax+xmin)/2)/1920)
    y = list(((ymax+ymin)/2)/1080)
    w = list((xmax-xmin)/1920)
    h = list((ymax-ymin)/1080)

    x_res = [] 
    y_res = []
    w_res = []
    h_res = []
    class_num_res = []

    # get only the detection bounding boxes with an area over 0.008 so that we only get the detections of the actual cuphead
    if(len(class_num) > 0):
        for x1, y1, w1, h1, class1 in zip(x, y, w, h, class_num):
            if((w1*h1) > 0.007):
                x_res.append(x1) 
                y_res.append(y1)
                w_res.append(w1)
                h_res.append(h1)
                class_num_res.append(class1)

    return class_num_res, x_res, y_res, w_res, h_res

- Function to convert detection results to the YOLO input format for anotating goopy and cuphead images using the YOLOv5-XL model

In [5]:
"""
Given the YOLO output results converts and returns those results in the YOLO input format:
class_num: class value
x: x coordinate of the detection b-box center
y: y coordinate of the detection b-box center
w: width of the detection b-box
h: height of the detection b-box

Detections will be filtered to remove the irrelevant classes and duplicated detections

Except for the class value, all other values will be normalized
"""
def filter(results, level):

    if(level != 0):
        return filter2(results)
    else:
        # most common classes detected by YOLO for each object
        # [0: person, 1: bicycle, 3: motorcycle, 13: bench]
        CUPHEAD_CLASSES = [0, 1, 3, 13] 

        # [14: bird, 29: frisbee, 32: sports ball, 37: surfboard, 45: bowl,
        #  55: cake, 70: toaster, 74: clock, 75: vase]
        GOOPY_CLASSES = [14, 29, 32, 33, 37, 45, 55, 70, 74, 75]

        # convert to dataframe
        res_fil = results.pandas().xyxy[0]

        # filter the detections with a confidence over 0.39, and unwanted detections
        res_fil = res_fil.loc[(res_fil['confidence']>0.39) & ((res_fil['class'].isin(CUPHEAD_CLASSES)) | (res_fil['class'].isin(GOOPY_CLASSES)))] 

        aux = pd.DataFrame()
        res = pd.DataFrame()

        # check each row looking for overlaping bounding boxes
        for _, row1 in res_fil.iterrows():
            min_row = pd.DataFrame()
            for _, row2 in res_fil.iterrows():
                if((abs(row1['xmin']-row2['xmin']) > 0) and (abs(row1['ymin']-row2['ymin']) > 0)
                and (abs(row1['xmax']-row2['xmax']) > 0) and (abs(row1['ymax']-row2['ymax']) > 0)
                and (abs(row1['xmin']-row2['xmin']) <= 100) and (abs(row1['ymin']-row2['ymin']) <= 100)
                and (abs(row1['xmax']-row2['xmax']) <= 100) and (abs(row1['ymax']-row2['ymax']) <= 100)):
                    if(row2['confidence'] >= row1['confidence']):
                        min_row = row1.T
            if(len(min_row) > 0):
                aux = pd.concat([aux, min_row.T], ignore_index=False)
            
        # get the difference to remove the overlapinng bounding boxes
        res = pd.concat([res_fil, aux.T]).drop_duplicates(keep=False)

        # compute the input-format values
        xmax = res.xmax.astype(float)
        xmin = res.xmin.astype(float)
        ymax = res.ymax.astype(float)
        ymin = res.ymin.astype(float)

        class_num = [0 if elem in CUPHEAD_CLASSES else 1 for elem in list(res["class"].astype(int))]

        x = list(((xmax+xmin)/2)/1920)
        y = list(((ymax+ymin)/2)/1080)
        w = list((xmax-xmin)/1920)
        h = list((ymax-ymin)/1080)

        return class_num, x, y, w, h

- Make detections on images and save labels

In [10]:
IMAGES_PATH = 'data/images/'
DETECTIONS_PATH = 'data/detections/'
LABELS_PATH = 'data/labels/'
FILTER = 1


for image in os.listdir(IMAGES_PATH):
    # make detections on each image
    img = os.path.join(IMAGES_PATH, image)
    results = model(img)
    
    try:
        class_num, x, y, w, h = filter(results, FILTER)
    except Exception as e:
        print(e)
        print(str(img))
        pass
    
    if(len(class_num) > 1):
        cv2.imwrite(os.path.join(DETECTIONS_PATH,"%s.jpg" % os.path.splitext(image)[0]), np.squeeze(results.render())) # save detection frame as JPG file
        with open(os.path.join(LABELS_PATH,"%s.txt" % os.path.splitext(image)[0]), 'w') as f: # create text file and write
            for i in range(0, len(class_num)):
                f.write(str(class_num[i])+" {:.6f} {:.6f} {:.6f} {:.6f} \n".format(x[i], y[i], w[i], h[i]))
    else:
        os.remove(img) # remove the frames with no detections