In [1]:
import cv2
import os
import shutil
from numpy import random
from random import shuffle
from utils.video_scraping import get_savant_video_urls, download_mp4_from_savant

# Detectron Training Preparation
## Background
Here we randomly download 600 images from 6 pitchers (100 images per pitcher) and use labelImg to annotate the images. The annotated images are then used to train a Detectron2 model on Google Colab.

## Download Videos
Download 10 random videos per player

In [None]:
def download_videos(player_id, season, runners_on_base, dest_path, random_sample, n_sample=10):
    
    # Get player videos for full season
    print(f"Player ID: {player_id}")
    video_urls = get_savant_video_urls(player_id=player_id, season=season, runners_on_base=runners_on_base)
    print(f"{len(video_urls)} videos available for this season...")
    
    # Select random sample of videos if random_sample parameter is set to true
    if random_sample:
        video_urls = random.sample(video_urls, n_sample)
        print(f"But sampling {n_sample} videos...")
        
    # Download videos
    for video_url in video_urls:
        try:
            download_mp4_from_savant(savant_url=video_url, path=dest_path)
        except:
            continue
    
    # New line
    print("")

In [None]:
player_info = {
    "Clayton Kershaw": {"player_id": "477132", "season": 2019}, 
    "Walker Buehler": {"player_id": "621111", "season": 2021}, 
    "Tony Gonsolin": {"player_id": "664062", "season": 2022}, 
    "Tyler Anderson": {"player_id": "542881", "season": 2022}, 
    "Mitch White": {"player_id": "669952", "season": 2022}, 
    # Note: Tyler Glasnow always pitches from the stretch
    "Tyler Glasnow": {"player_id": "607192", "season": 2019}}

for player in list(player_info.keys()):
    player_id = player_info[player]["player_id"]
    season = player_info[player]["season"]
    print(f"Getting videos for {player}'s {season} season")
    download_videos(player_id, season, runners_on_base=True, dest_path="../data/detectron-training/videos", random_sample=True)
    

## Convert Videos to Images
Generating 10 images randomly from each video, which equates to 100 random images per pitcher and 600 random images total. These images will be annotated and used to train a Detectron 2 pitcher detector model.

In [3]:
def mp4_to_image(input_dir, output_dir, n_frames=10):

    # Get list of files to extract images from
    mp4s = [f"{input_dir}/{file}" for file in os.listdir(input_dir) if ".mp4" in file]
    
    # Output 10 random jpgs per video
    for i, mp4 in enumerate(mp4s, 1):
        
        # Message
        print(f"\nVideo {i}")

        # Get random n frames from each video
        random.seed(seed=0)
        vidcap = cv2.VideoCapture(mp4)
        totalFrames = vidcap.get(cv2.CAP_PROP_FRAME_COUNT)
        randomFrames = random.randint(totalFrames, size=(n_frames))

        # Output each frame as jpg
        for i, randomFrameNumber in enumerate(randomFrames, 1):
            
            # Message
            print(f"Outputting image {i}/{n_frames}...")
            
            # Set video to specified frame number
            vidcap.set(cv2.CAP_PROP_POS_FRAMES, randomFrameNumber)
            success, image = vidcap.read()
            if success:
                output_file = f"{output_dir}/{mp4.split('/')[-1].split('.mp4')[0]}-{i}.jpg"
                cv2.imwrite(output_file, image)

#mp4_to_image(input_dir="../data/detectron-training/videos", output_dir="../data/detectron-training/images")

## Annotate Images with LabelImg
Annotating images with labelimg (https://github.com/heartexlabs/labelImg). Once installed, calling 'labelImg' from the command line opens up an interface to annotate images (in this case labeling the pitcher in a given frame). I originally saved annotations as yolo txt files. These may need to be converted to xml files and then to a coco json file.

In [None]:
#!pip3 install labelImg

## Converting Annotation File Types
txt --> xml --> coco

### txt --> xml
The script used here is the txt_to_xml.py file from https://github.com/sowmyakavali/Data-convertion.git. I found some bugs in the script that needed to be fixed. The edits that were made were saved in a new file called txt_to_xml-2.py. The original file is preserved and can be referenced.

In [20]:
#!git clone https://github.com/sowmyakavali/Data-convertion.git

Cloning into 'Data-convertion'...
remote: Enumerating objects: 21, done.[K
remote: Counting objects: 100% (21/21), done.[K
remote: Compressing objects: 100% (18/18), done.[K
remote: Total 21 (delta 4), reused 1 (delta 0), pack-reused 0[K
Receiving objects: 100% (21/21), 11.14 KiB | 11.14 MiB/s, done.
Resolving deltas: 100% (4/4), done.


In [None]:
# Move Data-convertion directory
#shutil.move('./Data-convertion', '../data/detectron-training/helper_packages/Data-convertion')

In [14]:
#!python ../data/detectron-training/helper_packages/Data-convertion/txt_to_xml-2.py -i ../data/detectron-training/annotations_txt/ -img ../data/detectron-training/images/ -o ../data/detectron-training/annotations_xml/

c1db6892-b2c4-4706-8b08-45eeede02543-5.txt-->start!
6136fe4f-e136-4ba1-bbc0-ef5308d6e592-1.txt-->start!
1a9aad4c-88eb-4133-8ad1-fd41b702b2d7-6.txt-->start!
2c3e4ffe-1b80-4d1b-907b-acaba78875cb-7.txt-->start!
14e67b7b-aea2-445f-872e-f69ca0197b2f-2.txt-->start!
be3c2e55-7a4a-44cd-9473-ab81f576193e-8.txt-->start!
ce1ab1e6-1b54-4730-84d3-deabfd90d25a-2.txt-->start!
ecc21930-dc56-4677-9475-8408a1230702-6.txt-->start!
2c119be3-18d0-4381-961f-7fc4b75d0d78-10.txt-->start!
2712fdd9-98a3-4ac0-8c90-f94b93caa264-2.txt-->start!
3d9e316c-9cc5-49e3-93c4-ac8d662d300b-7.txt-->start!
7fa54d27-4e39-47ca-88ee-5be698a9f294-3.txt-->start!
5ec4a912-b95f-4c13-a963-b7d7cf7859bd-3.txt-->start!
4e1c34af-4fba-4c0a-b38b-6748e694b496-1.txt-->start!
d45e0c51-5f72-4d92-abfa-f6cec08d4918-8.txt-->start!
66bfb6b6-f36a-4ae8-b182-4fd410153678-8.txt-->start!
3ea6a4c1-9962-401e-a90c-95ba49d4f45a-6.txt-->start!
b320529f-842e-4b27-94c2-833461c4475e-2.txt-->start!
66919ef7-0420-42dd-8e08-d2193e078968-3.txt-->start!
29a7131e-78

2c119be3-18d0-4381-961f-7fc4b75d0d78-1.txt-->start!
dae02688-1cdc-41ac-838d-73240bff93b7-1.txt-->start!
12a6cc92-c8fe-4e16-9f99-16b697aca1d8-1.txt-->start!
0653d294-b000-4fce-a321-702cb92fad9b-8.txt-->start!
1bc10f0b-fd61-4adc-bcdf-0e3626264b61-7.txt-->start!
d7f29851-0c17-4e92-a409-8eaab9ebbfa5-10.txt-->start!
788ce2fd-9217-447c-aa4f-3861126c97dc-6.txt-->start!
be9961a3-6208-4d02-81c4-9beeb8fdff42-2.txt-->start!
131789d8-85cd-47d9-8cbd-44af657b4863-9.txt-->start!
924a02b7-f10b-4554-bd74-b8a618c50562-2.txt-->start!
3d9e316c-9cc5-49e3-93c4-ac8d662d300b-10.txt-->start!
29bd0a4e-d144-42f3-9f3d-e60207c39777-8.txt-->start!
d6756d26-7ee5-4f74-98d6-dd8c9d142e6c-8.txt-->start!
f8a22549-1aba-438f-a2dd-435000c1c806-10.txt-->start!
cc36b9a2-9f60-415b-a84b-62a0ee003a39-9.txt-->start!
788ce2fd-9217-447c-aa4f-3861126c97dc-4.txt-->start!
dffb5a04-88b0-4dfc-ada7-ed80b334e2bf-9.txt-->start!
1bc10f0b-fd61-4adc-bcdf-0e3626264b61-5.txt-->start!
12a6cc92-c8fe-4e16-9f99-16b697aca1d8-3.txt-->start!
dae02688-

3a63bc20-d0ce-499f-8253-50e5be82789b-1.txt-->start!
131789d8-85cd-47d9-8cbd-44af657b4863-6.txt-->start!
788ce2fd-9217-447c-aa4f-3861126c97dc-9.txt-->start!
cc36b9a2-9f60-415b-a84b-62a0ee003a39-4.txt-->start!
e592d7d9-9b7f-465a-88de-49690976e52a-2.txt-->start!
0a4b7d48-3ab6-435f-bbd7-7716f114a062-2.txt-->start!
29bd0a4e-d144-42f3-9f3d-e60207c39777-7.txt-->start!
d6756d26-7ee5-4f74-98d6-dd8c9d142e6c-7.txt-->start!
d6756d26-7ee5-4f74-98d6-dd8c9d142e6c-5.txt-->start!
29bd0a4e-d144-42f3-9f3d-e60207c39777-5.txt-->start!
cc36b9a2-9f60-415b-a84b-62a0ee003a39-6.txt-->start!
be3c2e55-7a4a-44cd-9473-ab81f576193e-10.txt-->start!
3a63bc20-d0ce-499f-8253-50e5be82789b-3.txt-->start!
131789d8-85cd-47d9-8cbd-44af657b4863-4.txt-->start!
1199091a-eb82-4a1a-a5a9-bc82e218fd1a-2.txt-->start!
dffb5a04-88b0-4dfc-ada7-ed80b334e2bf-6.txt-->start!
ebdc5532-3b83-44f5-bb53-ab7b579b138c-1.txt-->start!
d7f29851-0c17-4e92-a409-8eaab9ebbfa5-1.txt-->start!
0a4b7d48-3ab6-435f-bbd7-7716f114a062-10.txt-->start!
1f0395db-8

### Split into Training and Validation Sets

In [42]:
# Randomize xml files
xml_files = [file for file in os.listdir('data/detectron-training/annotations_xml') if ".xml" in file]
random.seed(0)
random.shuffle(xml_files)

if xml_files:

    # Partition into training and validation sets
    xml_train = xml_files[:500]
    xml_val = xml_files[500:]

    # Make new paths to store training and validation data
    if not os.path.exists('data/detectron-training/annotations_xml/training'):
        os.mkdir('data/detectron-training/annotations_xml/training')

    if not os.path.exists('data/detectron-training/annotations_xml/validation'):
        os.mkdir('data/detectron-training/annotations_xml/validation')

    # Move files to new destinations
    # Training data
    for file in xml_train:
        source_folder = 'data/detectron-training/annotations_xml'
        destination_folder = 'data/detectron-training/annotations_xml/training'
        shutil.move(f'{source_folder}/{file}', f'{destination_folder}/{file}')

    # Validation data
    for file in xml_val:
        source_folder = 'data/detectron-training/annotations_xml'
        destination_folder = 'data/detectron-training/annotations_xml/validation'
        shutil.move(f'{source_folder}/{file}', f'{destination_folder}/{file}')

In [50]:
# Now doing this to images
img_train = [f'{file.split(".xml")[0]}.jpg' for file in xml_train]
img_val = [f'{file.split(".xml")[0]}.jpg' for file in xml_val]

# Make new paths to store training and validation data
if not os.path.exists('data/detectron-training/images/training'):
    os.mkdir('data/detectron-training/images/training')

if not os.path.exists('data/detectron-training/images/validation'):
    os.mkdir('data/detectron-training/images/validation')

# Move files to new destinations
# Training data
for file in img_train:
    source_folder = 'data/detectron-training/images'
    destination_folder = 'data/detectron-training/images/training'
    shutil.move(f'{source_folder}/{file}', f'{destination_folder}/{file}')

# Validation data
for file in img_val:
    source_folder = 'data/detectron-training/images'
    destination_folder = 'data/detectron-training/images/validation'
    shutil.move(f'{source_folder}/{file}', f'{destination_folder}/{file}')
    

### xml --> coco
The script used here is the vooc2coco.py file from https://github.com/Tony607/voc2coco.git. There was a weird requirement to have images titled as numbers so I made a slight edit to the script. The changes were saved in voc2coco/voc2coco-2.py, which is used below.

In [16]:
#!git clone https://github.com/Tony607/voc2coco.git

Cloning into 'voc2coco'...
remote: Enumerating objects: 50, done.[K
remote: Total 50 (delta 0), reused 0 (delta 0), pack-reused 50[K
Receiving objects: 100% (50/50), 4.37 MiB | 5.30 MiB/s, done.
Resolving deltas: 100% (17/17), done.


In [None]:
# Move voc2coco directory
#shutil.move('./voc2coco', '../data/detectron-training/helper_packages/voc2coco')

In [43]:
#!python ../data/detectron-training/helper_packages/voc2coco/voc2coco-2.py ../data/detectron-training/annotations_xml/training/ ../data/detectron-training/annotations_coco/training/pitcher_annotations.json

Number of xml files: 500
Success: ./data/detectron-training/annotations_coco/training/pitcher_annotations.json


In [44]:
#!python ../data/detectron-training/helper_packages/voc2coco/voc2coco-2.py ../data/detectron-training/annotations_xml/validation/ ../data/detectron-training/annotations_coco/validation/pitcher_annotations.json

Number of xml files: 100
Success: ./data/detectron-training/annotations_coco/validation/pitcher_annotations.json


## Detectron Training
Detectron training was performed in a Google Colab notebook to make use of a GPU. The notebook can be found at https://colab.research.google.com/drive/1bl9DoAHkTPCcU2vy4WM4vkuqxFLBp2n_?authuser=1#scrollTo=xvemhFnpFI1x.