In [None]:
import os
from utils.video_scraping import get_savant_video_urls, get_video_link
import json

# Background

In this notebook we webscrape pitching video URLs from baseball savant. URLs are scraped for six pitchers (shown below) and stored in a dictionary that is outputted as a json file. These URLs can easily be used to download the corresponding videos. The plan is to transfer the output json file to Google Colab where we can use the Detectron2/OpenCV/OpenPose pipeline to extract useful data from videos, one video at a time.

In [None]:
# Use this dictionary if we are re-running this script - store baseballsavant play id
player_info = {
    "Clayton Kershaw": {"player_id": "477132", "season": 2019, "runners_on_base": True, "play_ids": [], "video_urls": []}, 
    "Walker Buehler": {"player_id": "621111", "season": 2021, "runners_on_base": True, "play_ids": [], "video_urls": []}, 
    "Tony Gonsolin": {"player_id": "664062", "season": 2022, "runners_on_base": True, "play_ids": [], "video_urls": []}, 
    "Mitch White": {"player_id": "669952", "season": 2022, "runners_on_base": True, "play_ids": [], "video_urls": []}, 
    "Tyler Glasnow": {"player_id": "607192", "season": 2019, "runners_on_base": False, "play_ids": [], "video_urls": []}}


In [None]:
# Scrape play IDs and downloadable video URLs for each of the desired pitchers
for player in list(player_info.keys()):
    
    if not player_info[player]["video_urls"]:
    
        # Get mp4 URLs for desired pitcher
        player_id = player_info[player]["player_id"]
        season = player_info[player]["season"]
        runners_on_base = player_info[player]["runners_on_base"]
        print(f"Getting video urls for {player}'s {season} season...")

        # Get savant URLs
        savant_urls = get_savant_video_urls(player_id=player_id, season=season, runners_on_base=runners_on_base)
        print(f"{len(savant_urls)} found...")
        
        # Store savant play ids
        play_ids = [url.split("playId=")[-1] for url in savant_urls]
        player_info[player]["play_ids"] = play_ids
        
        # Collect 
        downloadable_urls = []
        for i, url in enumerate(savant_urls, 1):
            downloadable = get_video_link(url)
            downloadable_urls.append(downloadable)
            if not i % 10:
                print(f'URL {i}/{len(savant_urls)} extracted...')

        # Save downloadable URLs in dictionary
        player_info[player]["video_urls"] = downloadable_urls
        
    else:
        continue
        

In [None]:
# Output json file to store dictionary
with open('../data/video-url-json/NNextPitchURLs.json', 'w') as f:
    json.dump(player_info, f)