In [1]:
import os
import pandas as pd
import json
from utils.tabular_scraping import get_savant_tabular_data

# Background

In this notebook we webscrape the tabular data associated with each video that will be used in this project. Tabular data, in contrast to video data, refers to the situational data related to each pitch. For example, the inning, the number of outs, the number of runners on base, etc. We don't necessarily *need* to webscrape these files since they are stored as CSVs and are easily downloadable from baseball savant. However, this script allows us to assign appropriate play IDs to each pitch which we can use to link tabular data to video data.

In [2]:
# Open saved player information file
f = open('../data/video-url-json/NNextPitchURLs.json')
player_info_dict = json.load(f)

In [3]:
output_dir = f"{os.getcwd()}/../data/tabular-data"

# Scrape play IDs and downloadable video URLs for each of the desired pitchers
for player in list(player_info_dict.keys()):
    
    # Get parameters for tabular download
    player_id = player_info_dict[player]["player_id"]
    season = player_info_dict[player]["season"]
    runners_on_base = player_info_dict[player]["runners_on_base"]

    # Get savant URLs
    get_savant_tabular_data(player_id, season, runners_on_base, output_dir)
    

Opening webpage for player ID 477132...







[WDM] - Current google-chrome version is 110.0.5481


2023-03-08 16:56:32,747 INFO Current google-chrome version is 110.0.5481


[WDM] - Get LATEST chromedriver version for 110.0.5481 google-chrome


2023-03-08 16:56:32,748 INFO Get LATEST chromedriver version for 110.0.5481 google-chrome


[WDM] - Driver [/Users/bgraham/.wdm/drivers/chromedriver/mac64/110.0.5481.77/chromedriver] found in cache


2023-03-08 16:56:32,889 INFO Driver [/Users/bgraham/.wdm/drivers/chromedriver/mac64/110.0.5481.77/chromedriver] found in cache
Downloading tabular file...
Getting play IDs...
Successfully added play IDs. Outputting updated CSV file...
Opening webpage for player ID 621111...







[WDM] - Current google-chrome version is 110.0.5481


2023-03-08 16:57:26,889 INFO Current google-chrome version is 110.0.5481


[WDM] - Get LATEST chromedriver version for 110.0.5481 google-chrome


2023-03-08 16:57:26,890 INFO Get LATEST chromedriver version for 110.0.5481 google-chrome


[WDM] - Driver [/Users/bgraham/.wdm/drivers/chromedriver/mac64/110.0.5481.77/chromedriver] found in cache


2023-03-08 16:57:26,997 INFO Driver [/Users/bgraham/.wdm/drivers/chromedriver/mac64/110.0.5481.77/chromedriver] found in cache
Downloading tabular file...
Getting play IDs...
Successfully added play IDs. Outputting updated CSV file...
Opening webpage for player ID 664062...







[WDM] - Current google-chrome version is 110.0.5481


2023-03-08 16:58:12,652 INFO Current google-chrome version is 110.0.5481


[WDM] - Get LATEST chromedriver version for 110.0.5481 google-chrome


2023-03-08 16:58:12,653 INFO Get LATEST chromedriver version for 110.0.5481 google-chrome


[WDM] - Driver [/Users/bgraham/.wdm/drivers/chromedriver/mac64/110.0.5481.77/chromedriver] found in cache


2023-03-08 16:58:12,780 INFO Driver [/Users/bgraham/.wdm/drivers/chromedriver/mac64/110.0.5481.77/chromedriver] found in cache
Downloading tabular file...
Getting play IDs...
Successfully added play IDs. Outputting updated CSV file...
Opening webpage for player ID 542881...







[WDM] - Current google-chrome version is 110.0.5481


2023-03-08 16:58:51,952 INFO Current google-chrome version is 110.0.5481


[WDM] - Get LATEST chromedriver version for 110.0.5481 google-chrome


2023-03-08 16:58:51,953 INFO Get LATEST chromedriver version for 110.0.5481 google-chrome


[WDM] - Driver [/Users/bgraham/.wdm/drivers/chromedriver/mac64/110.0.5481.77/chromedriver] found in cache


2023-03-08 16:58:52,448 INFO Driver [/Users/bgraham/.wdm/drivers/chromedriver/mac64/110.0.5481.77/chromedriver] found in cache
Downloading tabular file...
Getting play IDs...
Successfully added play IDs. Outputting updated CSV file...
Opening webpage for player ID 669952...







[WDM] - Current google-chrome version is 110.0.5481


2023-03-08 16:59:32,632 INFO Current google-chrome version is 110.0.5481


[WDM] - Get LATEST chromedriver version for 110.0.5481 google-chrome


2023-03-08 16:59:32,633 INFO Get LATEST chromedriver version for 110.0.5481 google-chrome


[WDM] - Driver [/Users/bgraham/.wdm/drivers/chromedriver/mac64/110.0.5481.77/chromedriver] found in cache


2023-03-08 16:59:32,819 INFO Driver [/Users/bgraham/.wdm/drivers/chromedriver/mac64/110.0.5481.77/chromedriver] found in cache
Downloading tabular file...
Getting play IDs...
Successfully added play IDs. Outputting updated CSV file...
Opening webpage for player ID 607192...







[WDM] - Current google-chrome version is 110.0.5481


2023-03-08 17:00:12,595 INFO Current google-chrome version is 110.0.5481


[WDM] - Get LATEST chromedriver version for 110.0.5481 google-chrome


2023-03-08 17:00:12,596 INFO Get LATEST chromedriver version for 110.0.5481 google-chrome


[WDM] - Driver [/Users/bgraham/.wdm/drivers/chromedriver/mac64/110.0.5481.77/chromedriver] found in cache


2023-03-08 17:00:12,825 INFO Driver [/Users/bgraham/.wdm/drivers/chromedriver/mac64/110.0.5481.77/chromedriver] found in cache
Downloading tabular file...
Getting play IDs...
Successfully added play IDs. Outputting updated CSV file...


In [4]:
# Merge tabular files
file_dir = f"{os.getcwd()}/../data/tabular-data"
tabular_files = [f"{file_dir}/{file}" for file in os.listdir(file_dir) if ".csv" in file]
combined_df = pd.concat([pd.read_csv(f, index_col=False) for f in tabular_files])
combined_df.to_csv(f"{file_dir}/tabular-data.concat.csv", index=False)