In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from urllib.error import HTTPError
# from tqdm.notebook import tqdm
from time import sleep
import pickle as pkl
import seaborn as sns

In [2]:
# file_name = 'pitchers_since_2018.txt'
# file_name = 'pitchers_no_TJ.txt'
# raw_file = pd.read_csv(file_name, sep='\t')
# raw_file[2].apply(str.strip)


In [3]:
# raw_file.columns = ['First Name', 'Last Name', 'Position', 'Handedness', 'Month', 'Year', 'Year2', 'Note']
def preprocess(raw_f):
    f = raw_f.copy()
    f = f[f['Position'] == 'P']
    f['Date'] = f['Date'].apply(lambda x : datetime.strptime(x, '%m/%d/%Y'))
    f['Season'] = f['Date'].apply(lambda x: x.year)
    return f

def gen_seasons(last_season, years_back):
    s = str(last_season)
    for i in range(1, years_back):
        s += f'|{last_season - i}'
    return s

# This scraper based off of: https://github.com/alanrkessler/savantscraper/blob/master/savantscraper.py
def get_pitches(Id, name, num_tries=5, pause_time=30):
    # Id = entry['Id']
    # season = entry['Season']
    # name = entry['Player'].replace(" ", "")
    seasons = gen_seasons(2022, 2022-2016)
    
    url = ("https://baseballsavant.mlb.com/statcast_search/csv?all=true"
           "&hfPT=&hfAB=&hfBBT=&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfGT=R|&h"
           f"fC=&hfSea={seasons}%7C&hfSit=&player_type=pitcher&hfOuts=&opponent"
           "=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=&game_date_lt="
           f"&hfInfield=&team=&position=&hfOutfield=&hfRO="
           f"&home_road=&hfFlag=&hfPull=&pitchers_lookup[]={Id}&metric_1=&hfInn="
           "&min_pitches=0&min_results=0&group_by=name&sort_col=pitches"
           "&player_event_sort=pitch_number_thisgame&sort_order=desc"
           "&min_pas=0&type=details&")
    
    for i in range(num_tries):
        try:
            pitch_data = pd.read_csv(url, low_memory=False)
        except HTTPError as connect_error:
            if connect_error:
                if retry >= num_tries - 1:
                    raise HTTPError
                else:
                    sleep(pause_time)
                    pause_time *= 2
                    continue
            else:
                break
    
    # pitch_data.to_csv(f"scraped_data/{name}.csv")
    return pitch_data

In [4]:
with open('ids_by_name.pkl', 'rb') as f:
    ids_by_name = pkl.load(f)
    
with open('inj_by_name.pkl', 'rb') as f:
    inj_by_name = pkl.load(f)

In [5]:
for name, inj_list in inj_by_name.items():
    df = get_pitches(ids_by_name[name], name)
    if len(inj_list) == 0:
        continue
    inj_list = np.vectorize(np.datetime64)(inj_list)
    df['game_date'] = pd.to_datetime(df['game_date'], unit='ns')
    dates = df['game_date'].unique()
    dates.sort()
    df['injury_day'] = 0
    for injury_date in inj_list:
        previous_dates = dates[dates < injury_date]
        try:
            outing_date = max(previous_dates)
            df.loc[df['game_date'] == outing_date, 'injury_day'] = 1
        except:
            print(f'No outing prior to {injury_date}')
    df.to_csv(f"scraped_data/{name}.csv", index=False)

No outing prior to 2015-04-26T00:00:00.000000
No outing prior to 2011-03-25T00:00:00.000000
No outing prior to 2015-04-08T00:00:00.000000
No outing prior to 2013-08-06T00:00:00.000000
No outing prior to 2016-08-02T00:00:00.000000
No outing prior to 2016-07-30T00:00:00.000000
No outing prior to 2016-06-09T00:00:00.000000
No outing prior to 2015-06-08T00:00:00.000000
No outing prior to 2014-07-26T00:00:00.000000
No outing prior to 2012-06-10T00:00:00.000000
No outing prior to 2012-04-04T00:00:00.000000
No outing prior to 2011-06-03T00:00:00.000000
No outing prior to 2009-07-29T00:00:00.000000
No outing prior to 2016-04-24T00:00:00.000000
No outing prior to 2015-04-05T00:00:00.000000
No outing prior to 2014-09-24T00:00:00.000000
No outing prior to 2014-08-17T00:00:00.000000
No outing prior to 2013-03-31T00:00:00.000000
No outing prior to 2012-06-01T00:00:00.000000
No outing prior to 2012-04-04T00:00:00.000000
No outing prior to 2016-09-30T00:00:00.000000
No outing prior to 2016-04-25T00:0

ValueError: cannot call `vectorize` on size 0 inputs unless `otypes` is set

In [117]:
test = pd.read_csv(f'scraped_data/{name}.csv')

In [118]:
test.head()

Unnamed: 0.1,Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,...,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp,injury_day
0,0,FC,2022-06-22,85.8,-1.06,6.26,"Wainwright, Adam",457705,425794,double,...,3,4,4,3,Infield shift,Standard,153.0,0.163,1.721,0
1,1,FC,2022-06-22,84.4,-1.08,6.33,"Wainwright, Adam",457705,425794,,...,3,3,3,3,Infield shift,Standard,142.0,0.0,-0.095,0
2,2,FC,2022-06-22,85.8,-1.06,6.31,"Wainwright, Adam",457705,425794,,...,3,3,3,3,Infield shift,Standard,160.0,0.0,-0.06,0
3,3,FF,2022-06-22,88.1,-1.25,6.18,"Wainwright, Adam",642133,425794,walk,...,3,3,3,3,Strategic,Standard,193.0,0.011,0.045,0
4,4,CU,2022-06-22,72.8,-1.23,6.33,"Wainwright, Adam",642133,425794,,...,3,3,3,3,Strategic,Standard,43.0,0.0,0.058,0


In [11]:
tj_data = preprocess(raw_file)

In [134]:
example = get_pitches(tj_data.iloc[0])

In [12]:
for i in tqdm(range(len(tj_data))):
    example = get_pitches(tj_data.iloc[i])

  0%|          | 0/110 [00:00<?, ?it/s]

In [13]:
example

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp
0,FF,2017-09-28,94.7,2.12,5.81,"Turley, Nik",467793,543867,field_out,hit_into_play,...,0,0,5,5,0,Standard,Standard,164,0.000,-0.121
1,CU,2017-09-28,78.4,2.30,5.85,"Turley, Nik",467793,543867,,ball,...,0,0,5,5,0,Standard,Standard,319,0.000,0.020
2,CU,2017-09-28,81.2,2.30,5.59,"Turley, Nik",467793,543867,,called_strike,...,0,0,5,5,0,Standard,Standard,303,0.000,-0.022
3,CU,2017-09-28,82.4,2.47,5.65,"Turley, Nik",467793,543867,,ball,...,0,0,5,5,0,Standard,Standard,296,0.000,0.018
4,FF,2017-09-28,94.3,2.16,5.57,"Turley, Nik",457803,543867,home_run,hit_into_play,...,0,0,5,5,0,Infield shift,Strategic,148,0.007,0.982
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318,FF,2017-06-11,92.9,1.88,6.14,"Turley, Nik",605509,543867,single,hit_into_play,...,0,0,0,0,0,Standard,Standard,143,0.036,0.229
319,FF,2017-06-11,92.0,1.96,6.20,"Turley, Nik",605509,543867,,ball,...,0,0,0,0,0,Standard,Standard,151,0.000,0.117
320,FF,2017-06-11,94.1,1.79,6.21,"Turley, Nik",605509,543867,,ball,...,0,0,0,0,0,Standard,Standard,145,0.000,0.050
321,FF,2017-06-11,92.4,1.85,6.14,"Turley, Nik",605509,543867,,called_strike,...,0,0,0,0,0,Standard,Standard,144,0.000,-0.046
