In [326]:
import pybaseball
from pybaseball import statcast
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from datetime import timezone
from bs4 import BeautifulSoup
import io
import requests
import unicodedata
import scipy.stats as stats
pybaseball.cache.enable()

from RosterScraper import RosterScraper

## Loads in StatCast ID so batter names show in the Statcast data and loads in a scraped DF with every 40 man roster

In [327]:
url = 'https://docs.google.com/spreadsheets/d/1JgczhD5VDQ1EiXqVG-blttZcVwbZd5_Ne_mefUGwJnk/pub?output=csv'
res = requests.get(url)
ID = pd.read_csv(io.BytesIO(res.content), sep=',')
ID.dropna(subset=['MLBID'], inplace=True)
ID['MLBID'] = ID['MLBID'].astype(int)

Rosters = RosterScraper()
BID = Rosters[Rosters["Position"] == "Batter"]
PID = Rosters[Rosters["Position"] == "Pitcher"]

## Creating functions for data manipulation so they can match when joining separate datasets

In [328]:
def convert_name(name):
    if name == 'Rockies':
        return 'COL'
    elif name == 'Reds':
        return 'CIN'
    elif name == 'Mariners':
        return 'SEA'
    elif name == 'Nationals':
        return 'WSH'
    elif name == 'Yankees':
        return 'NYY'
    elif name == 'Astros':
        return 'HOU'
    elif name == 'Red Sox':
        return 'BOS'
    elif name == 'Athletics':
        return 'OAK'
    elif name == 'Mets':
        return 'NYM'
    elif name == 'Braves':
        return 'ATL'
    elif name == 'Giants':
        return 'SF'
    elif name == 'Brewers':
        return 'MIL'
    elif name == 'Rays':
        return 'TB'
    elif name == 'Royals':
        return 'KC'
    elif name == 'White Sox':
        return 'CWS'
    elif name == 'Cubs':
        return 'CHC'
    elif name == 'Angels':
        return 'LAA'
    elif name == 'Tigers':
        return 'DET'
    elif name == 'Diamondbacks':
        return 'ARI'
    elif name == 'Guardians':
        return 'CLE'
    elif name == 'Orioles':
        return 'BAL'
    elif name == 'Twins':
        return 'MIN'
    elif name == 'Marlins':
        return 'MIA'
    elif name == 'Phillies':
        return 'PHI'
    elif name == 'Rangers':
        return 'TEX'
    elif name == 'Dodgers':
        return 'LAD'
    elif name == 'Padres':
        return 'SD'
    elif name == 'Pirates':
        return 'PIT'
    elif name == 'Blue Jays':
        return 'TOR'
    elif name == 'Cardinals':
        return 'STL'
    else:
        return np.nan
    
def flip_names(name):
    first_name, last_name = name.split(", ")
    return f"{last_name} {first_name}"

def replace_special_chars(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')

def append_suffix_to_duplicates(df, column):
        seen = {}
        for idx, value in enumerate(df[column]):
            if value in seen:
                seen[value] += 1
                df.at[idx, column] = f"{value}2"
            else:
                seen[value] = 1

## Scraping the RotoGrinders website for daily pitchers and lineups

In [329]:
def getDKData2024():
    eastern_time = datetime.datetime.now(timezone.utc).astimezone(timezone(datetime.timedelta(hours=-5)))
    todaysdate = eastern_time.strftime("%m-%d-%Y")
    url = 'https://rotogrinders.com/lineups/mlb?site=draftkings'
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'lxml')

    gamelist = []
    gamecards = soup.findAll("div", {"class": "game-card-teams"})
    for x in gamecards:
        twoteams = x.findAll("span", {"class": "team-nameplate-mascot"})
        roadteam = convert_name(twoteams[0].text)
        hometeam = convert_name(twoteams[1].text)
        gamekey = "{}@{}".format(roadteam,hometeam)
        gamelist.append(gamekey)

    matchupsdf = pd.DataFrame()
    for game in gamelist:
        roadteam = game.split("@")[0]
        hometeam = game.split("@")[1]
        thisdf1 = pd.DataFrame({"Team": roadteam, "Opp": hometeam, "RoadTeam": roadteam, "HomeTeam": hometeam},index=[0])
        thisdf2 = pd.DataFrame({"Team": hometeam, "Opp": roadteam, "RoadTeam": roadteam, "HomeTeam": hometeam},index=[0])
        matchupsdf = pd.concat([matchupsdf,thisdf1,thisdf2])
        
    oppdict = dict(zip(matchupsdf.Team,matchupsdf.Opp))
    hometeamdict = dict(zip(matchupsdf.Team,matchupsdf.HomeTeam))
    roadteamdict = dict(zip(matchupsdf.Team,matchupsdf.RoadTeam))

    disabled_span_list = []
    for span in soup.findAll("span", {"class": "player-nameplate disabled"}):
        for a in span.findAll("a"):
            disabled_span_list.append(a.text)

    spdata = pd.DataFrame()
    for div in soup.findAll("span", {"class": "player-nameplate", "data-position": "SP"}):
        if "TBD" in str(div):
            playername = "TBD"
            pos = "SP"
            sal = 0
        else:
            for a in div.findAll('a', {'class': 'player-nameplate-name'}):
                playername = a.text.strip()

            strdiv = str(div)
            pos = strdiv[strdiv.find("data-position")+15:strdiv.find("data-salary")-2]
            sal = strdiv[strdiv.find("data-salary")+13:strdiv.find("<div class = 'player-nameplate-info'>")-3]
        try:
            ownership = strdiv[strdiv.find('<span class="small muted" data-auth="502">') + 42:strdiv.find('%')]
            ownership = ownership.replace("</span>", "")
            ownership = ownership.replace("</span", "")
            ownership = ownership.replace("</div>", "")
            ownership = ownership.replace(" ", "")
        except:
            ownership = np.nan

        thisspdata = pd.DataFrame([[playername, sal, ownership]], columns = ["Player", "Salary", "Ownership"])
        spdata = pd.concat([spdata, thisspdata])

    spdata['Player'] = spdata['Player'].replace('Luis Ortiz', 'Luis L. Ortiz')
    spdata['Player'] = spdata['Player'].replace('Mike King', 'Michael King')
    spdata['Player'] = spdata['Player'].replace('Robert Zastryzny', 'Rob Zastryzny')

    spdata2 = pd.merge(spdata, PID[["Name", "Team"]], left_on = ["Player"], right_on = ["Name"], how = "left").rename(columns = {"Team": "PitcherTeam"})
    spdata3 = pd.merge(spdata2, matchupsdf[["Team", "Opp"]], left_on = ["PitcherTeam"], right_on = ["Team"], how = "left").drop(columns = ["Team"])

    append_suffix_to_duplicates(spdata3, 'PitcherTeam')
    append_suffix_to_duplicates(spdata3, 'Opp')

    opp_spname_dict = dict(zip(spdata3.Opp, spdata3.Player))
    opp_spsal_dict = dict(zip(spdata3.Opp, spdata.Salary))
    opp_spown_dict = dict(zip(spdata3.Opp, spdata3.Ownership))

    ludf = pd.DataFrame()
    
    for li in soup.findAll("li", {"class": "lineup-card-player"}):
        for a in li.findAll("a", {"class": ["player-nameplate-name", "player-nameplate disabled"]}):
            playername = a.text

        listring = str(li)
        for span in li.find("span", {"class": "small"}):
            luspot = span.text
            luspot = luspot.replace("\n", "")
            luspot = luspot.strip()
            luspot = int(luspot)
        pos = listring[listring.find("data-position")+15:listring.find("data-salary")-2]
        sal = listring[listring.find("data-salary")+13:listring.find("<span class='small'>")-3]
        ownership = ownership.replace("</span>", "")
        ownership = ownership.replace("</span", "")
        ownership = ownership.replace("</li", "")
        ownership = ownership.replace("</div>", "")
        ownership = ownership.replace(" ", "")

        try:
            sal = int(sal)
        except:
            sal = 0
        thisludf = pd.DataFrame([[playername, luspot, sal, ownership]], columns = ["Player", "Spot", "Sal", "Ownership"])
        ludf = pd.concat([ludf, thisludf])

    ludf2 = pd.merge(ludf, BID[["Name", "Team"]], left_on = ["Player"], right_on = ["Name"], how = "left").rename(columns = {"Team": "BatterTeam"})
    ludf2['BatterTeam'] = ludf2['BatterTeam'].fillna(method='ffill')
    ludf2['HomeTeam'] = ludf2['BatterTeam'].map(hometeamdict)
    ludf2['RoadTeam'] = ludf2['BatterTeam'].map(roadteamdict)

    ludf2_teamlist = list(ludf2["BatterTeam"])

    dhteams = []
    for x in ludf2_teamlist:
        if ludf2_teamlist.count(x) > 11:
            if x in dhteams:
                pass
            else:
                dhteams.append(x)

    extract_dh = ludf2[ludf2["BatterTeam"].isin(dhteams)]
    new_ludf2 = ludf2[~ludf2["BatterTeam"].isin(dhteams)]

    new_team_list = []
    new_home_list = []
    new_road_list = []
    runcounter = 0

    for x, home, road in zip(extract_dh["BatterTeam"].astype(str), 
                         extract_dh["HomeTeam"].astype(str), 
                         extract_dh["RoadTeam"].astype(str)):
        if runcounter < 18:
            new_team_list.append(x)
            new_home_list.append(home)
            new_road_list.append(road)
            runcounter += 1
        else:
            new_team_list.append(x + "2")
            new_home_list.append(home + "2")
            new_road_list.append(road + "2")
            runcounter += 1

    extract_dh["BatterTeam"] = new_team_list
    extract_dh["HomeTeam"] = new_home_list
    extract_dh["RoadTeam"] = new_road_list

    ludf2 = pd.concat([extract_dh, new_ludf2])
    ludf2["Opp"] = ludf2["BatterTeam"].map(oppdict)
    ludf2['SP'] = ludf2['BatterTeam'].map(opp_spname_dict)
    ludf2['SPSal'] = ludf2['BatterTeam'].map(opp_spsal_dict)
    ludf2['SPOwnership'] = ludf2['BatterTeam'].map(opp_spown_dict)
    ludf2['Date'] = todaysdate
    ludf2['Time'] = np.nan

    ludf3 = ludf2[['BatterTeam','RoadTeam','HomeTeam','Time','Spot','Player','Sal','Ownership','Date', "SP"]]

    dkdata = ludf3.copy()

    try:
        checknan = dkdata[["BatterTeam", "SP"]]
        getnans = checknan[["SP"].isna()]
        if len(getnans) == 0:
            nonans = 1
            nanmapdict = {}
        else:
            nonans = 0
            getnans["SP"] = disabled_span_list
            nanmapdict = dict(zip(getnans.Team, getnans.SP))
    except:
        pass

    try:
        dkdata["SP"] = np.where(dkdata["SP"].isna(), dkdata["BatterTeam"].map(nanmapdict), dkdata["SP"])
    except:
        pass
    
    for i in range(1, len(dkdata) - 1):
        if dkdata.loc[i, 'BatterTeam'] != dkdata.loc[i-1, 'BatterTeam']:
            if dkdata.loc[i, 'BatterTeam'] != dkdata.loc[i+1, 'BatterTeam']:
                dkdata.loc[i, 'BatterTeam'] = np.nan
                dkdata.loc[i, 'HomeTeam'] = np.nan
                dkdata.loc[i, 'RoadTeam'] = np.nan
                dkdata.loc[i, 'SP'] = np.nan

    
    dkdata[["BatterTeam", "RoadTeam", "HomeTeam"]] = dkdata[["BatterTeam", "RoadTeam", "HomeTeam"]].fillna(method='ffill')
    dkdata = dkdata.drop_duplicates(subset = ["BatterTeam", "SP"], keep = "first")
    dkdata = dkdata.drop(columns = ["Time", "Sal", "Ownership"])

    dkdata['BatterTeam'] = dkdata['BatterTeam'].replace('ARI', 'AZ')
    dkdata['RoadTeam'] = dkdata['RoadTeam'].replace('ARI', 'AZ')
    dkdata['HomeTeam'] = dkdata['HomeTeam'].replace('ARI', 'AZ')

    dkdata['Date'] = pd.to_datetime(dkdata['Date'])
    dkdata['Date'] = dkdata['Date'].dt.strftime('%Y-%m-%d')
    dkdata = dkdata.set_index("Date")
    dkdata = dkdata[["BatterTeam", "RoadTeam", "HomeTeam", "SP"]]

    return(dkdata)

## Loads regular season data from 2022-23 to train on

In [330]:
#statcast(start_dt = "2022-04-07", end_dt = "2022-10-05")
#statcast(start_dt = "2023-03-30", end_dt = "2023-10-01")
savant2022 = pd.read_csv("~/Desktop/Random-Projects/MLB/savant2022.csv")
savant2023 = pd.read_csv("~/Desktop/Random-Projects/MLB/savant2023.csv")

In [331]:
#pd.set_option('display.max_columns', None)
combined1 = pd.concat([savant2022, savant2023])
combined1['game_date'] = pd.to_datetime(combined1['game_date'])
combined1['game_date'] = pd.to_datetime(combined1['game_date'].dt.strftime('%Y-%m-%d'))
combined1['BatterTeam'] = np.where(combined1['inning_topbot'] == 'Top', combined1['away_team'], combined1['home_team'])
combined1['PitcherTeam'] = np.where(combined1['inning_topbot'] == 'Top', combined1['home_team'], combined1['away_team'])
combined1['AwayRunsScored'] = combined1['post_away_score'] - combined1['away_score']
combined1['HomeRunsScored'] = combined1['post_home_score'] - combined1['home_score']
combined1["player_name"] = combined1["player_name"].apply(flip_names)
combined1["player_name"] = combined1["player_name"].apply(replace_special_chars)

## Creating a DF where it only loads in the stats of starting pitchers

In [332]:
# Group by game and team identifiers
groupby_cols = ['game_date', 'BatterTeam', 'away_team', 'home_team']

# Function to keep only the starter's data
def keep_starter(group):
    starter_name = group['player_name'].iloc[0]
    return group[group['player_name'] == starter_name]

def count_outs(x):
    single_outs = ['other_out', 'strikeout', 'field_out', "force_out", 'fielders_choice', 'fielders_choice_out', "sac_fly", "sac_bunt", "caught_stealing_2b", "caught_stealing_3b", "caught_stealing_home", "pickoff_caught_stealing_2b",  "pickoff_caught_stealing_3b",  "pickoff_caught_stealing_home"]
    double_outs = ['double_play', 'strikeout_double_play', 'grounded_into_double_play', "sac_fly_double_play"]
    triple_outs = ['triple_play']
    
    outs = (x.isin(single_outs)).sum() + 2 * (x.isin(double_outs)).sum() + 3 * (x.isin(triple_outs)).sum()
    return outs

# Apply the function to each group
combined2 = combined1[["game_date", "home_team", "away_team", "inning", "inning_topbot", "at_bat_number", "pitch_number", "BatterTeam", "MLBNAME", "events", "description", "bb_type", "estimated_ba_using_speedangle", "estimated_woba_using_speedangle", "woba_value", "p_throws", "PitcherTeam", "player_name", "delta_run_exp", "AwayRunsScored", "HomeRunsScored"]].sort_values(by = ["game_date", "home_team", "away_team", "inning_topbot", "at_bat_number", "pitch_number"], ascending=[True, True, True, False, True, True])
df_starters_only = combined2.groupby(groupby_cols).apply(keep_starter).reset_index(drop = True)

  df_starters_only = combined2.groupby(groupby_cols).apply(keep_starter).reset_index(drop = True)


## Grouping on a pitch level for pitchers

In [333]:
Train1 = df_starters_only.groupby(["game_date", "BatterTeam", "away_team", "home_team", "inning", "at_bat_number", "MLBNAME", "player_name", "p_throws"]).agg(
    Pitches = ("pitch_number", "size"),
    PA = ('events', lambda x: (x.isin(['other_out', 'single', 'double', 'triple', 'home_run', 'strikeout', 'field_out', 'field_error', 'fielders_choice', 'double_play', 'fielders_choice_out', 'strikeout_double_play', 'triple_play', 'grounded_into_double_play'])).sum()),
    Outs = ('events', count_outs),
    Hit = ('events', lambda x: (x.isin(['single', 'double', 'triple', 'home_run'])).sum()),
    FB = ('bb_type', lambda x: (x == 'fly_ball').sum()),
    HR = ('events', lambda x: (x == 'home_run').sum()),
    HBP = ('events', lambda x: (x == 'hit_by_pitch').sum()),
    Balls = ('description', lambda x: (x.isin(["ball", "hit_by_pitch", "blocked_ball"])).sum()),
    BB = ('events', lambda x: (x == 'walk').sum()),
    CS = ('description', lambda x: (x == 'called_strike').sum()),
    Whiff = ('description', lambda x: (x.isin(["swinging_strike", "swinging_strike_blocked", "foul_tip"])).sum()),
    Strikes = ('description', lambda x: (x.isin(["called_strike", "swinging_strike", "foul", "swinging_strike_blocked", "foul_tip"])).sum()),
    K = ('events', lambda x: (x == 'strikeout').sum()),
    xBA = ("estimated_ba_using_speedangle", "mean"),
    xwOBA = ("estimated_woba_using_speedangle", "mean"),
    wOBA = ("woba_value", "mean"),
    RunExp = ("delta_run_exp", "mean"),
    AwayRunsScored = ("AwayRunsScored", "sum"),
    HomeRunsScored = ("HomeRunsScored", "sum")).reset_index().fillna(0)

## Grouping on an at bat level

In [334]:
Train2 = Train1.groupby(["game_date", "BatterTeam", "away_team", "home_team", "player_name", "p_throws"]).agg(
    Pitches = ("Pitches", "sum"),
    AB = ("at_bat_number", "size"),
    PA = ("PA", "sum"),
    Outs = ("Outs", "sum"),
    Hits = ("Hit", "sum"),
    FB = ("FB", "sum"),
    HR = ("HR", "sum"),
    HBP = ("HBP", "sum"),
    Balls = ("Balls", "sum"),
    BB = ("BB", "sum"),
    CS = ("CS", "sum"),
    Whiff = ("Whiff", "sum"),
    Strikes = ("Strikes", "sum"),
    K = ("K", "sum"),
    xBA = ("xBA", "mean"),
    xwOBA = ("xwOBA", "mean"),
    wOBA = ("wOBA", "mean"),
    RunExp = ("RunExp", "mean"),
    AwayRunsScored = ("AwayRunsScored", "sum"),
    HomeRunsScored = ("HomeRunsScored", "sum")).reset_index().fillna(0)

Train2['RA'] = np.where((Train2['HomeRunsScored'] > 0) & (Train2['BatterTeam'] == Train2['home_team']), Train2["HomeRunsScored"],
                        np.where((Train2['AwayRunsScored'] > 0) & (Train2['BatterTeam'] == Train2['away_team']), Train2["AwayRunsScored"], 0))
Train2["RA/9"] = (27 * Train2["RA"] / Train2["Outs"])
Train2 = Train2.drop(columns = ["AwayRunsScored", "HomeRunsScored"])

## Calculating loads of pitcher rate stats

In [335]:
lgHR = len(combined1[combined1["events"] == "home_run"])
lgFB = len(combined1[combined1["bb_type"] == "fly_ball"])

Train2['FIP'] = (13 * Train2['HR'] + 3 * (Train2['BB'] + Train2['HBP']) - 2 * Train2['K']) / (Train2['Outs'] / 3) + 3.137
Train2['xFIP'] = (13 * (Train2['FB'] * (lgHR/lgFB * 0.58)) + 3 * (Train2['BB'] + Train2['HBP']) - 2 * Train2['K']) / (Train2['Outs'] / 3) + 3.137

Train2['K%'] = round((Train2['K'] / Train2['AB']) * 100, 2)
Train2['BB%'] = round((Train2['BB'] / Train2['AB']) * 100, 2)
Train2['K-BB%'] = Train2["K%"] - Train2["BB%"]
Train2['Ball%'] = round((Train2['Balls'] / Train2['Pitches']) * 100, 2)
Train2['Strike%'] = round((Train2['Strikes'] / Train2['Pitches']) * 100, 2)
Train2['CS%'] = round((Train2['CS'] / Train2['Pitches']) * 100, 2)
Train2['Whiff%'] = round((Train2['Whiff'] / Train2['Pitches']) * 100, 2)
Train2["CSW"] = Train2["CS"] + Train2["Whiff"]
Train2['CSW%'] = round((Train2['CSW'] / Train2['Pitches']) * 100, 2)
Train2 = Train2.drop(columns = ["game_date", "CSW"])

## Adding rolling averages for the past 5 and 10 games

In [336]:
window_size5 = 5
window_size10 = 10
window_size20 = 20

# Rolling 5 game pitch averages
Train2['Pitches5'] = Train2.groupby('player_name')['Pitches'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Train2 = Train2.drop(Train2[Train2['Pitches5'] < 40].index)
# Rolling 5 and 10 game outs averages
Train2['Outs5'] = Train2.groupby('player_name')['Outs'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Train2['Outs10'] = Train2.groupby('player_name')['Outs'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game expected batting averages
Train2['xBA5'] = Train2.groupby('player_name')['xBA'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Train2['xBA10'] = Train2.groupby('player_name')['xBA'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game expected wOBA averages
Train2['xwOBA5'] = Train2.groupby('player_name')['xwOBA'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Train2['xwOBA10'] = Train2.groupby('player_name')['xwOBA'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game wOBA averages
Train2['wOBA5'] = Train2.groupby('player_name')['wOBA'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Train2['wOBA10'] = Train2.groupby('player_name')['wOBA'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game RA averages
Train2['RA5'] = Train2.groupby('player_name')['RA/9'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Train2['RA10'] = Train2.groupby('player_name')['RA/9'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game FIP averages
Train2['FIP5'] = Train2.groupby('player_name')['FIP'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Train2['FIP10'] = Train2.groupby('player_name')['FIP'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game xFIP averages
Train2['xFIP5'] = Train2.groupby('player_name')['xFIP'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Train2['xFIP10'] = Train2.groupby('player_name')['xFIP'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game K% averages
Train2['K%5'] = Train2.groupby('player_name')['K%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Train2['K%10'] = Train2.groupby('player_name')['K%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game BB% averages
Train2['BB%5'] = Train2.groupby('player_name')['BB%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Train2['BB%10'] = Train2.groupby('player_name')['BB%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game K-BB% averages
Train2['K-BB%5'] = Train2.groupby('player_name')['K-BB%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Train2['K-BB%10'] = Train2.groupby('player_name')['K-BB%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 game Ball% averages
Train2['Ball%5'] = Train2.groupby('player_name')['Ball%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 game Strike% averages
Train2['Strike%5'] = Train2.groupby('player_name')['Strike%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game Called Strike% averages
Train2['CS%5'] = Train2.groupby('player_name')['CS%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Train2['CS%10'] = Train2.groupby('player_name')['CS%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game Whiff% averages
Train2['Whiff%5'] = Train2.groupby('player_name')['Whiff%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Train2['Whiff%10'] = Train2.groupby('player_name')['Whiff%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game Called Strike plus Whiff% averages
Train2['CSW%5'] = Train2.groupby('player_name')['CSW%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Train2['CSW%10'] = Train2.groupby('player_name')['CSW%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)

Train3 = Train2.drop(columns = ["FB", "Balls", "HBP", "CS", "Whiff", "Strikes", 'Ball%', 'Strike%', 'CS%', 'Whiff%', 'CSW%', "RA/9"])
Train3 = Train3.rename(columns={'away_team': 'RoadTeam', 'home_team': 'HomeTeam', "player_name": "SP"})

## Grouping on a pitch level for batters

In [337]:
BatterTrain1 = combined2.groupby(["game_date", "BatterTeam", "away_team", "home_team", "inning", "at_bat_number", "MLBNAME", "player_name", "p_throws"]).agg(
    Pitches = ("pitch_number", "size"),
    PA = ('events', lambda x: (x.isin(['other_out', 'single', 'double', 'triple', 'home_run', 'strikeout', 'field_out', 'field_error', 'fielders_choice', 'double_play', 'fielders_choice_out', 'strikeout_double_play', 'triple_play', 'grounded_into_double_play'])).sum()),
    Outs = ('events', count_outs),
    FB = ('bb_type', lambda x: (x == 'fly_ball').sum()),
    HR = ('events', lambda x: (x == 'home_run').sum()),
    HBP = ('events', lambda x: (x == 'hit_by_pitch').sum()),
    Balls = ('description', lambda x: (x.isin(["ball", "hit_by_pitch", "blocked_ball"])).sum()),
    BB = ('events', lambda x: (x == 'walk').sum()),
    CS = ('description', lambda x: (x == 'called_strike').sum()),
    Whiff = ('description', lambda x: (x.isin(["swinging_strike", "swinging_strike_blocked", "foul_tip"])).sum()),
    Strikes = ('description', lambda x: (x.isin(["called_strike", "swinging_strike", "foul", "swinging_strike_blocked", "foul_tip"])).sum()),
    K = ('events', lambda x: (x == 'strikeout').sum()),
    xBA = ("estimated_ba_using_speedangle", "mean"),
    xwOBA = ("estimated_woba_using_speedangle", "mean"),
    wOBA = ("woba_value", "mean"),
    RunExp = ("delta_run_exp", "mean"),
    AwayRunsScored = ("AwayRunsScored", "sum"),
    HomeRunsScored = ("HomeRunsScored", "sum")).reset_index().fillna(0)

## Grouping on an at bat level

In [338]:
BatterTrain2 = BatterTrain1.groupby(["game_date", "BatterTeam", "away_team", "home_team", "p_throws"]).agg(
    Pitches = ("Pitches", "sum"),
    AB = ("at_bat_number", "size"),
    PA = ("PA", "sum"),
    Outs = ("Outs", "sum"),
    FB = ("FB", "sum"),
    HR = ("HR", "sum"),
    HBP = ("HBP", "sum"),
    Balls = ("Balls", "sum"),
    BB = ("BB", "sum"),
    CS = ("CS", "sum"),
    Whiff = ("Whiff", "sum"),
    Strikes = ("Strikes", "sum"),
    K = ("K", "sum"),
    xBA = ("xBA", "mean"),
    xwOBA = ("xwOBA", "mean"),
    wOBA = ("wOBA", "mean"),
    RunExp = ("RunExp", "mean"),
    AwayRunsScored = ("AwayRunsScored", "sum"),
    HomeRunsScored = ("HomeRunsScored", "sum")).reset_index().fillna(0)

BatterTrain2['RA'] = np.where((BatterTrain2['HomeRunsScored'] > 0) & (BatterTrain2['BatterTeam'] == BatterTrain2['home_team']), BatterTrain2["HomeRunsScored"],
                        np.where((BatterTrain2['AwayRunsScored'] > 0) & (BatterTrain2['BatterTeam'] == BatterTrain2['away_team']), BatterTrain2["AwayRunsScored"], 0))
BatterTrain2["R/9"] = (27 * BatterTrain2["RA"] / BatterTrain2["Outs"])
BatterTrain2 = BatterTrain2.drop(columns = ["AwayRunsScored", "HomeRunsScored", "RA"])

In [339]:
BatterTrain2['FIP'] = (13 * BatterTrain2['HR'] + 3 * (BatterTrain2['BB'] + BatterTrain2['HBP']) - 2 * BatterTrain2['K']) / (BatterTrain2['Outs'] / 3) + 3.137
BatterTrain2['xFIP'] = (13 * (BatterTrain2['FB'] * (lgHR/lgFB * 0.58)) + 3 * (BatterTrain2['BB'] + BatterTrain2['HBP']) - 2 * BatterTrain2['K']) / (BatterTrain2['Outs'] / 3) + 3.137

BatterTrain2['K%'] = round((BatterTrain2['K'] / BatterTrain2['AB']) * 100, 2)
BatterTrain2['BB%'] = round((BatterTrain2['BB'] / BatterTrain2['AB']) * 100, 2)
BatterTrain2['K-BB%'] = BatterTrain2["K%"] - BatterTrain2["BB%"]
BatterTrain2['Ball%'] = round((BatterTrain2['Balls'] / BatterTrain2['Pitches']) * 100, 2)
BatterTrain2['Strike%'] = round((BatterTrain2['Strikes'] / BatterTrain2['Pitches']) * 100, 2)
BatterTrain2['CS%'] = round((BatterTrain2['CS'] / BatterTrain2['Pitches']) * 100, 2)
BatterTrain2['Whiff%'] = round((BatterTrain2['Whiff'] / BatterTrain2['Pitches']) * 100, 2)
BatterTrain2["CSW"] = BatterTrain2["CS"] + BatterTrain2["Whiff"]
BatterTrain2['CSW%'] = round((BatterTrain2['CSW'] / BatterTrain2['Pitches']) * 100, 2)
BatterTrain2 = BatterTrain2.drop(columns = ["CSW"])

In [340]:
# Rolling 5 and 10 game expected batting averages
Train3['bxBA5'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['xBA'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Train3['bxBA10'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['xBA'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 and 10 game expected wOBA averages
Train3['bxwOBA5'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['xwOBA'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Train3['bxwOBA10'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['xwOBA'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 and 10 game wOBA averages
Train3['bwOBA5'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['wOBA'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Train3['bwOBA10'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['wOBA'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 and 10 game RA averages
Train3['bRS5'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['R/9'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Train3['bRS10'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['R/9'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 and 10 game FIP averages
Train3['bFIP5'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['FIP'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Train3['bFIP10'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['FIP'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 and 10 game xFIP averages
Train3['bxFIP5'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['xFIP'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Train3['bxFIP10'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['xFIP'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 and 10 game K% averages
Train3['bK%5'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['K%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Train3['bK%10'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['K%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 and 10 game BB% averages
Train3['bBB%5'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['BB%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Train3['bBB%10'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['BB%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 and 10 game K-BB% averages
Train3['bK-BB%5'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['K-BB%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Train3['bK-BB%10'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['K-BB%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 game Ball% averages
Train3['bBall%5'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['Ball%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 game Strike% averages
Train3['bStrike%5'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['Strike%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 and 10 game Called Strike% averages
Train3['bCS%5'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['CS%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Train3['bCS%10'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['CS%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 and 10 game Whiff% averages
Train3['bWhiff%5'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['Whiff%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Train3['bWhiff%10'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['Whiff%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 and 10 game Called Strike plus Whiff% averages
Train3['bCSW%5'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['CSW%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Train3['bCSW%10'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['CSW%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)

Train3 = Train3.replace([float('inf'), -float('inf')], 5)

## Loads in today's data

In [341]:
TodaysData = getDKData2024()
TodaysData

  ludf2['BatterTeam'] = ludf2['BatterTeam'].fillna(method='ffill')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extract_dh["BatterTeam"] = new_team_list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extract_dh["HomeTeam"] = new_home_list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extract_dh["RoadTeam"] = new_road_list
  dkdata[["BatterTe

Unnamed: 0_level_0,BatterTeam,RoadTeam,HomeTeam,SP
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-08-09,CLE,CLE,MIN,Bailey Ober
2024-08-09,MIN,CLE,MIN,Alex Cobb
2024-08-09,CLE2,CLE2,MIN2,Louie Varland
2024-08-09,MIN2,CLE2,MIN2,Alex Cobb
2024-08-09,LAA,LAA,WSH,Mitchell Parker
2024-08-09,WSH,LAA,WSH,Jose Soriano
2024-08-09,BAL,BAL,TB,Zack Littell
2024-08-09,TB,BAL,TB,Zach Eflin
2024-08-09,TEX,TEX,NYY,Carlos Rodon
2024-08-09,NYY,TEX,NYY,Cody Bradford


In [342]:
eastern_time = datetime.datetime.now(timezone.utc).astimezone(timezone(datetime.timedelta(hours=-5)))
savant2024 = statcast(start_dt = "2024-03-28", end_dt = eastern_time.strftime("%Y-%m-%d"))
savant2024['game_date'] = pd.to_datetime(savant2024['game_date'])
savant2024['game_date'] = savant2024['game_date'].dt.strftime('%Y-%m-%d')
savant2024['BatterTeam'] = np.where(savant2024['inning_topbot'] == 'Top', savant2024['away_team'], savant2024['home_team'])
savant2024['PitcherTeam'] = np.where(savant2024['inning_topbot'] == 'Top', savant2024['home_team'], savant2024['away_team'])
savant2024 = pd.merge(savant2024, ID[["MLBID", "MLBNAME"]], left_on = 'batter', right_on = 'MLBID', how = 'left')
savant2024.dropna(subset=['MLBNAME'], inplace=True)
savant2024 = savant2024.drop_duplicates(subset = ["pitch_type", "game_date", "release_speed", "release_pos_x", "release_pos_z", "player_name"], keep='first')
savant2024["player_name"] = savant2024["player_name"].apply(flip_names)
savant2024['AwayRunsScored'] = savant2024['post_away_score'] - savant2024['away_score']
savant2024['HomeRunsScored'] = savant2024['post_home_score'] - savant2024['home_score']
savant2024 = savant2024[["game_date", "home_team", "away_team", "inning", "inning_topbot", "at_bat_number", "pitch_number", "pitch_type", "BatterTeam", "MLBNAME", "balls", "strikes", "outs_when_up", "events", "description", "bb_type", "hit_distance_sc", "launch_speed", "launch_angle", "estimated_ba_using_speedangle", "estimated_woba_using_speedangle", "woba_value", "p_throws", "PitcherTeam", "player_name", "delta_home_win_exp", "delta_run_exp", "away_score", "home_score", "AwayRunsScored", "HomeRunsScored"]].sort_values(by = ["game_date", "home_team", "away_team", "inning_topbot", "at_bat_number", "pitch_number"], ascending=[True, True, True, False, True, True])
savant2024 = savant2024.groupby(groupby_cols).apply(keep_starter).reset_index(drop = True)
savant2024["player_name"] = savant2024["player_name"].apply(replace_special_chars)

This is a large query, it may take a moment to complete


100%|██████████| 135/135 [01:26<00:00,  1.56it/s]
  final_data = pd.concat(dataframe_list, axis=0).convert_dtypes(convert_string=False)
  savant2024 = savant2024.groupby(groupby_cols).apply(keep_starter).reset_index(drop = True)


## Does the same grouping as the training data at the various levels

In [343]:
Season1 = savant2024.groupby(["game_date", "BatterTeam", "away_team", "home_team", "inning", "at_bat_number", "MLBNAME", "player_name", "p_throws"]).agg(
    Pitches = ("pitch_number", "size"),
    PA = ('events', lambda x: (x.isin(['other_out', 'single', 'double', 'triple', 'home_run', 'strikeout', 'field_out', 'field_error', 'fielders_choice', 'double_play', 'fielders_choice_out', 'strikeout_double_play', 'triple_play', 'grounded_into_double_play'])).sum()),
    Outs = ('events', count_outs),
    Hit = ('events', lambda x: (x.isin(['single', 'double', 'triple', 'home_run'])).sum()),
    FB = ('bb_type', lambda x: (x == 'fly_ball').sum()),
    HR = ('events', lambda x: (x == 'home_run').sum()),
    HBP = ('events', lambda x: (x == 'hit_by_pitch').sum()),
    Balls = ('description', lambda x: (x.isin(["ball", "hit_by_pitch", "blocked_ball"])).sum()),
    BB = ('events', lambda x: (x == 'walk').sum()),
    CS = ('description', lambda x: (x == 'called_strike').sum()),
    Whiff = ('description', lambda x: (x.isin(["swinging_strike", "swinging_strike_blocked", "foul_tip"])).sum()),
    Strikes = ('description', lambda x: (x.isin(["called_strike", "swinging_strike", "foul", "swinging_strike_blocked", "foul_tip"])).sum()),
    K = ('events', lambda x: (x == 'strikeout').sum()),
    xBA = ("estimated_ba_using_speedangle", "mean"),
    xwOBA = ("estimated_woba_using_speedangle", "mean"),
    wOBA = ("woba_value", "mean"),
    RunExp = ("delta_run_exp", "mean"),
    AwayRunsScored = ("AwayRunsScored", "sum"),
    HomeRunsScored = ("HomeRunsScored", "sum")).reset_index().fillna(0)

In [344]:
Season2 = Season1.groupby(["game_date", "BatterTeam", "away_team", "home_team", "player_name", "p_throws"]).agg(
    Pitches = ("Pitches", "sum"),
    AB = ("at_bat_number", "size"),
    PA = ("PA", "sum"),
    Outs = ("Outs", "sum"),
    Hits = ("Hit", "sum"),
    FB = ("FB", "sum"),
    HR = ("HR", "sum"),
    HBP = ("HBP", "sum"),
    Balls = ("Balls", "sum"),
    BB = ("BB", "sum"),
    CS = ("CS", "sum"),
    Whiff = ("Whiff", "sum"),
    Strikes = ("Strikes", "sum"),
    K = ("K", "sum"),
    xBA = ("xBA", "mean"),
    xwOBA = ("xwOBA", "mean"),
    wOBA = ("wOBA", "mean"),
    RunExp = ("RunExp", "mean"),
    AwayRunsScored = ("AwayRunsScored", "sum"),
    HomeRunsScored = ("HomeRunsScored", "sum")).reset_index().fillna(0)

Season2["IP"] = Season2["Outs"] / 3
Season2['RA'] = np.where((Season2['HomeRunsScored'] > 0) & (Season2['BatterTeam'] == Season2['home_team']), Season2["HomeRunsScored"],
                        np.where((Season2['AwayRunsScored'] > 0) & (Season2['BatterTeam'] == Season2['away_team']),Season2["AwayRunsScored"], 0))
Season2["RA/9"] = (27 * Season2["RA"] / Season2["Outs"])
Season2 = Season2.drop(columns = ["AwayRunsScored", "HomeRunsScored"])

In [345]:
Season2['FIP'] = (13 * Season2['HR'] + 3 * (Season2['BB'] + Season2['HBP']) - 2 * Season2['K']) / (Season2['IP']) + 3.137
Season2['xFIP'] = (13 * (Season2['FB'] * (lgHR/lgFB * 0.58)) + 3 * (Season2['BB'] + Season2['HBP']) - 2 * Season2['K']) / (Season2['IP']) + 3.137

Season2['K%'] = round((Season2['K'] / Season2['AB']) * 100, 2)
Season2['BB%'] = round((Season2['BB'] / Season2['AB']) * 100, 2)
Season2['K-BB%'] = Season2["K%"] - Season2["BB%"]
Season2['Ball%'] = round((Season2['Balls'] / Season2['Pitches']) * 100, 2)
Season2['Strike%'] = round((Season2['Strikes'] / Season2['Pitches']) * 100, 2)
Season2['CS%'] = round((Season2['CS'] / Season2['Pitches']) * 100, 2)
Season2['Whiff%'] = round((Season2['Whiff'] / Season2['Pitches']) * 100, 2)
Season2["CSW"] = Season2["CS"] + Season2["Whiff"]
Season2['CSW%'] = round((Season2['CSW'] / Season2['Pitches']) * 100, 2)
Season2 = Season2.drop(columns=["CSW"])

In [346]:
# Rolling 5 game pitch averages
Season2['Pitches5'] = Season2.groupby('player_name')['Pitches'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Season2 = Season2.drop(Season2[Season2['Pitches5'] < 40].index)
# Rolling 5 and 10 game outs averages
Season2['Outs5'] = Season2.groupby('player_name')['Outs'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Season2['Outs10'] = Season2.groupby('player_name')['Outs'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game expected batting averages
Season2['xBA5'] = Season2.groupby('player_name')['xBA'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Season2['xBA10'] = Season2.groupby('player_name')['xBA'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game expected wOBA averages
Season2['xwOBA5'] = Season2.groupby('player_name')['xwOBA'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Season2['xwOBA10'] = Season2.groupby('player_name')['xwOBA'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game wOBA averages
Season2['wOBA5'] = Season2.groupby('player_name')['wOBA'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Season2['wOBA10'] = Season2.groupby('player_name')['wOBA'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game RA averages
Season2['RA5'] = Season2.groupby('player_name')['RA/9'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Season2['RA10'] = Season2.groupby('player_name')['RA/9'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game FIP averages
Season2['FIP5'] = Season2.groupby('player_name')['FIP'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Season2['FIP10'] = Season2.groupby('player_name')['FIP'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game xFIP averages
Season2['xFIP5'] = Season2.groupby('player_name')['xFIP'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Season2['xFIP10'] = Season2.groupby('player_name')['xFIP'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game K% averages
Season2['K%5'] = Season2.groupby('player_name')['K%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Season2['K%10'] = Season2.groupby('player_name')['K%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game BB% averages
Season2['BB%5'] = Season2.groupby('player_name')['BB%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Season2['BB%10'] = Season2.groupby('player_name')['BB%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game K-BB% averages
Season2['K-BB%5'] = Season2.groupby('player_name')['K-BB%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Season2['K-BB%10'] = Season2.groupby('player_name')['K-BB%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 game Ball% averages
Season2['Ball%5'] = Season2.groupby('player_name')['Ball%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 game Strike% averages
Season2['Strike%5'] = Season2.groupby('player_name')['Strike%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game Called Strike% averages
Season2['CS%5'] = Season2.groupby('player_name')['CS%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Season2['CS%10'] = Season2.groupby('player_name')['CS%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game Whiff% averages
Season2['Whiff%5'] = Season2.groupby('player_name')['Whiff%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Season2['Whiff%10'] = Season2.groupby('player_name')['Whiff%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game Called Strike plus Whiff% averages
Season2['CSW%5'] = Season2.groupby('player_name')['CSW%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Season2['CSW%10'] = Season2.groupby('player_name')['CSW%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)

Season3 = Season2.drop(columns = ["FB", "Balls", "HBP", "CS", "Whiff", "Strikes", 'Ball%', 'Strike%', 'CS%', 'Whiff%', 'CSW%', "RA/9"])
Season3 = Season3.rename(columns={'away_team': 'RoadTeam', 'home_team': 'HomeTeam', "player_name": "SP"})

## Grouping everything to get season and rolling averages for pitchers

In [354]:
Season4 = Season3.groupby(["SP", "p_throws"]).agg(
    Starts = ("IP", "size"),
    Pitches = ("Pitches", "mean"),
    AB = ("AB", "mean"),
    PA = ("PA", "mean"),
    Outs = ("Outs", "mean"),
    Hits = ("Hits", "mean"),
    HR = ("HR", "mean"),
    BB = ("BB", "mean"),
    K = ("K", "mean"),
    RA = ("RA", "mean"),
    xBA = ("xBA", "mean"),
    xwOBA = ("xwOBA", "mean"),
    wOBA = ("wOBA", "mean"),
    RunExp = ("RunExp", "mean"),
    FIP = ("FIP", "mean"),
    xFIP = ("xFIP", "mean"),
    Kpercent = ("K%", "mean"),
    BBpercent = ("BB%", "mean"),
    KminusBBpercent = ("K-BB%", "mean"),
    Pitches5 =  ("Pitches5", "last"),
    Outs5 =  ("Outs5", "last"),
    Outs10 = ("Outs10", "last"),
    xBA5 =  ("xBA5", "last"),
    xBA10 = ("xBA10", "last"),
    xwOBA5 =  ("xwOBA5", "last"),
    xwOBA10 = ("xwOBA10", "last"),
    wOBA5 =  ("wOBA5", "last"),
    wOBA10 = ("wOBA10", "last"),
    RA5 = ("RA5", "last"),
    RA10 = ("RA10", "last"),
    FIP5 = ("FIP5", "last"),
    FIP10 = ("FIP10", "last"),
    xFIP5 = ("xFIP5", "last"),
    xFIP10 = ("xFIP10", "last"),
    Kpercent5 = ("K%5", "last"),
    Kpercent10 = ("K%10", "last"),
    BBpercent5 = ("BB%5", "last"),
    BBpercent10 = ("BB%10", "last"),
    KminusBBpercent5 = ("K-BB%5", "last"),
    KminusBBpercent10 = ("K-BB%10", "last"),
    Ballpercent5 = ("Ball%5", "last"),
    Strikepercent5 = ("Strike%5", "last"),
    CSpercent5 = ("CS%5", "last"),
    CSpercent10 = ("CS%10", "last"),
    Whiffpercent5 = ("Whiff%5", "last"),
    Whiffpercent10 = ("Whiff%10", "last"),
    CSWpercent5 = ("CSW%5", "last"),
    CSWpercent10 = ("CSW%10", "last")).reset_index().fillna(0)

Season4.rename(columns={col: col.replace('percent', '%') for col in Season4.columns if 'percent' in col}, inplace=True)
Season4.rename(columns={col: col.replace('minus', '-') for col in Season4.columns if 'minus' in col}, inplace=True)

In [355]:
BatterSeason1 = savant2024.groupby(["game_date", "BatterTeam", "away_team", "home_team", "inning", "at_bat_number", "MLBNAME", "player_name", "p_throws"]).agg(
    Pitches = ("pitch_number", "size"),
    PA = ('events', lambda x: (x.isin(['other_out', 'single', 'double', 'triple', 'home_run', 'strikeout', 'field_out', 'field_error', 'fielders_choice', 'double_play', 'fielders_choice_out', 'strikeout_double_play', 'triple_play', 'grounded_into_double_play'])).sum()),
    Outs = ('events', count_outs),
    FB = ('bb_type', lambda x: (x == 'fly_ball').sum()),
    HR = ('events', lambda x: (x == 'home_run').sum()),
    HBP = ('events', lambda x: (x == 'hit_by_pitch').sum()),
    Balls = ('description', lambda x: (x.isin(["ball", "hit_by_pitch", "blocked_ball"])).sum()),
    BB = ('events', lambda x: (x == 'walk').sum()),
    CS = ('description', lambda x: (x == 'called_strike').sum()),
    Whiff = ('description', lambda x: (x.isin(["swinging_strike", "swinging_strike_blocked", "foul_tip"])).sum()),
    Strikes = ('description', lambda x: (x.isin(["called_strike", "swinging_strike", "foul", "swinging_strike_blocked", "foul_tip"])).sum()),
    K = ('events', lambda x: (x == 'strikeout').sum()),
    xBA = ("estimated_ba_using_speedangle", "mean"),
    xwOBA = ("estimated_woba_using_speedangle", "mean"),
    wOBA = ("woba_value", "mean"),
    RunExp = ("delta_run_exp", "mean"),
    AwayRunsScored = ("AwayRunsScored", "sum"),
    HomeRunsScored = ("HomeRunsScored", "sum")).reset_index().fillna(0)

In [356]:
BatterSeason2 = BatterSeason1.groupby(["game_date", "BatterTeam", "away_team", "home_team", "p_throws"]).agg(
    Pitches=("Pitches", "sum"),
    AB=("at_bat_number", "size"),
    PA=("PA", "sum"),
    Outs=("Outs", "sum"),
    FB=("FB", "sum"),
    HR=("HR", "sum"),
    HBP=("HBP", "sum"),
    Balls=("Balls", "sum"),
    BB=("BB", "sum"),
    CS=("CS", "sum"),
    Whiff=("Whiff", "sum"),
    Strikes=("Strikes", "sum"),
    K=("K", "sum"),
    xBA=("xBA", "mean"),
    xwOBA=("xwOBA", "mean"),
    wOBA=("wOBA", "mean"),
    RunExp=("RunExp", "mean"),
    AwayRunsScored=("AwayRunsScored", "sum"),
    HomeRunsScored=("HomeRunsScored", "sum")).reset_index().fillna(0)

BatterSeason2['RA'] = np.where((BatterSeason2['HomeRunsScored'] > 0) & (BatterSeason2['BatterTeam'] == BatterSeason2['home_team']),
    BatterSeason2["HomeRunsScored"], np.where((BatterSeason2['AwayRunsScored'] > 0) & (BatterSeason2['BatterTeam'] == BatterSeason2['away_team']), BatterSeason2["AwayRunsScored"], 0))
BatterSeason2["R/9"] = (27 * BatterSeason2["RA"] / BatterSeason2["Outs"])
BatterSeason2 = BatterSeason2.drop(columns=["AwayRunsScored", "HomeRunsScored", "RA"])

In [357]:
BatterSeason2['FIP'] = (13 * BatterSeason2['HR'] + 3 * (BatterSeason2['BB'] + BatterSeason2['HBP']) - 2 * BatterSeason2['K']) / (BatterSeason2['Outs'] / 3) + 3.137
BatterSeason2['xFIP'] = (13 * (BatterSeason2['FB'] * (lgHR/lgFB * 0.58)) + 3 * (BatterSeason2['BB'] + BatterSeason2['HBP']) - 2 * BatterSeason2['K']) / (BatterSeason2['Outs'] / 3) + 3.137

BatterSeason2['K%'] = round((BatterSeason2['K'] / BatterSeason2['AB']) * 100, 2)
BatterSeason2['BB%'] = round((BatterSeason2['BB'] / BatterSeason2['AB']) * 100, 2)
BatterSeason2['K-BB%'] = BatterSeason2["K%"] - BatterSeason2["BB%"]
BatterSeason2['Ball%'] = round((BatterSeason2['Balls'] / BatterSeason2['Pitches']) * 100, 2)
BatterSeason2['Strike%'] = round((BatterSeason2['Strikes'] / BatterSeason2['Pitches']) * 100, 2)
BatterSeason2['CS%'] = round((BatterSeason2['CS'] / BatterSeason2['Pitches']) * 100, 2)
BatterSeason2['Whiff%'] = round((BatterSeason2['Whiff'] / BatterSeason2['Pitches']) * 100, 2)
BatterSeason2["CSW"] = BatterSeason2["CS"] + BatterSeason2["Whiff"]
BatterSeason2['CSW%'] = round((BatterSeason2['CSW'] / BatterSeason2['Pitches']) * 100, 2)
BatterSeason2 = BatterSeason2.drop(columns = ["CSW"])

In [358]:
# Rolling 5 and 10 game expected batting averages
Season4['bxBA5'] = BatterSeason2.groupby(['BatterTeam', "p_throws"])['xBA'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Season4['bxBA10'] = BatterSeason2.groupby(['BatterTeam', "p_throws"])['xBA'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 and 10 game expected wOBA averages
Season4['bxwOBA5'] = BatterSeason2.groupby(['BatterTeam', "p_throws"])['xwOBA'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Season4['bxwOBA10'] = BatterSeason2.groupby(['BatterTeam', "p_throws"])['xwOBA'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 and 10 game wOBA averages
Season4['bwOBA5'] = BatterSeason2.groupby(['BatterTeam', "p_throws"])['wOBA'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Season4['bwOBA10'] = BatterSeason2.groupby(['BatterTeam', "p_throws"])['wOBA'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 and 10 game RA averages
Season4['bRS5'] = BatterSeason2.groupby(['BatterTeam', "p_throws"])['R/9'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Season4['bRS10'] = BatterSeason2.groupby(['BatterTeam', "p_throws"])['R/9'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 and 10 game FIP averages
Season4['bFIP5'] = BatterSeason2.groupby(['BatterTeam', "p_throws"])['FIP'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Season4['bFIP10'] = BatterSeason2.groupby(['BatterTeam', "p_throws"])['FIP'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 and 10 game xFIP averages
Season4['bxFIP5'] = BatterSeason2.groupby(['BatterTeam', "p_throws"])['xFIP'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Season4['bxFIP10'] = BatterSeason2.groupby(['BatterTeam', "p_throws"])['xFIP'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 and 10 game K% averages
Season4['bK%5'] = BatterSeason2.groupby(['BatterTeam', "p_throws"])['K%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Season4['bK%10'] = BatterSeason2.groupby(['BatterTeam', "p_throws"])['K%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 and 10 game BB% averages
Season4['bBB%5'] = BatterSeason2.groupby(['BatterTeam', "p_throws"])['BB%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Season4['bBB%10'] = BatterSeason2.groupby(['BatterTeam', "p_throws"])['BB%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 and 10 game K-BB% averages
Season4['bK-BB%5'] = BatterSeason2.groupby(['BatterTeam', "p_throws"])['K-BB%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Season4['bK-BB%10'] = BatterSeason2.groupby(['BatterTeam', "p_throws"])['K-BB%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 game Ball% averages
Season4['bBall%5'] = BatterSeason2.groupby(['BatterTeam', "p_throws"])['Ball%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 game Strike% averages
Season4['bStrike%5'] = BatterSeason2.groupby(['BatterTeam', "p_throws"])['Strike%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 and 10 game Called Strike% averages
Season4['bCS%5'] = BatterSeason2.groupby(['BatterTeam', "p_throws"])['CS%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Season4['bCS%10'] = BatterSeason2.groupby(['BatterTeam', "p_throws"])['CS%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 and 10 game Whiff% averages
Season4['bWhiff%5'] = BatterSeason2.groupby(['BatterTeam', "p_throws"])['Whiff%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Season4['bWhiff%10'] = BatterSeason2.groupby(['BatterTeam', "p_throws"])['Whiff%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 and 10 game Called Strike plus Whiff% averages
Season4['bCSW%5'] = BatterSeason2.groupby(['BatterTeam', "p_throws"])['CSW%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Season4['bCSW%10'] = BatterSeason2.groupby(['BatterTeam', "p_throws"])['CSW%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)

## Joining both the pitcher averages and the batter rolling averages

In [359]:
TodaysData.dropna(subset=['SP'], inplace=True)
TodaysData1 = pd.merge(TodaysData, Season4[['SP', 'p_throws', 'Starts', 'Pitches', 'AB', 'PA', 'Outs', "Hits", 'HR', 'BB',
       'K', 'RA', 'xBA', 'xwOBA', 'wOBA', 'RunExp', 'FIP', 'xFIP', 'K%', 'BB%',
       'K-BB%', 'Pitches5', 'Outs5', 'Outs10', 'xBA5', 'xBA10', 'xwOBA5',
       'xwOBA10', 'wOBA5', 'wOBA10', 'RA5', 'RA10', 'FIP5', 'FIP10', 'xFIP5',
       'xFIP10', 'K%5', 'K%10', 'BB%5', 'BB%10', 'K-BB%5', 'K-BB%10', 'Ball%5',
       'Strike%5', 'CS%5', 'CS%10', 'Whiff%5', 'Whiff%10', 'CSW%5', 'CSW%10',
       'bxBA5', 'bxBA10', 'bxwOBA5', 'bxwOBA10', 'bwOBA5', 'bwOBA10', 'bRS5',
       'bRS10', 'bFIP5', 'bFIP10', 'bxFIP5', 'bxFIP10', 'bK%5', 'bK%10',
       'bBB%5', 'bBB%10', 'bK-BB%5', 'bK-BB%10', 'bBall%5', 'bStrike%5',
       'bCS%5', 'bCS%10', 'bWhiff%5', 'bWhiff%10', 'bCSW%5', 'bCSW%10']], left_on = ['SP'], right_on = ['SP'], how = 'left')

# If no 2024 savant data exists then gives them the league averages from 2022-23
TrainMeans = Train3.drop(['BatterTeam', 'RoadTeam', "HomeTeam", "SP", "p_throws"], axis=1).mean()
TodaysData1 = TodaysData1.fillna(TrainMeans)
TodaysData1 = TodaysData1.replace([float('inf'), -float('inf')], 5)

## Encodes the teams and players allowing to be fed into the algorithms

In [360]:
# Ensure Train5 and TodaysData2 are copies of Train4 and TodaysData1 respectively
Train5 = Train3.copy()
TodaysData2 = TodaysData1.copy()

# Dictionary to store the label encoders
label_encoders = {}

# Encode non-numeric columns in Train4
non_numeric_columns_train = Train5.select_dtypes(exclude=['float64', 'int64']).columns
for col in non_numeric_columns_train:
    label_encoder = LabelEncoder()
    Train5[col] = label_encoder.fit_transform(Train5[col])
    label_encoders[col] = label_encoder

# Ensure all non-numeric columns in Train4 are in TodaysData1
for col in non_numeric_columns_train:
    if col not in TodaysData2.columns:
        print(f"Warning: Column {col} from training data is not present in today's data.")
        # Adding the missing column with a default value
        TodaysData2[col] = 536

# Encode non-numeric columns in TodaysData1 using the same encoders
non_numeric_columns_today = TodaysData2.select_dtypes(exclude=['float64', 'int64']).columns
for col in non_numeric_columns_today:
    if col in label_encoders:
        label_encoder = label_encoders[col]
        unique_values = set(label_encoder.classes_)
        encoded_values = []
        for item in TodaysData2[col]:
            if item in unique_values:
                encoded_values.append(label_encoder.transform([item])[0])
            else:
                encoded_values.append(536)  # Using 536 as a placeholder for unknown categories
        TodaysData2[col] = encoded_values
    else:
        print(f"Warning: Column {col} is not present in the training data.")
        # Fit a new label encoder for columns not present in Train4, but be cautious with this
        label_encoder = LabelEncoder()
        TodaysData2[col] = label_encoder.fit_transform(TodaysData2[col])
        label_encoders[col] = label_encoder

## K model

In [361]:
TrainFeatures = Train5.drop(columns = ["K"]).values.reshape(-1, 77)
TrainLabel = Train5["K"].values.reshape(-1, 1)
TodayFeatures = TodaysData2.drop(columns = ["Starts", "K"]).values.reshape(-1, 77)

rf_regressor = RandomForestRegressor(n_estimators = 152, max_depth = 15, min_samples_leaf = 4)
rf_regressor.fit(TrainFeatures, TrainLabel)
RFpred = rf_regressor.predict(TodayFeatures)

TodaysData2["xK"] = RFpred

  return fit_method(estimator, *args, **kwargs)


## BB Model

In [362]:
TrainFeatures = Train5.drop(columns = ["BB"]).values.reshape(-1, 77)
TrainLabel = Train5["BB"].values.reshape(-1, 1)
TodayFeatures = TodaysData2.drop(columns = ["Starts", "BB", "xK"]).values.reshape(-1, 77)

rf_regressor = RandomForestRegressor(n_estimators = 152, max_depth = 15, min_samples_leaf = 3)
rf_regressor.fit(TrainFeatures, TrainLabel)
RFpred = rf_regressor.predict(TodayFeatures)

TodaysData2["xBB"] = RFpred

  return fit_method(estimator, *args, **kwargs)


## Hits Model

In [364]:
TrainFeatures = Train5.drop(columns = ["Hits"]).values.reshape(-1, 77)
TrainLabel = Train5["Hits"].values.reshape(-1, 1)
TodayFeatures = TodaysData2.drop(columns = ["Starts", "BB", "xK", "xBB"]).values.reshape(-1, 77)

rf_regressor = RandomForestRegressor(n_estimators = 152, max_depth = 15, min_samples_leaf = 3)
rf_regressor.fit(TrainFeatures, TrainLabel)
RFpred = rf_regressor.predict(TodayFeatures)

TodaysData2["xHA"] = RFpred

  return fit_method(estimator, *args, **kwargs)


## RA Model

In [365]:
TrainFeatures = Train5.drop(columns = ["RA"]).values.reshape(-1, 77)
TrainLabel = Train5["RA"].values.reshape(-1, 1)
TodayFeatures = TodaysData2.drop(columns = ["Starts", "BB", "xK", "xBB", "xHA"]).values.reshape(-1, 77)

rf_regressor = RandomForestRegressor(n_estimators = 152, max_depth = 15, min_samples_leaf = 3)
rf_regressor.fit(TrainFeatures, TrainLabel)
RFpred = rf_regressor.predict(TodayFeatures)

TodaysData2["xRA"] = RFpred

  return fit_method(estimator, *args, **kwargs)


## IP model

In [366]:
TrainFeatures = Train5.drop(columns = ["Outs"]).values.reshape(-1, 77)
TrainLabel = Train5["Outs"].values.reshape(-1, 1)
TodayFeatures = TodaysData2.drop(columns = ["Starts", "Outs", "xK", "xBB", "xHA", "xRA"]).values.reshape(-1, 77)

rf_regressor = RandomForestRegressor(n_estimators = 152, max_depth = 15, min_samples_leaf = 3)
rf_regressor.fit(TrainFeatures, TrainLabel)
RFpred = rf_regressor.predict(TodayFeatures)

TodaysData2["xOuts"] = RFpred

  return fit_method(estimator, *args, **kwargs)


## Reverse encodes today's data so it can be understood

In [367]:
for col in non_numeric_columns_today:
    if col in label_encoders:
        label_encoder = label_encoders[col]
        # Handling default value of 536
        TodaysData2[col] = TodaysData2[col].apply(lambda x: label_encoder.inverse_transform([x])[0] if x != 536 else np.nan)

TodaysData2["SP"].fillna(TodaysData1["SP"], inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  TodaysData2["SP"].fillna(TodaysData1["SP"], inplace = True)


In [368]:
def add_2_to_duplicates(df):
    mask = df.duplicated(subset=['BatterTeam', 'RoadTeam', 'HomeTeam'], keep='first')
    
    df.loc[mask, 'BatterTeam'] += '2'
    df.loc[mask, 'RoadTeam'] += '2'
    df.loc[mask, 'HomeTeam'] += '2'
    
    return df

TodaysData2 = add_2_to_duplicates(TodaysData2)

## Creates simple dataset to see the expected stats of all the predicted metrics

In [379]:
# Performs a two "sample" z test to see the likelihood of a quality start occurring
def row_z_score(row):
    x_bar1 = row["xRA"]
    x_bar2 = row["xOuts"]
    mu1 = 3
    mu2 = 18
    sigma1 = Train2["RA"].std()
    sigma2 = Train2["Outs"].std()
    n1 = len(TodaysData2)
    n2 = n1
    
    # Calculate z-scores
    z_score_xRA = (x_bar1 - mu1) / (sigma1 / np.sqrt(n1))
    z_score_xOuts = (x_bar2 - mu2) / (sigma2 / np.sqrt(n2))
    
    # Calculate probabilities, want less than
    prob_xRA_less_than_mu1 = 1 - stats.norm.cdf(z_score_xRA)
    prob_xOuts_greater_than_mu2 = stats.norm.cdf(z_score_xOuts)

    # Combine probabilities by multiplying it
    combined_prob = prob_xRA_less_than_mu1 * prob_xOuts_greater_than_mu2
    return combined_prob

# Calculate the chance of a quality start occurring and finds the xQS points
percent = TodaysData2.apply(row_z_score, axis=1)
xQS = percent * 5

# Adds the xQS to the xFS
TodaysData2.loc[:, "xFS"] = (xQS + TodaysData2["xK"] * 3 + TodaysData2["xOuts"] - TodaysData2["xRA"] * 3)
TodaysData3 = TodaysData2[["BatterTeam", "SP", "Starts", "xK", "xBB", "xHA", "xRA", "xOuts", "xFS"]].round(2)
TodaysData3.sort_values("xFS")

Unnamed: 0,BatterTeam,SP,Starts,xK,xBB,xHA,xRA,xOuts,xFS
2,,Louie Varland,5.0,4.0,2.55,5.39,3.7,12.78,13.69
22,ATL,Tanner Gordon,4.0,4.0,1.0,4.73,3.76,15.16,15.87
13,MIA,Martin Perez,17.0,4.0,2.0,5.9,3.06,15.2,18.02
24,PHI,Ryne Nelson,19.0,4.0,1.0,5.77,3.07,16.32,19.14
27,SEA,Jose Quintana,21.0,4.01,2.0,5.4,2.5,14.93,19.45
4,LAA,Mitchell Parker,20.0,4.01,2.0,5.72,2.55,15.47,19.83
17,KC,Miles Mikolas,23.0,3.95,1.0,5.78,2.87,16.55,19.88
16,STL,Michael Lorenzen,18.0,4.0,3.0,4.37,2.36,15.38,20.29
11,TOR,Mitch Spence,14.0,4.02,2.0,5.06,2.61,16.88,21.42
1,MIN,Alex Cobb,,4.77,2.0,3.95,2.39,14.58,21.72


## Using Prizepicks API to load in data to see lines to find edge

In [370]:
def call_endpoint(url, max_level=3, include_new_player_attributes=False):
    '''
    takes: 
        - url (str): the API endpoint to call
        - max_level (int): level of json normalizing to apply
        - include_player_attributes (bool): whether to include player object attributes in the returned dataframe
    returns:
        - df (pd.DataFrame): a dataframe of the call response content
    '''
    resp = requests.get(url).json()
    data = pd.json_normalize(resp['data'], max_level=max_level)
    included = pd.json_normalize(resp['included'], max_level=max_level)
    if include_new_player_attributes:
        inc_cop = included[included['type'] == 'new_player'].copy().dropna(axis=1)
        data = pd.merge(data
                        , inc_cop
                        , how='left'
                        , left_on=['relationships.new_player.data.id'
                                   ,'relationships.new_player.data.type']
                        , right_on=['id', 'type']
                        , suffixes=('', '_new_player'))
    return data

url = 'https://partner-api.prizepicks.com/projections?league_id=2&per_page=1000'
df = call_endpoint(url, include_new_player_attributes=True)

prizepicks1 = df[["attributes.description", "attributes.line_score", "attributes.odds_type", "attributes.stat_type", "attributes.name", "attributes.position", "attributes.team"]]
prizepicks1 = prizepicks1[(prizepicks1["attributes.position"] == "P") & ((prizepicks1["attributes.odds_type"] == "standard") | (prizepicks1["attributes.odds_type"] == "goblin") | (prizepicks1["attributes.odds_type"] == "demon"))].drop(columns = "attributes.position")
prizepicks2 = prizepicks1.rename(columns={'attributes.name': 'SP', 'attributes.description': 'BatterTeam', 'attributes.team': 'PitcherTeam',  "attributes.line_score": "Line", "attributes.stat_type": "Stat", "attributes.odds_type": "Type"})

## Strikeouts

In [371]:
StrikeoutLine = prizepicks2[(prizepicks2["Stat"] == "Pitcher Strikeouts") & (prizepicks2["Type"] == "standard")]
Strikeouts1 = TodaysData3[["BatterTeam", "SP", "Starts", "xK"]]
Strikeouts2 = pd.merge(Strikeouts1, StrikeoutLine[["SP", "Line", "Stat", "Type"]], left_on = ['SP'], right_on = ['SP'], how = 'left').dropna()
Strikeouts2["Edge"] = Strikeouts2["xK"] - Strikeouts2["Line"]
Strikeouts2["Edge%"] = abs(Strikeouts2["Edge"] / Strikeouts2["Line"]) * 100
Strikeouts2.sort_values("Edge%").round(2)

Unnamed: 0,BatterTeam,SP,Starts,xK,Line,Stat,Type,Edge,Edge%
4,LAA,Mitchell Parker,20.0,4.01,4.0,Pitcher Strikeouts,standard,0.01,0.25
11,TOR,Mitch Spence,14.0,4.02,4.0,Pitcher Strikeouts,standard,0.02,0.5
18,CIN,Aaron Civale,22.0,4.96,5.0,Pitcher Strikeouts,standard,-0.04,0.8
15,BOS,Ronel Blanco,21.0,5.07,5.0,Pitcher Strikeouts,standard,0.07,1.4
28,PIT,Jack Flaherty,18.0,6.72,7.0,Pitcher Strikeouts,standard,-0.28,4.0
21,CWS,Jameson Taillon,19.0,4.22,4.0,Pitcher Strikeouts,standard,0.22,5.5
6,BAL,Zack Littell,22.0,4.81,4.5,Pitcher Strikeouts,standard,0.31,6.89
26,NYM,Bryce Miller,22.0,4.99,5.5,Pitcher Strikeouts,standard,-0.51,9.27
19,MIL,Carson Spiers,7.0,4.96,4.5,Pitcher Strikeouts,standard,0.46,10.22
16,STL,Michael Lorenzen,18.0,4.0,3.5,Pitcher Strikeouts,standard,0.5,14.29


# Walks

In [372]:
WalkLine = prizepicks2[(prizepicks2["Stat"] == "Walks Allowed") & (prizepicks2["Type"] == "standard")]
Walks1 = TodaysData3[["BatterTeam", "SP", "Starts", "xBB"]]
Walks2 = pd.merge(Walks1, WalkLine[["SP", "Line", "Stat", "Type"]], left_on = ['SP'], right_on = ['SP'], how = 'left').dropna()
Walks2["Edge"] = Walks2["xBB"] - Walks2["Line"]
Walks2["Edge%"] = abs(Walks2["Edge"] / Walks2["Line"]) * 100
Walks2.sort_values("Edge%").round(2)

Unnamed: 0,BatterTeam,SP,Starts,xBB,Line,Stat,Type,Edge,Edge%
29,LAD,Mitch Keller,22.0,1.01,1.5,Walks Allowed,standard,-0.49,32.67
19,MIL,Carson Spiers,7.0,1.0,1.5,Walks Allowed,standard,-0.5,33.33
30,DET,Robbie Ray,3.0,2.0,1.5,Walks Allowed,standard,0.5,33.33


## Hits Allowed

In [383]:
HitLine = prizepicks2[(prizepicks2["Stat"] == "Hits Allowed") & ((prizepicks2["Type"] == "standard") | (prizepicks2["Type"] == "demon"))]
Hits1 = TodaysData3[["BatterTeam", "SP", "Starts", "xHA"]]
Hits2 = pd.merge(Hits1, HitLine[["SP", "Line", "Stat", "Type"]], left_on = ['SP'], right_on = ['SP'], how = 'left').dropna()
Hits2["Edge"] = Hits2["xHA"] - Hits2["Line"]
Hits2 = Hits2[~((Hits2["Type"] == "demon") | (Hits2["Type"] == "goblin") & (Hits2["Edge"] < 0))]
Hits2["Edge%"] = abs(Hits2["Edge"] / Hits2["Line"]) * 100
Hits2.sort_values("Edge%").round(2)

Unnamed: 0,BatterTeam,SP,Starts,xHA,Line,Stat,Type,Edge,Edge%
6,BAL,Zack Littell,22.0,5.57,5.5,Hits Allowed,standard,0.07,1.27
28,PIT,Jack Flaherty,18.0,4.62,4.5,Hits Allowed,standard,0.12,2.67
30,DET,Robbie Ray,3.0,3.15,4.5,Hits Allowed,standard,-1.35,30.0


## Runs Allowed

In [384]:
RunLine = prizepicks2[(prizepicks2["Stat"] == "Earned Runs Allowed") & ((prizepicks2["Type"] == "standard") | (prizepicks2["Type"] == "demon"))]
Runs1 = TodaysData3[["BatterTeam", "SP", "Starts", "xRA"]]
Runs2 = pd.merge(Runs1, RunLine[["SP", "Line", "Stat", "Type"]], left_on = ['SP'], right_on = ['SP'], how = 'left').dropna()
Runs2["Edge"] = Runs2["xRA"] - Runs2["Line"]
Runs2 = Runs2[~((Runs2["Type"] == "demon") | (Runs2["Type"] == "goblin") & (Runs2["Edge"] < 0))]
Runs2["Edge%"] = abs(Runs2["Edge"] / Runs2["Line"]) * 100
Runs2.sort_values("Edge%").round(2)

Unnamed: 0,BatterTeam,SP,Starts,xRA,Line,Stat,Type,Edge,Edge%
19,MIL,Carson Spiers,7.0,2.63,2.5,Earned Runs Allowed,standard,0.13,5.2
14,HOU,Tanner Houck,22.0,2.11,2.5,Earned Runs Allowed,standard,-0.39,15.6


# Pitching Outs

In [385]:
OutLine = prizepicks2[prizepicks2["Stat"] == "Pitching Outs"]
Outs1 = TodaysData3[["BatterTeam", "SP", "Starts", "xOuts"]]
Outs2 = pd.merge(Outs1, OutLine[["SP", "Line", "Stat", "Type"]], left_on = ['SP'], right_on = ['SP'], how = 'left').dropna()
Outs2["Edge"] = Outs2["xOuts"] - Outs2["Line"]
Outs2 = Outs2[~((Outs2["Type"] == "demon") | (Outs2["Type"] == "goblin") & (Outs2["Edge"] < 0))]
Outs2["Edge%"] = abs(Outs2["Edge"] / Outs2["Line"]) * 100
Outs2.sort_values("Edge%").round(2)

Unnamed: 0,BatterTeam,SP,Starts,xOuts,Line,Stat,Type,Edge,Edge%
21,CWS,Jameson Taillon,19.0,17.76,17.5,Pitching Outs,goblin,0.26,1.49
6,BAL,Zack Littell,22.0,16.61,17.0,Pitching Outs,standard,-0.39,2.29
11,TOR,Mitch Spence,14.0,16.88,16.5,Pitching Outs,standard,0.38,2.3
18,CIN,Aaron Civale,22.0,15.93,15.5,Pitching Outs,standard,0.43,2.77
26,NYM,Bryce Miller,22.0,16.75,17.5,Pitching Outs,standard,-0.75,4.29
17,KC,Miles Mikolas,23.0,16.55,17.5,Pitching Outs,standard,-0.95,5.43
19,MIL,Carson Spiers,7.0,16.63,15.5,Pitching Outs,standard,1.13,7.29


## Fantasy Score

In [386]:
FSLine = prizepicks2[prizepicks2["Stat"] == "Pitcher Fantasy Score"]
FS1 = TodaysData3[["BatterTeam", "SP", "Starts", "xFS"]]
FS2 = pd.merge(FS1, FSLine[["SP", "Line", "Stat"]], left_on = ['SP'], right_on = ['SP'], how = 'left').dropna()
FS2["Edge"] = FS2["xFS"] - FS2["Line"]
FS2["Edge%"] = abs(FS2["Edge"] / FS2["Line"]) * 100
FS2.sort_values("Edge%").round(2)

Unnamed: 0,BatterTeam,SP,Starts,xFS,Line,Stat,Edge,Edge%
25,AZ,Zack Wheeler,22.0,36.33,33.5,Pitcher Fantasy Score,2.83,8.45
21,CWS,Jameson Taillon,19.0,26.7,30.0,Pitcher Fantasy Score,-3.3,11.0
26,NYM,Bryce Miller,22.0,25.92,30.5,Pitcher Fantasy Score,-4.58,15.02
7,TB,Zach Eflin,20.0,25.46,32.5,Pitcher Fantasy Score,-7.04,21.66
30,DET,Robbie Ray,3.0,28.11,36.5,Pitcher Fantasy Score,-8.39,22.99
28,PIT,Jack Flaherty,18.0,30.18,41.0,Pitcher Fantasy Score,-10.82,26.39


In [390]:
Props1 = pd.concat([Strikeouts2, Hits2, Runs2, Outs2], ignore_index = True)
Props2 = Props1[(Props1["Edge%"] > 10) & (Props1["Starts"] > 3)]
Props2 = Props2[["BatterTeam", "SP", "Starts", "Stat", "Type", "Line", "Edge", "Edge%"]].sort_values("Edge%").round(2)
Props2

Unnamed: 0,BatterTeam,SP,Starts,Stat,Type,Line,Edge,Edge%
10,MIL,Carson Spiers,7.0,Pitcher Strikeouts,standard,4.5,0.46,10.22
7,STL,Michael Lorenzen,18.0,Pitcher Strikeouts,standard,3.5,0.5,14.29
2,TB,Zach Eflin,20.0,Pitcher Strikeouts,standard,5.0,-0.78,15.6
22,HOU,Tanner Houck,22.0,Earned Runs Allowed,standard,2.5,-0.39,15.6
17,LAD,Mitch Keller,22.0,Pitcher Strikeouts,standard,5.0,0.81,16.2
11,CHC,Garrett Crochet,23.0,Pitcher Strikeouts,standard,6.0,0.99,16.5
14,AZ,Zack Wheeler,22.0,Pitcher Strikeouts,standard,6.0,0.99,16.5
8,KC,Miles Mikolas,23.0,Pitcher Strikeouts,standard,3.0,0.95,31.67
4,SD,Edward Cabrera,11.0,Pitcher Strikeouts,standard,4.5,1.5,33.33
5,HOU,Tanner Houck,22.0,Pitcher Strikeouts,standard,4.5,1.5,33.33


In [None]:
excel_file = 'Pitcher-Tracker.xlsx'

# If deleting the mode and engine it rewrites the whole file
with pd.ExcelWriter(excel_file, mode='a', engine='openpyxl') as writer:
    Props2.reset_index().to_excel(writer, index=False, sheet_name = eastern_time.strftime("%m-%d-%y"))