In [142]:
import pybaseball
from pybaseball import statcast
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from datetime import timezone
from bs4 import BeautifulSoup
import io
import requests
import unicodedata

import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="pandas")
pybaseball.cache.enable()

from RosterScraper import RosterScraper

In [143]:
url = 'https://docs.google.com/spreadsheets/d/1JgczhD5VDQ1EiXqVG-blttZcVwbZd5_Ne_mefUGwJnk/pub?output=csv'
res = requests.get(url)
ID = pd.read_csv(io.BytesIO(res.content), sep=',')
ID.dropna(subset=['MLBID'], inplace=True)
ID['MLBID'] = ID['MLBID'].astype(int)

Rosters = RosterScraper()
BID = Rosters[Rosters["Position"] == "Batter"]
PID = Rosters[Rosters["Position"] == "Pitcher"]

In [144]:
def convert_name(name):
    if name == 'Rockies':
        return 'COL'
    elif name == 'Reds':
        return 'CIN'
    elif name == 'Mariners':
        return 'SEA'
    elif name == 'Nationals':
        return 'WSH'
    elif name == 'Yankees':
        return 'NYY'
    elif name == 'Astros':
        return 'HOU'
    elif name == 'Red Sox':
        return 'BOS'
    elif name == 'Athletics':
        return 'OAK'
    elif name == 'Mets':
        return 'NYM'
    elif name == 'Braves':
        return 'ATL'
    elif name == 'Giants':
        return 'SF'
    elif name == 'Brewers':
        return 'MIL'
    elif name == 'Rays':
        return 'TB'
    elif name == 'Royals':
        return 'KC'
    elif name == 'White Sox':
        return 'CWS'
    elif name == 'Cubs':
        return 'CHC'
    elif name == 'Angels':
        return 'LAA'
    elif name == 'Tigers':
        return 'DET'
    elif name == 'Diamondbacks':
        return 'ARI'
    elif name == 'Guardians':
        return 'CLE'
    elif name == 'Orioles':
        return 'BAL'
    elif name == 'Twins':
        return 'MIN'
    elif name == 'Marlins':
        return 'MIA'
    elif name == 'Phillies':
        return 'PHI'
    elif name == 'Rangers':
        return 'TEX'
    elif name == 'Dodgers':
        return 'LAD'
    elif name == 'Padres':
        return 'SD'
    elif name == 'Pirates':
        return 'PIT'
    elif name == 'Blue Jays':
        return 'TOR'
    elif name == 'Cardinals':
        return 'STL'
    else:
        return np.nan
    
def flip_names(name):
    first_name, last_name = name.split(", ")
    return f"{last_name} {first_name}"

def replace_special_chars(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')

def append_suffix_to_duplicates(df, column):
        seen = {}
        for idx, value in enumerate(df[column]):
            if value in seen:
                seen[value] += 1
                df.at[idx, column] = f"{value}2"
            else:
                seen[value] = 1

In [145]:
def getDKData2024():
    eastern_time = datetime.datetime.now(timezone.utc).astimezone(timezone(datetime.timedelta(hours=-5)))
    todaysdate = eastern_time.strftime("%m-%d-%Y")
    url = 'https://rotogrinders.com/lineups/mlb?site=draftkings'
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'lxml')

    gamelist = []
    gamecards = soup.findAll("div", {"class": "game-card-teams"})
    for x in gamecards:
        twoteams = x.findAll("span", {"class": "team-nameplate-mascot"})
        roadteam = convert_name(twoteams[0].text)
        hometeam = convert_name(twoteams[1].text)
        gamekey = "{}@{}".format(roadteam,hometeam)
        gamelist.append(gamekey)

    matchupsdf = pd.DataFrame()
    for game in gamelist:
        roadteam = game.split("@")[0]
        hometeam = game.split("@")[1]
        thisdf1 = pd.DataFrame({"Team": roadteam, "Opp": hometeam, "RoadTeam": roadteam, "HomeTeam": hometeam},index=[0])
        thisdf2 = pd.DataFrame({"Team": hometeam, "Opp": roadteam, "RoadTeam": roadteam, "HomeTeam": hometeam},index=[0])
        matchupsdf = pd.concat([matchupsdf,thisdf1,thisdf2])
        
    oppdict = dict(zip(matchupsdf.Team,matchupsdf.Opp))
    hometeamdict = dict(zip(matchupsdf.Team,matchupsdf.HomeTeam))
    roadteamdict = dict(zip(matchupsdf.Team,matchupsdf.RoadTeam))

    disabled_span_list = []
    for span in soup.findAll("span", {"class": "player-nameplate disabled"}):
        for a in span.findAll("a"):
            disabled_span_list.append(a.text)

    spdata = pd.DataFrame()
    for div in soup.findAll("span", {"class": "player-nameplate", "data-position": "SP"}):
        if "TBD" in str(div):
            playername = "TBD"
            pos = "SP"
            sal = 0
        else:
            for a in div.findAll('a', {'class': 'player-nameplate-name'}):
                playername = a.text.strip()

            strdiv = str(div)
            pos = strdiv[strdiv.find("data-position")+15:strdiv.find("data-salary")-2]
            sal = strdiv[strdiv.find("data-salary")+13:strdiv.find("<div class = 'player-nameplate-info'>")-3]
        try:
            ownership = strdiv[strdiv.find('<span class="small muted" data-auth="502">') + 42:strdiv.find('%')]
            ownership = ownership.replace("</span>", "")
            ownership = ownership.replace("</span", "")
            ownership = ownership.replace("</div>", "")
            ownership = ownership.replace(" ", "")
        except:
            ownership = np.nan

        thisspdata = pd.DataFrame([[playername, sal, ownership]], columns = ["Player", "Salary", "Ownership"])
        spdata = pd.concat([spdata, thisspdata])

    spdata['Player'] = spdata['Player'].replace('Luis Ortiz', 'Luis L. Ortiz')
    spdata['Player'] = spdata['Player'].replace('Mike King', 'Michael King')
    spdata['Player'] = spdata['Player'].replace('Robert Zastryzny', 'Rob Zastryzny')

    spdata2 = pd.merge(spdata, PID[["Name", "Team"]], left_on = ["Player"], right_on = ["Name"], how = "left").rename(columns = {"Team": "PitcherTeam"})
    spdata3 = pd.merge(spdata2, matchupsdf[["Team", "Opp"]], left_on = ["PitcherTeam"], right_on = ["Team"], how = "left").drop(columns = ["Team"])

    append_suffix_to_duplicates(spdata3, 'PitcherTeam')
    append_suffix_to_duplicates(spdata3, 'Opp')

    opp_spname_dict = dict(zip(spdata3.Opp, spdata3.Player))
    opp_spsal_dict = dict(zip(spdata3.Opp, spdata.Salary))
    opp_spown_dict = dict(zip(spdata3.Opp, spdata3.Ownership))

    ludf = pd.DataFrame()
    
    for li in soup.findAll("li", {"class": "lineup-card-player"}):
        for a in li.findAll("a", {"class": ["player-nameplate-name", "player-nameplate disabled"]}):
            playername = a.text

        listring = str(li)
        for span in li.find("span", {"class": "small"}):
            luspot = span.text
            luspot = luspot.replace("\n", "")
            luspot = luspot.strip()
            luspot = int(luspot)
        pos = listring[listring.find("data-position")+15:listring.find("data-salary")-2]
        sal = listring[listring.find("data-salary")+13:listring.find("<span class='small'>")-3]
        ownership = ownership.replace("</span>", "")
        ownership = ownership.replace("</span", "")
        ownership = ownership.replace("</li", "")
        ownership = ownership.replace("</div>", "")
        ownership = ownership.replace(" ", "")

        try:
            sal = int(sal)
        except:
            sal = 0
        thisludf = pd.DataFrame([[playername, luspot, sal, ownership]], columns = ["Player", "Spot", "Sal", "Ownership"])
        ludf = pd.concat([ludf, thisludf])

    ludf2 = pd.merge(ludf, BID[["Name", "Team"]], left_on = ["Player"], right_on = ["Name"], how = "left").rename(columns = {"Team": "BatterTeam"})
    ludf2['BatterTeam'] = ludf2['BatterTeam'].fillna(method='ffill')
    ludf2['HomeTeam'] = ludf2['BatterTeam'].map(hometeamdict)
    ludf2['RoadTeam'] = ludf2['BatterTeam'].map(roadteamdict)

    ludf2_teamlist = list(ludf2["BatterTeam"])

    dhteams = []
    for x in ludf2_teamlist:
        if ludf2_teamlist.count(x) > 11:
            if x in dhteams:
                pass
            else:
                dhteams.append(x)

    extract_dh = ludf2[ludf2["BatterTeam"].isin(dhteams)]
    new_ludf2 = ludf2[~ludf2["BatterTeam"].isin(dhteams)]

    new_team_list = []
    new_home_list = []
    new_road_list = []
    runcounter = 0

    for x, home, road in zip(extract_dh["BatterTeam"].astype(str), 
                         extract_dh["HomeTeam"].astype(str), 
                         extract_dh["RoadTeam"].astype(str)):
        if runcounter < 18:
            new_team_list.append(x)
            new_home_list.append(home)
            new_road_list.append(road)
            runcounter += 1
        else:
            new_team_list.append(x + "2")
            new_home_list.append(home + "2")
            new_road_list.append(road + "2")
            runcounter += 1

    extract_dh["BatterTeam"] = new_team_list
    extract_dh["HomeTeam"] = new_home_list
    extract_dh["RoadTeam"] = new_road_list

    ludf2 = pd.concat([extract_dh, new_ludf2])
    ludf2["Opp"] = ludf2["BatterTeam"].map(oppdict)
    ludf2['SP'] = ludf2['BatterTeam'].map(opp_spname_dict)
    ludf2['SPSal'] = ludf2['BatterTeam'].map(opp_spsal_dict)
    ludf2['SPOwnership'] = ludf2['BatterTeam'].map(opp_spown_dict)
    ludf2['Date'] = todaysdate
    ludf2['Time'] = np.nan

    ludf3 = ludf2[['BatterTeam','RoadTeam','HomeTeam','Time','Spot','Player','Sal','Ownership','Date', "SP"]]

    dkdata = ludf3.copy()

    try:
        checknan = dkdata[["BatterTeam", "SP"]]
        getnans = checknan[["SP"].isna()]
        if len(getnans) == 0:
            nonans = 1
            nanmapdict = {}
        else:
            nonans = 0
            getnans["SP"] = disabled_span_list
            nanmapdict = dict(zip(getnans.Team, getnans.SP))
    except:
        pass

    try:
        dkdata["SP"] = np.where(dkdata["SP"].isna(), dkdata["BatterTeam"].map(nanmapdict), dkdata["SP"])
    except:
        pass
    
    for i in range(1, len(dkdata) - 1):
        if dkdata.loc[i, 'BatterTeam'] != dkdata.loc[i-1, 'BatterTeam']:
            if dkdata.loc[i, 'BatterTeam'] != dkdata.loc[i+1, 'BatterTeam']:
                dkdata.loc[i, 'BatterTeam'] = np.nan
                dkdata.loc[i, 'HomeTeam'] = np.nan
                dkdata.loc[i, 'RoadTeam'] = np.nan
                dkdata.loc[i, 'SP'] = np.nan

    
    dkdata[["BatterTeam", "RoadTeam", "HomeTeam"]] = dkdata[["BatterTeam", "RoadTeam", "HomeTeam"]].fillna(method='ffill')
    dkdata = dkdata.drop_duplicates(subset = ["BatterTeam", "SP"], keep = "first")
    dkdata = dkdata.drop(columns = ["Time", "Sal", "Ownership"])

    dkdata['BatterTeam'] = dkdata['BatterTeam'].replace('ARI', 'AZ')
    dkdata['RoadTeam'] = dkdata['RoadTeam'].replace('ARI', 'AZ')
    dkdata['HomeTeam'] = dkdata['HomeTeam'].replace('ARI', 'AZ')

    dkdata['Date'] = pd.to_datetime(dkdata['Date'])
    dkdata['Date'] = dkdata['Date'].dt.strftime('%Y-%m-%d')
    dkdata = dkdata.set_index("Date")
    dkdata = dkdata[["BatterTeam", "RoadTeam", "HomeTeam", "SP"]]

    return(dkdata)

In [146]:
#statcast(start_dt = "2022-04-07", end_dt = "2022-10-05")
#statcast(start_dt = "2023-03-30", end_dt = "2023-10-01")
savant2022 = pd.read_csv("~/Desktop/Random-Projects/MLB/savant2022.csv")
savant2023 = pd.read_csv("~/Desktop/Random-Projects/MLB/savant2023.csv")

In [437]:
#pd.set_option('display.max_columns', None)
combined1 = pd.concat([savant2022, savant2023])
combined1['game_date'] = pd.to_datetime(combined1['game_date'])
combined1['game_date'] = pd.to_datetime(combined1['game_date'].dt.strftime('%Y-%m-%d'))
combined1['BatterTeam'] = np.where(combined1['inning_topbot'] == 'Top', combined1['away_team'], combined1['home_team'])
combined1['PitcherTeam'] = np.where(combined1['inning_topbot'] == 'Top', combined1['home_team'], combined1['away_team'])
combined1['AwayRunsScored'] = combined1['post_away_score'] - combined1['away_score']
combined1['HomeRunsScored'] = combined1['post_home_score'] - combined1['home_score']
combined1["player_name"] = combined1["player_name"].apply(flip_names)
combined1["player_name"] = combined1["player_name"].apply(replace_special_chars)

In [445]:
# Group by game and team identifiers
groupby_cols = ['game_date', 'BatterTeam', 'away_team', 'home_team']

# Function to keep only the starter's data
def keep_starter(group):
    starter_name = group['player_name'].iloc[0]
    return group[group['player_name'] == starter_name]

def count_outs(x):
    single_outs = ['other_out', 'strikeout', 'field_out', "force_out", 'fielders_choice', 'fielders_choice_out', "sac_fly", "sac_bunt", "caught_stealing_2b", "caught_stealing_3b", "caught_stealing_home", "pickoff_caught_stealing_2b",  "pickoff_caught_stealing_3b",  "pickoff_caught_stealing_home"]
    double_outs = ['double_play', 'strikeout_double_play', 'grounded_into_double_play', "sac_fly_double_play"]
    triple_outs = ['triple_play']
    
    outs = (x.isin(single_outs)).sum() + 2 * (x.isin(double_outs)).sum() + 3 * (x.isin(triple_outs)).sum()
    return outs

# Apply the function to each group
combined2 = combined1[["game_date", "home_team", "away_team", "inning", "inning_topbot", "at_bat_number", "pitch_number", "BatterTeam", "MLBNAME", "events", "description", "bb_type", "estimated_ba_using_speedangle", "estimated_woba_using_speedangle", "woba_value", "p_throws", "PitcherTeam", "player_name", "delta_run_exp", "AwayRunsScored", "HomeRunsScored"]].sort_values(by = ["game_date", "home_team", "away_team", "inning_topbot", "at_bat_number", "pitch_number"], ascending=[True, True, True, False, True, True])
df_starters_only = combined2.groupby(groupby_cols).apply(keep_starter).reset_index(drop = True)

  df_starters_only = combined2.groupby(groupby_cols).apply(keep_starter, include_groups = True).reset_index(drop = True)


## Pitcher group by

In [447]:
Train1 = df_starters_only.groupby(["game_date", "BatterTeam", "away_team", "home_team", "inning", "at_bat_number", "MLBNAME", "player_name", "p_throws"]).agg(
    Pitches = ("pitch_number", "size"),
    PA = ('events', lambda x: (x.isin(['other_out', 'single', 'double', 'triple', 'home_run', 'strikeout', 'field_out', 'field_error', 'fielders_choice', 'double_play', 'fielders_choice_out', 'strikeout_double_play', 'triple_play', 'grounded_into_double_play'])).sum()),
    Outs = ('events', count_outs),
    FB = ('bb_type', lambda x: (x == 'fly_ball').sum()),
    HR = ('events', lambda x: (x == 'home_run').sum()),
    HBP = ('events', lambda x: (x == 'hit_by_pitch').sum()),
    Balls = ('description', lambda x: (x.isin(["ball", "hit_by_pitch", "blocked_ball"])).sum()),
    BB = ('events', lambda x: (x == 'walk').sum()),
    CS = ('description', lambda x: (x == 'called_strike').sum()),
    Whiff = ('description', lambda x: (x.isin(["swinging_strike", "swinging_strike_blocked", "foul_tip"])).sum()),
    Strikes = ('description', lambda x: (x.isin(["called_strike", "swinging_strike", "foul", "swinging_strike_blocked", "foul_tip"])).sum()),
    K = ('events', lambda x: (x == 'strikeout').sum()),
    xBA = ("estimated_ba_using_speedangle", "mean"),
    xwOBA = ("estimated_woba_using_speedangle", "mean"),
    wOBA = ("woba_value", "mean"),
    RunExp = ("delta_run_exp", "mean"),
    AwayRunsScored = ("AwayRunsScored", "sum"),
    HomeRunsScored = ("HomeRunsScored", "sum")).reset_index().fillna(0)

In [468]:
Train2 = Train1.groupby(["game_date", "BatterTeam", "away_team", "home_team", "player_name", "p_throws"]).agg(
    Pitches = ("Pitches", "sum"),
    AB = ("at_bat_number", "size"),
    PA = ("PA", "sum"),
    Outs = ("Outs", "sum"),
    FB = ("FB", "sum"),
    HR = ("HR", "sum"),
    HBP = ("HBP", "sum"),
    Balls = ("Balls", "sum"),
    BB = ("BB", "sum"),
    CS = ("CS", "sum"),
    Whiff = ("Whiff", "sum"),
    Strikes = ("Strikes", "sum"),
    K = ("K", "sum"),
    xBA = ("xBA", "mean"),
    xwOBA = ("xwOBA", "mean"),
    wOBA = ("wOBA", "mean"),
    RunExp = ("RunExp", "mean"),
    AwayRunsScored = ("AwayRunsScored", "sum"),
    HomeRunsScored = ("HomeRunsScored", "sum")).reset_index().fillna(0)

Train2['RA'] = np.where((Train2['HomeRunsScored'] > 0) & (Train2['BatterTeam'] == Train2['home_team']), Train2["HomeRunsScored"],
                        np.where((Train2['AwayRunsScored'] > 0) & (Train2['BatterTeam'] == Train2['away_team']), Train2["AwayRunsScored"], 0))
Train2["RA/9"] = (27 * Train2["RA"] / Train2["Outs"])
Train2 = Train2.drop(columns = ["AwayRunsScored", "HomeRunsScored", "RA"])

In [469]:
lgHR = len(combined1[combined1["events"] == "home_run"])
lgFB = len(combined1[combined1["bb_type"] == "fly_ball"])

Train2['FIP'] = (13 * Train2['HR'] + 3 * (Train2['BB'] + Train2['HBP']) - 2 * Train2['K']) / (Train2['Outs'] / 3) + 3.137
Train2['xFIP'] = (13 * (Train2['FB'] * (lgHR/lgFB * 0.58)) + 3 * (Train2['BB'] + Train2['HBP']) - 2 * Train2['K']) / (Train2['Outs'] / 3) + 3.137

Train2['K%'] = round((Train2['K'] / Train2['AB']) * 100, 2)
Train2['BB%'] = round((Train2['BB'] / Train3['AB']) * 100, 2)
Train2['K-BB%'] = Train2["K%"] - Train2["BB%"]
Train2['Ball%'] = round((Train2['Balls'] / Train2['Pitches']) * 100, 2)
Train2['Strike%'] = round((Train2['Strikes'] / Train2['Pitches']) * 100, 2)
Train2['CS%'] = round((Train2['CS'] / Train2['Pitches']) * 100, 2)
Train2['Whiff%'] = round((Train2['Whiff'] / Train2['Pitches']) * 100, 2)
Train2["CSW"] = Train2["CS"] + Train2["Whiff"]
Train2['CSW%'] = round((Train2['CSW'] / Train2['Pitches']) * 100, 2)
Train2 = Train2.drop(columns = ["CSW"])

In [471]:
window_size5 = 5
window_size10 = 10
window_size20 = 20

# Rolling 5 game pitch averages
Train2['Pitches5'] = Train2.groupby('player_name')['Pitches'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Train2 = Train2.drop(Train2[Train2['Pitches5'] < 40].index)
# Rolling 5 and 10 game outs averages
Train2['Outs5'] = Train2.groupby('player_name')['Outs'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Train2['Outs10'] = Train2.groupby('player_name')['Outs'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game expected batting averages
Train2['xBA5'] = Train2.groupby('player_name')['xBA'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Train2['xBA10'] = Train2.groupby('player_name')['xBA'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game expected wOBA averages
Train2['xwOBA5'] = Train2.groupby('player_name')['xwOBA'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Train2['xwOBA10'] = Train2.groupby('player_name')['xwOBA'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game wOBA averages
Train2['wOBA5'] = Train2.groupby('player_name')['wOBA'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Train2['wOBA10'] = Train2.groupby('player_name')['wOBA'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game RA averages
Train2['RA5'] = Train2.groupby('player_name')['RA/9'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Train2['RA10'] = Train2.groupby('player_name')['RA/9'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game FIP averages
Train2['FIP5'] = Train2.groupby('player_name')['FIP'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Train2['FIP10'] = Train2.groupby('player_name')['FIP'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game xFIP averages
Train2['xFIP5'] = Train2.groupby('player_name')['xFIP'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Train2['xFIP10'] = Train2.groupby('player_name')['xFIP'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game K% averages
Train2['K%5'] = Train2.groupby('player_name')['K%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Train2['K%10'] = Train2.groupby('player_name')['K%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game BB% averages
Train2['BB%5'] = Train2.groupby('player_name')['BB%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Train2['BB%10'] = Train2.groupby('player_name')['BB%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game K-BB% averages
Train2['K-BB%5'] = Train2.groupby('player_name')['K-BB%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Train2['K-BB%10'] = Train2.groupby('player_name')['K-BB%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 game Ball% averages
Train2['Ball%5'] = Train2.groupby('player_name')['Ball%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 game Strike% averages
Train2['Strike%5'] = Train2.groupby('player_name')['Strike%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game Called Strike% averages
Train2['CS%5'] = Train2.groupby('player_name')['CS%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Train2['CS%10'] = Train2.groupby('player_name')['CS%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game Whiff% averages
Train2['Whiff%5'] = Train2.groupby('player_name')['Whiff%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Train2['Whiff%10'] = Train2.groupby('player_name')['Whiff%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 and 10 game Called Strike plus Whiff% averages
Train2['CSW%5'] = Train2.groupby('player_name')['CSW%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Train2['CSW%10'] = Train2.groupby('player_name')['CSW%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)

Train3 = Train2.drop(columns = ["FB", "Balls", "HBP", "BB", "CS", "Whiff", "Strikes", "K", 'Ball%', 'Strike%', 'CS%', 'Whiff%', 'CSW%', "RA/9"])

## Batter

In [481]:
BatterTrain1 = combined2.groupby(["game_date", "BatterTeam", "away_team", "home_team", "inning", "at_bat_number", "MLBNAME", "player_name", "p_throws"]).agg(
    Pitches = ("pitch_number", "size"),
    PA = ('events', lambda x: (x.isin(['other_out', 'single', 'double', 'triple', 'home_run', 'strikeout', 'field_out', 'field_error', 'fielders_choice', 'double_play', 'fielders_choice_out', 'strikeout_double_play', 'triple_play', 'grounded_into_double_play'])).sum()),
    Outs = ('events', count_outs),
    FB = ('bb_type', lambda x: (x == 'fly_ball').sum()),
    HR = ('events', lambda x: (x == 'home_run').sum()),
    HBP = ('events', lambda x: (x == 'hit_by_pitch').sum()),
    Balls = ('description', lambda x: (x.isin(["ball", "hit_by_pitch", "blocked_ball"])).sum()),
    BB = ('events', lambda x: (x == 'walk').sum()),
    CS = ('description', lambda x: (x == 'called_strike').sum()),
    Whiff = ('description', lambda x: (x.isin(["swinging_strike", "swinging_strike_blocked", "foul_tip"])).sum()),
    Strikes = ('description', lambda x: (x.isin(["called_strike", "swinging_strike", "foul", "swinging_strike_blocked", "foul_tip"])).sum()),
    K = ('events', lambda x: (x == 'strikeout').sum()),
    xBA = ("estimated_ba_using_speedangle", "mean"),
    xwOBA = ("estimated_woba_using_speedangle", "mean"),
    wOBA = ("woba_value", "mean"),
    RunExp = ("delta_run_exp", "mean"),
    AwayRunsScored = ("AwayRunsScored", "sum"),
    HomeRunsScored = ("HomeRunsScored", "sum")).reset_index().fillna(0)

In [482]:
BatterTrain2 = BatterTrain1.groupby(["game_date", "BatterTeam", "away_team", "home_team", "p_throws"]).agg(
    Pitches = ("Pitches", "sum"),
    AB = ("at_bat_number", "size"),
    PA = ("PA", "sum"),
    Outs = ("Outs", "sum"),
    FB = ("FB", "sum"),
    HR = ("HR", "sum"),
    HBP = ("HBP", "sum"),
    Balls = ("Balls", "sum"),
    BB = ("BB", "sum"),
    CS = ("CS", "sum"),
    Whiff = ("Whiff", "sum"),
    Strikes = ("Strikes", "sum"),
    K = ("K", "sum"),
    xBA = ("xBA", "mean"),
    xwOBA = ("xwOBA", "mean"),
    wOBA = ("wOBA", "mean"),
    RunExp = ("RunExp", "mean"),
    AwayRunsScored = ("AwayRunsScored", "sum"),
    HomeRunsScored = ("HomeRunsScored", "sum")).reset_index().fillna(0)

BatterTrain2['RA'] = np.where((BatterTrain2['HomeRunsScored'] > 0) & (BatterTrain2['BatterTeam'] == BatterTrain2['home_team']), BatterTrain2["HomeRunsScored"],
                        np.where((BatterTrain2['AwayRunsScored'] > 0) & (BatterTrain2['BatterTeam'] == BatterTrain2['away_team']), BatterTrain2["AwayRunsScored"], 0))
BatterTrain2["R/9"] = (27 * BatterTrain2["RA"] / BatterTrain2["Outs"])
BatterTrain2 = BatterTrain2.drop(columns = ["AwayRunsScored", "HomeRunsScored", "RA"])

In [484]:
BatterTrain2['FIP'] = (13 * BatterTrain2['HR'] + 3 * (BatterTrain2['BB'] + BatterTrain2['HBP']) - 2 * BatterTrain2['K']) / (BatterTrain2['Outs'] / 3) + 3.137
BatterTrain2['xFIP'] = (13 * (BatterTrain2['FB'] * (lgHR/lgFB * 0.58)) + 3 * (BatterTrain2['BB'] + BatterTrain2['HBP']) - 2 * BatterTrain2['K']) / (BatterTrain2['Outs'] / 3) + 3.137

BatterTrain2['K%'] = round((BatterTrain2['K'] / BatterTrain2['AB']) * 100, 2)
BatterTrain2['BB%'] = round((BatterTrain2['BB'] / BatterTrain2['AB']) * 100, 2)
BatterTrain2['K-BB%'] = BatterTrain2["K%"] - BatterTrain2["BB%"]
BatterTrain2['Ball%'] = round((BatterTrain2['Balls'] / BatterTrain2['Pitches']) * 100, 2)
BatterTrain2['Strike%'] = round((BatterTrain2['Strikes'] / BatterTrain2['Pitches']) * 100, 2)
BatterTrain2['CS%'] = round((BatterTrain2['CS'] / BatterTrain2['Pitches']) * 100, 2)
BatterTrain2['Whiff%'] = round((BatterTrain2['Whiff'] / BatterTrain2['Pitches']) * 100, 2)
BatterTrain2["CSW"] = BatterTrain2["CS"] + BatterTrain2["Whiff"]
BatterTrain2['CSW%'] = round((BatterTrain2['CSW'] / BatterTrain2['Pitches']) * 100, 2)
BatterTrain2 = BatterTrain2.drop(columns = ["CSW"])

## Adding Rolling Averages

In [490]:
# Rolling 5 and 10 game expected batting averages
Train3['bxBA5'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['xBA'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Train3['bxBA10'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['xBA'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 and 10 game expected wOBA averages
Train3['bxwOBA5'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['xwOBA'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Train3['bxwOBA10'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['xwOBA'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 and 10 game wOBA averages
Train3['bwOBA5'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['wOBA'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Train3['bwOBA10'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['wOBA'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 and 10 game RA averages
Train3['bRS5'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['R/9'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Train3['bRS10'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['R/9'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 and 10 game FIP averages
Train3['bFIP5'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['FIP'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Train3['bFIP10'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['FIP'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 and 10 game xFIP averages
Train3['bxFIP5'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['xFIP'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Train3['bxFIP10'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['xFIP'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 and 10 game K% averages
Train3['bK%5'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['K%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Train3['bK%10'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['K%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 and 10 game BB% averages
Train3['bBB%5'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['BB%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Train3['bBB%10'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['BB%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 and 10 game K-BB% averages
Train3['bK-BB%5'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['K-BB%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Train3['bK-BB%10'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['K-BB%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 game Ball% averages
Train3['bBall%5'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['Ball%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 game Strike% averages
Train3['bStrike%5'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['Strike%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 and 10 game Called Strike% averages
Train3['bCS%5'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['CS%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Train3['bCS%10'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['CS%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 and 10 game Whiff% averages
Train3['bWhiff%5'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['Whiff%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Train3['bWhiff%10'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['Whiff%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 and 10 game Called Strike plus Whiff% averages
Train3['bCSW%5'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['CSW%'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Train3['bCSW%10'] = BatterTrain2.groupby(['BatterTeam', "p_throws"])['CSW%'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)

## Loads in today's data

In [155]:
TodaysData = getDKData2024()
TodaysData

  ludf2['BatterTeam'] = ludf2['BatterTeam'].fillna(method='ffill')
  dkdata[["BatterTeam", "RoadTeam", "HomeTeam"]] = dkdata[["BatterTeam", "RoadTeam", "HomeTeam"]].fillna(method='ffill')


Unnamed: 0_level_0,BatterTeam,RoadTeam,HomeTeam,SP
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-07-22,TB,TB,NYY,Carlos Rodon
2024-07-22,NYY,TB,NYY,Zack Littell
2024-07-22,DET,DET,CLE,Carlos Carrasco
2024-07-22,CLE,DET,CLE,Tarik Skubal
2024-07-22,NYM,NYM,MIA,Yonny Chirinos
2024-07-22,MIA,NYM,MIA,David Peterson
2024-07-22,STL,STL,PIT,Mitch Keller
2024-07-22,PIT,STL,PIT,Andre Pallante
2024-07-22,CIN,CIN,ATL,Reynaldo Lopez
2024-07-22,ATL,CIN,ATL,Hunter Greene


In [156]:
eastern_time = datetime.datetime.now(timezone.utc).astimezone(timezone(datetime.timedelta(hours=-5)))
savant2024 = statcast(start_dt = "2024-03-28", end_dt = eastern_time.strftime("%Y-%m-%d"))
savant2024['game_date'] = pd.to_datetime(savant2024['game_date'])
savant2024['game_date'] = savant2024['game_date'].dt.strftime('%Y-%m-%d')
savant2024['BatterTeam'] = np.where(savant2024['inning_topbot'] == 'Top', savant2024['away_team'], savant2024['home_team'])
savant2024['PitcherTeam'] = np.where(savant2024['inning_topbot'] == 'Top', savant2024['home_team'], savant2024['away_team'])
savant2024 = pd.merge(savant2024, ID[["MLBID", "MLBNAME"]], left_on = 'batter', right_on = 'MLBID', how = 'left')
savant2024.dropna(subset=['MLBNAME'], inplace=True)
savant2024 = savant2024.drop_duplicates(subset = ["pitch_type", "game_date", "release_speed", "release_pos_x", "release_pos_z", "player_name"], keep='first')
savant2024["player_name"] = savant2024["player_name"].apply(flip_names)
savant2024 = savant2024[["game_date", "home_team", "away_team", "inning", "inning_topbot", "at_bat_number", "pitch_number", "pitch_type", "BatterTeam", "MLBNAME", "balls", "strikes", "outs_when_up", "events", "description", "bb_type", "hit_distance_sc", "launch_speed", "launch_angle", "estimated_ba_using_speedangle", "estimated_woba_using_speedangle", "woba_value", "p_throws", "PitcherTeam", "player_name", "delta_home_win_exp", "delta_run_exp", "away_score", "home_score"]].sort_values(by = ["game_date", "home_team", "away_team", "inning_topbot", "at_bat_number", "pitch_number"], ascending=[True, True, True, False, True, True])
savant2024 = savant2024.groupby(groupby_cols).apply(keep_starter).reset_index(drop = True)
savant2024["player_name"] = savant2024["player_name"].apply(replace_special_chars)

This is a large query, it may take a moment to complete


100%|██████████| 117/117 [04:29<00:00,  2.30s/it]
  final_data = pd.concat(dataframe_list, axis=0).convert_dtypes(convert_string=False)
  savant2024 = savant2024.groupby(groupby_cols).apply(keep_starter).reset_index(drop = True)


In [157]:
Season1 = savant2024.groupby(["game_date", "BatterTeam", "away_team", "home_team", "inning", "at_bat_number", "MLBNAME", "player_name", "p_throws"]).agg(
    Pitches = ("pitch_number", "size"),
    PA = ('events', lambda x: (x.isin(['other_out', 'single', 'double', 'triple', 'home_run', 'strikeout', 'field_out', 'field_error', 'fielders_choice', 'double_play', 'fielders_choice_out', 'strikeout_double_play', 'triple_play', 'grounded_into_double_play'])).sum()),
    Outs = ('events', count_outs),
    HBP = ('events', lambda x: (x == 'hit_by_pitch').sum()),
    Balls = ('description', lambda x: (x.isin(["ball", "hit_by_pitch", "blocked_ball"])).sum()),
    BB = ('events', lambda x: (x == 'walk').sum()),
    CS = ('description', lambda x: (x == 'called_strike').sum()),
    Whiff = ('description', lambda x: (x.isin(["swinging_strike", "swinging_strike_blocked", "foul_tip"])).sum()),
    Strikes = ('description', lambda x: (x.isin(["called_strike", "swinging_strike", "foul", "swinging_strike_blocked", "foul_tip"])).sum()),
    K = ('events', lambda x: (x == 'strikeout').sum()),
    xBA = ("estimated_ba_using_speedangle", "mean"),
    xwOBA = ("estimated_woba_using_speedangle", "mean"),
    wOBA = ("woba_value", "mean"),
    RunExp = ("delta_run_exp", "mean")).reset_index().fillna(0)

In [158]:
Season2 = Season1.groupby(["game_date", "BatterTeam", "away_team", "home_team", "player_name", "p_throws"]).agg(
    Pitches = ("Pitches", "sum"),
    AB = ("at_bat_number", "size"),
    PA = ("PA", "sum"),
    Outs = ("Outs", "sum"),
    HBP = ("HBP", "sum"),
    Balls = ("Balls", "sum"),
    BB = ("BB", "sum"),
    CS = ("CS", "sum"),
    Whiff = ("Whiff", "sum"),
    Strikes = ("Strikes", "sum"),
    K = ("K", "sum"),
    xBA = ("xBA", "mean"),
    xwOBA = ("xwOBA", "mean"),
    wOBA = ("wOBA", "mean"),
    RunExp = ("RunExp", "mean")).reset_index().fillna(0)

Season2

Unnamed: 0,game_date,BatterTeam,away_team,home_team,player_name,p_throws,Pitches,AB,PA,Outs,...,Balls,BB,CS,Whiff,Strikes,K,xBA,xwOBA,wOBA,RunExp
0,2024-03-28,AZ,COL,AZ,Kyle Freeland,L,49,17,16,7,...,15,1,8,6,20,2,0.442235,0.538678,0.676471,0.271077
1,2024-03-28,BAL,LAA,BAL,Patrick Sandoval,L,60,14,10,3,...,24,2,10,6,26,2,0.274357,0.359147,0.575,0.033632
2,2024-03-28,BOS,BOS,SEA,Luis Castillo,R,91,24,19,13,...,33,2,17,6,42,5,0.24875,0.337378,0.395833,0.038714
3,2024-03-28,CHC,CHC,TEX,Nathan Eovaldi,R,88,22,19,17,...,33,1,11,6,38,3,0.236909,0.339829,0.290909,0.012676
4,2024-03-28,CIN,WSH,CIN,Josiah Gray,R,80,22,20,12,...,28,2,14,15,37,6,0.279273,0.401048,0.538636,0.024406
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2941,2024-07-21,STL,STL,ATL,Spencer Schwellenbach,R,98,25,25,18,...,26,0,11,21,55,8,0.2672,0.36644,0.384,0.025928
2942,2024-07-21,TB,TB,NYY,Marcus Stroman,R,87,21,20,15,...,25,0,14,12,47,4,0.304333,0.365238,0.378571,0.003875
2943,2024-07-21,TEX,BAL,TEX,Dean Kremer,R,89,24,20,15,...,39,3,15,5,31,2,0.261333,0.38815,0.320833,0.082927
2944,2024-07-21,TOR,DET,TOR,Keider Montero,R,63,21,21,14,...,18,0,12,4,27,3,0.358333,0.426238,0.464286,0.03724


In [159]:
Season3 = Season2.groupby(["player_name", "p_throws"]).agg(
    Games = ("Pitches", "size"),
    Pitches = ("Pitches", "sum"),
    AB = ("AB", "sum"),
    PA = ("PA", "sum"),
    Outs = ("Outs", "sum"),
    HBP = ("HBP", "sum"),
    Balls = ("Balls", "sum"),
    BB = ("BB", "sum"),
    CS = ("CS", "sum"),
    Whiff = ("Whiff", "sum"),
    Strikes = ("Strikes", "sum"),
    K = ("K", "sum"),
    xBA = ("xBA", "mean"),
    xwOBA = ("xwOBA", "mean"),
    wOBA = ("wOBA", "mean"),
    RunExp = ("RunExp", "mean")).reset_index().fillna(0)

Season3["P/AB"] = round(Season3["Pitches"] / Season3["AB"], 2)
Season3['K%'] = round((Season3['K'] / Season3['AB']) * 100, 2)
Season3['BB%'] = round((Season3['BB'] / Season3['AB']) * 100, 2)
Season3['K-BB%'] = Season3["K%"] - Season3["BB%"]
Season3['Ball%'] = round((Season3['Balls'] / Season3['Pitches']) * 100, 2)
Season3['Strike%'] = round((Season3['Strikes'] / Season3['Pitches']) * 100, 2)
Season3['CS%'] = round((Season3['CS'] / Season3['Pitches']) * 100, 2)
Season3['Whiff%'] = round((Season3['Whiff'] / Season3['Pitches']) * 100, 2)
Season3["CSW"] = Season3["CS"] + Season3["Whiff"]
Season3['CSW%'] = round((Season3['CSW'] / Season3['Pitches']) * 100, 2)
Season3 = Season3.drop(columns = ["CSW"])

In [160]:
# List of columns to keep unchanged
columns_to_keep = ["P/AB", "xBA", "wOBA", "xwOBA", "RunExp", "K%", "BB%", "K-BB%", "Ball%", "Strike%", "CS%", "Whiff%", "CSW%"]

# List of columns to divide
columns_to_divide = ["Pitches", "AB", "PA", "Outs", "HBP", "Balls", "BB", "CS", "Whiff", "Strikes", "K"]
divide_by_column = 'Games'

# Create the new DataFrame
Season4 = Season3.copy()

for col in columns_to_keep:
    Season4[col] = Season3[col]

for col in columns_to_divide:
    Season4[col] = Season3[col] / Season3[divide_by_column]

In [178]:
TodaysData.dropna(subset=['SP'], inplace=True)
TodaysData1 = pd.merge(TodaysData, Season4[['player_name', "p_throws", 'Pitches', 'AB', 'PA', 'Outs', 'HBP', 'Balls', 'BB', 'CS', 'Whiff', 'Strikes', 'K', "xBA", 'xwOBA', 'wOBA', 'RunExp', 'P/AB', 'K%', 'BB%', 'K-BB%', 'Ball%', 'Strike%', 'CS%', 'Whiff%', 'CSW%']], left_on = ['SP'], right_on = ['player_name'], how = 'left').drop(columns = ["player_name"])

# If no 2024 savant data exists then gives them the league averages from 2022-23
TrainMeans = Train3.drop(['BatterTeam', 'RoadTeam', "HomeTeam", "SP", "p_throws"], axis=1).mean()
TodaysData1 = TodaysData1.fillna(TrainMeans)

In [179]:
TodaysData1

Unnamed: 0,BatterTeam,RoadTeam,HomeTeam,SP,p_throws,Pitches,AB,PA,Outs,HBP,...,RunExp,P/AB,K%,BB%,K-BB%,Ball%,Strike%,CS%,Whiff%,CSW%
0,TB,TB,NYY,Carlos Rodon,L,94.8,22.8,20.2,15.3,0.45,...,0.009839,4.16,24.12,7.24,16.88,36.66,47.26,14.4,14.03,28.43
1,NYY,TB,NYY,Zack Littell,R,84.947368,22.842105,21.263158,15.473684,0.210526,...,0.010746,3.72,20.97,4.38,16.59,31.78,48.27,15.92,11.9,27.82
2,DET,DET,CLE,Carlos Carrasco,R,78.5625,21.625,19.25,14.25,0.3125,...,0.035439,3.63,18.79,6.07,12.72,34.92,44.71,15.35,11.06,26.41
3,CLE,DET,CLE,Tarik Skubal,L,89.555556,23.666667,21.555556,17.111111,0.333333,...,-0.001274,3.78,30.99,4.69,26.3,30.52,52.73,17.74,16.94,34.68
4,NYM,NYM,MIA,Yonny Chirinos,R,81.2,22.2,19.6,12.8,0.4,...,0.0626,3.66,18.02,8.11,9.91,33.74,46.55,18.72,10.1,28.82
5,MIA,NYM,MIA,David Peterson,L,91.0,23.75,20.375,15.75,0.625,...,-0.008275,3.83,16.32,10.0,6.32,34.75,46.15,16.21,10.44,26.65
6,STL,STL,PIT,Mitch Keller,R,92.684211,25.526316,22.368421,16.894737,0.631579,...,-0.000618,3.63,21.65,6.19,15.46,32.88,47.93,19.31,10.62,29.93
7,PIT,STL,PIT,Andre Pallante,R,71.857143,18.714286,16.428571,13.428571,0.285714,...,0.034776,3.84,19.08,9.92,9.16,39.36,42.35,16.3,8.35,24.65
8,CIN,CIN,ATL,Reynaldo Lopez,R,86.0,22.4,19.8,15.866667,0.0,...,-0.025958,3.84,25.3,8.33,16.97,35.43,47.44,16.67,12.71,29.38
9,ATL,CIN,ATL,Hunter Greene,R,101.157895,23.894737,20.052632,16.368421,0.684211,...,0.006455,4.23,27.75,9.91,17.84,36.37,49.22,13.48,15.14,28.62


In [180]:
BatterTrain3.head()
TodaysData2 = pd.merge(TodaysData1, BatterTrain3[["BatterTeam", "p_throws", "hxBA", "hxwOBA", "hwOBA", "hRunExp", "hP/AB", "hK%", "hBB%", "hK-BB%", "hBall%", "hStrike%", "hCS%", "hWhiff%", "hCSW%"]], left_on = ["BatterTeam", "p_throws"], right_on = ["BatterTeam", "p_throws"], how = "left")
TodaysData2.head()

Unnamed: 0,BatterTeam,RoadTeam,HomeTeam,SP,p_throws,Pitches,AB,PA,Outs,HBP,...,hRunExp,hP/AB,hK%,hBB%,hK-BB%,hBall%,hStrike%,hCS%,hWhiff%,hCSW%
0,TB,TB,NYY,Carlos Rodon,L,94.8,22.8,20.2,15.3,0.45,...,0.011099,3.83,21.57,7.84,13.73,35.13,46.61,16.31,12.06,28.37
1,NYY,TB,NYY,Zack Littell,R,84.947368,22.842105,21.263158,15.473684,0.210526,...,0.007137,3.97,22.92,9.29,13.63,36.72,46.47,18.11,11.86,29.97
2,DET,DET,CLE,Carlos Carrasco,R,78.5625,21.625,19.25,14.25,0.3125,...,-0.001935,3.87,24.41,7.22,17.19,34.81,47.73,15.72,13.42,29.13
3,CLE,DET,CLE,Tarik Skubal,L,89.555556,23.666667,21.555556,17.111111,0.333333,...,-0.000361,3.85,18.01,6.56,11.45,34.06,46.48,16.97,10.18,27.15
4,NYM,NYM,MIA,Yonny Chirinos,R,81.2,22.2,19.6,12.8,0.4,...,0.010102,3.91,19.91,8.4,11.51,36.41,45.72,16.62,10.93,27.55


In [163]:
# Ensure Train5 and TodaysData2 are copies of Train4 and TodaysData1 respectively
Train5 = Train3.copy()
TodaysData2 = TodaysData2.copy()

# Dictionary to store the label encoders
label_encoders = {}

# Encode non-numeric columns in Train4
non_numeric_columns_train = Train5.select_dtypes(exclude=['float64', 'int64']).columns
for col in non_numeric_columns_train:
    label_encoder = LabelEncoder()
    Train5[col] = label_encoder.fit_transform(Train5[col])
    label_encoders[col] = label_encoder

# Ensure all non-numeric columns in Train4 are in TodaysData1
for col in non_numeric_columns_train:
    if col not in TodaysData2.columns:
        print(f"Warning: Column {col} from training data is not present in today's data.")
        # Adding the missing column with a default value
        TodaysData2[col] = 536

# Encode non-numeric columns in TodaysData1 using the same encoders
non_numeric_columns_today = TodaysData2.select_dtypes(exclude=['float64', 'int64']).columns
for col in non_numeric_columns_today:
    if col in label_encoders:
        label_encoder = label_encoders[col]
        unique_values = set(label_encoder.classes_)
        encoded_values = []
        for item in TodaysData2[col]:
            if item in unique_values:
                encoded_values.append(label_encoder.transform([item])[0])
            else:
                encoded_values.append(536)  # Using 536 as a placeholder for unknown categories
        TodaysData2[col] = encoded_values
    else:
        print(f"Warning: Column {col} is not present in the training data.")
        # Fit a new label encoder for columns not present in Train4, but be cautious with this
        label_encoder = LabelEncoder()
        TodaysData2[col] = label_encoder.fit_transform(TodaysData2[col])
        label_encoders[col] = label_encoder

In [164]:
TrainFeatures = Train5.drop(columns = ["K"]).values.reshape(-1, 28)
TrainLabel = Train5["K"].values.reshape(-1, 1)
TodayFeatures = TodaysData2.drop(columns = ["K"]).values.reshape(-1, 28)

rf_regressor = RandomForestRegressor(n_estimators = 152, max_depth = 15, min_samples_leaf = 4)
rf_regressor.fit(TrainFeatures, TrainLabel)
RFpred = rf_regressor.predict(TodayFeatures)

TodaysData2["RFPred"] = RFpred

ValueError: cannot reshape array of size 209308 into shape (28)

In [None]:
for col in non_numeric_columns_today:
    if col in label_encoders:
        label_encoder = label_encoders[col]
        # Handling default value of 536
        TodaysData2[col] = TodaysData2[col].apply(lambda x: label_encoder.inverse_transform([x])[0] if x != 536 else np.nan)

TodaysData2["SP"].fillna(TodaysData1["SP"], inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  TodaysData2["SP"].fillna(TodaysData1["SP"], inplace = True)


In [None]:
TodaysData2.sort_values("RFPred", ascending = True)

Unnamed: 0,BatterTeam,RoadTeam,HomeTeam,SP,p_throws,Pitches,AB,PA,Outs,HBP,...,P/AB,K%,BB%,K-BB%,Ball%,Strike%,CS%,Whiff%,CSW%,RFPred
6,NYY,NYY,BAL,Cade Povich,L,84.5,20.833333,17.333333,12.833333,0.333333,...,4.06,14.4,10.4,4.0,36.69,45.17,17.16,7.89,25.05,3.0
23,LAA,SEA,LAA,Bryan Woo,R,69.875,18.625,17.75,14.75,0.125,...,3.75,18.12,2.01,16.11,26.83,52.24,16.99,11.27,28.26,3.085526
26,TOR,TOR,AZ,Ryne Nelson,R,81.285714,21.714286,19.857143,14.357143,0.214286,...,3.74,15.13,4.61,10.52,32.51,46.31,15.2,9.49,24.69,3.585526
24,ATL,ATL,SD,Randy Vasquez,R,83.363636,20.727273,18.545455,12.909091,0.181818,...,4.02,15.35,6.14,9.21,35.99,44.82,14.18,9.81,23.99,3.914474
1,DET,LAD,DET,James Paxton,L,84.3125,21.375,18.1875,14.125,0.0625,...,3.94,16.08,11.4,4.68,38.84,42.7,13.71,9.86,23.57,3.940789
19,CWS,PIT,CWS,Marco Gonzales,L,81.333333,23.0,21.0,16.666667,0.0,...,3.54,15.94,7.25,8.69,31.15,47.13,18.44,9.02,27.46,3.953947
3,PHI,OAK,PHI,Hogan Harris,L,90.714286,22.0,19.285714,14.571429,0.142857,...,4.12,17.53,9.09,8.44,37.64,44.72,17.48,10.08,27.56,3.986842
27,AZ,TOR,AZ,Yariel Rodriguez,R,73.571429,17.571429,15.285714,12.428571,0.142857,...,4.19,23.58,11.38,12.2,37.28,47.18,15.92,11.65,27.57,4.0
5,TB,CLE,TB,Carlos Carrasco,R,78.333333,21.6,19.133333,14.2,0.333333,...,3.63,19.14,6.17,12.97,34.55,45.19,15.74,10.98,26.72,4.0
21,STL,CHC,STL,Kyle Hendricks,R,74.727273,20.181818,17.636364,11.818182,0.181818,...,3.7,18.02,7.21,10.81,33.94,46.23,18.73,9.37,28.1,4.0
