In [310]:
import pybaseball
from pybaseball import statcast
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from datetime import timezone
from bs4 import BeautifulSoup
import io
import requests
import plotnine
import unicodedata
from plotnine import ggplot, geom_point, aes 
pybaseball.cache.enable()

In [311]:
url = 'https://docs.google.com/spreadsheets/d/1JgczhD5VDQ1EiXqVG-blttZcVwbZd5_Ne_mefUGwJnk/pub?output=csv'
res = requests.get(url)
ID = pd.read_csv(io.BytesIO(res.content), sep=',')
ID.dropna(subset=['MLBID'], inplace=True)
ID['MLBID'] = ID['MLBID'].astype(int)
BID = pd.read_csv("mlb-player-stats-Batters.csv")
PID = pd.read_csv("mlb-player-stats-P.csv")
PID['Player'] = PID['Player'].replace('Michael King', 'Mike King')

In [312]:
def convert_name(name):
    if name == 'Rockies':
        return 'COL'
    elif name == 'Reds':
        return 'CIN'
    elif name == 'Mariners':
        return 'SEA'
    elif name == 'Nationals':
        return 'WSH'
    elif name == 'Yankees':
        return 'NYY'
    elif name == 'Astros':
        return 'HOU'
    elif name == 'Red Sox':
        return 'BOS'
    elif name == 'Athletics':
        return 'OAK'
    elif name == 'Mets':
        return 'NYM'
    elif name == 'Braves':
        return 'ATL'
    elif name == 'Giants':
        return 'SF'
    elif name == 'Brewers':
        return 'MIL'
    elif name == 'Rays':
        return 'TB'
    elif name == 'Royals':
        return 'KC'
    elif name == 'White Sox':
        return 'CWS'
    elif name == 'Cubs':
        return 'CHC'
    elif name == 'Angels':
        return 'LAA'
    elif name == 'Tigers':
        return 'DET'
    elif name == 'Diamondbacks':
        return 'ARI'
    elif name == 'Guardians':
        return 'CLE'
    elif name == 'Orioles':
        return 'BAL'
    elif name == 'Twins':
        return 'MIN'
    elif name == 'Marlins':
        return 'MIA'
    elif name == 'Phillies':
        return 'PHI'
    elif name == 'Rangers':
        return 'TEX'
    elif name == 'Dodgers':
        return 'LAD'
    elif name == 'Padres':
        return 'SD'
    elif name == 'Pirates':
        return 'PIT'
    elif name == 'Blue Jays':
        return 'TOR'
    elif name == 'Cardinals':
        return 'STL'
    else:
        return np.nan
    
def flip_names(name):
    first_name, last_name = name.split(", ")
    return f"{last_name} {first_name}"

In [313]:
def getDKData2024():
    eastern_time = datetime.datetime.now(timezone.utc).astimezone(timezone(datetime.timedelta(hours=-5)))
    todaysdate = eastern_time.strftime("%m-%d-%Y")
    url = 'https://rotogrinders.com/lineups/mlb?site=draftkings'
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'lxml')

    gamelist = []
    gamecards = soup.findAll("div", {"class": "game-card-teams"})
    for x in gamecards:
        twoteams = x.findAll("span", {"class": "team-nameplate-mascot"})
        roadteam = convert_name(twoteams[0].text)
        hometeam = convert_name(twoteams[1].text)
        gamekey = "{}@{}".format(roadteam,hometeam)
        gamelist.append(gamekey)

    matchupsdf = pd.DataFrame()
    for game in gamelist:
        roadteam = game.split("@")[0]
        hometeam = game.split("@")[1]
        thisdf1 = pd.DataFrame({"Team": roadteam, "Opp": hometeam, "RoadTeam": roadteam, "HomeTeam": hometeam},index=[0])
        thisdf2 = pd.DataFrame({"Team": hometeam, "Opp": roadteam, "RoadTeam": roadteam, "HomeTeam": hometeam},index=[0])
        matchupsdf = pd.concat([matchupsdf,thisdf1,thisdf2])
        
    oppdict = dict(zip(matchupsdf.Team,matchupsdf.Opp))
    hometeamdict = dict(zip(matchupsdf.Team,matchupsdf.HomeTeam))
    roadteamdict = dict(zip(matchupsdf.Team,matchupsdf.RoadTeam))

    disabled_span_list = []
    for span in soup.findAll("span", {"class": "player-nameplate disabled"}):
        for a in span.findAll("a"):
            disabled_span_list.append(a.text)

    spdata = pd.DataFrame()
    for div in soup.findAll("span", {"class": "player-nameplate", "data-position": "SP"}):
        if "TBD" in str(div):
            playername = "TBD"
            pos = "SP"
            sal = 0
        else:
            for a in div.findAll('a', {'class': 'player-nameplate-name'}):
                playername = a.text.strip()

            strdiv = str(div)
            pos = strdiv[strdiv.find("data-position")+15:strdiv.find("data-salary")-2]
            sal = strdiv[strdiv.find("data-salary")+13:strdiv.find("<div class = 'player-nameplate-info'>")-3]
        try:
            ownership = strdiv[strdiv.find('<span class="small muted" data-auth="502">') + 42:strdiv.find('%')]
            ownership = ownership.replace("</span>", "")
            ownership = ownership.replace("</span", "")
            ownership = ownership.replace("</div>", "")
            ownership = ownership.replace(" ", "")
        except:
            ownership = np.nan

        thisspdata = pd.DataFrame([[playername, sal, ownership]], columns = ["Player", "Salary", "Ownership"])
        spdata = pd.concat([spdata, thisspdata])

    spdata2 = pd.merge(spdata, PID[["Player", "Team"]], left_on = ["Player"], right_on = ["Player"], how = "left").rename(columns = {"Team": "PitcherTeam"})
    spdata3 = pd.merge(spdata2, matchupsdf[["Team", "Opp"]], left_on = ["PitcherTeam"], right_on = ["Team"], how = "left").drop(columns = ["Team"])

    opp_spname_dict = dict(zip(spdata3.Opp, spdata3.Player))
    opp_spsal_dict = dict(zip(spdata3.Opp, spdata.Salary))
    opp_spown_dict = dict(zip(spdata3.Opp, spdata3.Ownership))

    ludf = pd.DataFrame()
    
    for li in soup.findAll("li", {"class": "lineup-card-player"}):
        for a in li.findAll("a", {"class": ["player-nameplate-name", "player-nameplate disabled"]}):
            playername = a.text

        listring = str(li)
        for span in li.find("span", {"class": "small"}):
            luspot = span.text
            luspot = luspot.replace("\n", "")
            luspot = luspot.strip()
            luspot = int(luspot)
        pos = listring[listring.find("data-position")+15:listring.find("data-salary")-2]
        sal = listring[listring.find("data-salary")+13:listring.find("<span class='small'>")-3]
        ownership = ownership.replace("</span>", "")
        ownership = ownership.replace("</span", "")
        ownership = ownership.replace("</li", "")
        ownership = ownership.replace("</div>", "")
        ownership = ownership.replace(" ", "")

        try:
            sal = int(sal)
        except:
            sal = 0
        thisludf = pd.DataFrame([[playername, luspot, sal, ownership]], columns = ["Player", "Spot", "Sal", "Ownership"])
        ludf = pd.concat([ludf, thisludf])

    ludf2 = pd.merge(ludf, BID[["Player", "Team"]], left_on = ["Player"], right_on = ["Player"], how = "left").rename(columns = {"Team": "BatterTeam"})
    ludf2['BatterTeam'] = ludf2['BatterTeam'].fillna(method='ffill')

    ludf2_teamlist = list(ludf2["BatterTeam"])

    dhteams = []
    for x in ludf2_teamlist:
        if ludf2_teamlist.count(x) > 11:
            if x in dhteams:
                pass
            else:
                dhteams.append(x)

    extract_dh = ludf2[ludf2["BatterTeam"].isin(dhteams)]
    new_ludf2 = ludf2[~ludf2["BatterTeam"].isin(dhteams)]

    new_team_list = []
    runcounter = 0

    for x in list(extract_dh["BatterTeam"].astype(str)):
        if runcounter < 18:
            new_team_list.append(x)
            runcounter += 1
        else:
            new_team_list.append(x+"2")
            runcounter += 1

    extract_dh["BatterTeam"] = new_team_list

    ludf2 = pd.concat([extract_dh, new_ludf2])
    ludf2["Opp"] = ludf2["BatterTeam"].map(oppdict)
    ludf2["HomeTeam"] = ludf2["BatterTeam"].map(hometeamdict)
    ludf2["RoadTeam"] = ludf2["BatterTeam"].map(roadteamdict)
    ludf2['SP'] = ludf2['BatterTeam'].map(opp_spname_dict)
    ludf2['SPSal'] = ludf2['BatterTeam'].map(opp_spsal_dict)
    ludf2['SPOwnership'] = ludf2['BatterTeam'].map(opp_spown_dict)
    ludf2['Date'] = todaysdate
    ludf2['Time'] = np.nan

    ludf3 = ludf2[['BatterTeam','RoadTeam','HomeTeam','Time','Spot','Player','Sal','Ownership','Date', "SP"]]

    dkdata = ludf3.copy()

    try:
        checknan = dkdata[["BatterTeam", "SP"]]
        getnans = checknan[["SP"].isna()]
        if len(getnans) == 0:
            nonans = 1
            nanmapdict = {}
        else:
            nonans = 0
            getnans["SP"] = disabled_span_list
            nanmapdict = dict(zip(getnans.Team, getnans.SP))
    except:
        pass

    try:
        dkdata["SP"] = np.where(dkdata["SP"].isna(), dkdata["BatterTeam"].map(nanmapdict), dkdata["SP"])
    except:
        pass
    
    for i in range(1, len(dkdata) - 1):
        if dkdata.loc[i, 'BatterTeam'] != dkdata.loc[i-1, 'BatterTeam']:
            if dkdata.loc[i, 'BatterTeam'] != dkdata.loc[i+1, 'BatterTeam']:
                dkdata.loc[i, 'BatterTeam'] = np.nan
                dkdata.loc[i, 'HomeTeam'] = np.nan
                dkdata.loc[i, 'RoadTeam'] = np.nan
                dkdata.loc[i, 'SP'] = np.nan

    
    dkdata[["BatterTeam", "RoadTeam", "HomeTeam"]] = dkdata[["BatterTeam", "RoadTeam", "HomeTeam"]].fillna(method='ffill')
    dkdata = dkdata.drop_duplicates(subset = ["BatterTeam", "SP"], keep = "first")
    dkdata = dkdata.drop(columns = ["Time", "Sal", "Ownership"])

    dkdata['BatterTeam'] = dkdata['BatterTeam'].replace('ARI', 'AZ')
    dkdata['RoadTeam'] = dkdata['RoadTeam'].replace('ARI', 'AZ')
    dkdata['HomeTeam'] = dkdata['HomeTeam'].replace('ARI', 'AZ')

    dkdata['Date'] = pd.to_datetime(dkdata['Date'])
    dkdata['Date'] = dkdata['Date'].dt.strftime('%Y-%m-%d')
    dkdata = dkdata.set_index("Date")
    dkdata = dkdata[["BatterTeam", "RoadTeam", "HomeTeam", "SP"]]

    return(dkdata)


In [314]:
#statcast(start_dt = "2022-04-07", end_dt = "2022-10-05")
#statcast(start_dt = "2023-03-30", end_dt = "2023-10-01")
savant2022 = pd.read_csv("~/Desktop/Random-Projects/MLB/savant2022.csv")
savant2023 = pd.read_csv("~/Desktop/Random-Projects/MLB/savant2023.csv")

In [315]:
#pd.set_option('display.max_columns', None)
combined1 = pd.concat([savant2022, savant2023])
combined1 = combined1[(combined1['inning'] == 1)]
combined1['game_date'] = pd.to_datetime(combined1['game_date'])
combined1['game_date'] = combined1['game_date'].dt.strftime('%Y-%m-%d')
combined1['BatterTeam'] = np.where(combined1['inning_topbot'] == 'Top', combined1['away_team'], combined1['home_team'])
combined1['PitcherTeam'] = np.where(combined1['inning_topbot'] == 'Top', combined1['home_team'], combined1['away_team'])
combined1["player_name"] = combined1["player_name"].apply(flip_names)
combined1['player_name'] = combined1['player_name'].replace('Michael King', 'Mike King')

In [316]:
combined2 = combined1[["game_date", "home_team", "away_team", "inning", "inning_topbot", "at_bat_number", "pitch_number", "BatterTeam", "MLBNAME", "balls", "strikes", "outs_when_up", "events", "description", "bb_type", "hit_distance_sc", "launch_speed", "launch_angle", "estimated_ba_using_speedangle", "estimated_woba_using_speedangle", "woba_value", "p_throws", "PitcherTeam", "player_name", "delta_home_win_exp", "delta_run_exp", "away_score", "home_score"]].sort_values(by = ["game_date", "home_team", "away_team", "inning_topbot", "at_bat_number", "pitch_number"], ascending=[True, True, True, False, True, True])
combined2 = combined2.set_index("game_date").sort_index(ascending = True)

In [317]:
Train1 = combined2.groupby(["game_date", "BatterTeam", "away_team", "home_team", "at_bat_number", "player_name"]).agg(
    Pitches = ("at_bat_number", "size"),
    PA = ('events', lambda x: (x.isin(['other_out', 'single', 'double', 'triple', 'home_run', 'strikeout', 'field_out', 'field_error', 'fielders_choice', 'double_play', 'fielders_choice_out', 'strikeout_double_play', 'triple_play', 'grounded_into_double_play'])).sum()),
    GB = ('bb_type', lambda x: (x == 'ground_ball').sum()),
    FB = ('bb_type', lambda x: (x == 'fly_ball').sum()),
    PU = ('bb_type', lambda x: (x == 'popup').sum()),
    BB = ('events', lambda x: (x == 'walk').sum()),
    HBP = ('events', lambda x: (x == 'hit_by_pitch').sum()),
    K = ('events', lambda x: (x == 'strikeout').sum()),
    HR = ("events", lambda x: (x == 'home_run').sum()),
    Distance = ("hit_distance_sc", "mean"),
    EV = ("launch_speed", "mean"),
    LA = ("launch_angle", "mean"),
    xBA = ("estimated_ba_using_speedangle", "mean"),
    xwOBA = ("estimated_woba_using_speedangle", "mean"),
    wOBA = ("woba_value", "mean"),
    RunExp = ("delta_run_exp", "mean"),
    AwayScore = ("away_score", "sum"),
    HomeScore = ("home_score", "sum")).reset_index().fillna(0)

In [318]:
Train2 = Train1.groupby(["game_date", "BatterTeam", "away_team", "home_team", "player_name"]).agg(
    AB = ("at_bat_number", "size"),
    PA = ("PA", "sum"),
    GB = ('GB', "sum"),
    FB = ('FB', "sum"),
    PU = ('FB', "sum"),
    BB = ("BB", "sum"),
    HBP = ("HBP", "sum"),
    K = ("K", "sum"),
    HR = ("HR", "sum"),
    Distance = ("Distance", "mean"),
    EV = ("EV", "mean"),
    LA = ("LA", "mean"),
    xBA = ("xBA", "mean"),
    xwOBA = ("xwOBA", "mean"),
    wOBA = ("wOBA", "mean"),
    RunExp = ("RunExp", "mean"),
    AwayScore = ("AwayScore", "sum"),
    HomeScore = ("HomeScore", "sum")).reset_index().fillna(0)

Train2['NRFI'] = np.where((Train2['away_team'] == Train2['BatterTeam']) & (Train2['AwayScore'] == 0), 1, 0)
Train2['NRFI'] = np.where((Train2['home_team'] == Train2['BatterTeam']) & (Train2['HomeScore'] == 0), 1, Train2['NRFI'])
Train2 = Train2.drop(['AwayScore', 'HomeScore'], axis=1)

In [319]:
Train3 = Train2.groupby(["BatterTeam", "away_team", "home_team", "player_name"]).agg(
    IP = ("AB", "size"),
    AB = ("AB", "sum"),
    PA = ("PA", "sum"),
    BB = ("BB", "sum"),
    HBP = ("HBP", "sum"),
    K = ("K", "sum"),
    GB = ('GB', "sum"),
    FB = ('FB', "sum"),
    PU = ('FB', "sum"),
    HR = ("HR", "sum"),
    Distance = ("Distance", "mean"),
    EV = ("EV", "mean"),
    LA = ("LA", "mean"),
    xBA = ("xBA", "mean"),
    xwOBA = ("xwOBA", "mean"),
    wOBA = ("wOBA", "mean"),
    RunExp = ("RunExp", "mean"),
    NRFI = ("NRFI", "mean")).reset_index().fillna(0)

Train3['K%'] = round((Train3['K'] / Train3['AB']) * 100, 2)
Train3['BB%'] = round((Train3['BB'] / Train3['AB']) * 100, 2)
Train3["K-BB%"] = Train3['K%'] - Train3['BB%']

In [320]:
def FIP(HR, BB, HBP, K, IP):
    fip = (((HR * 13) + (3 * (BB + HBP)) - (2 * K)) / IP + 3.137)
    return round(fip, 3)

def xFIP(FB, BB, HBP, K, IP):
    lgHR = len(combined1[combined1["events"] == "home_run"])
    lgFB = len(combined1[combined1["bb_type"] == "fly_ball"])

    xfip = (13 * (FB * (lgHR/lgFB * 0.58)) + 3 * (BB + HBP) - 2 * K) / IP + 3.137
    return round(xfip, 3)

def SIERA(K, BB, GB, FB, PU, PA):
    if PA == 0:
        return 0

    so_pa = K/PA
    bb_pa = BB/PA
    gb_pa = GB/PA
    fb_pa = FB/PA
    pu_pa = PU/PA

    SIERA = 6.145 - 16.986 * (so_pa / 100) + 11.434 * (bb_pa / 100) - 1.858 * ((gb_pa - fb_pa - pu_pa) / 100) + 7.653 * (so_pa / 100) ** 2 - 6.664 * ((gb_pa - fb_pa - pu_pa) / 100) ** 2 + 10.130 * (so_pa / 100) * ((gb_pa - fb_pa - pu_pa) / 100) - 5.195 * (bb_pa / 100) * ((gb_pa - fb_pa - pu_pa) / 100)
    return round(SIERA, 3)

In [321]:
Train3['FIP'] = Train3.apply(lambda row: FIP(row['HR'], row['BB'], row['HBP'], row['K'], row['IP']), axis=1)
Train3['xFIP'] = Train3.apply(lambda row: xFIP(row['FB'], row['BB'], row['HBP'], row['K'], row['IP']), axis=1)
Train3['SIERA'] = Train3.apply(lambda row: SIERA(row['K'], row['BB'], row['GB'], row['FB'], row['PU'], row['PA']), axis=1)
Train3["AB"] = Train3["AB"] / Train3["IP"]

In [410]:
Train4 = Train3[["BatterTeam", "away_team", "home_team", "player_name", "AB", "K-BB%", "Distance", "EV", "LA", "xBA", "xwOBA", "wOBA", "RunExp", "FIP", "xFIP", "SIERA", "NRFI"]].round(3)
Train4 = Train4.rename(columns={'away_team': 'RoadTeam', 'home_team': 'HomeTeam', "player_name": "SP"})
Train4

Unnamed: 0,BatterTeam,RoadTeam,HomeTeam,SP,AB,K-BB%,Distance,EV,LA,xBA,xwOBA,wOBA,RunExp,FIP,xFIP,SIERA,NRFI
0,ATL,ATL,AZ,Humberto Castellanos,3.0,66.67,185.000,82.933,15.667,0.151,0.201,0.000,-0.038,-0.863,-0.863,6.032,1.0
1,ATL,ATL,AZ,Madison Bumgarner,6.0,16.66,105.333,78.433,3.000,0.233,0.339,0.475,0.043,2.137,2.137,6.096,0.0
2,ATL,ATL,AZ,Merrill Kelly,3.0,0.00,34.333,77.100,-27.333,0.137,0.125,0.300,0.108,3.137,3.137,6.132,1.0
3,ATL,ATL,AZ,Ryne Nelson,3.0,0.00,194.889,92.122,10.333,0.236,0.279,0.000,-0.063,3.137,4.536,6.151,1.0
4,ATL,ATL,AZ,Zac Gallen,5.0,30.00,203.750,82.675,24.000,0.363,0.356,0.360,-0.029,0.137,1.536,6.100,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7565,WSH,WSH,TEX,Dane Dunning,4.0,0.00,326.500,91.525,32.750,0.248,0.399,0.400,-0.020,3.137,7.333,6.171,1.0
7566,WSH,WSH,TEX,Glenn Otto,7.0,-14.29,150.286,81.971,13.357,0.332,0.519,0.536,0.117,6.137,7.536,6.168,0.0
7567,WSH,WSH,TOR,Chris Bassitt,3.0,0.00,143.917,74.850,32.417,0.090,0.148,0.000,-0.039,3.137,5.935,6.169,1.0
7568,WSH,WSH,TOR,José Berríos,2.0,0.00,233.250,84.775,22.500,0.380,0.352,0.000,-0.057,3.137,3.137,6.136,1.0


In [323]:
TodaysData = getDKData2024()
TodaysData



Unnamed: 0_level_0,BatterTeam,RoadTeam,HomeTeam,SP
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-06-06,BAL,BAL,TOR,Yusei Kikuchi
2024-06-06,TOR,BAL,TOR,
2024-06-06,KC,KC,CLE,Tanner Bibee
2024-06-06,CLE,KC,CLE,Brady Singer
2024-06-06,SEA,SEA,OAK,JP Sears
2024-06-06,SEA,SEA,OAK,
2024-06-06,OAK,SEA,OAK,Bryan Woo
2024-06-06,LAD,LAD,PIT,Bailey Falter
2024-06-06,PIT,LAD,PIT,Walker Buehler
2024-06-06,ATL,ATL,WSH,Mitchell Parker


In [324]:
eastern_time = datetime.datetime.now(timezone.utc).astimezone(timezone(datetime.timedelta(hours=-5)))
savant2024 = statcast(start_dt = "2024-03-28", end_dt = eastern_time.strftime("%Y-%m-%d"))
savant2024 = savant2024[(savant2024['inning'] == 1)]
savant2024['game_date'] = pd.to_datetime(savant2024['game_date'])
savant2024['game_date'] = savant2024['game_date'].dt.strftime('%Y-%m-%d')
savant2024['BatterTeam'] = np.where(savant2024['inning_topbot'] == 'Top', savant2024['away_team'], savant2024['home_team'])
savant2024['PitcherTeam'] = np.where(savant2024['inning_topbot'] == 'Top', savant2024['home_team'], savant2024['away_team'])
savant2024 = pd.merge(savant2024, ID[["MLBID", "MLBNAME"]], left_on = 'batter', right_on = 'MLBID', how = 'left')
savant2024.dropna(subset=['MLBNAME'], inplace=True)
savant2024 = savant2024.drop_duplicates(subset = ["pitch_type", "game_date", "release_speed", "release_pos_x", "release_pos_z", "player_name"], keep='first')
savant2024["player_name"] = savant2024["player_name"].apply(flip_names)
savant2024['player_name'] = savant2024['player_name'].replace('Michael King', 'Mike King')
savant2024 = savant2024[["game_date", "home_team", "away_team", "inning", "inning_topbot", "at_bat_number", "pitch_number", "BatterTeam", "MLBNAME", "balls", "strikes", "outs_when_up", "events", "description", "bb_type", "hit_distance_sc", "launch_speed", "launch_angle", "estimated_ba_using_speedangle", "estimated_woba_using_speedangle", "woba_value", "p_throws", "PitcherTeam", "player_name", "delta_home_win_exp", "delta_run_exp", "away_score", "home_score"]].sort_values(by = ["game_date", "home_team", "away_team", "inning_topbot", "at_bat_number", "pitch_number"], ascending=[True, True, True, False, True, True])
savant2024 = savant2024.set_index("game_date").sort_index(ascending = True)

This is a large query, it may take a moment to complete


100%|██████████| 71/71 [00:25<00:00,  2.76it/s]


In [325]:
def replace_special_chars(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')

savant2024["player_name"] = savant2024["player_name"].apply(replace_special_chars)
Season1 = savant2024[savant2024['player_name'].isin(TodaysData['SP'])]

In [326]:
Season2 = Season1.groupby(["BatterTeam", "away_team", "home_team", "at_bat_number", "player_name"]).agg(
    Pitches = ("at_bat_number", "size"),
    PA = ('events', lambda x: (x.isin(['other_out', 'single', 'double', 'triple', 'home_run', 'strikeout', 'field_out', 'field_error', 'fielders_choice', 'double_play', 'fielders_choice_out', 'strikeout_double_play', 'triple_play', 'grounded_into_double_play'])).sum()),
    GB = ('bb_type', lambda x: (x == 'ground_ball').sum()),
    FB = ('bb_type', lambda x: (x == 'fly_ball').sum()),
    PU = ('bb_type', lambda x: (x == 'popup').sum()),
    BB = ('events', lambda x: (x == 'walk').sum()),
    HBP = ('events', lambda x: (x == 'hit_by_pitch').sum()),
    K = ('events', lambda x: (x == 'strikeout').sum()),
    HR = ("events", lambda x: (x == 'home_run').sum()),
    Distance = ("hit_distance_sc", "mean"),
    EV = ("launch_speed", "mean"),
    LA = ("launch_angle", "mean"),
    xBA = ("estimated_ba_using_speedangle", "mean"),
    xwOBA = ("estimated_woba_using_speedangle", "mean"),
    wOBA = ("woba_value", "mean"),
    RunExp = ("delta_run_exp", "mean"),
    AwayScore = ("away_score", "sum"),
    HomeScore = ("home_score", "sum")).reset_index().fillna(0)

In [327]:
Season3 = Season2.groupby(["BatterTeam", "away_team", "home_team", "player_name"]).agg(
    AB = ("at_bat_number", "size"),
    PA = ("PA", "sum"),
    GB = ('GB', "sum"),
    FB = ('FB', "sum"),
    PU = ('FB', "sum"),
    BB = ("BB", "sum"),
    HBP = ("HBP", "sum"),
    K = ("K", "sum"),
    HR = ("HR", "sum"),
    Distance = ("Distance", "mean"),
    EV = ("EV", "mean"),
    LA = ("LA", "mean"),
    xBA = ("xBA", "mean"),
    xwOBA = ("xwOBA", "mean"),
    wOBA = ("wOBA", "mean"),
    RunExp = ("RunExp", "mean"),
    AwayScore = ("AwayScore", "sum"),
    HomeScore = ("HomeScore", "sum")).reset_index().fillna(0)

Season3['NRFI'] = np.where((Season3 ['away_team'] == Season3 ['BatterTeam']) & (Season3['AwayScore'] == 0), 1, 0)
Season3['NRFI'] = np.where((Season3['home_team'] == Season3['BatterTeam']) & (Season3['HomeScore'] == 0), 1, Season3['NRFI'])
Season3 = Season3.drop(['AwayScore', 'HomeScore'], axis=1)

In [328]:
Season4 = Season3.groupby(["player_name"]).agg(
    IP = ("AB", "size"),
    AB = ("AB", "sum"),
    PA = ("PA", "sum"),
    BB = ("BB", "sum"),
    HBP = ("HBP", "sum"),
    K = ("K", "sum"),
    GB = ('GB', "sum"),
    FB = ('FB', "sum"),
    PU = ('FB', "sum"),
    HR = ("HR", "sum"),
    Distance = ("Distance", "mean"),
    EV = ("EV", "mean"),
    LA = ("LA", "mean"),
    xBA = ("xBA", "mean"),
    xwOBA = ("xwOBA", "mean"),
    wOBA = ("wOBA", "mean"),
    RunExp = ("RunExp", "mean"),
    NRFI = ("NRFI", "mean")).reset_index().fillna(0)

Season4['K%'] = round((Season4['K'] / Season4['AB']) * 100, 2)
Season4['BB%'] = round((Season4['BB'] / Season4['AB']) * 100, 2)
Season4["K-BB%"] = Season4['K%'] - Season4['BB%']

In [329]:
Season4['FIP'] = Season4.apply(lambda row: FIP(row['HR'], row['BB'], row['HBP'], row['K'], row['IP']), axis=1)
Season4['xFIP'] = Season4.apply(lambda row: xFIP(row['FB'], row['BB'], row['HBP'], row['K'], row['IP']), axis=1)
Season4['SIERA'] = Season4.apply(lambda row: SIERA(row['K'], row['BB'], row['GB'], row['FB'], row['PU'], row['PA']), axis=1)
Season4["AB"] = Season4["AB"] / Season4["IP"]
Season5 = Season4[["player_name", "AB", "K-BB%", "Distance", "EV", "LA", "xBA", "xwOBA", "wOBA", "RunExp", "FIP", "xFIP", "SIERA", "NRFI"]].round(3)

In [411]:
TodaysData.dropna(subset=['SP'], inplace=True)
TodaysData1 = pd.merge(TodaysData, Season5[["player_name", "AB", "K-BB%", "Distance", "EV", "LA", "xBA", "xwOBA", "wOBA", "RunExp", "FIP", "xFIP", "SIERA", "NRFI"]], left_on = ['SP'], right_on = ['player_name'], how = 'left').drop(columns = ["player_name"])
TodaysData1

Unnamed: 0,BatterTeam,RoadTeam,HomeTeam,SP,AB,K-BB%,Distance,EV,LA,xBA,xwOBA,wOBA,RunExp,FIP,xFIP,SIERA,NRFI
0,BAL,BAL,TOR,Yusei Kikuchi,4.5,14.82,143.449,73.579,16.218,0.214,0.315,0.297,-0.004,4.47,3.236,6.106,0.667
1,KC,KC,CLE,Tanner Bibee,3.667,25.0,132.58,72.813,19.678,0.146,0.217,0.137,-0.037,1.804,2.62,6.093,0.917
2,CLE,KC,CLE,Brady Singer,3.636,10.0,118.744,66.892,9.382,0.164,0.242,0.16,-0.03,2.864,3.5,6.114,0.909
3,SEA,SEA,OAK,JP Sears,3.833,6.52,129.502,71.751,14.87,0.193,0.279,0.203,-0.031,3.22,4.269,6.127,0.75
4,OAK,SEA,OAK,Bryan Woo,3.2,25.0,142.992,68.511,22.631,0.093,0.137,0.035,-0.066,1.737,3.136,6.104,1.0
5,LAD,LAD,PIT,Bailey Falter,4.091,13.33,155.538,75.872,16.684,0.221,0.301,0.239,-0.002,5.864,3.718,6.12,0.727
6,PIT,LAD,PIT,Walker Buehler,5.2,11.54,161.284,71.764,10.251,0.232,0.331,0.482,0.056,10.137,3.456,6.12,0.4
7,ATL,ATL,WSH,Mitchell Parker,4.111,10.81,126.767,76.355,13.586,0.185,0.27,0.208,-0.034,2.693,3.47,6.115,0.667
8,WSH,ATL,WSH,Reynaldo Lopez,3.6,33.33,160.339,70.481,20.282,0.165,0.193,0.154,-0.043,0.937,1.916,6.086,1.0
9,MIN,MIN,NYY,Marcus Stroman,4.0,10.41,130.096,70.275,8.831,0.239,0.349,0.243,-0.015,4.887,3.42,6.116,0.833


In [412]:
Train5 = Train4
TodaysData2 = TodaysData1

# Dictionary to store the label encoders
label_encoders = {}

# Encode non-numeric columns in Train4
non_numeric_columns_train = Train5.select_dtypes(exclude=['float64', 'int64']).columns
for col in non_numeric_columns_train:
    label_encoder = LabelEncoder()
    Train5[col] = label_encoder.fit_transform(Train5[col])
    label_encoders[col] = label_encoder

# Ensure all non-numeric columns in Train4 are in TodaysData1
for col in non_numeric_columns_train:
    if col not in TodaysData2.columns:
        print(f"Warning: Column {col} from training data is not present in today's data.")
        # Adding the missing column with a default value
        TodaysData2[col] = 1000

# Encode non-numeric columns in TodaysData1 using the same encoders
non_numeric_columns_today = TodaysData2.select_dtypes(exclude=['float64', 'int64']).columns
for col in non_numeric_columns_today:
    if col in label_encoders:
        label_encoder = label_encoders[col]
        unique_values = set(label_encoder.classes_)
        encoded_values = []
        for item in TodaysData2[col]:
            if item in unique_values:
                encoded_values.append(label_encoder.transform([item])[0])
            else:
                encoded_values.append(1000)
        TodaysData2[col] = encoded_values
    else:
        print(f"Warning: Column {col} is not present in the training data.")
        # Fit a new label encoder for columns not present in Train4, but be cautious with this
        label_encoder = LabelEncoder()
        TodaysData2[col] = label_encoder.fit_transform(TodaysData2[col])
        label_encoders[col] = label_encoder

In [413]:
pbpFeatures = Train5.drop(columns = ["NRFI"]).values.reshape(-1, 16)
pbpLabel = Train5["NRFI"].values.reshape(-1, 1)
TodayFeatures = TodaysData2.drop(columns = ["NRFI"]).values.reshape(-1, 16)

rf_regressor = RandomForestRegressor(n_estimators=175, max_leaf_nodes = 10, max_depth = 100)
rf_regressor.fit(pbpFeatures, pbpLabel)
RFpred = rf_regressor.predict(TodayFeatures)

TodaysData2["RFPred"] = RFpred



In [414]:
for col in non_numeric_columns_today:
    if col in label_encoders:
        label_encoder = label_encoders[col]
        # Handling default value of 1000
        TodaysData2[col] = TodaysData2[col].apply(lambda x: label_encoder.inverse_transform([x])[0] if x != 1000 else np.nan)

In [415]:
TodaysData3 = TodaysData2.groupby(["RoadTeam", "HomeTeam"])['RFPred'].prod().reset_index().round(3)

TodaysData3['Games'] = TodaysData3.apply(lambda x: tuple(sorted([x['RoadTeam'], x['HomeTeam']])), axis=1)
TodaysData4 = TodaysData3.groupby("Games").agg(
    RFPred=("RFPred", "sum")).sort_values("RFPred", ascending = False)

TodaysData4

Unnamed: 0_level_0,RFPred
Games,Unnamed: 1_level_1
"(CLE, KC)",0.936
"(OAK, SEA)",0.936
"(COL, STL)",0.933
"(BOS, CWS)",0.93
"(ATL, WSH)",0.899
"(MIN, NYY)",0.779
"(CHC, CIN)",0.729
"(BAL, TOR)",0.613
"(AZ, SD)",0.324
"(LAD, PIT)",0.229


In [418]:
excel_file = 'todays_data.xlsx'

# If deleting the mode and engine it rewrites the whole file
with pd.ExcelWriter(excel_file, mode='a', engine='openpyxl') as writer:
    TodaysData4.reset_index().to_excel(writer, index=False, sheet_name='6-7-24')