In [2]:
import datetime
import io
import sys
import os
import unicodedata
from datetime import timezone

import numpy as np
import pandas as pd
import pybaseball
import requests
from bs4 import BeautifulSoup
from pybaseball import statcast
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor

pybaseball.cache.enable()

# Load in my own 40 man Roster Scraper
directory = os.path.expanduser('~/Desktop/Desktop - Cameron MacBook Pro/Random-Projects/MLB')
sys.path.append(directory)
from RosterScraper import RosterScraper

## Loads in StatCast ID so batter names show in the Statcast data and loads in a scraped DF with every 40 man roster

In [3]:
url = 'https://docs.google.com/spreadsheets/d/1JgczhD5VDQ1EiXqVG-blttZcVwbZd5_Ne_mefUGwJnk/pub?output=csv'
res = requests.get(url)
ID = pd.read_csv(io.BytesIO(res.content), sep=',')
ID.dropna(subset=['MLBID'], inplace=True)
ID['MLBID'] = ID['MLBID'].astype(int)

Rosters = RosterScraper()
BID = Rosters[Rosters["Position"] == "Batter"]
PID = Rosters[Rosters["Position"] == "Pitcher"]

## Creating functions for data manipulation so they can match when joining separate datasets

In [4]:
def convert_name(name):
    if name == 'Rockies':
        return 'COL'
    elif name == 'Reds':
        return 'CIN'
    elif name == 'Mariners':
        return 'SEA'
    elif name == 'Nationals':
        return 'WSH'
    elif name == 'Yankees':
        return 'NYY'
    elif name == 'Astros':
        return 'HOU'
    elif name == 'Red Sox':
        return 'BOS'
    elif name == 'Athletics':
        return 'OAK'
    elif name == 'Mets':
        return 'NYM'
    elif name == 'Braves':
        return 'ATL'
    elif name == 'Giants':
        return 'SF'
    elif name == 'Brewers':
        return 'MIL'
    elif name == 'Rays':
        return 'TB'
    elif name == 'Royals':
        return 'KC'
    elif name == 'White Sox':
        return 'CWS'
    elif name == 'Cubs':
        return 'CHC'
    elif name == 'Angels':
        return 'LAA'
    elif name == 'Tigers':
        return 'DET'
    elif name == 'Diamondbacks':
        return 'ARI'
    elif name == 'Guardians':
        return 'CLE'
    elif name == 'Orioles':
        return 'BAL'
    elif name == 'Twins':
        return 'MIN'
    elif name == 'Marlins':
        return 'MIA'
    elif name == 'Phillies':
        return 'PHI'
    elif name == 'Rangers':
        return 'TEX'
    elif name == 'Dodgers':
        return 'LAD'
    elif name == 'Padres':
        return 'SD'
    elif name == 'Pirates':
        return 'PIT'
    elif name == 'Blue Jays':
        return 'TOR'
    elif name == 'Cardinals':
        return 'STL'
    else:
        return np.nan
    
def flip_names(name):
    first_name, last_name = name.split(", ")
    return f"{last_name} {first_name}"

def replace_special_chars(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')

def append_suffix_to_duplicates(df, column):
        seen = {}
        for idx, value in enumerate(df[column]):
            if value in seen:
                seen[value] += 1
                df.at[idx, column] = f"{value}2"
            else:
                seen[value] = 1

## Scraping the RotoGrinders website for daily pitchers and lineups

In [5]:
def getDKData2024():
    eastern_time = datetime.datetime.now(timezone.utc).astimezone(timezone(datetime.timedelta(hours=-5)))
    todaysdate = eastern_time.strftime("%m-%d-%Y")
    url = 'https://rotogrinders.com/lineups/mlb?site=draftkings'
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'lxml')

    gamelist = []
    gamecards = soup.findAll("div", {"class": "game-card-teams"})
    for x in gamecards:
        twoteams = x.findAll("span", {"class": "team-nameplate-mascot"})
        roadteam = convert_name(twoteams[0].text)
        hometeam = convert_name(twoteams[1].text)
        gamekey = "{}@{}".format(roadteam,hometeam)
        gamelist.append(gamekey)

    matchupsdf = pd.DataFrame()
    for game in gamelist:
        roadteam = game.split("@")[0]
        hometeam = game.split("@")[1]
        thisdf1 = pd.DataFrame({"Team": roadteam, "Opp": hometeam, "RoadTeam": roadteam, "HomeTeam": hometeam},index=[0])
        thisdf2 = pd.DataFrame({"Team": hometeam, "Opp": roadteam, "RoadTeam": roadteam, "HomeTeam": hometeam},index=[0])
        matchupsdf = pd.concat([matchupsdf,thisdf1,thisdf2])
        
    oppdict = dict(zip(matchupsdf.Team,matchupsdf.Opp))
    hometeamdict = dict(zip(matchupsdf.Team,matchupsdf.HomeTeam))
    roadteamdict = dict(zip(matchupsdf.Team,matchupsdf.RoadTeam))

    disabled_span_list = []
    for span in soup.findAll("span", {"class": "player-nameplate disabled"}):
        for a in span.findAll("a"):
            disabled_span_list.append(a.text)

    spdata = pd.DataFrame()
    for div in soup.findAll("span", {"class": "player-nameplate", "data-position": "SP"}):
        if "TBD" in str(div):
            playername = "TBD"
            pos = "SP"
            sal = 0
        else:
            for a in div.findAll('a', {'class': 'player-nameplate-name'}):
                playername = a.text.strip()

            strdiv = str(div)
            pos = strdiv[strdiv.find("data-position")+15:strdiv.find("data-salary")-2]
            sal = strdiv[strdiv.find("data-salary")+13:strdiv.find("<div class = 'player-nameplate-info'>")-3]
        try:
            ownership = strdiv[strdiv.find('<span class="small muted" data-auth="502">') + 42:strdiv.find('%')]
            ownership = ownership.replace("</span>", "")
            ownership = ownership.replace("</span", "")
            ownership = ownership.replace("</div>", "")
            ownership = ownership.replace(" ", "")
        except:
            ownership = np.nan

        thisspdata = pd.DataFrame([[playername, sal, ownership]], columns = ["Player", "Salary", "Ownership"])
        spdata = pd.concat([spdata, thisspdata])

    spdata['Player'] = spdata['Player'].replace('Luis Ortiz', 'Luis L. Ortiz')
    spdata['Player'] = spdata['Player'].replace('Mike King', 'Michael King')
    spdata['Player'] = spdata['Player'].replace('Robert Zastryzny', 'Rob Zastryzny')

    spdata2 = pd.merge(spdata, PID[["Name", "Team"]], left_on = ["Player"], right_on = ["Name"], how = "left").rename(columns = {"Team": "PitcherTeam"})
    spdata3 = pd.merge(spdata2, matchupsdf[["Team", "Opp"]], left_on = ["PitcherTeam"], right_on = ["Team"], how = "left").drop(columns = ["Team"])

    append_suffix_to_duplicates(spdata3, 'PitcherTeam')
    append_suffix_to_duplicates(spdata3, 'Opp')

    opp_spname_dict = dict(zip(spdata3.Opp, spdata3.Player))
    opp_spsal_dict = dict(zip(spdata3.Opp, spdata.Salary))
    opp_spown_dict = dict(zip(spdata3.Opp, spdata3.Ownership))

    ludf = pd.DataFrame()
    
    for li in soup.findAll("li", {"class": "lineup-card-player"}):
        for a in li.findAll("a", {"class": ["player-nameplate-name", "player-nameplate disabled"]}):
            playername = a.text

        listring = str(li)
        for span in li.find("span", {"class": "small"}):
            luspot = span.text
            luspot = luspot.replace("\n", "")
            luspot = luspot.strip()
            luspot = int(luspot)
        pos = listring[listring.find("data-position")+15:listring.find("data-salary")-2]
        sal = listring[listring.find("data-salary")+13:listring.find("<span class='small'>")-3]
        ownership = ownership.replace("</span>", "")
        ownership = ownership.replace("</span", "")
        ownership = ownership.replace("</li", "")
        ownership = ownership.replace("</div>", "")
        ownership = ownership.replace(" ", "")

        try:
            sal = int(sal)
        except:
            sal = 0
        thisludf = pd.DataFrame([[playername, luspot, sal, ownership]], columns = ["Player", "Spot", "Sal", "Ownership"])
        ludf = pd.concat([ludf, thisludf])

    ludf2 = pd.merge(ludf, BID[["Name", "Team"]], left_on = ["Player"], right_on = ["Name"], how = "left").rename(columns = {"Team": "BatterTeam"})
    ludf2['BatterTeam'] = ludf2['BatterTeam'].fillna(method='ffill')
    ludf2['HomeTeam'] = ludf2['BatterTeam'].map(hometeamdict)
    ludf2['RoadTeam'] = ludf2['BatterTeam'].map(roadteamdict)

    ludf2_teamlist = list(ludf2["BatterTeam"])

    dhteams = []
    for x in ludf2_teamlist:
        if ludf2_teamlist.count(x) > 11:
            if x in dhteams:
                pass
            else:
                dhteams.append(x)

    extract_dh = ludf2[ludf2["BatterTeam"].isin(dhteams)]
    new_ludf2 = ludf2[~ludf2["BatterTeam"].isin(dhteams)]

    new_team_list = []
    new_home_list = []
    new_road_list = []
    runcounter = 0

    for x, home, road in zip(extract_dh["BatterTeam"].astype(str), 
                         extract_dh["HomeTeam"].astype(str), 
                         extract_dh["RoadTeam"].astype(str)):
        if runcounter < 18:
            new_team_list.append(x)
            new_home_list.append(home)
            new_road_list.append(road)
            runcounter += 1
        else:
            new_team_list.append(x + "2")
            new_home_list.append(home + "2")
            new_road_list.append(road + "2")
            runcounter += 1

    extract_dh["BatterTeam"] = new_team_list
    extract_dh["HomeTeam"] = new_home_list
    extract_dh["RoadTeam"] = new_road_list

    ludf2 = pd.concat([extract_dh, new_ludf2])
    ludf2["Opp"] = ludf2["BatterTeam"].map(oppdict)
    ludf2['SP'] = ludf2['BatterTeam'].map(opp_spname_dict)
    ludf2['SPSal'] = ludf2['BatterTeam'].map(opp_spsal_dict)
    ludf2['SPOwnership'] = ludf2['BatterTeam'].map(opp_spown_dict)
    ludf2['Date'] = todaysdate
    ludf2['Time'] = np.nan

    ludf3 = ludf2[['BatterTeam','RoadTeam','HomeTeam','Time','Spot','Player','Sal','Ownership','Date', "SP"]]

    dkdata = ludf3.copy()

    try:
        checknan = dkdata[["BatterTeam", "SP"]]
        getnans = checknan[["SP"].isna()]
        if len(getnans) == 0:
            nonans = 1
            nanmapdict = {}
        else:
            nonans = 0
            getnans["SP"] = disabled_span_list
            nanmapdict = dict(zip(getnans.Team, getnans.SP))
    except:
        pass

    try:
        dkdata["SP"] = np.where(dkdata["SP"].isna(), dkdata["BatterTeam"].map(nanmapdict), dkdata["SP"])
    except:
        pass
    
    for i in range(1, len(dkdata) - 1):
        if dkdata.loc[i, 'BatterTeam'] != dkdata.loc[i-1, 'BatterTeam']:
            if dkdata.loc[i, 'BatterTeam'] != dkdata.loc[i+1, 'BatterTeam']:
                dkdata.loc[i, 'BatterTeam'] = np.nan
                dkdata.loc[i, 'HomeTeam'] = np.nan
                dkdata.loc[i, 'RoadTeam'] = np.nan
                dkdata.loc[i, 'SP'] = np.nan

    
    dkdata[["BatterTeam", "RoadTeam", "HomeTeam"]] = dkdata[["BatterTeam", "RoadTeam", "HomeTeam"]].fillna(method='ffill')
    dkdata = dkdata.drop_duplicates(subset = ["BatterTeam", "SP"], keep = "first")
    dkdata = dkdata.drop(columns = ["Time", "Sal", "Ownership"])

    dkdata['BatterTeam'] = dkdata['BatterTeam'].replace('ARI', 'AZ')
    dkdata['RoadTeam'] = dkdata['RoadTeam'].replace('ARI', 'AZ')
    dkdata['HomeTeam'] = dkdata['HomeTeam'].replace('ARI', 'AZ')

    dkdata['Date'] = pd.to_datetime(dkdata['Date'])
    dkdata['Date'] = dkdata['Date'].dt.strftime('%Y-%m-%d')
    dkdata = dkdata.set_index("Date")
    dkdata = dkdata[["BatterTeam", "RoadTeam", "HomeTeam", "SP"]]

    return(dkdata)

## Loads regular season data from 2022-23 to train on

In [7]:
#statcast(start_dt = "2022-04-07", end_dt = "2022-10-05")
#statcast(start_dt = "2023-03-30", end_dt = "2023-10-01")
savant2022 = pd.read_csv("~/Desktop/Desktop - Cameron MacBook Pro/Random-Projects/MLB/Data/savant2022.csv")
savant2023 = pd.read_csv("~/Desktop/Desktop - Cameron MacBook Pro/Random-Projects/MLB/Data/savant2023.csv")

In [8]:
#pd.set_option('display.max_columns', None)
combined1 = pd.concat([savant2022, savant2023])
combined1 = combined1[(combined1['inning'] == 1)]
combined1['game_date'] = pd.to_datetime(combined1['game_date'])
combined1['game_date'] = combined1['game_date'].dt.strftime('%Y-%m-%d')
combined1['BatterTeam'] = np.where(combined1['inning_topbot'] == 'Top', combined1['away_team'], combined1['home_team'])
combined1['PitcherTeam'] = np.where(combined1['inning_topbot'] == 'Top', combined1['home_team'], combined1['away_team'])
combined1["player_name"] = combined1["player_name"].apply(flip_names)
combined1["player_name"] = combined1["player_name"].apply(replace_special_chars)
combined1['player_name'] = combined1['player_name'].replace('Michael King', 'Mike King')
combined1['player_name'] = combined1['player_name'].replace('Luis L. Ortiz', 'Luis Ortiz')

In [9]:
combined2 = combined1[["game_date", "home_team", "away_team", "inning", "inning_topbot", "at_bat_number", "pitch_number", "BatterTeam", "MLBNAME", "events", "description", "bb_type", "estimated_woba_using_speedangle", "woba_value", "p_throws", "PitcherTeam", "player_name", "delta_home_win_exp", "delta_run_exp", "away_score", "home_score"]].sort_values(by = ["game_date", "home_team", "away_team", "inning_topbot", "at_bat_number", "pitch_number"], ascending=[True, True, True, False, True, True])
combined2 = combined2.set_index("game_date").sort_index(ascending = True)

## Grouping on a pitch level

In [10]:
Train1 = combined2.groupby(["game_date", "BatterTeam", "away_team", "home_team", "at_bat_number", "player_name", "p_throws"]).agg(
    PA = ('events', lambda x: (x.isin(['other_out', 'single', 'double', 'triple', 'home_run', 'strikeout', 'field_out', 'field_error', 'fielders_choice', 'double_play', 'fielders_choice_out', 'strikeout_double_play', 'triple_play', 'grounded_into_double_play'])).sum()),
    GB = ('bb_type', lambda x: (x == 'ground_ball').sum()),
    LD = ('bb_type', lambda x: (x == 'line_drive').sum()),
    FB = ('bb_type', lambda x: (x == 'fly_ball').sum()),
    PU = ('bb_type', lambda x: (x == 'popup').sum()),
    BB = ('events', lambda x: (x == 'walk').sum()),
    HBP = ('events', lambda x: (x == 'hit_by_pitch').sum()),
    K = ('events', lambda x: (x == 'strikeout').sum()),
    HR = ("events", lambda x: (x == 'home_run').sum()),
    xwOBA = ("estimated_woba_using_speedangle", "mean"),
    wOBA = ("woba_value", "mean"),
    RunExp = ("delta_run_exp", "mean"),
    AwayScore = ("away_score", "sum"),
    HomeScore = ("home_score", "sum")).reset_index().fillna(0)

# If there are more than six batters faced it gets rid of the 7th at bat and beyond to prevent skewness
Train1["BF"] = Train1.groupby(["game_date", "BatterTeam", "away_team", "home_team", "player_name"])["at_bat_number"].cumcount()
Train1 = Train1.drop(Train1[Train1["BF"] >= 6].index, axis=0)

## Grouping on an at bat level

In [11]:
Train2 = Train1.groupby(["game_date", "BatterTeam", "away_team", "home_team", "player_name", "p_throws"]).agg(
    AB = ("at_bat_number", "size"),
    PA = ("PA", "sum"),
    GB = ('GB', "sum"),
    LD = ('LD', "sum"),
    FB = ('FB', "sum"),
    PU = ('PU', "sum"),
    BB = ("BB", "sum"),
    HBP = ("HBP", "sum"),
    K = ("K", "sum"),
    HR = ("HR", "sum"),
    xwOBA = ("wOBA", "mean"),
    wOBA = ("wOBA", "mean"),
    RunExp = ("RunExp", "mean"),
    AwayScore = ("AwayScore", "sum"),
    HomeScore = ("HomeScore", "sum")).reset_index().fillna(0)

Train2['BB%'] = round((Train2['BB'] / Train2['AB']) * 100, 2)
Train2['NRFI'] = np.where((Train2['away_team'] == Train2['BatterTeam']) & (Train2['AwayScore'] == 0), 1, 0)
Train2['NRFI'] = np.where((Train2['home_team'] == Train2['BatterTeam']) & (Train2['HomeScore'] == 0), 1, Train2['NRFI'])
Train2 = Train2.drop(['AwayScore', 'HomeScore'], axis=1)

## Adding rolling averages for the past 5 and 10 games

In [12]:
window_size5 = 5
window_size10 = 10

# For pitchers
# Rolling 5 and 10 game NRFI averages
Train2['P5'] = Train2.groupby('player_name')['NRFI'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Train2['P10'] = Train2.groupby('player_name')['NRFI'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling 5 game wOBA and xwOBA averages
Train2['P5xwOBA'] = Train2.groupby('player_name')['xwOBA'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Train2['P10xwOBA'] = Train2.groupby('player_name')['xwOBA'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)

# For batters
# Rolling 5 and 10 game NRFI averages
Train2['B5'] = Train2.groupby(['BatterTeam', "p_throws"])['NRFI'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Train2['B10'] = Train2.groupby(['BatterTeam', "p_throws"])['NRFI'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# Rolling 5 game wOBA and xwOBA averages
Train2['B5xwOBA'] = Train2.groupby(['BatterTeam', "p_throws"])['xwOBA'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Train2['B10xwOBA'] = Train2.groupby(['BatterTeam', "p_throws"])['xwOBA'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)

## Creates functions for various ERA estimators

In [13]:
def FIP(HR, BB, HBP, K, IP):
    fip = (((HR * 13) + (3 * (BB + HBP)) - (2 * K)) / IP + 3.137)
    return round(fip, 3)

def xFIP(FB, BB, HBP, K, IP):
    lgHR = len(combined1[combined1["events"] == "home_run"])
    lgFB = len(combined1[combined1["bb_type"] == "fly_ball"])

    xfip = (13 * (FB * (lgHR/lgFB * 0.58)) + 3 * (BB + HBP) - 2 * K) / IP + 3.137
    return round(xfip, 3)

def SIERA(K, BB, GB, FB, PU, PA):
    if PA == 0:
        return 0

    so_pa = K/PA
    bb_pa = BB/PA
    gb_pa = GB/PA
    fb_pa = FB/PA
    pu_pa = PU/PA

    SIERA = 6.145 - 16.986 * (so_pa / 100) + 11.434 * (bb_pa / 100) - 1.858 * ((gb_pa - fb_pa - pu_pa) / 100) + 7.653 * (so_pa / 100) ** 2 - 6.664 * ((gb_pa - fb_pa - pu_pa) / 100) ** 2 + 10.130 * (so_pa / 100) * ((gb_pa - fb_pa - pu_pa) / 100) - 5.195 * (bb_pa / 100) * ((gb_pa - fb_pa - pu_pa) / 100)
    return round(SIERA, 3)

## Finalizes the cleaned training data

In [14]:
Train4 = Train2[["BatterTeam", "away_team", "home_team", "player_name", "AB", "BB%", "wOBA", "P5", "P10", "P5xwOBA", "P10xwOBA", "B5", "B10", "B5xwOBA", "B10xwOBA", "NRFI"]].round(3)
Train4 = Train4.rename(columns={'away_team': 'RoadTeam', 'home_team': 'HomeTeam', "player_name": "SP"})

In [15]:
TodaysData = getDKData2024()
TodaysData

  ludf2['BatterTeam'] = ludf2['BatterTeam'].fillna(method='ffill')
  dkdata[["BatterTeam", "RoadTeam", "HomeTeam"]] = dkdata[["BatterTeam", "RoadTeam", "HomeTeam"]].fillna(method='ffill')


Unnamed: 0_level_0,BatterTeam,RoadTeam,HomeTeam,SP
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-09-24,TB,TB,DET,
2024-09-24,DET,TB,DET,
2024-09-24,CIN,CIN,CLE,Tanner Bibee
2024-09-24,CLE,CIN,CLE,Carson Spiers
2024-09-24,CHC,CHC,PHI,Tanner Banks
2024-09-24,PHI,CHC,PHI,Justin Steele
2024-09-24,MIL,MIL,PIT,Bailey Falter
2024-09-24,PIT,MIL,PIT,Tobias Myers
2024-09-24,KC,KC,WSH,Mitchell Parker
2024-09-24,WSH,KC,WSH,Cole Ragans


In [16]:
eastern_time = datetime.datetime.now(timezone.utc).astimezone(timezone(datetime.timedelta(hours=-5)))
savant2024 = statcast(start_dt = "2024-03-28", end_dt = eastern_time.strftime("%Y-%m-%d"))
savant2024 = savant2024[(savant2024['inning'] == 1)]
savant2024['game_date'] = pd.to_datetime(savant2024['game_date'])
savant2024['game_date'] = savant2024['game_date'].dt.strftime('%Y-%m-%d')
savant2024['BatterTeam'] = np.where(savant2024['inning_topbot'] == 'Top', savant2024['away_team'], savant2024['home_team'])
savant2024['PitcherTeam'] = np.where(savant2024['inning_topbot'] == 'Top', savant2024['home_team'], savant2024['away_team'])
savant2024 = pd.merge(savant2024, ID[["MLBID", "MLBNAME"]], left_on = 'batter', right_on = 'MLBID', how = 'left')
savant2024.dropna(subset=['MLBNAME'], inplace=True)
savant2024 = savant2024.drop_duplicates(subset = ["pitch_type", "game_date", "release_speed", "release_pos_x", "release_pos_z", "player_name"], keep='first')
savant2024["player_name"] = savant2024["player_name"].apply(flip_names)
savant2024 = savant2024[["game_date", "home_team", "away_team", "inning", "inning_topbot", "at_bat_number", "pitch_number", "pitch_type", "BatterTeam", "MLBNAME", "balls", "strikes", "outs_when_up", "events", "description", "bb_type", "hit_distance_sc", "launch_speed", "launch_angle", "estimated_ba_using_speedangle", "estimated_woba_using_speedangle", "woba_value", "p_throws", "PitcherTeam", "player_name", "delta_home_win_exp", "delta_run_exp", "away_score", "home_score"]].sort_values(by = ["game_date", "home_team", "away_team", "inning_topbot", "at_bat_number", "pitch_number"], ascending=[True, True, True, False, True, True])
savant2024 = savant2024.set_index("game_date").sort_index(ascending = True)
savant2024["player_name"] = savant2024["player_name"].apply(replace_special_chars)

This is a large query, it may take a moment to complete


  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
100%|██████████| 181/181 [02:22<00:00,  1.27it/s]
  final_data = pd.concat(dataframe_list, axis=0).convert_dtypes(convert_string=False)


## Does the same grouping as the training data at the various levels

In [17]:
Season2 = savant2024.groupby(["game_date", "BatterTeam", "away_team", "home_team", "at_bat_number", "player_name", "p_throws"]).agg(
    PA = ('events', lambda x: (x.isin(['other_out', 'single', 'double', 'triple', 'home_run', 'strikeout', 'field_out', 'field_error', 'fielders_choice', 'double_play', 'fielders_choice_out', 'strikeout_double_play', 'triple_play', 'grounded_into_double_play'])).sum()),
    GB = ('bb_type', lambda x: (x == 'ground_ball').sum()),
    LD = ('bb_type', lambda x: (x == 'line_drive').sum()),
    FB = ('bb_type', lambda x: (x == 'fly_ball').sum()),
    PU = ('bb_type', lambda x: (x == 'popup').sum()),
    BB = ('events', lambda x: (x == 'walk').sum()),
    HBP = ('events', lambda x: (x == 'hit_by_pitch').sum()),
    K = ('events', lambda x: (x == 'strikeout').sum()),
    HR = ("events", lambda x: (x == 'home_run').sum()),
    xwOBA = ("estimated_woba_using_speedangle", "mean"),
    wOBA = ("woba_value", "mean"),
    RunExp = ("delta_run_exp", "mean"),
    AwayScore = ("away_score", "sum"),
    HomeScore = ("home_score", "sum")).reset_index().fillna(0)

# If there are more than six batters faced it gets rid of the 7th at bat and beyond to prevent skewness
Season2["BF"] = Season2.groupby(["BatterTeam", "away_team", "home_team", "player_name"])["at_bat_number"].cumcount()
Season2 = Season2.drop(Season2[Season2["BF"] >= 6].index, axis=0)

In [18]:
Season3 = Season2.groupby(["game_date", "BatterTeam", "away_team", "home_team", "player_name", "p_throws"]).agg(
    AB = ("at_bat_number", "size"),
    PA = ("PA", "sum"),
    GB = ('GB', "sum"),
    LD = ('LD', "sum"),
    FB = ('FB', "sum"),
    PU = ('PU', "sum"),
    BB = ("BB", "sum"),
    HBP = ("HBP", "sum"),
    K = ("K", "sum"),
    HR = ("HR", "sum"),
    xwOBA = ("xwOBA", "mean"),
    wOBA = ("wOBA", "mean"),
    RunExp = ("RunExp", "mean"),
    AwayScore = ("AwayScore", "sum"),
    HomeScore = ("HomeScore", "sum")).reset_index().fillna(0)

Season3['NRFI'] = np.where((Season3 ['away_team'] == Season3 ['BatterTeam']) & (Season3['AwayScore'] == 0), 1, 0)
Season3['NRFI'] = np.where((Season3['home_team'] == Season3['BatterTeam']) & (Season3['HomeScore'] == 0), 1, Season3['NRFI'])
Season3 = Season3.drop(['AwayScore', 'HomeScore'], axis=1)

In [19]:
Season3['P5'] = Season3.groupby('player_name')['NRFI'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Season3['P10'] = Season3.groupby('player_name')['NRFI'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)
Season3['P5xwOBA'] = Season3.groupby('player_name')['xwOBA'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=0, drop=True)
Season3['P10xwOBA'] = Season3.groupby('player_name')['xwOBA'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=0, drop=True)

Season3['B5'] = Season3.groupby(['BatterTeam', "p_throws"])['NRFI'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Season3['B10'] = Season3.groupby(['BatterTeam', "p_throws"])['NRFI'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Season3['B5xwOBA'] = Season3.groupby(['BatterTeam', "p_throws"])['xwOBA'].rolling(window=window_size5, min_periods=1).mean().reset_index(level=[0,1], drop=True)
Season3['B10xwOBA'] = Season3.groupby(['BatterTeam', "p_throws"])['xwOBA'].rolling(window=window_size10, min_periods=1).mean().reset_index(level=[0,1], drop=True)

In [20]:
Season4 = Season3.groupby(["player_name", "p_throws"]).agg(
    IP = ("AB", "size"),
    AB = ("AB", "sum"),
    PA = ("PA", "sum"),
    BB = ("BB", "sum"),
    HBP = ("HBP", "sum"),
    K = ("K", "sum"),
    GB = ('GB', "sum"),
    LD = ('LD', "sum"),
    FB = ('FB', "sum"),
    PU = ('PU', "sum"),
    HR = ("HR", "sum"),
    wOBA = ("wOBA", "mean"),
    RunExp = ("RunExp", "mean"),
    NRFI = ("NRFI", "mean"),
    P5 = ("P5", "last"),
    P10 = ("P10", "last"),
    P5xwOBA = ("P5xwOBA", "last"),
    P10xwOBA = ("P10xwOBA", "last"),
    B5 = ("B5", "last"),
    B10 = ("B10", "last"),
    B5xwOBA = ("B5xwOBA", "last"),
    B10xwOBA = ("B10xwOBA", "last")).reset_index().fillna(0)

Season4['BB%'] = round((Season4['BB'] / Season4['AB']) * 100, 2)
Season4 = Season4[Season4['player_name'].isin(TodaysData['SP'])]

In [21]:
Season4['FIP'] = Season4.apply(lambda row: FIP(row['HR'], row['BB'], row['HBP'], row['K'], row['IP']), axis=1)
Season4["AB"] = Season4["AB"] / Season4["IP"]
Season5 = Season4[["player_name", "IP", "AB", "BB%", "wOBA", "P5", "P10", "P5xwOBA", "P10xwOBA", "B5", "B10", "B5xwOBA", "B10xwOBA", "NRFI"]].round(3)

## Data being predicted on takes season first inning averages and gives league average 2022-23 data if no data exists for that player

In [22]:
TodaysData.dropna(subset=['SP'], inplace=True)
TodaysData1 = pd.merge(TodaysData, Season5[["player_name", "IP", "AB", "BB%", "wOBA", "P5", "P10", "P5xwOBA", "P10xwOBA", "B5", "B10", "B5xwOBA", "B10xwOBA", "NRFI"]], left_on = ['SP'], right_on = ['player_name'], how = 'left').drop(columns = ["player_name"])
TodaysData1['IP'] = TodaysData1['IP'].astype(float)

# If no 2024 savant data exists then gives them the league averages from 2022-23
TrainMeans = Train4.drop(['BatterTeam', 'RoadTeam', "HomeTeam", "SP"], axis=1).mean()
TodaysData1 = TodaysData1.fillna(TrainMeans)

## Encodes the teams and players allowing to be fed into the algorithms

In [23]:
# Ensure Train5 and TodaysData2 are copies of Train4 and TodaysData1 respectively
Train5 = Train4.copy()
TodaysData2 = TodaysData1.copy()

def strip_2_from_columns(df, columns):
    for col in columns:
        df[col] = df[col].astype(str).str.replace('2', '', regex=False)
    return df

# Apply the function to TodaysData2
TodaysData2 = strip_2_from_columns(TodaysData2, ['BatterTeam', 'HomeTeam', 'RoadTeam'])

# Dictionary to store the label encoders
label_encoders = {}

# Encode non-numeric columns in Train4
non_numeric_columns_train = Train5.select_dtypes(exclude=['float64', 'int64']).columns
for col in non_numeric_columns_train:
    label_encoder = LabelEncoder()
    Train5[col] = label_encoder.fit_transform(Train5[col])
    label_encoders[col] = label_encoder

# Ensure all non-numeric columns in Train4 are in TodaysData1
for col in non_numeric_columns_train:
    if col not in TodaysData2.columns:
        print(f"Warning: Column {col} from training data is not present in today's data.")
        # Adding the missing column with a default value
        TodaysData2[col] = 536

# Encode non-numeric columns in TodaysData1 using the same encoders
non_numeric_columns_today = TodaysData2.select_dtypes(exclude=['float64', 'int64']).columns
for col in non_numeric_columns_today:
    if col in label_encoders:
        label_encoder = label_encoders[col]
        unique_values = set(label_encoder.classes_)
        encoded_values = []
        for item in TodaysData2[col]:
            if item in unique_values:
                encoded_values.append(label_encoder.transform([item])[0])
            else:
                encoded_values.append(536)  # Using 536 as a placeholder for unknown categories
        TodaysData2[col] = encoded_values
    else:
        print(f"Warning: Column {col} is not present in the training data.")
        # Fit a new label encoder for columns not present in Train4, but be cautious with this
        label_encoder = LabelEncoder()
        TodaysData2[col] = label_encoder.fit_transform(TodaysData2[col])
        label_encoders[col] = label_encoder

## Runs Random Forest, Gradient Boosting, and Extreme Gradient Boosting models on the data

In [24]:
TrainFeatures = Train5.drop(columns = ["NRFI"]).values.reshape(-1, 15)
TrainLabel = Train5["NRFI"].values.ravel()
TodayFeatures = TodaysData2.drop(columns = ["IP", "NRFI"]).values.reshape(-1, 15)

rf_regressor = RandomForestRegressor(n_estimators = 152, max_depth = 15, min_samples_leaf = 4)
rf_regressor.fit(TrainFeatures, TrainLabel)
RFpred = rf_regressor.predict(TodayFeatures)

TodaysData2["RFPred"] = RFpred

In [25]:
gb_regressor = GradientBoostingRegressor(n_estimators = 106, min_samples_leaf = 4, max_depth=8, max_features='log2', learning_rate=0.1)
gb_regressor.fit(TrainFeatures, TrainLabel)
GBpred = gb_regressor.predict(TodayFeatures)

TodaysData2["GBPred"] = GBpred

In [26]:
xgb_regressor = XGBRegressor(learning_rate = 0.1, n_estimators= 60, scale_pos_weight=1, max_depth = 35, min_child_weight = 20)
xgb_regressor.fit(TrainFeatures, TrainLabel)
XGBpred = xgb_regressor.predict(TodayFeatures)

TodaysData2["XGBPred"] = XGBpred

## Reverse encodes today's data so it can be understood

In [27]:
for col in non_numeric_columns_today:
    if col in label_encoders:
        label_encoder = label_encoders[col]
        # Handling default value of 536
        TodaysData2[col] = TodaysData2[col].apply(lambda x: label_encoder.inverse_transform([x])[0] if x != 536 else np.nan)

TodaysData2["SP"].fillna(TodaysData1["SP"], inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  TodaysData2["SP"].fillna(TodaysData1["SP"], inplace = True)


In [28]:
def add_2_to_duplicates(df):
    mask = df.duplicated(subset=['BatterTeam', 'RoadTeam', 'HomeTeam'], keep='first')
    
    df.loc[mask, 'BatterTeam'] += '2'
    df.loc[mask, 'RoadTeam'] += '2'
    df.loc[mask, 'HomeTeam'] += '2'
    
    return df

TodaysData2 = add_2_to_duplicates(TodaysData2)
TodaysData2

Unnamed: 0,BatterTeam,RoadTeam,HomeTeam,SP,IP,AB,BB%,wOBA,P5,P10,P5xwOBA,P10xwOBA,B5,B10,B5xwOBA,B10xwOBA,NRFI,RFPred,GBPred,XGBPred
0,CIN,CIN,CLE,Tanner Bibee,30.0,3.633,6.42,0.337,1.0,0.9,0.442,0.318,0.8,0.8,0.221,0.289,0.733,0.998988,0.966281,0.958714
1,CLE,CIN,CLE,Carson Spiers,9.0,4.222,10.53,0.265,0.6,0.667,0.282,0.261,0.4,0.5,0.432,0.405,0.667,0.681597,0.828812,0.633396
2,CHC,CHC,PHI,Tanner Banks,2.0,2.0,0.0,0.0,0.5,0.5,0.008,0.008,0.8,0.6,0.209,0.266,0.5,0.727203,0.573758,0.844071
3,PHI,CHC,PHI,Justin Steele,23.0,3.696,11.76,0.232,1.0,0.9,0.325,0.283,0.8,0.9,0.397,0.378,0.913,1.0,1.009618,0.993854
4,MIL,MIL,PIT,Bailey Falter,26.0,4.077,6.6,0.326,0.6,0.7,0.389,0.354,1.0,1.0,0.357,0.298,0.731,0.974861,1.028363,0.959548
5,PIT,MIL,PIT,Tobias Myers,23.0,3.913,6.67,0.309,1.0,0.8,0.304,0.302,0.6,0.5,0.375,0.453,0.652,0.990372,1.012367,0.953352
6,KC,KC,WSH,Mitchell Parker,28.0,4.071,9.65,0.288,0.6,0.7,0.333,0.36,1.0,0.8,0.257,0.299,0.714,0.992252,1.043077,0.989341
7,WSH,KC,WSH,Cole Ragans,30.0,3.833,7.83,0.201,0.8,0.8,0.19,0.197,0.6,0.6,0.306,0.317,0.767,0.991314,0.976107,0.97756
8,BAL,BAL,NYY,Clarke Schmidt,14.0,4.214,6.78,0.295,0.8,0.7,0.279,0.354,0.8,0.7,0.264,0.286,0.786,0.998355,0.94273,0.998855
9,NYY,BAL,NYY,Dean Kremer,23.0,3.609,4.82,0.171,0.8,0.8,0.155,0.183,0.6,0.6,0.449,0.42,0.87,0.997252,1.000147,1.034673


## Multiplies the predictions of each game together as they are independent events
## Creates a score composite of the various model predictions

In [29]:
TodaysData2[["RFPred", "GBPred", "XGBPred"]] = TodaysData2[["RFPred", "GBPred", "XGBPred"]].sub(0.15)
TodaysData3 = TodaysData2.groupby(["RoadTeam", "HomeTeam"])[["RFPred", "GBPred", "XGBPred"]].prod().reset_index()

TodaysData3['Games'] = TodaysData3.apply(lambda x: tuple(sorted([x['RoadTeam'], x['HomeTeam']])), axis=1)
TodaysData3['Score'] = TodaysData3[['RFPred', 'GBPred', "XGBPred"]].mean(axis=1)
TodaysData4 = TodaysData3.set_index("Games").drop(
    columns = ["RoadTeam", "HomeTeam"]).sort_values(
    "Score", ascending = False).drop(
    columns = ["RFPred", "GBPred", "XGBPred"]).round(3)

TodaysData4

Unnamed: 0_level_0,Score
Games,Unnamed: 1_level_1
"(COL, STL)",0.765
"(ATL, NYM)",0.725
"(AZ, SF)",0.716
"(HOU, SEA)",0.715
"(BAL, NYY)",0.715
"(KC, WSH)",0.714
"(MIL, PIT)",0.7
"(BOS, TOR)",0.698
"(OAK, TEX)",0.673
"(CWS, LAA)",0.657


## Writes this to an excel file creating a sheet so the daily performance can be tracked

In [30]:
excel_file = 'NRFI-Tracker.xlsx'

# If deleting the mode and engine it rewrites the whole file
#with pd.ExcelWriter(excel_file, mode='a', engine='openpyxl') as writer:
    #TodaysData4.reset_index().to_excel(writer, index=False, sheet_name = eastern_time.strftime("%m-%d-%y"))