# Model 1: Basic Regression for Predicting MLB Scores

In [2]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import sys
import datetime
import pickle as pkl
import matplotlib.pyplot as plt
#get parent directory:
import os
parentDirectory = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append('/Users/efriedlander/Dropbox/SportsBetting/mlb-predict')
from src.data import bbref_scrape

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Data Preparation

First scrape, import, and parse the data scraped from baseball-reference. We will save the parsed data so we don't have to repeat that step every time we run the notebook.

In [3]:
# start19 = datetime.datetime(2019, 1, 1)
# end19 = datetime.datetime(2019, 12, 31)
# start18 = datetime.datetime(2018, 1, 1)
# end18 = datetime.datetime(2018, 12, 31)
# start17 = datetime.datetime(2017, 1, 1)
# end17 = datetime.datetime(2017, 12, 31)

In [4]:
# links19 = bbref_scrape.get_box_score_links('ALL', start19, end19)
# boxscores19 = bbref_scrape.get_box_scores(links19)
# pkl.dump(boxscores19, open(os.path.join(parentDirectory, 'data', 'boxscores2019.p'), 'wb'))

In [5]:
# links18 = bbref_scrape.get_box_score_links('ALL', start18, end18)
# boxscores18 = bbref_scrape.get_box_scores(links18)
# pkl.dump(boxscores18, open(os.path.join(parentDirectory, 'data', 'boxscores2018.p'), 'wb'))

In [6]:
# links17 = bbref_scrape.get_box_score_links('ALL', start17, end17)
# boxscores17 = bbref_scrape.get_box_scores(links17)
# pkl.dump(boxscores17, open(os.path.join(parentDirectory, 'data', 'boxscores2017.p'), 'wb'))

In [7]:
# scores = pkl.load(open(os.path.join(parentDirectory, 'data', 'boxscores2019.p'), 'rb'))
# parsed_scores = bbref_scrape.parse_box_scores(scores)
# pkl.dump(parsed_scores, open(os.path.join(parentDirectory, 'data', 'parsedData2019.p'), 'wb'))

# scores = pkl.load(open(os.path.join(parentDirectory, 'data', 'boxscores2018.p'), 'rb'))
# parsed_scores = bbref_scrape.parse_box_scores(scores)
# pkl.dump(parsed_scores, open(os.path.join(parentDirectory, 'data', 'parsedData2018.p'), 'wb'))

# scores = pkl.load(open(os.path.join(parentDirectory, 'data', 'boxscores2017.p'), 'rb'))
# parsed_scores = bbref_scrape.parse_box_scores(scores)
# pkl.dump(parsed_scores, open(os.path.join(parentDirectory, 'data', 'parsedData2017.p'), 'wb'))

In [8]:
parsed19 = pkl.load(open(os.path.join(parentDirectory, 'data', 'parsedData2019.p'), 'rb'))

The parser outputs four dataframes. For now we will only concern ourselves with the game and team level data.

In [9]:
game_level = parsed19['Game']
team_level = parsed19['Team']

For this first model we'll only consider the number of runs, hits, and starter for each game. Let's extract those, make sure they have the appropriate data types.

In [10]:
team_data = team_level[['GameID', 'Team', 'GameNum', 'HomeAway', 'Runs', 'Hits', 'Starter']].astype({'GameID' : 'int64',
                                                                                                     'Team' : 'category',
                                                                                                     'GameNum' : 'Int32',
                                                                                                     'HomeAway' : 'category',
                                                                                                     'Runs' : 'Int32',
                                                                                                     'Hits' : 'Int32',
                                                                                                     'Starter' : 'category'}, copy=False)

team_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4858 entries, 0 to 4857
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   GameID    4858 non-null   int64   
 1   Team      4858 non-null   category
 2   GameNum   4858 non-null   Int32   
 3   HomeAway  4858 non-null   category
 4   Runs      4858 non-null   Int32   
 5   Hits      4858 non-null   Int32   
 6   Starter   4858 non-null   category
dtypes: Int32(3), category(3), int64(1)
memory usage: 142.7 KB


Now compute the lagged running average for the number of runs and hits for each game.

In [11]:
team_data.sort_values(by='GameNum', inplace=True)
team_data[['MeanRuns', 'MeanHits']] = team_data.groupby(by='Team').expanding().mean()[['Runs', 'Hits']].reset_index(level=0, drop=True)
team_data[['MeanRuns', 'MeanHits']] = team_data.groupby(by='Team')[['MeanRuns', 'MeanHits']].shift(1)

We will need the game odds. Due to formatting it will be easiest to add it to team_data first. Below we import the odds dataset and join the batting line and odds. Note that the odds dataset uses abbreviations so we'll need to define those and add them to the data set in order to join

In [12]:
team_abbrv = {'Atlanta Braves' : 'ATL', 
              'Arizona Diamondbacks' : 'ARI', 
              'Baltimore Orioles' : 'BAL', 
              'Boston Red Sox' : 'BOS', 
              'Chicago Cubs' : 'CUB', 
              'Chicago White Sox' : 'CWS', 
              'Cincinnati Reds' : 'CIN', 
              'Cleveland Indians' : 'CLE', 
              'Colorado Rockies' : 'COL', 
              'Detroit Tigers' : 'DET',
              'Kansas City Royals': 'KAN', 
              'Houston Astros' : 'HOU', 
              'Los Angeles Angels' : 'LAA', 
              'Los Angeles Dodgers' : 'LAD', 
              'Miami Marlins' : 'MIA', 
              'Florida Marlins' : 'FLA', 
              'Milwaukee Brewers' : 'MIL', 
              'Minnesota Twins' : 'MIN', 
              'New York Mets' : 'NYM', 
              'New York Yankees' : 'NYY', 
              'Oakland Athletics' : 'OAK',
              'Philadelphia Phillies' : 'PHI', 
              'Pittsburgh Pirates' : 'PIT', 
              'San Diego Padres' : 'SDG', 
              'Seattle Mariners' : 'SEA', 
              'San Francisco Giants' : 'SFO', 
              'St. Louis Cardinals' : 'STL', 
              'Tampa Bay Rays' : 'TAM', 
              'Texas Rangers' : 'TEX', 
              'Toronto Blue Jays' : 'TOR', 
              'Washington Nationals' : 'WAS'}
team_data['TeamAbr'] = team_data['Team'].apply(lambda x: team_abbrv[x])

# game_level['DateTime'] = pd.to_datetime(game_level['DateTime'], utc=True)
game_date = game_level[['GameID', 'DateTime']].astype({'GameID' : 'int64'})
game_date['Date'] = game_date['DateTime'].map(lambda x: x.month*100 + x.day)
game_date.drop(columns='DateTime', inplace=True)
team_data = team_data.merge(game_date, on='GameID')


odds = pd.read_csv(os.path.join(parentDirectory, 'data', 'mlbodds2019.csv'))
odds.rename(columns={'Team' : 'TeamAbr', 'Final' : 'Runs', 'Unnamed: 18' : 'Line Odds'}, inplace=True)
team_data = team_data.merge(odds[['Date', 'TeamAbr', 'Runs', 'Run Line', 'Line Odds', 'Pitcher']], how='left', on=['TeamAbr', 'Date', 'Runs'])

Now join the team data back to the game_level data in order to come up with our final dataset.

In [13]:
# team_data.drop(columns=['DateTime', 'HomeAway'], inplace=True)
model_data = game_level[['GameID', 'AwayTeam', 'HomeTeam', 'DateTime', 'AwayScore', 'HomeScore']]
model_data = model_data.merge(team_data, left_on=['GameID', 'AwayTeam'], right_on=['GameID', 'Team'])
model_data = model_data.merge(team_data, left_on=['GameID', 'HomeTeam'], right_on=['GameID', 'Team'], suffixes=('_away', '_home'))
model_data['ScoreDiff'] = model_data['AwayScore'] - model_data['HomeScore']
model_data.drop(columns=['Team_away', 'Team_home', 'Runs_away', 'Runs_home'], inplace=True)

## Data Exploration

Now that we have all of the data we will use for our model let's explore it a little bit.

In [14]:
model_data[model_data['Run Line_home'] != -model_data['Run Line_away']]

Unnamed: 0,GameID,AwayTeam,HomeTeam,DateTime,AwayScore,HomeScore,GameNum_away,HomeAway_away,Hits_away,Starter_away,...,Hits_home,Starter_home,MeanRuns_home,MeanHits_home,TeamAbr_home,Date_home,Run Line_home,Line Odds_home,Pitcher_home,ScoreDiff
578,-910491629547739276,Kansas City Royals,Boston Red Sox,2019-08-07 19:10:00-05:00,4,5,116,Away,13,Glenn Sparkman,...,12,Eduardo Rodriguez,5.698276,9.767241,BOS,807,,,,-1
778,7842854319504398228,Chicago White Sox,New York Yankees,2019-04-12 19:05:00-05:00,9,6,12,Away,12,Lucas Giolito,...,7,J.A. Happ,5.083333,8.75,NYY,412,-1.5,-110.0,JHAPP-L,3
809,2568596730165295575,Kansas City Royals,Chicago White Sox,2019-05-27 14:10:00-05:00,1,2,53,Away,7,Homer Bailey,...,7,Ivan Nova,4.192308,8.326923,CWS,527,,,,-1
1311,-2068446951398620634,Oakland Athletics,Detroit Tigers,2019-05-19 13:10:00-05:00,7,3,48,Away,12,Mike Fiers,...,5,Gregory Soto,3.318182,7.295455,DET,519,1.5,116.0,GSOTO-L,4
1510,1915063218856430657,Texas Rangers,Houston Astros,2019-05-10 20:10:00-05:00,0,3,36,Away,2,Lance Lynn,...,6,Justin Verlander,5.052632,9.289474,HOU,510,,,,-3
1517,5327454643872064934,Houston Astros,Seattle Mariners,2019-06-04 22:10:00-05:00,11,5,62,Away,13,Wade Miley,...,5,Andrew Moore,4.936508,8.222222,SEA,604,1.5,-113.0,AMOORE-R,6
2044,9059109643928713529,St. Louis Cardinals,New York Mets,2019-06-13 19:10:00-05:00,5,4,67,Away,10,Jack Flaherty,...,6,Jacob deGrom,4.686567,8.626866,NYM,613,-1.5,149.0,JDEGROM-R,1
2116,1547503049529398916,Tampa Bay Rays,New York Yankees,2019-07-18 15:00:00-05:00,2,6,98,Away,5,Yonny Chirinos,...,10,Domingo German,5.634409,9.021505,NYY,718,,,,-4
2222,-8347995212585020649,Philadelphia Phillies,San Diego Padres,2019-06-04 22:10:00-05:00,9,6,61,Away,13,Jerad Eickhoff,...,10,Chris Paddack,4.0,7.566667,SDG,604,,,,3


In [None]:
odds.loc[(odds['Date']==807)]

In [None]:
odds[(odds['TeamAbr']=='ATL')].iloc[4]

In [None]:
team_data[(team_data['TeamAbr']=='LAD')].count()

In [None]:
game_level[game_level['GameID']==1096196326745839632]

In [None]:
game_level[game_level['HomeTeam']=='Atlanta Braves'].iloc[0]['DateTime']

In [None]:
box_score = scores[3]

In [None]:
pd.to_datetime(dateparser.parse(box_score.date + ' ' + box_score.time.replace('Local', '')))

In [None]:
dateparser.parse(box_score.date + ' ' + box_score.time.replace('Local', ''))