In [27]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import os
import shutil
import unicodedata
import mlbgame
import glob

In [28]:
#date is in the following format YYYYMMDD. Will return a string "MM/DD/YYYY" format
def format_date(date):
    date = str(date)
    year = date[:4]
    month = date[4:6]
    day = date[6:]
    return str(month) + "/" + str(day) + "/" + str(year)

In [29]:
def clean_up_pitcher_name(name):
    pitcher = unicodedata.normalize("NFKD", name) #removing the unicode characters in the pitcher name string
    pitcher = (pitcher.replace("\n", "").replace(" ", ""))[1:-3] #removing spaces,
                                                                 #the - at the beginning, and the (L) or (R) at the end
    return pitcher

In [30]:
def get_team_and_pitcher_names(game):
    home_info = game[1].a.text.split(None, 1)
    away_info = game[0].a.text.split(None, 1)
    home_team = home_info[0]
    away_team = away_info[0]
    home_pitcher = clean_up_pitcher_name(home_info[1])
    away_pitcher = clean_up_pitcher_name(away_info[1])
    
    return home_team, away_team, home_pitcher, away_pitcher

In [31]:
def get_scores(scores):
    home_score = scores[1].text
    away_score = scores[0].text 
    
    return home_score, away_score

In [32]:
def get_moneylines(moneyline):
    #The first two numbers in this array are the odds by Pinnacle
    home_line = moneyline[1].text
    away_line = moneyline[0].text
    
    return home_line, away_line

In [33]:
"""
Given year, month, and date (YYYYMMDD), this function will scrape Sportsbook.com, take only the moneyline from Pinnacle,
and create and save a CSV for that day
"""
def scrape_info_for_date(date, year, month):
    
    #I need to query the webpage first
    query = "https://www.sportsbookreview.com/betting-odds/mlb-baseball/?date=" + str(date)
    r = requests.get(query)
    soup = BeautifulSoup(r.text, "html.parser")
    
    #getting all the games played that day
    games = soup.find_all(class_="event-holder holder-complete")
    if (len(games)==0): #no games on this day
        return 0
    date2 = format_date(date)
    rows_list = []
    
    for game in games:
        #the list below will have columns - home team,home pitcher name, away team, away pitcher name, home_score, away_score,
        #home_line, away_line
        current_game = []
    
        #getting team and pitcher names
        x = game.find_all(class_="team-name")
        home_team, away_team, home_pitcher, away_pitcher = get_team_and_pitcher_names(x)
    
        #getting scores
        scores = game.find_all(class_="current-score")
        home_score, away_score = get_scores(scores)
    
        #getting moneyline
        moneyline = game.find_all('b')
        home_line, away_line = get_moneylines(moneyline)
    
        rows_list.append({
            'date':date2,
            'homeTeam': home_team,
            'awayTeam': away_team,
            'homeScore':home_score,
            'awayScore': away_score,
            'homeLine': home_line,
            'awayLine': away_line
        })
    f = pd.DataFrame(rows_list)
    folder = "2017/"+ str(y) + month+"/"
    f.to_csv(folder+str(date)+".csv", index = None, columns = ['date', 'homeTeam', 'awayTeam', 'homeScore', 'awayScore',
                                                  'homeLine', 'awayLine'])

In [None]:
"""The season starts in April. So when scraping for April, 
    - the inner loop should start from the date that the season starts
    - the months list with only April should be used
    - y should be i+4
For other months
    - inner loop should start from 1
    - y should be i+5
"""
months = ['April']
days_in_month = [30]

# months = ['May', 'June', 'July', 'August', 'September', 'October', 'November'] #the months of the season
# days_in_month = [31, 30, 31, 31, 30, 31, 30]

year = '2017'

for i in range(len(months)):
    print (months[i])
    for j in range(3, days_in_month[i]+1):
        x = format(j, "02")
        y = format(i+4, "02")
        d = int(year + str(y)+str(x))
        print (d)
        scrape_info_for_date(d, y, months[i])

## The above code scrapes betting data for the 2017 season.



## the below code is for combining the different CSVs into one

In [21]:
"""Change these two lines for each season
"""
year = '2017'
folders = ['04April', '05May', '06June', '07July', '08August', '09September', '10October', '11November']

#making the csv for each month
for folder in folders:
    
    p = str(year) + '/'+str(folder)

    files = glob.glob(p+'/*.csv') #Getting a list of all files which I need to concat
    dfs = []
    for i in range(0, len(files)):
        dfs.append(pd.read_csv(files[i]))

    merged = pd.concat(dfs, ignore_index = True)
    merged.to_csv(str(year)+'/'+folder[2:]+'.csv', index = None)
    
#removing the earlier directories for each day (they are no longer needed as we have a file for each month now)
for folder in folders:
    p = str(year) + '/'+str(folder)
    shutil.rmtree(p)

In [26]:
#now combining the month CSVs into one CSVs for the entire year

files = glob.glob(str(year)+"/*.csv")
dfs = []
for i in range(0, len(files)):
    dfs.append(pd.read_csv(files[i]))

merged = pd.concat(dfs, ignore_index = True)
merged.to_csv('Betting Data/' + str(year) + '.csv', index = None)

#removing the earlier directory for the year (no longer needed as we have a CSV file in the Betting Data directory now)
shutil.rmtree(str(year))