In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import unicodedata

In [2]:
#date is in the following format YYYYMMDD. Will return a string "MM/DD/YYYY" format
def format_date(date):
    date = str(date)
    year = date[:4]
    month = date[4:6]
    day = date[6:]
    return str(month) + "/" + str(day) + "/" + str(year)

In [3]:
def clean_up_pitcher_name(name):
    pitcher = unicodedata.normalize("NFKD", name) #removing the unicode characters in the pitcher name string
    pitcher = (pitcher.replace("\n", "").replace(" ", ""))[1:-3] #removing spaces,
                                                                 #the - at the beginning, and the (L) or (R) at the end
    return pitcher

In [4]:
def get_team_and_pitcher_names(game):
    home_info = game[1].a.text.split(None, 1)
    away_info = game[0].a.text.split(None, 1)
    home_team = home_info[0]
    away_team = away_info[0]
    home_pitcher = clean_up_pitcher_name(home_info[1])
    away_pitcher = clean_up_pitcher_name(away_info[1])
    
    return home_team, away_team, home_pitcher, away_pitcher

In [5]:
def get_scores(scores):
    home_score = scores[1].text
    away_score = scores[0].text 
    
    return home_score, away_score

In [6]:
def get_moneylines(moneyline):
    #The first two numbers in this array are the odds by Pinnacle
    home_line = moneyline[1].text
    away_line = moneyline[0].text
    
    return home_line, away_line

In [47]:
"""
Given year, month, and date (YYYYMMDD), this function will scrape Sportsbook.com, take only the moneyline from Pinnacle,
and create and save a CSV for that day
"""
def scrape_info_for_date(date, y, month):
    
    #I need to query the webpage first
    query = "https://www.sportsbookreview.com/betting-odds/mlb-baseball/?date=" + str(date)
    r = requests.get(query)
    soup = BeautifulSoup(r.text, "html.parser")
    
    #getting all the games played that day
    games = soup.find_all(class_="event-holder holder-complete")
    if (len(games)==0): #no games on this day
        return 0
    date2 = format_date(date)
    rows_list = []
    
    for game in games:
        #the list below will have columns - home team,home pitcher name, away team, away pitcher name, home_score, away_score,
        #home_line, away_line
        current_game = []
    
        #getting team and pitcher names
        x = game.find_all(class_="team-name")
        home_team, away_team, home_pitcher, away_pitcher = get_team_and_pitcher_names(x)
    
        #getting scores
        scores = game.find_all(class_="current-score")
        home_score, away_score = get_scores(scores)
    
        #getting moneyline
        moneyline = game.find_all('b')
        home_line, away_line = get_moneylines(moneyline)
    
        rows_list.append({
            'date':date2,
            'homeTeam': home_team,
            'awayTeam': away_team,
            'homeScore':home_score,
            'awayScore': away_score,
            'homeLine': home_line,
            'awayLine': away_line
        })
    f = pd.DataFrame(rows_list)
    folder = "2017/"+ str(y) + month+"/"
    f.to_csv(folder+str(date)+".csv", index = None, columns = ['date', 'homeTeam', 'awayTeam', 'homeScore', 'awayScore',
                                                  'homeLine', 'awayLine'])

In [52]:
"""The 2017 season started on April 2. So when scraping for April, 
    - the inner loop should start from 2
    - the months list with only April should be used
    - y should be i+4
For other months
    - inner loop should start from 1
    - y should be i+5
"""
# months = ['April']
# days_in_month = [30]

months = ['May', 'June', 'July', 'August', 'September', 'October', 'November'] #the months of the season
days_in_month = [31, 30, 31, 31, 30, 31, 30]

for i in range(len(months)):
    print (months[i])
    for j in range(1, days_in_month[i]+1):
        x = format(j, "02")
        y = format(i+5, "02")
        d = int("2017" + str(y)+str(x))
        print (d)
        scrape_info_for_date(d, y, months[i])

May
20170501
20170502
20170503
20170504
20170505
20170506
20170507
20170508
20170509
20170510
20170511
20170512
20170513
20170514
20170515
20170516
20170517
20170518
20170519
20170520
20170521
20170522
20170523
20170524
20170525
20170526
20170527
20170528
20170529
20170530
20170531
June
20170601
20170602
20170603
20170604
20170605
20170606
20170607
20170608
20170609
20170610
20170611
20170612
20170613
20170614
20170615
20170616
20170617
20170618
20170619
20170620
20170621
20170622
20170623
20170624
20170625
20170626
20170627
20170628
20170629
20170630
July
20170701
20170702
20170703
20170704
20170705
20170706
20170707
20170708
20170709
20170710
20170711
20170712
20170713
20170714
20170715
20170716
20170717
20170718
20170719
20170720
20170721
20170722
20170723
20170724
20170725
20170726
20170727
20170728
20170729
20170730
20170731
August
20170801
20170802
20170803
20170804
20170805
20170806
20170807
20170808
20170809
20170810
20170811
20170812
20170813
20170814
20170815
20170816
2017081