In [3]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import unicodedata

In [4]:
def clean_up_pitcher_name(name):
    pitcher = unicodedata.normalize("NFKD", name) #removing the unicode characters in the pitcher name string
    pitcher = (pitcher.replace("\n", "").replace(" ", ""))[1:-3] #removing spaces,
                                                                 #the - at the beginning, and the (L) or (R) at the end
    return pitcher

In [28]:
def get_team_and_pitcher_names(game):
    home_info = game[1].a.text.split(None, 1)
    away_info = game[0].a.text.split(None, 1)
    home_team = home_info[0]
    away_team = away_info[0]
    home_pitcher = clean_up_pitcher_name(home_info[1])
    away_pitcher = clean_up_pitcher_name(away_info[1])
    
    return home_team, away_team, home_pitcher, away_pitcher

In [32]:
def get_scores(scores):
    home_score = scores[1].text
    away_score = scores[0].text 
    
    return home_score, away_score

In [33]:
def get_moneylines(moneyline):
    #The first two numbers in this array are the odds by Pinnacle
    home_line = moneyline[1].text
    away_line = moneyline[0].text
    
    return home_line, away_line

In [44]:
def get_win_percentages(consensus):
    l = consensus.text.split("%", 2)
    return l[1], l[0] #away win percentage is first in this list

In [47]:
"""
Given year, month, and date (YYYYMMDD), this function will scrape Sportsbook.com, take only the moneyline from Pinnacle,
and create and save a CSV for that day
"""
def scrape_info_for_date(date):
    
    #I need to query the webpage first
    query = "https://www.sportsbookreview.com/betting-odds/mlb-baseball/?date=" + str(date)
    r = requests.get(query)
    soup = BeautifulSoup(r.text, "html.parser")
    
    #getting all the games played that day
    games = soup.find_all(class_="event-holder holder-complete")
    
    #the list below will have columns - home team,home pitcher name, away team, away pitcher name, home_score, away_score,
    #home_line, away_line
    current_game = []
    
    #getting team and pitcher names
    x = games[0].find_all(class_="team-name")
    home_team, away_team, home_pitcher, away_pitcher = get_team_and_pitcher_names(x)
    
    #getting scores
    scores = games[0].find_all(class_="current-score")
    home_score, away_score = get_scores(scores)
    
    #getting moneyline
    moneyline = games[0].find_all('b')
    home_line, away_line = get_moneylines(moneyline)
    
    #getting consensus win percentages (in case we need it in the future)
    consensus = games[0].find(class_="el-div eventLine-consensus")
    home_win_percent, away_win_percent = get_win_percentages(consensus)
    
    return home_team, away_team, home_pitcher, away_pitcher, home_score, away_score, home_line, away_line, home_win_percent, away_win_percent

In [48]:
scrape_info_for_date(20171009)

('BOS',
 'HOU',
 'F.Porcello',
 'C.Morton',
 '4',
 '5',
 '+106',
 '-115',
 '46.3',
 '53.7')