In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

In [2]:
## Given a url which has all the links to games from a season, (e.g. 2014)
## Return a list of URLs for the games for scraping purposes
def get_games(url):
    response = requests.get(url)
    html = response.text
    
    # Create soup
    soup = BeautifulSoup(html, 'html.parser')
    
    # Base URL for concatenation
    base_url = 'https://www.baseball-reference.com'
    
    # Grab tags containing game ids from soup
    hrefs = soup.select('em a')

    # Build URLs in list comprehension
    game_ids = [base_url + hrefs[x]['href'] for x in range(len(hrefs))]

    return game_ids

## Given the raw HTML from a game page,
## Scrape the name of the umpire and return it
def get_ump(html):
    target_word = "Umpires:"
    word_len = len(target_word)
    start_idx = html.find(target_word) + word_len
    end_idx = start_idx + 100
    ump_name = html[start_idx: end_idx].split(',')[0].lstrip('</strong>').split('-')[1].strip(' ')
    return ump_name

In [6]:
# This is meant to be run as a cell
# Set the list of the games to get here
game_list = games_2014

# Here is the big for loop which populates df_list with 
for i in range(len(df_list),len(game_list)):
    # Load the html from the game page
    response = requests.get(game_list[i])
    html = response.text
    html = html.replace('<!--', '').replace('-->', '')

    # Find the umpire name for the game
    ump_name = get_ump(html)
    
    # Retrieve all other game information
    soup = BeautifulSoup(html, 'html.parser')

    pbp_table = soup.find('table', id='play_by_play')

    events = pbp_table.find_all('tr', {'class':['top_inning', 'bottom_inning']})
    
    for x in range(len(events)):
        try:
            x += 1
            if events[x].find(attrs={"data-stat": "batter"}).get_text() == events[x-1].find(attrs={"data-stat": "batter"}).get_text():
                events.pop(x-1)
                x -= 1
        except:
            break

        data_dict = {}
        pitch = []
        balls = []
        strikes = []
        count = []
        pitchers = []x`
        batters = []
        home_pitcher = []
        innings = []
        run_diffs = []
        umps = []

    for e in events:
        strike_codes = ['C', 'S', 'T', 'K', 'L', 'M', 'O', 'Q', 'R']
        foul_code = 'F'
        ball_codes = ['B', 'I', 'P', 'V']
        
        try:
            pit_seq = e.find('span', 'pitch_sequence').get_text()
            pit_seq = re.sub('[^CSFBXTKIHLMOPQRVY]', '', pit_seq)
            pit_seq = list(pit_seq)

            ball_count = 0
            strike_count = 0
            
            for pit in pit_seq:
                batter = e.find(attrs={"data-stat": "batter"}).get_text()
                pitcher = e.find(attrs={"data-stat": "pitcher"}).get_text()
                inning = e.find(attrs={"data-stat": "inning"}).get_text()
                score = e.find(attrs={"data-stat": "score_batting_team"}).get_text()

                ball_strike_count = f'{ball_count}-{strike_count}'
                
                bat_score, pit_score = score.split('-', 1)  
                run_diff = int(pit_score) - int(bat_score)
                
                top_bottom = list(inning)[0]
                
                if top_bottom == 't':
                    pitcher_home = 1
                else:
                    pitcher_home = 0
                    
                if len(list(inning)) > 2:
                    inning_num = int(list(inning)[1] + list(inning)[2])
                else:
                    inning_num = int(list(inning)[1])
                
                pitch.append(pit)
                balls.append(ball_count)
                strikes.append(strike_count)
                count.append(ball_strike_count)
                batters.append(batter)
                pitchers.append(pitcher)
                umps.append(ump_name)
                home_pitcher.append(pitcher_home)
                innings.append(inning_num)
                run_diffs.append(run_diff)
                
                if pit in strike_codes:
                    strike_count += 1
                elif pit == foul_code and strike_count < 2:
                    strike_count += 1
                elif pit in ball_codes:
                    ball_count += 1
                
        except Exception as exception:
            print(f'{exception}: {url}')
            pit_seq = None
    
    # Populate data dictionary and DataFrame
    data_dict["pitch"] = pitch
    data_dict["balls"] = balls
    data_dict["strikes"] = strikes
    data_dict["count"] = count
    data_dict["batter"] = batters
    data_dict["pitcher"] = pitchers
    data_dict["umpire"] = umps
    data_dict["home_pitcher"] = home_pitcher
    data_dict["inning"] = innings
    data_dict["run_diff"] = run_diffs

    df = pd.DataFrame(data_dict)
    
    df_list.append(df)

In [9]:
pitches_df = pd.concat(df_list)
pitches_df

Unnamed: 0,pitch,balls,strikes,count,batter,pitcher,umpire,home_pitcher,inning,run_diff
0,C,0,0,0-0,Yasiel Puig,Wade Miley,Tim Welke,1,1,0
1,F,0,1,0-1,Yasiel Puig,Wade Miley,Tim Welke,1,1,0
2,S,0,2,0-2,Yasiel Puig,Wade Miley,Tim Welke,1,1,0
3,B,0,0,0-0,Justin Turner,Wade Miley,Tim Welke,1,1,0
4,B,1,0,1-0,Justin Turner,Wade Miley,Tim Welke,1,1,0
5,F,2,0,2-0,Justin Turner,Wade Miley,Tim Welke,1,1,0
6,S,2,1,2-1,Justin Turner,Wade Miley,Tim Welke,1,1,0
7,C,2,2,2-2,Justin Turner,Wade Miley,Tim Welke,1,1,0
8,S,0,0,0-0,Hanley Ramirez,Wade Miley,Tim Welke,1,1,0
9,B,0,1,0-1,Hanley Ramirez,Wade Miley,Tim Welke,1,1,0


In [7]:
len(df_list)

2462

In [8]:
len(games_2014)

2462

In [10]:
pitches_df.to_csv("games_2014.csv")

In [11]:
games_2013_url = 'https://www.baseball-reference.com/leagues/MLB/2013-schedule.shtml'
games_2013 = get_games(games_2013_url)

df_list = []

In [20]:
# This is meant to be run as a cell
# Set the list of the games to get here
game_list = games_2013

# Here is the big for loop which populates df_list with 
for i in range(len(df_list),len(game_list)):
    # Load the html from the game page
    response = requests.get(game_list[i])
    html = response.text
    html = html.replace('<!--', '').replace('-->', '')

    # Find the umpire name for the game
    ump_name = get_ump(html)
    
    # Retrieve all other game information
    soup = BeautifulSoup(html, 'html.parser')

    pbp_table = soup.find('table', id='play_by_play')

    events = pbp_table.find_all('tr', {'class':['top_inning', 'bottom_inning']})
    
    for x in range(len(events)):
        try:
            x += 1
            if events[x].find(attrs={"data-stat": "batter"}).get_text() == events[x-1].find(attrs={"data-stat": "batter"}).get_text():
                events.pop(x-1)
                x -= 1
        except:
            break

        data_dict = {}
        pitch = []
        balls = []
        strikes = []
        count = []
        pitchers = []
        batters = []
        home_pitcher = []
        innings = []
        run_diffs = []
        umps = []

    for e in events:
        strike_codes = ['C', 'S', 'T', 'K', 'L', 'M', 'O', 'Q', 'R']
        foul_code = 'F'
        ball_codes = ['B', 'I', 'P', 'V']
        
        try:
            pit_seq = e.find('span', 'pitch_sequence').get_text()
            pit_seq = re.sub('[^CSFBXTKIHLMOPQRVY]', '', pit_seq)
            pit_seq = list(pit_seq)

            ball_count = 0
            strike_count = 0
            
            for pit in pit_seq:
                batter = e.find(attrs={"data-stat": "batter"}).get_text()
                pitcher = e.find(attrs={"data-stat": "pitcher"}).get_text()
                inning = e.find(attrs={"data-stat": "inning"}).get_text()
                score = e.find(attrs={"data-stat": "score_batting_team"}).get_text()

                ball_strike_count = f'{ball_count}-{strike_count}'
                
                bat_score, pit_score = score.split('-', 1)  
                run_diff = int(pit_score) - int(bat_score)
                
                top_bottom = list(inning)[0]
                
                if top_bottom == 't':
                    pitcher_home = 1
                else:
                    pitcher_home = 0
                    
                if len(list(inning)) > 2:
                    inning_num = int(list(inning)[1] + list(inning)[2])
                else:
                    inning_num = int(list(inning)[1])
                
                pitch.append(pit)
                balls.append(ball_count)
                strikes.append(strike_count)
                count.append(ball_strike_count)
                batters.append(batter)
                pitchers.append(pitcher)
                umps.append(ump_name)
                home_pitcher.append(pitcher_home)
                innings.append(inning_num)
                run_diffs.append(run_diff)
                
                if pit in strike_codes:
                    strike_count += 1
                elif pit == foul_code and strike_count < 2:
                    strike_count += 1
                elif pit in ball_codes:
                    ball_count += 1
                
        except Exception as exception:
            print(f'{exception}: {url}')
            pit_seq = None
    
    # Populate data dictionary and DataFrame
    data_dict["pitch"] = pitch
    data_dict["balls"] = balls
    data_dict["strikes"] = strikes
    data_dict["count"] = count
    data_dict["batter"] = batters
    data_dict["pitcher"] = pitchers
    data_dict["umpire"] = umps
    data_dict["home_pitcher"] = home_pitcher
    data_dict["inning"] = innings
    data_dict["run_diff"] = run_diffs

    df = pd.DataFrame(data_dict)
    
    df_list.append(df)

In [21]:
len(df_list)

2469

In [22]:
pitches_df = pd.concat(df_list)

In [23]:
pitches_df.to_csv("games_2013.csv")

In [25]:
games_2015_url = 'https://www.baseball-reference.com/leagues/MLB/2015-schedule.shtml'
games_2015 = get_games(games_2015_url)

df_list = []

In [38]:
# This is meant to be run as a cell
# Set the list of the games to get here
game_list = games_2015

# Here is the big for loop which populates df_list with 
for i in range(len(df_list),len(game_list)):
    # Load the html from the game page
    response = requests.get(game_list[i])
    html = response.text
    html = html.replace('<!--', '').replace('-->', '')

    # Find the umpire name for the game
    ump_name = get_ump(html)
    
    # Retrieve all other game information
    soup = BeautifulSoup(html, 'html.parser')

    pbp_table = soup.find('table', id='play_by_play')

    events = pbp_table.find_all('tr', {'class':['top_inning', 'bottom_inning']})
    
    for x in range(len(events)):
        try:
            x += 1
            if events[x].find(attrs={"data-stat": "batter"}).get_text() == events[x-1].find(attrs={"data-stat": "batter"}).get_text():
                events.pop(x-1)
                x -= 1
        except:
            break

        data_dict = {}
        pitch = []
        balls = []
        strikes = []
        count = []
        pitchers = []
        batters = []
        home_pitcher = []
        innings = []
        run_diffs = []
        umps = []

    for e in events:
        strike_codes = ['C', 'S', 'T', 'K', 'L', 'M', 'O', 'Q', 'R']
        foul_code = 'F'
        ball_codes = ['B', 'I', 'P', 'V']
        
        try:
            pit_seq = e.find('span', 'pitch_sequence').get_text()
            pit_seq = re.sub('[^CSFBXTKIHLMOPQRVY]', '', pit_seq)
            pit_seq = list(pit_seq)

            ball_count = 0
            strike_count = 0
            
            for pit in pit_seq:
                batter = e.find(attrs={"data-stat": "batter"}).get_text()
                pitcher = e.find(attrs={"data-stat": "pitcher"}).get_text()
                inning = e.find(attrs={"data-stat": "inning"}).get_text()
                score = e.find(attrs={"data-stat": "score_batting_team"}).get_text()

                ball_strike_count = f'{ball_count}-{strike_count}'
                
                bat_score, pit_score = score.split('-', 1)  
                run_diff = int(pit_score) - int(bat_score)
                
                top_bottom = list(inning)[0]
                
                if top_bottom == 't':
                    pitcher_home = 1
                else:
                    pitcher_home = 0
                    
                if len(list(inning)) > 2:
                    inning_num = int(list(inning)[1] + list(inning)[2])
                else:
                    inning_num = int(list(inning)[1])
                
                pitch.append(pit)
                balls.append(ball_count)
                strikes.append(strike_count)
                count.append(ball_strike_count)
                batters.append(batter)
                pitchers.append(pitcher)
                umps.append(ump_name)
                home_pitcher.append(pitcher_home)
                innings.append(inning_num)
                run_diffs.append(run_diff)
                
                if pit in strike_codes:
                    strike_count += 1
                elif pit == foul_code and strike_count < 2:
                    strike_count += 1
                elif pit in ball_codes:
                    ball_count += 1
                
        except Exception as exception:
            print(f'{exception}: {url}')
            pit_seq = None
    
    # Populate data dictionary and DataFrame
    data_dict["pitch"] = pitch
    data_dict["balls"] = balls
    data_dict["strikes"] = strikes
    data_dict["count"] = count
    data_dict["batter"] = batters
    data_dict["pitcher"] = pitchers
    data_dict["umpire"] = umps
    data_dict["home_pitcher"] = home_pitcher
    data_dict["inning"] = innings
    data_dict["run_diff"] = run_diffs

    df = pd.DataFrame(data_dict)
    
    df_list.append(df)

In [41]:
pitches_df = pd.concat(df_list)

In [42]:
pitches_df.to_csv("games_2015.csv")