In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

In [2]:
## Given a url which has all the links to games from a season, (e.g. 2014)
## Return a list of URLs for the games for scraping purposes
def get_games(url):
    response = requests.get(url)
    html = response.text
    
    # Create soup
    soup = BeautifulSoup(html, 'html.parser')
    
    # Base URL for concatenation
    base_url = 'https://www.baseball-reference.com'
    
    # Grab tags containing game ids from soup
    hrefs = soup.select('em a')

    # Build URLs in list comprehension
    game_ids = [base_url + hrefs[x]['href'] for x in range(len(hrefs))]

    return game_ids

## Given the raw HTML from a game page,
## Scrape the name of the umpire and return it
def get_ump(html):
    target_word = "Umpires:"
    word_len = len(target_word)
    start_idx = html.find(target_word) + word_len
    end_idx = start_idx + 100
    ump_name = html[start_idx: end_idx].split(',')[0].lstrip('</strong>').split('-')[1].strip(' ')
    return ump_name

In [3]:
## Here is an outline of how our for loop might work
## In this case, it is grabbing information for all the games from the 2014 season
games_2014_url = 'https://www.baseball-reference.com/leagues/MLB/2014-schedule.shtml'
games_2014 = get_games(games_2014_url)

df_list = []

for url in games_2014:
    # Load the html from the game page
    response = requests.get(url)
    html = response.text
    html = html.replace('<!--', '').replace('-->', '')

    # Find the umpire name for the game
    ump_name = get_ump(html)
    print(f'{ump_name}: {url}')
    
    # Do everything else here
    soup = BeautifulSoup(html, 'html.parser')

    pbp_table = soup.find('table', id='play_by_play')

    events = pbp_table.find_all('tr', {'class':['top_inning', 'bottom_inning']})
    
    for x in range(len(events)):
        try:
            x += 1
            if events[x].find(attrs={"data-stat": "batter"}).get_text() == events[x-1].find(attrs={"data-stat": "batter"}).get_text():
                events.pop(x-1)
                x -= 1
        except:
            break

        data_dict = {}
        pitch = []
        balls = []
        strikes = []
        count = []
        pitchers = []
        batters = []
        home_pitcher = []
        innings = []
        run_diffs = []
        umps = []

    for e in events:
        strike_codes = ['C', 'S', 'T', 'K', 'L', 'M', 'O', 'Q', 'R']
        foul_code = 'F'
        ball_codes = ['B', 'I', 'P', 'V']
        
        try:
            pit_seq = e.find('span', 'pitch_sequence').get_text()
            pit_seq = re.sub('[^CSFBXTKIHLMOPQRVY]', '', pit_seq)
            pit_seq = list(pit_seq)

            ball_count = 0
            strike_count = 0
            
            for pit in pit_seq:
                batter = e.find(attrs={"data-stat": "batter"}).get_text()
                pitcher = e.find(attrs={"data-stat": "pitcher"}).get_text()
                inning = e.find(attrs={"data-stat": "inning"}).get_text()
                score = e.find(attrs={"data-stat": "score_batting_team"}).get_text()

                ball_strike_count = f'{ball_count}-{strike_count}'
                
                bat_score, pit_score = score.split('-', 1)  
                run_diff = int(pit_score) - int(bat_score)
                
                top_bottom = list(inning)[0]
                
                if top_bottom == 't':
                    pitcher_home = 1
                else:
                    pitcher_home = 0
                    
                if len(list(inning)) > 2:
                    inning_num = int(list(inning)[1] + list(inning)[2])
                else:
                    inning_num = int(list(inning)[1])
                
                pitch.append(pit)
                balls.append(ball_count)
                strikes.append(strike_count)
                count.append(ball_strike_count)
                batters.append(batter)
                pitchers.append(pitcher)
                umps.append(ump_name)
                home_pitcher.append(pitcher_home)
                innings.append(inning_num)
                run_diffs.append(run_diff)
                
                if pit in strike_codes:
                    strike_count += 1
                elif pit == foul_code and strike_count < 2:
                    strike_count += 1
                elif pit in ball_codes:
                    ball_count += 1
                
        except Exception as exception:
            print(f'{exception}: {url}')
            pit_seq = None
    
    # Populate data dictionary and DataFrame
    data_dict["pitch"] = pitch
    data_dict["balls"] = balls
    data_dict["strikes"] = strikes
    data_dict["count"] = count
    data_dict["batter"] = batters
    data_dict["pitcher"] = pitchers
    data_dict["umpire"] = umps
    data_dict["home_pitcher"] = home_pitcher
    data_dict["inning"] = innings
    data_dict["run_diff"] = run_diffs

    df = pd.DataFrame(data_dict)
    
    df_list.append(df)

Tim Welke: https://www.baseball-reference.com/boxes/ARI/ARI201403220.shtml
Dale Scott: https://www.baseball-reference.com/boxes/ARI/ARI201403230.shtml
Fieldin Culbreth: https://www.baseball-reference.com/boxes/SDN/SDN201403300.shtml
Joe West: https://www.baseball-reference.com/boxes/ANA/ANA201403310.shtml
Tom Hallion: https://www.baseball-reference.com/boxes/ARI/ARI201403310.shtml
Dana DeMuth: https://www.baseball-reference.com/boxes/BAL/BAL201403310.shtml
Dale Scott: https://www.baseball-reference.com/boxes/CHA/CHA201403310.shtml
Gary Cederstrom: https://www.baseball-reference.com/boxes/CIN/CIN201403310.shtml
Jerry Meals: https://www.baseball-reference.com/boxes/DET/DET201403310.shtml
Jerry Layne: https://www.baseball-reference.com/boxes/MIA/MIA201403310.shtml
Ted Barrett: https://www.baseball-reference.com/boxes/MIL/MIL201403310.shtml
Tim Welke: https://www.baseball-reference.com/boxes/NYN/NYN201403310.shtml
Mike Winters: https://www.baseball-reference.com/boxes/OAK/OAK201403310.shtm

Quinn Wolcott: https://www.baseball-reference.com/boxes/KCA/KCA201404080.shtml
Tom Hallion: https://www.baseball-reference.com/boxes/LAN/LAN201404080.shtml
Todd Tichenor: https://www.baseball-reference.com/boxes/NYA/NYA201404080.shtml
Larry Vanover: https://www.baseball-reference.com/boxes/PHI/PHI201404080.shtml
Mike Muchlinski: https://www.baseball-reference.com/boxes/SEA/SEA201404080.shtml
Jim Reynolds: https://www.baseball-reference.com/boxes/SFN/SFN201404080.shtml
Tripp Gibson: https://www.baseball-reference.com/boxes/SLN/SLN201404080.shtml
Mike Estabrook: https://www.baseball-reference.com/boxes/TOR/TOR201404080.shtml
Jeff Kellogg: https://www.baseball-reference.com/boxes/WAS/WAS201404080.shtml
Mark Ripperger: https://www.baseball-reference.com/boxes/ATL/ATL201404090.shtml
Jordan Baker: https://www.baseball-reference.com/boxes/BOS/BOS201404090.shtml
Laz Diaz: https://www.baseball-reference.com/boxes/CHN/CHN201404090.shtml
Brian O'Nora: https://www.baseball-reference.com/boxes/CLE/

Clint Fagan: https://www.baseball-reference.com/boxes/MIA/MIA201404160.shtml
John Tumpane: https://www.baseball-reference.com/boxes/MIL/MIL201404160.shtml
Jim Reynolds: https://www.baseball-reference.com/boxes/NYA/NYA201404161.shtml
Fieldin Culbreth: https://www.baseball-reference.com/boxes/NYA/NYA201404162.shtml
Brian Gorman: https://www.baseball-reference.com/boxes/PHI/PHI201404160.shtml
Marcus Pattillo: https://www.baseball-reference.com/boxes/SDN/SDN201404160.shtml
Andy Fletcher: https://www.baseball-reference.com/boxes/SFN/SFN201404160.shtml
Ted Barrett: https://www.baseball-reference.com/boxes/TEX/TEX201404160.shtml
Jim Joyce: https://www.baseball-reference.com/boxes/CHA/CHA201404170.shtml
Lance Barrett: https://www.baseball-reference.com/boxes/DET/DET201404170.shtml
CB Bucknor: https://www.baseball-reference.com/boxes/HOU/HOU201404170.shtml
Chris Guccione: https://www.baseball-reference.com/boxes/MIN/MIN201404171.shtml
Pat Hoberg: https://www.baseball-reference.com/boxes/MIN/MIN

Mike Muchlinski: https://www.baseball-reference.com/boxes/TOR/TOR201404230.shtml
Paul Emmel: https://www.baseball-reference.com/boxes/WAS/WAS201404230.shtml
Phil Cuzzi: https://www.baseball-reference.com/boxes/BOS/BOS201404240.shtml
Brian O'Nora: https://www.baseball-reference.com/boxes/CHN/CHN201404240.shtml
Greg Gibson: https://www.baseball-reference.com/boxes/CLE/CLE201404240.shtml
Dan Iassogna: https://www.baseball-reference.com/boxes/DET/DET201404240.shtml
Toby Basner: https://www.baseball-reference.com/boxes/HOU/HOU201404240.shtml
Mike DiMuro: https://www.baseball-reference.com/boxes/LAN/LAN201404240.shtml
Alan Porter: https://www.baseball-reference.com/boxes/NYN/NYN201404240.shtml
Mark Ripperger: https://www.baseball-reference.com/boxes/PIT/PIT201404240.shtml
Ted Barrett: https://www.baseball-reference.com/boxes/TBA/TBA201404240.shtml
Mike Winters: https://www.baseball-reference.com/boxes/TOR/TOR201404240.shtml
Cory Blaser: https://www.baseball-reference.com/boxes/WAS/WAS2014042

Tim Welke: https://www.baseball-reference.com/boxes/KCA/KCA201405020.shtml
Dale Scott: https://www.baseball-reference.com/boxes/MIA/MIA201405020.shtml
Scott Barry: https://www.baseball-reference.com/boxes/MIN/MIN201405020.shtml
Brian O'Nora: https://www.baseball-reference.com/boxes/NYA/NYA201405020.shtml
Jim Reynolds: https://www.baseball-reference.com/boxes/PHI/PHI201405020.shtml
Vic Carapazza: https://www.baseball-reference.com/boxes/PIT/PIT201405020.shtml
Brian Gorman: https://www.baseball-reference.com/boxes/SDN/SDN201405020.shtml
Lance Barrett: https://www.baseball-reference.com/boxes/ANA/ANA201405030.shtml
Clint Fagan: https://www.baseball-reference.com/boxes/ATL/ATL201405030.shtml
D.J. Reyburn: https://www.baseball-reference.com/boxes/BOS/BOS201405030.shtml
Larry Vanover: https://www.baseball-reference.com/boxes/CHN/CHN201405030.shtml
Mike DiMuro: https://www.baseball-reference.com/boxes/CIN/CIN201405030.shtml
Chris Guccione: https://www.baseball-reference.com/boxes/CLE/CLE20140

Jeff Kellogg: https://www.baseball-reference.com/boxes/BAL/BAL201405100.shtml
Phil Cuzzi: https://www.baseball-reference.com/boxes/CHA/CHA201405100.shtml
Brian O'Nora: https://www.baseball-reference.com/boxes/CIN/CIN201405100.shtml
Joe West: https://www.baseball-reference.com/boxes/DET/DET201405100.shtml
Mark Carlson: https://www.baseball-reference.com/boxes/LAN/LAN201405100.shtml
Paul Emmel: https://www.baseball-reference.com/boxes/MIL/MIL201405100.shtml
Tim Timmons: https://www.baseball-reference.com/boxes/NYN/NYN201405100.shtml
Kerwin Danley: https://www.baseball-reference.com/boxes/OAK/OAK201405100.shtml
David Rackley: https://www.baseball-reference.com/boxes/PIT/PIT201405100.shtml
Lance Barrett: https://www.baseball-reference.com/boxes/SDN/SDN201405100.shtml
Scott Barry: https://www.baseball-reference.com/boxes/SEA/SEA201405100.shtml
John Tumpane: https://www.baseball-reference.com/boxes/TBA/TBA201405100.shtml
Bill Miller: https://www.baseball-reference.com/boxes/TEX/TEX201405100.

Bill Miller: https://www.baseball-reference.com/boxes/ANA/ANA201405180.shtml
Paul Schrieber: https://www.baseball-reference.com/boxes/ARI/ARI201405180.shtml
Scott Barry: https://www.baseball-reference.com/boxes/BOS/BOS201405180.shtml
David Rackley: https://www.baseball-reference.com/boxes/CHN/CHN201405180.shtml
Tom Woodring: https://www.baseball-reference.com/boxes/CLE/CLE201405180.shtml
Fieldin Culbreth: https://www.baseball-reference.com/boxes/COL/COL201405180.shtml
D.J. Reyburn: https://www.baseball-reference.com/boxes/HOU/HOU201405180.shtml
Cory Blaser: https://www.baseball-reference.com/boxes/KCA/KCA201405180.shtml
Adrian Johnson: https://www.baseball-reference.com/boxes/MIN/MIN201405180.shtml
Paul Nauert: https://www.baseball-reference.com/boxes/NYA/NYA201405181.shtml
James Hoye: https://www.baseball-reference.com/boxes/NYA/NYA201405182.shtml
Jerry Meals: https://www.baseball-reference.com/boxes/PHI/PHI201405180.shtml
Kerwin Danley: https://www.baseball-reference.com/boxes/SFN/SF

Paul Emmel: https://www.baseball-reference.com/boxes/SFN/SFN201405250.shtml
Angel Hernandez: https://www.baseball-reference.com/boxes/TBA/TBA201405250.shtml
Mark Carlson: https://www.baseball-reference.com/boxes/TOR/TOR201405250.shtml
Paul Nauert: https://www.baseball-reference.com/boxes/ARI/ARI201405260.shtml
John Tumpane: https://www.baseball-reference.com/boxes/ATL/ATL201405260.shtml
Ron Kulpa: https://www.baseball-reference.com/boxes/CHA/CHA201405260.shtml
Gary Cederstrom: https://www.baseball-reference.com/boxes/KCA/KCA201405260.shtml
Quinn Wolcott: https://www.baseball-reference.com/boxes/LAN/LAN201405260.shtml
Tom Woodring: https://www.baseball-reference.com/boxes/MIL/MIL201405260.shtml
Mike DiMuro: https://www.baseball-reference.com/boxes/MIN/MIN201405260.shtml
Laz Diaz: https://www.baseball-reference.com/boxes/NYN/NYN201405260.shtml
Jordan Baker: https://www.baseball-reference.com/boxes/OAK/OAK201405260.shtml
Jim Wolf: https://www.baseball-reference.com/boxes/PHI/PHI201405260.

David Rackley: https://www.baseball-reference.com/boxes/NYA/NYA201406020.shtml
Pat Hoberg: https://www.baseball-reference.com/boxes/PHI/PHI201406020.shtml
Seth Buckminster: https://www.baseball-reference.com/boxes/SDN/SDN201406020.shtml
Dan Iassogna: https://www.baseball-reference.com/boxes/SLN/SLN201406020.shtml
Paul Schrieber: https://www.baseball-reference.com/boxes/ATL/ATL201406030.shtml
Tim Welke: https://www.baseball-reference.com/boxes/CHN/CHN201406030.shtml
Angel Hernandez: https://www.baseball-reference.com/boxes/CIN/CIN201406030.shtml
Gerry Davis: https://www.baseball-reference.com/boxes/CLE/CLE201406030.shtml
Jerry Layne: https://www.baseball-reference.com/boxes/COL/COL201406030.shtml
Tony Randazzo: https://www.baseball-reference.com/boxes/DET/DET201406030.shtml
Paul Emmel: https://www.baseball-reference.com/boxes/HOU/HOU201406030.shtml
Gary Cederstrom: https://www.baseball-reference.com/boxes/LAN/LAN201406030.shtml
Vic Carapazza: https://www.baseball-reference.com/boxes/MIA

ConnectionError: HTTPSConnectionPool(host='www.baseball-reference.com', port=443): Max retries exceeded with url: /boxes/SDN/SDN201406060.shtml (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x11dc34d68>: Failed to establish a new connection: [Errno 60] Operation timed out'))

In [6]:
# This is meant to be run as a cell
# Set the list of the games to get here
game_list = games_2014

# Here is the big for loop which populates df_list with 
for i in range(len(df_list),len(game_list)):
    # Load the html from the game page
    response = requests.get(game_list[i])
    html = response.text
    html = html.replace('<!--', '').replace('-->', '')

    # Find the umpire name for the game
    ump_name = get_ump(html)
    
    # Do everything else here
    soup = BeautifulSoup(html, 'html.parser')

    pbp_table = soup.find('table', id='play_by_play')

    events = pbp_table.find_all('tr', {'class':['top_inning', 'bottom_inning']})
    
    for x in range(len(events)):
        try:
            x += 1
            if events[x].find(attrs={"data-stat": "batter"}).get_text() == events[x-1].find(attrs={"data-stat": "batter"}).get_text():
                events.pop(x-1)
                x -= 1
        except:
            break

        data_dict = {}
        pitch = []
        balls = []
        strikes = []
        count = []
        pitchers = []x`
        batters = []
        home_pitcher = []
        innings = []
        run_diffs = []
        umps = []

    for e in events:
        strike_codes = ['C', 'S', 'T', 'K', 'L', 'M', 'O', 'Q', 'R']
        foul_code = 'F'
        ball_codes = ['B', 'I', 'P', 'V']
        
        try:
            pit_seq = e.find('span', 'pitch_sequence').get_text()
            pit_seq = re.sub('[^CSFBXTKIHLMOPQRVY]', '', pit_seq)
            pit_seq = list(pit_seq)

            ball_count = 0
            strike_count = 0
            
            for pit in pit_seq:
                batter = e.find(attrs={"data-stat": "batter"}).get_text()
                pitcher = e.find(attrs={"data-stat": "pitcher"}).get_text()
                inning = e.find(attrs={"data-stat": "inning"}).get_text()
                score = e.find(attrs={"data-stat": "score_batting_team"}).get_text()

                ball_strike_count = f'{ball_count}-{strike_count}'
                
                bat_score, pit_score = score.split('-', 1)  
                run_diff = int(pit_score) - int(bat_score)
                
                top_bottom = list(inning)[0]
                
                if top_bottom == 't':
                    pitcher_home = 1
                else:
                    pitcher_home = 0
                    
                if len(list(inning)) > 2:
                    inning_num = int(list(inning)[1] + list(inning)[2])
                else:
                    inning_num = int(list(inning)[1])
                
                pitch.append(pit)
                balls.append(ball_count)
                strikes.append(strike_count)
                count.append(ball_strike_count)
                batters.append(batter)
                pitchers.append(pitcher)
                umps.append(ump_name)
                home_pitcher.append(pitcher_home)
                innings.append(inning_num)
                run_diffs.append(run_diff)
                
                if pit in strike_codes:
                    strike_count += 1
                elif pit == foul_code and strike_count < 2:
                    strike_count += 1
                elif pit in ball_codes:
                    ball_count += 1
                
        except Exception as exception:
            print(f'{exception}: {url}')
            pit_seq = None
    
    # Populate data dictionary and DataFrame
    data_dict["pitch"] = pitch
    data_dict["balls"] = balls
    data_dict["strikes"] = strikes
    data_dict["count"] = count
    data_dict["batter"] = batters
    data_dict["pitcher"] = pitchers
    data_dict["umpire"] = umps
    data_dict["home_pitcher"] = home_pitcher
    data_dict["inning"] = innings
    data_dict["run_diff"] = run_diffs

    df = pd.DataFrame(data_dict)
    
    df_list.append(df)

In [9]:
pitches_df = pd.concat(df_list)
pitches_df

Unnamed: 0,pitch,balls,strikes,count,batter,pitcher,umpire,home_pitcher,inning,run_diff
0,C,0,0,0-0,Yasiel Puig,Wade Miley,Tim Welke,1,1,0
1,F,0,1,0-1,Yasiel Puig,Wade Miley,Tim Welke,1,1,0
2,S,0,2,0-2,Yasiel Puig,Wade Miley,Tim Welke,1,1,0
3,B,0,0,0-0,Justin Turner,Wade Miley,Tim Welke,1,1,0
4,B,1,0,1-0,Justin Turner,Wade Miley,Tim Welke,1,1,0
5,F,2,0,2-0,Justin Turner,Wade Miley,Tim Welke,1,1,0
6,S,2,1,2-1,Justin Turner,Wade Miley,Tim Welke,1,1,0
7,C,2,2,2-2,Justin Turner,Wade Miley,Tim Welke,1,1,0
8,S,0,0,0-0,Hanley Ramirez,Wade Miley,Tim Welke,1,1,0
9,B,0,1,0-1,Hanley Ramirez,Wade Miley,Tim Welke,1,1,0


In [7]:
len(df_list)

2462

In [8]:
len(games_2014)

2462

In [10]:
pitches_df.to_csv("games_2014.csv")

In [11]:
games_2013_url = 'https://www.baseball-reference.com/leagues/MLB/2013-schedule.shtml'
games_2013 = get_games(games_2013_url)

df_list = []

In [20]:
# This is meant to be run as a cell
# Set the list of the games to get here
game_list = games_2013

# Here is the big for loop which populates df_list with 
for i in range(len(df_list),len(game_list)):
    # Load the html from the game page
    response = requests.get(game_list[i])
    html = response.text
    html = html.replace('<!--', '').replace('-->', '')

    # Find the umpire name for the game
    ump_name = get_ump(html)
    
    # Do everything else here
    soup = BeautifulSoup(html, 'html.parser')

    pbp_table = soup.find('table', id='play_by_play')

    events = pbp_table.find_all('tr', {'class':['top_inning', 'bottom_inning']})
    
    for x in range(len(events)):
        try:
            x += 1
            if events[x].find(attrs={"data-stat": "batter"}).get_text() == events[x-1].find(attrs={"data-stat": "batter"}).get_text():
                events.pop(x-1)
                x -= 1
        except:
            break

        data_dict = {}
        pitch = []
        balls = []
        strikes = []
        count = []
        pitchers = []
        batters = []
        home_pitcher = []
        innings = []
        run_diffs = []
        umps = []

    for e in events:
        strike_codes = ['C', 'S', 'T', 'K', 'L', 'M', 'O', 'Q', 'R']
        foul_code = 'F'
        ball_codes = ['B', 'I', 'P', 'V']
        
        try:
            pit_seq = e.find('span', 'pitch_sequence').get_text()
            pit_seq = re.sub('[^CSFBXTKIHLMOPQRVY]', '', pit_seq)
            pit_seq = list(pit_seq)

            ball_count = 0
            strike_count = 0
            
            for pit in pit_seq:
                batter = e.find(attrs={"data-stat": "batter"}).get_text()
                pitcher = e.find(attrs={"data-stat": "pitcher"}).get_text()
                inning = e.find(attrs={"data-stat": "inning"}).get_text()
                score = e.find(attrs={"data-stat": "score_batting_team"}).get_text()

                ball_strike_count = f'{ball_count}-{strike_count}'
                
                bat_score, pit_score = score.split('-', 1)  
                run_diff = int(pit_score) - int(bat_score)
                
                top_bottom = list(inning)[0]
                
                if top_bottom == 't':
                    pitcher_home = 1
                else:
                    pitcher_home = 0
                    
                if len(list(inning)) > 2:
                    inning_num = int(list(inning)[1] + list(inning)[2])
                else:
                    inning_num = int(list(inning)[1])
                
                pitch.append(pit)
                balls.append(ball_count)
                strikes.append(strike_count)
                count.append(ball_strike_count)
                batters.append(batter)
                pitchers.append(pitcher)
                umps.append(ump_name)
                home_pitcher.append(pitcher_home)
                innings.append(inning_num)
                run_diffs.append(run_diff)
                
                if pit in strike_codes:
                    strike_count += 1
                elif pit == foul_code and strike_count < 2:
                    strike_count += 1
                elif pit in ball_codes:
                    ball_count += 1
                
        except Exception as exception:
            print(f'{exception}: {url}')
            pit_seq = None
    
    # Populate data dictionary and DataFrame
    data_dict["pitch"] = pitch
    data_dict["balls"] = balls
    data_dict["strikes"] = strikes
    data_dict["count"] = count
    data_dict["batter"] = batters
    data_dict["pitcher"] = pitchers
    data_dict["umpire"] = umps
    data_dict["home_pitcher"] = home_pitcher
    data_dict["inning"] = innings
    data_dict["run_diff"] = run_diffs

    df = pd.DataFrame(data_dict)
    
    df_list.append(df)

In [21]:
len(df_list)

2469

In [22]:
pitches_df = pd.concat(df_list)

In [23]:
pitches_df.to_csv("games_2013.csv")

In [25]:
games_2015_url = 'https://www.baseball-reference.com/leagues/MLB/2015-schedule.shtml'
games_2015 = get_games(games_2015_url)

df_list = []

In [38]:
# This is meant to be run as a cell
# Set the list of the games to get here
game_list = games_2015

# Here is the big for loop which populates df_list with 
for i in range(len(df_list),len(game_list)):
    # Load the html from the game page
    response = requests.get(game_list[i])
    html = response.text
    html = html.replace('<!--', '').replace('-->', '')

    # Find the umpire name for the game
    ump_name = get_ump(html)
    
    # Do everything else here
    soup = BeautifulSoup(html, 'html.parser')

    pbp_table = soup.find('table', id='play_by_play')

    events = pbp_table.find_all('tr', {'class':['top_inning', 'bottom_inning']})
    
    for x in range(len(events)):
        try:
            x += 1
            if events[x].find(attrs={"data-stat": "batter"}).get_text() == events[x-1].find(attrs={"data-stat": "batter"}).get_text():
                events.pop(x-1)
                x -= 1
        except:
            break

        data_dict = {}
        pitch = []
        balls = []
        strikes = []
        count = []
        pitchers = []
        batters = []
        home_pitcher = []
        innings = []
        run_diffs = []
        umps = []

    for e in events:
        strike_codes = ['C', 'S', 'T', 'K', 'L', 'M', 'O', 'Q', 'R']
        foul_code = 'F'
        ball_codes = ['B', 'I', 'P', 'V']
        
        try:
            pit_seq = e.find('span', 'pitch_sequence').get_text()
            pit_seq = re.sub('[^CSFBXTKIHLMOPQRVY]', '', pit_seq)
            pit_seq = list(pit_seq)

            ball_count = 0
            strike_count = 0
            
            for pit in pit_seq:
                batter = e.find(attrs={"data-stat": "batter"}).get_text()
                pitcher = e.find(attrs={"data-stat": "pitcher"}).get_text()
                inning = e.find(attrs={"data-stat": "inning"}).get_text()
                score = e.find(attrs={"data-stat": "score_batting_team"}).get_text()

                ball_strike_count = f'{ball_count}-{strike_count}'
                
                bat_score, pit_score = score.split('-', 1)  
                run_diff = int(pit_score) - int(bat_score)
                
                top_bottom = list(inning)[0]
                
                if top_bottom == 't':
                    pitcher_home = 1
                else:
                    pitcher_home = 0
                    
                if len(list(inning)) > 2:
                    inning_num = int(list(inning)[1] + list(inning)[2])
                else:
                    inning_num = int(list(inning)[1])
                
                pitch.append(pit)
                balls.append(ball_count)
                strikes.append(strike_count)
                count.append(ball_strike_count)
                batters.append(batter)
                pitchers.append(pitcher)
                umps.append(ump_name)
                home_pitcher.append(pitcher_home)
                innings.append(inning_num)
                run_diffs.append(run_diff)
                
                if pit in strike_codes:
                    strike_count += 1
                elif pit == foul_code and strike_count < 2:
                    strike_count += 1
                elif pit in ball_codes:
                    ball_count += 1
                
        except Exception as exception:
            print(f'{exception}: {url}')
            pit_seq = None
    
    # Populate data dictionary and DataFrame
    data_dict["pitch"] = pitch
    data_dict["balls"] = balls
    data_dict["strikes"] = strikes
    data_dict["count"] = count
    data_dict["batter"] = batters
    data_dict["pitcher"] = pitchers
    data_dict["umpire"] = umps
    data_dict["home_pitcher"] = home_pitcher
    data_dict["inning"] = innings
    data_dict["run_diff"] = run_diffs

    df = pd.DataFrame(data_dict)
    
    df_list.append(df)

In [41]:
pitches_df = pd.concat(df_list)

In [42]:
pitches_df.to_csv("games_2015.csv")