# Scraping NFL Play-by-Play Data from ESPN

This notebook scrapes every play from every NFL game played since 2003 and packages them in CSV files by season

Written by Connor Cozad</br>
March 6, 2022

### Imports

In [19]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import time
import datetime
import random
pd.set_option('display.max_rows', None)

### Get List of All Teams

We need a list of all teams so that we can find their schedules from each season. We scrape can scrape all of the Game IDs from every team's schedule from every season. The Game IDs will then be used to access the webpages with the play-by-play from those games.

In [2]:
url = 'https://www.espn.com/nfl/team/schedule/_/name/wsh/season/2021'
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")

In [3]:
teams = soup.find('select').find_all('option')[1:]
teams = [x['data-param-value'] for x in teams]
print(teams)

['ari', 'atl', 'bal', 'buf', 'car', 'chi', 'cin', 'cle', 'dal', 'den', 'det', 'gb', 'hou', 'ind', 'jax', 'kc', 'lv', 'lac', 'lar', 'mia', 'min', 'ne', 'no', 'nyg', 'nyj', 'phi', 'pit', 'sf', 'sea', 'tb', 'ten']


### Get List of All Game IDs

We use the list of all of the game IDs to access the webpages containing the play-by-play data from those games.

In [4]:
def get_game_ids(year):
    game_ids = []
    for team in teams:
        url = f'https://www.espn.com/nfl/team/schedule/_/name/{team}/season/{year}'
        req = requests.get(url)
        soup = BeautifulSoup(req.text, "html.parser")

        games = soup.find_all('span', class_='ml4')
        game_ids.extend([game.find('a')['href'].split('/')[-1] for game in games])
    
    return list(set(game_ids))

### Get the Play-by-Play Data

We get the play-by-play data from every game in a particular season and export it as a CSV file.

In [5]:
def append_play(dictionary, game_id=np.nan, is_postseason=np.nan, home_team=np.nan, home_team_score=np.nan,
                away_team=np.nan, away_team_score=np.nan, quarter=np.nan, game_clock=np.nan, possession=np.nan,
                down=np.nan, ball_placement=np.nan, play_description=np.nan):
    
    dictionary['Game ID'].append(game_id)
    dictionary['Postseason'].append(is_postseason)
    dictionary['Home Team'].append(home_team)
    dictionary['Home Team Score'].append(home_team_score)
    dictionary['Away Team'].append(away_team)
    dictionary['Away Team Score'].append(away_team_score)
    dictionary['Quarter'].append(quarter)
    dictionary['Clock'].append(game_clock)
    dictionary['Possession'].append(possession)
    dictionary['Down'].append(down)
    dictionary['Ball Placement'].append(ball_placement)
    dictionary['Description'].append(play_description)
    
    return dictionary

In [6]:
def scrape_game(game_id):    
    url = f"https://www.espn.com/nfl/playbyplay/_/gameId/{game_id}"
    req = requests.get(url)
    soup = BeautifulSoup(req.text, "html.parser")

    home_team = soup.find('div', class_='team home').find('span', class_='abbrev').text
    home_final_score = soup.find('div', class_='team home').find('div', class_='score').text
    away_team = soup.find('div', class_='team away').find('span', class_='abbrev').text
    away_final_score = soup.find('div', class_='team away').find('div', class_='score').text
    is_postseason = bool(soup.find('div', class_='game-details header'))

    parsed_plays = {
        'Game ID':[], 'Postseason':[], 'Home Team':[], 'Home Team Score':[], 'Away Team':[], 'Away Team Score':[],
        'Quarter':[], 'Clock':[], 'Possession':[], 'Down':[], 'Ball Placement':[], 'Description':[]
    }

    prev_drive_home_score = 0
    prev_drive_away_score = 0

    drives = soup.find_all('li', class_='accordion-item')

    for drive in drives:
        if 'half-time' in drive['class']:
            continue

        try:
            possession = drive.find('img', class_='team-logo')['src'].split('/')[-1].split('.')[0].upper()
        except:
            continue

        for play in drive.find_all('li'):
            if 'end-quarter' in play['class'] or 'half-time' in play['class']:
                continue

            try:
                down_and_yrds_to_go, ball_placement = play.find('h3').text.split(' at ')
            except ValueError:
                down_and_yrds_to_go, ball_placement = np.nan, np.nan

            try:
                time, *play_description = play.find('span').text.strip().split(') ')
                play_description = ') '.join(play_description).strip()
            except ValueError:
                continue

            game_clock, quarter = time.split(' - ')
            game_clock = game_clock[1:]
            quarter = quarter[0]

            parsed_plays = append_play(parsed_plays, game_id, is_postseason, home_team, prev_drive_home_score, 
                                       away_team, prev_drive_away_score, quarter, game_clock, possession, 
                                       down_and_yrds_to_go, ball_placement, play_description)

        prev_drive_home_score = drive.find('span', class_='away').find('span', class_='team-score').text
        prev_drive_away_score = drive.find('span', class_='home').find('span', class_='team-score').text

    parsed_plays = append_play(parsed_plays, game_id, is_postseason, home_team, home_final_score, away_team,
                               away_final_score, quarter='4', game_clock='0:00', play_description='Final')
    
    return pd.DataFrame(parsed_plays)

In [7]:
scrape_game('401326634').tail()

Unnamed: 0,Game ID,Postseason,Home Team,Home Team Score,Away Team,Away Team Score,Quarter,Clock,Possession,Down,Ball Placement,Description
179,401326634,True,TB,27,LAR,27,4,0:35,LAR,2nd & 11,LAR 24,(Shotgun) M.Stafford pass deep left to C.Kupp ...
180,401326634,True,TB,27,LAR,27,4,0:27,LAR,1st & 10,LAR 44,(Shotgun) M.Stafford pass deep middle to C.Kup...
181,401326634,True,TB,27,LAR,27,4,0:06,LAR,1st & 10,TB 12,(No Huddle) M.Stafford spiked the ball to stop...
182,401326634,True,TB,27,LAR,27,4,0:00,LAR,2nd & 10,TB 12,Matt Gay 30 Yd Field Goal
183,401326634,True,TB,27,LAR,30,4,0:00,,,,Final


In [8]:
def print_progress(progress, total):
    progress_bar = '['
    progress_bar += ('=' * int((progress / total)*30))
    progress_bar += (' ' * (30 - int((progress / total)*30)))
    progress_bar += f'] {progress}/{total}'
    return progress_bar

In [16]:
def random_request():
    num = random.randint(0, len(teams)-1)
    if 5 > random.randint(0, 100):
        requests.get(f'https://www.espn.com/nfl/team/schedule/_/name/{teams[num]}/')
    if 5 > random.randint(0, 100):
        requests.get(f'https://www.espn.com/nfl/team/stats/_/name/{teams[num]}')

In [25]:
for year in []:
    game_ids = get_game_ids(year)
    all_plays = []
    for i in range(len(game_ids)):
        total = len(game_ids)
        try:
            all_plays.append(scrape_game(game_ids[i]))
        except AttributeError:
            print(f'Error due to rate limiting. Waiting to resume.')
            for i in range(20):
                time.sleep(random.randint(0, 20))
                random_request()
            i -= 1
        except:
            print(f'Error with {game_ids[i]}. Game skipped.')
        print(f'{year} {print_progress(i+1, total)}', end='\r')
    all_plays = pd.concat(all_plays)
    all_plays.to_csv(f'Final Project/Play by Play CSVs/{year}.csv', index=False)
    print()


