In [29]:
# Import pandas for dataframe work, requests and BeatifulSoup for scraping the data, and time to space out requests
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

In [30]:
# Creating initial data frame to hold team_name, whether they made the playoffs, their bye week, and the year
afc_teams = pd.DataFrame(columns=['team_name', 'playoffs', 'bye_week', 'year'])

# Looping through 2016 up to 2021
for year in range(2016, 2025, 1):

  # Printing the year
  print(f'Getting data for {year}')

  # Grabbing the AFC table
  url = f'https://www.pro-football-reference.com/years/{year}/'
  data = requests.get(url)
  soup = BeautifulSoup(data.text)
  table = soup.select('table#AFC')[0]
  # Taking a 4 second break
  time.sleep(4)

  # Finding each team_link and creating links to get team data
  links = [l.get("href") for l in table.find_all('a')]
  team_urls = [f"https://www.pro-football-reference.com{l}" for l in links]

  # Looping through each team in team_urls
  for team in team_urls:

    # Getting the games table for the team
    df = pd.read_html(team, attrs={'id':'games'})[0]

    # Extracting the bye week value for the team
    bye = df[df.iloc[:,9] == 'Bye Week']
    bye_week = bye.iloc[:,0].values

    # Finding out whether the team made the playoffs
    play = df[df.iloc[:,2] == 'Playoffs']
    playoffs = len(play) == 1

    # Extracting the team_name from the team link
    team_name = team.split('/')[-2].split('.')[0]

    # Updating the afc_teams data frame
    afc_teams.loc[len(afc_teams.index)] = [team_name, playoffs, bye_week, year]
    # Taking a 4 second break
    time.sleep(4)

  # Print that the data for year has been retrieved.
  print(f'Data for {year} retrieved')
  time.sleep(5)
  print()

Getting data for 2016
Data for 2016 retrieved

Getting data for 2017
Data for 2017 retrieved

Getting data for 2018
Data for 2018 retrieved

Getting data for 2019
Data for 2019 retrieved

Getting data for 2020
Data for 2020 retrieved

Getting data for 2021
Data for 2021 retrieved

Getting data for 2022
Data for 2022 retrieved

Getting data for 2023
Data for 2023 retrieved

Getting data for 2024
Data for 2024 retrieved



In [31]:
# Viewing the first 5 rows of afc_teams
afc_teams.head()

Unnamed: 0,team_name,playoffs,bye_week,year
0,nwe,True,[9],2016
1,mia,True,[8],2016
2,buf,False,[10],2016
3,nyj,False,[11],2016
4,pit,True,[8],2016


In [32]:
# Below code removes the brackets from each bye_week value
# - [9] becomes 9, [3] becomes 3, etc
afc_teams['bye_week'] = afc_teams['bye_week'].apply(lambda x: x[0] if len(x) > 0 else None)

# Verifying the code worked properly
afc_teams.head()

Unnamed: 0,team_name,playoffs,bye_week,year
0,nwe,True,9,2016
1,mia,True,8,2016
2,buf,False,10,2016
3,nyj,False,11,2016
4,pit,True,8,2016


In [33]:
# Creating nfc_teams data frame to hold the team name, whether they made the playoffs, their bye week, and the year
nfc_teams = pd.DataFrame(columns=['team_name', 'playoffs', 'bye_week', 'year'])

# Looping through 2016 up to 2021
for year in range(2016, 2025, 1):

  # Printing the year
  print(f'Getting data for {year}')

  # Grabbing the nfc table
  url = f'https://www.pro-football-reference.com/years/{year}/'
  data = requests.get(url)
  soup = BeautifulSoup(data.text)
  table = soup.select('table#NFC')[0]
  # Taking a 4 second break
  time.sleep(4)

  # Finding each team_link and creating links to get team data
  links = [l.get("href") for l in table.find_all('a')]
  team_urls = [f"https://www.pro-football-reference.com{l}" for l in links]

  # Looping through each team in team_urls
  for team in team_urls:

    # Getting the games table for the team
    df = pd.read_html(team, attrs={'id':'games'})[0]

    # Extracting the bye week value for the team
    bye = df[df.iloc[:,9] == 'Bye Week']
    bye_week = bye.iloc[:,0].values

    # Finding out whether the team made the playoffs
    play = df[df.iloc[:,2] == 'Playoffs']
    playoffs = len(play) == 1

    # Extracting the team_name from the team link
    team_name = team.split('/')[-2].split('.')[0]
    nfc_teams.loc[len(nfc_teams.index)] = [team_name, playoffs, bye_week, year]
    # Taking a 4 second break
    time.sleep(4)

  # Print that the data for year has been retrieved
  print(f'Data for {year} complete')
  time.sleep(5)
  print()

Getting data for 2016
Data for 2016 complete

Getting data for 2017
Data for 2017 complete

Getting data for 2018
Data for 2018 complete

Getting data for 2019
Data for 2019 complete

Getting data for 2020
Data for 2020 complete

Getting data for 2021
Data for 2021 complete

Getting data for 2022
Data for 2022 complete

Getting data for 2023
Data for 2023 complete

Getting data for 2024
Data for 2024 complete



In [34]:
# Below code removes the brackets from each bye_week value
nfc_teams['bye_week'] = nfc_teams['bye_week'].apply(lambda x: x[0] if len(x) > 0 else None)
# Verifying the code worked properly
nfc_teams.head()

Unnamed: 0,team_name,playoffs,bye_week,year
0,dal,True,7,2016
1,nyg,True,8,2016
2,was,False,9,2016
3,phi,False,4,2016
4,gnb,True,4,2016


In [37]:
# Concating afc_teams and nfc_teams
all_data = pd.concat([afc_teams, nfc_teams])
all_data

Unnamed: 0,team_name,playoffs,bye_week,year
0,nwe,True,9,2016
1,mia,True,8,2016
2,buf,False,10,2016
3,nyj,False,11,2016
4,pit,True,8,2016
...,...,...,...,...
139,nor,False,12,2024
140,ram,True,6,2024
141,sea,False,10,2024
142,crd,False,11,2024


In [39]:
# Saving data with no index column
all_data.to_csv('all_data.csv', index=False)