In [10]:
import requests
from bs4 import BeautifulSoup, Comment

In [11]:
POSITION_MAP = {
    'QB': 'QB',
    'RB': 'RB', 'FB': 'RB',
    'WR': 'WR', 'TE': 'TE',
    'OT': 'OL', 'OG': 'OL', 'C': 'OL', 'OL': 'OL', 'G': 'OL', 'T': 'OL', 'LT': 'OL', 'RT': 'OL', 'RG': 'OL', 'LG': 'OL',
    'DE': 'DE', 'RDE': 'DE', 'LDE': 'DE', 'EDGE': 'DE',
    'DL': 'DL', 'DT': 'DL', 'NT': 'DL', 'RDT': 'DL', 'LDT': 'DL', 'RDT': 'DL',
    'OLB': 'LB', 'ILB': 'LB', 'LB': 'LB', 'MLB': 'LB', 'OLB': 'LB', 'RILB': 'LB', 'LILB': 'LB', 'ROLB': 'LB', 'LOLB': 'LB', 'RLB': 'LB', 'LLB': 'LB',
    'CB': 'DB', 'S': 'DB', 'FS': 'DB', 'SS': 'DB', 'DB': 'DB', 'LCB': 'DB', 'RCB': 'DB', 'SAF': 'DB',
    'K': 'ST', 'P': 'ST', 'LS': 'ST'
}

In [48]:
def get_all_team_codes():
    url = 'https://www.pro-football-reference.com/teams/'
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table', {'id': 'teams_active'})
    team_codes = []
    for row in table.tbody.find_all('tr'):
        team_cell = row.find('th', {'data-stat': 'team_name'})
        if team_cell and team_cell.a:
            href = team_cell.a['href']  # e.g., "/teams/kan/"
            code = href.split('/')[2]
            team_codes.append(code)
    return team_codes

teams = get_all_team_codes()
print(f"Found teams: {teams}")

HTTPError: 429 Client Error: Too Many Requests for url: https://www.pro-football-reference.com/teams/

In [52]:
import time

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/115.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
}

def scrape_team_av(team_code, year):
    url = f'https://www.pro-football-reference.com/teams/{team_code}/{year}_roster.htm'
    response = requests.get(url, headers=HEADERS)
    time.sleep(5)
    if response.status_code != 200:
        print(f"Failed to fetch data for {team_code} {year}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')

    # Table may be inside comments
    table = soup.find('table', {'id': 'roster'})
    if not table:
        comments = soup.find_all(string=lambda text: isinstance(text, Comment))
        for comment in comments:
            if 'table' in comment and 'id="roster"' in comment:
                comment_soup = BeautifulSoup(comment, 'html.parser')
                table = comment_soup.find('table', {'id': 'roster'})
                if table:
                    break
        if not table:
            print(f"No roster table found for {team_code} {year}")
            return None

    av_by_pos = {}
    for row in table.tbody.find_all('tr'):
        # Skip header rows inside tbody
        if row.get('class') and 'thead' in row.get('class'):
            continue

        pos_cell = row.find('td', {'data-stat': 'pos'})
        av_cell = row.find('td', {'data-stat': 'av'})

        if not pos_cell or not av_cell:
            continue

        raw_pos = pos_cell.text.strip()
        av_text = av_cell.text.strip()

        try:
            av = int(av_text)
        except ValueError:
            av = 0

        norm_pos = POSITION_MAP.get(raw_pos, 'UNKNOWN')
        if norm_pos == 'UNKNOWN':
            continue

        av_by_pos[norm_pos] = av_by_pos.get(norm_pos, 0) + av

    return av_by_pos

def net_av_by_team(team_code):
    av_2023 = scrape_team_av(team_code, 2023) or {}
    av_2024 = scrape_team_av(team_code, 2024) or {}

    all_positions = set(av_2023.keys()) | set(av_2024.keys())
    net_av = {pos: av_2024.get(pos,0) - av_2023.get(pos,0) for pos in all_positions}

    return net_av

# Example usage:
team_codes = ['kan', 'nwe', 'sfo']  # Add all teams as you want

for team in team_codes:
    net = net_av_by_team(team)
    print(f"Net AV for {team.upper()} (2024 - 2023):")
    for pos, val in net.items():
        print(f"  {pos}: {val}")
    print()

Net AV for KAN (2024 - 2023):
  DB: -5
  TE: 1
  QB: -2
  OL: 2
  LB: 0
  DE: -4
  ST: 0
  DL: -2
  RB: -1
  WR: 0

Net AV for NWE (2024 - 2023):
  DB: -2
  TE: 4
  QB: 6
  OL: 5
  LB: -12
  DE: 5
  ST: 3
  DL: -13
  RB: 1
  WR: 0

Net AV for SFO (2024 - 2023):
  DB: -13
  TE: -1
  QB: -4
  OL: -13
  LB: 0
  DE: 14
  ST: -2
  DL: -24
  RB: -9
  WR: -11



In [55]:
import pandas as pd
import time

team_codes = ['crd', 'atl', 'rav', 'buf', 'car', 'chi', 'cin', 'cle', 'dal',
              'den', 'det', 'gnb', 'htx', 'clt', 'jax', 'kan', 'rai', 'sdg',
              'ram', 'mia', 'min', 'nwe', 'nor', 'nyg', 'nyj', 'phi', 'pit',
              'sfo', 'sea', 'tam', 'oti', 'was']  # Add all teams here

# Collect all net AV dicts keyed by team
team_net_av_data = {}

for team in team_codes:
    net = net_av_by_team(team)
    team_net_av_data[team.upper()] = net

# Create DataFrame, automatically aligns columns by position
offseason_2024 = pd.DataFrame.from_dict(team_net_av_data, orient='index').fillna(0).astype(int)

print(offseason_2024)

     DB  TE  QB  OL  LB  DE  ST  DL  RB  WR
CRD   6   0   1   1   3   3   0   4   3  -3
ATL -10  -6   0   6  -7  13  -2 -19   1  12
RAV  -2   2   1   0 -13  -3  -1  -7   3  -1
BUF  -4   0   3   9  -3  -4   0  -1   3   0
CAR  -3   2   2  14  -6   5   1 -16   3   3
CHI   3  -4  -4 -10  -2  16   2 -13  -3   1
CIN  -3   3   1   6   0   3  -1  -1   0   1
CLE  -9  -4  -8 -13  -5  -8  -3  -6  -4  -7
DAL  -7  -3  -9 -31   2  -8   1  -3  -5 -16
DEN   7   1  -1   1  17   1   4  13  -2   2
DET  11  -1   2   8   1   9   2  -3   8   1
GNB   9  -1   0  -1  -9  16   3   8   3  -3
HTX   4  -2  -4  -6  -4   3   3   0   1  -5
CLT   2  -3  -2  -3   4  -3   3   0  -2   2
JAX  -5  -2  -6 -11 -21  15   4  -8   0  -2
KAN  -5   1  -2   2   0  -4   0  -2  -1   0
RAI   0   6   0  -6  -1  -8   1   5  -2  -7
SDG  10   2  -1   8  10   0   1   7   1   0
RAM   3  -2  -2   2   6   7   2 -15  -3  -2
MIA   3   5  -2 -13   6   8   1  -2  -3 -16
MIN   6  -3   1   5   9  14   1 -13   5   6
NWE  -2   4   6   5 -12   5   3 

In [59]:
manual_labels = {
    'CRD': 'Better', 'ATL': 'Better', 'RAV': 'Worse', 'BUF': 'Better', 'CAR': 'Better', 'CHI': 'Worse', 'CIN': 'Better', 'CLE': 'Worse',
    'DAL': 'Worse', 'DEN': 'Better', 'DET': 'Better', 'GNB': 'Better', 'HTX': 'Worse', 'CLT': 'Worse', 'JAX': 'Worse', 'KAN': 'Better',
    'RAI': 'Worse', 'SDG': 'Better', 'RAM': 'Better', 'MIA': 'Worse', 'MIN': 'Better', 'NWE': 'Worse', 'NOR': 'Worse', 'NYG': 'Worse',
    'NYJ': 'Worse', 'PHI': 'Better', 'PIT': 'Better', 'SFO': 'Worse', 'SEA': 'Better', 'TAM': 'Better', 'OTI': 'Worse', 'WAS': 'Better'}
offseason_2024['Improvement?'] = offseason_2024.index.map(manual_labels)

In [62]:
manual_newcoach = {
    'CRD': 1, 'ATL': 1, 'RAV': 0, 'BUF': 0, 'CAR': 1, 'CHI': 1, 'CIN': 0, 'CLE': 0,
    'DAL': 0, 'DEN': 0, 'DET': 0, 'GNB': 0, 'HTX': 0, 'CLT': 0, 'JAX': 0, 'KAN': 0,
    'RAI': 1, 'SDG': 1, 'RAM': 0, 'MIA': 0, 'MIN': 0, 'NWE': 1, 'NOR': 1, 'NYG': 0,
    'NYJ': 1, 'PHI': 0, 'PIT': 0, 'SFO': 0, 'SEA': 1, 'TAM': 0, 'OTI': 1, 'WAS': 1}
offseason_2024['New Coach'] = offseason_2024.index.map(manual_newcoach)

In [64]:
manual_newQB = {
    'CRD': 0, 'ATL': 1, 'RAV': 0, 'BUF': 0, 'CAR': 0, 'CHI': 1, 'CIN': 0, 'CLE': 1,
    'DAL': 1, 'DEN': 1, 'DET': 0, 'GNB': 0, 'HTX': 0, 'CLT': 1, 'JAX': 0, 'KAN': 0,
    'RAI': 1, 'SDG': 0, 'RAM': 0, 'MIA': 0, 'MIN': 1, 'NWE': 1, 'NOR': 0, 'NYG': 0,
    'NYJ': 1, 'PHI': 0, 'PIT': 1, 'SFO': 0, 'SEA': 0, 'TAM': 0, 'OTI': 0, 'WAS': 1}
offseason_2024['New QB'] = offseason_2024.index.map(manual_newQB)

In [65]:
offseason_2024

Unnamed: 0,DB,TE,QB,OL,LB,DE,ST,DL,RB,WR,Improvement?,New Coach,New QB
CRD,6,0,1,1,3,3,0,4,3,-3,Better,1,0
ATL,-10,-6,0,6,-7,13,-2,-19,1,12,Better,1,1
RAV,-2,2,1,0,-13,-3,-1,-7,3,-1,Worse,0,0
BUF,-4,0,3,9,-3,-4,0,-1,3,0,Better,0,0
CAR,-3,2,2,14,-6,5,1,-16,3,3,Better,1,0
CHI,3,-4,-4,-10,-2,16,2,-13,-3,1,Worse,1,1
CIN,-3,3,1,6,0,3,-1,-1,0,1,Better,0,0
CLE,-9,-4,-8,-13,-5,-8,-3,-6,-4,-7,Worse,0,1
DAL,-7,-3,-9,-31,2,-8,1,-3,-5,-16,Worse,0,1
DEN,7,1,-1,1,17,1,4,13,-2,2,Better,0,1


In [1]:
offseason_2024.to_csv("offseason_2024_labeled.csv", index=False)

NameError: name 'offseason_2024' is not defined