In [1]:
import pandas as pd
from pathlib import Path

DATA_DIR = Path("../data/raw")

# yearly offense and defense
yp_off = pd.read_csv(DATA_DIR / "yearly_player_stats_offense.csv")
yp_def = pd.read_csv(DATA_DIR / "yearly_player_stats_defense.csv")

# weekly offense and defense (optional for later)
wp_off = pd.read_csv(DATA_DIR / "weekly_player_stats_offense.csv")
wp_def = pd.read_csv(DATA_DIR / "weekly_player_stats_defense.csv")

# salary tables 2014–2020
salary = pd.read_csv(DATA_DIR / "2014-thru-2020-cap-tables.csv").rename(str.lower, axis=1)
salary.head(10)

Unnamed: 0,name,pos,cap_hit,cap_percent,season,team
0,Russell Wilson,QB,31000000,15.53,2020,seattle-seahawks
1,Aaron Rodgers,QB,29672000,15.06,2019,green-bay-packers
2,Kirk Cousins,QB,29000000,15.12,2019,minnesota-vikings
3,Jared Goff,QB,28842682,14.15,2020,los-angeles-rams
4,Khalil Mack,OLB,26600000,12.52,2020,chicago-bears
5,Matthew Stafford,QB,26500000,14.61,2018,detroit-lions
6,Russell Wilson,QB,26286668,13.36,2019,seattle-seahawks
7,Von Miller,OLB,25125000,12.89,2019,denver-broncos
8,Tom Brady,QB,25000000,12.25,2020,tampa-bay-buccaneers
9,Philip Rivers,QB,25000000,10.42,2020,indianapolis-colts


In [2]:
# Standardize salary column names (ensure lowercase + consistent names)
salary = salary.rename(columns={
    "name": "player_name",
    "pos": "position",
    "team": "team",
    "season": "season",
    "cap_hit": "cap_hit",
    "cap_percent": "cap_percent"
})


In [3]:
# === CLEAN SALARY DATA (minimal because file is already clean) ===

# cap_hit is already numeric, but enforce float type just in case
salary["cap_hit"] = salary["cap_hit"].astype(float)

# season already looks good, ensure int
salary["season"] = salary["season"].astype(int)


# === STANDARDIZED NAME CLEANING FUNCTION ===
def clean_name(name):
    """Standardized name cleaner for both salary + player stats."""
    if pd.isna(name):
        return ""
    return (
        str(name)
        .upper()
        .replace(".", "")
        .replace("'", "")
        .replace("-", " ")
        .strip()
    )

# Apply to all datasets
yp_off["player_name_clean"] = yp_off["player_name"].apply(clean_name)
yp_def["player_name_clean"] = yp_def["player_name"].apply(clean_name)
salary["player_name_clean"] = salary["player_name"].apply(clean_name)

salary.head()


Unnamed: 0,player_name,position,cap_hit,cap_percent,season,team,player_name_clean
0,Russell Wilson,QB,31000000.0,15.53,2020,seattle-seahawks,RUSSELL WILSON
1,Aaron Rodgers,QB,29672000.0,15.06,2019,green-bay-packers,AARON RODGERS
2,Kirk Cousins,QB,29000000.0,15.12,2019,minnesota-vikings,KIRK COUSINS
3,Jared Goff,QB,28842682.0,14.15,2020,los-angeles-rams,JARED GOFF
4,Khalil Mack,OLB,26600000.0,12.52,2020,chicago-bears,KHALIL MACK


In [4]:
# translate team abbreviations into readable names (for display/reporting ONLY)
team_abbr_to_name = {
    "ARI": "Cardinals", "ATL": "Falcons", "BAL": "Ravens", "BUF": "Bills",
    "CAR": "Panthers", "CHI": "Bears", "CIN": "Bengals", "CLE": "Browns",
    "DAL": "Cowboys", "DEN": "Broncos", "DET": "Lions", "GB":  "Packers",
    "HOU": "Texans", "IND": "Colts", "JAX": "Jaguars", "KC":  "Chiefs",
    "LAC": "Chargers", "LAR": "Rams", "LV": "Raiders", "OAK": "Raiders",
    "MIA": "Dolphins", "MIN": "Vikings", "NE":  "Patriots", "NO": "Saints",
    "NYG": "Giants", "NYJ": "Jets", "PHI": "Eagles", "PIT": "Steelers",
    "SEA": "Seahawks", "SF":  "49ers", "TB": "Buccaneers", "TEN": "Titans",
    "WAS": "Commanders", "WSH": "Commanders"
}

# DO NOT overwrite 'team' (SEA, GB, etc.) — it’s needed for merging
yp_off["team_name_readable"] = yp_off["team"].map(team_abbr_to_name)
yp_def["team_name_readable"] = yp_def["team"].map(team_abbr_to_name)


In [5]:
# ================================================================
# MAP SALARY TEAM NAMES (e.g., "seattle-seahawks") TO NFL ABBREVIATIONS
# ================================================================

salary_team_to_abbr = {
    "arizona-cardinals": "ARI",
    "atlanta-falcons": "ATL",
    "baltimore-ravens": "BAL",
    "buffalo-bills": "BUF",
    "carolina-panthers": "CAR",
    "chicago-bears": "CHI",
    "cincinnati-bengals": "CIN",
    "cleveland-browns": "CLE",
    "dallas-cowboys": "DAL",
    "denver-broncos": "DEN",
    "detroit-lions": "DET",
    "green-bay-packers": "GB",
    "houston-texans": "HOU",
    "indianapolis-colts": "IND",
    "jacksonville-jaguars": "JAX",
    "kansas-city-chiefs": "KC",
    "las-vegas-raiders": "LV",
    "oakland-raiders": "OAK",   # old team name still appears in older seasons
    "los-angeles-chargers": "LAC",
    "san-diego-chargers": "SD", # may appear in early 2010s
    "los-angeles-rams": "LAR",
    "st-louis-rams": "STL",     # older seasons
    "miami-dolphins": "MIA",
    "minnesota-vikings": "MIN",
    "new-england-patriots": "NE",
    "new-orleans-saints": "NO",
    "new-york-giants": "NYG",
    "new-york-jets": "NYJ",
    "philadelphia-eagles": "PHI",
    "pittsburgh-steelers": "PIT",
    "san-francisco-49ers": "SF",
    "seattle-seahawks": "SEA",
    "tampa-bay-buccaneers": "TB",
    "tennessee-titans": "TEN",
    "washington-football-team": "WAS",
    "washington-commanders": "WAS",
    "washington-redskins": "WAS",   # appears in early years
}

# Standardize formatting before mapping
salary["team_abbr"] = (
    salary["team"]
    .str.lower()
    .str.replace(" ", "-", regex=False)
    .map(salary_team_to_abbr)
)

print("Unique salary team abbreviations after mapping:")
print(salary["team_abbr"].unique())


Unique salary team abbreviations after mapping:
['SEA' 'GB' 'MIN' 'LAR' 'CHI' 'DET' 'DEN' 'TB' 'IND' 'OAK' 'BAL' 'NYG'
 'NO' 'PIT' 'WAS' 'ATL' 'LAC' 'TEN' 'KC' 'NE' 'LV' 'CAR' 'SD' 'DAL' 'SF'
 'BUF' 'MIA' 'PHI' 'ARI' 'CIN' 'NYJ' 'HOU' 'JAX' 'CLE' nan]


In [6]:
# === MERGE OFFENSE PLAYERS WITH SALARY DATA ===
offense_merged = yp_off.merge(
    salary,
    left_on=["player_name", "team", "season", "position"],
    right_on=["player_name", "team_abbr", "season", "position"],
    how="left"
)

print("Offense merged shape:", offense_merged.shape)
print("Non-null salary %:", offense_merged["cap_hit"].notna().mean())


Offense merged shape: (7133, 667)
Non-null salary %: 0.4058600869199495


In [7]:
#merge defense 
defense_merged = yp_def.merge(
    salary,
    left_on=["player_name", "team", "season", "position"],
    right_on=["player_name", "team_abbr", "season", "position"],
    how="left"
)


In [8]:
# === MERGE DEFENSE PLAYERS WITH SALARY DATA ===
defense_merged = yp_def.merge(
    salary,
    left_on=["player_name_clean", "team", "season"],
    right_on=["player_name_clean", "team_abbr", "season"],
    how="left"
)

print("Defense merged shape:", defense_merged.shape)
print("Non-null salary %:", defense_merged["cap_hit"].notna().mean())


Defense merged shape: (16150, 157)
Non-null salary %: 0.29015479876160993


In [9]:
# ============================================================
# CLEAN SALARY MERGE RESULTS
# Create two datasets:
#   1. Players WITH salary (clean for modeling)
#   2. ALL players (for descriptive stats)
# ============================================================

# Version 1: players WITH salary
offense_with_salary = offense_merged.dropna(subset=["cap_hit"])
defense_with_salary = defense_merged.dropna(subset=["cap_hit"])

print("Offense with salary:", offense_with_salary.shape)
print("Defense with salary:", defense_with_salary.shape)

# Show heads (verify cap_hit values are present)
print("\n=== Offense (salary only) HEAD ===")
display(offense_with_salary.head())

print("\n=== Defense (salary only) HEAD ===")
display(defense_with_salary.head())

# Save salary-only datasets
offense_with_salary.to_csv("../data/processed/offense_players_salary_only.csv", index=False)
defense_with_salary.to_csv("../data/processed/defense_players_salary_only.csv", index=False)


# Version 2: ALL players (including missing salary)
print("\nOffense FULL dataset:", offense_merged.shape)
print("Defense FULL dataset:", defense_merged.shape)

print("\n=== Offense FULL HEAD ===")
display(offense_merged.head())

print("\n=== Defense FULL HEAD ===")
display(defense_merged.head())

# Save full datasets
offense_merged.to_csv("../data/processed/offense_players_all.csv", index=False)
defense_merged.to_csv("../data/processed/defense_players_all.csv", index=False)

print("\nSaved salary-only and full datasets.")


Offense with salary: (2895, 667)
Defense with salary: (4686, 157)

=== Offense (salary only) HEAD ===


Unnamed: 0,player_id,player_name,position,birth_year,draft_year,draft_round,draft_pick,draft_ovr,height,weight,...,delta_ypc,delta_td_pct,delta_yptouch,player_name_clean_x,team_name_readable,cap_hit,cap_percent,team_y,player_name_clean_y,team_abbr
832,00-0007091,Matt Hasselbeck,QB,1975,1998.0,6.0,34.0,187.0,76.0,235.0,...,-1.354756,0.0,0.116,MATT HASSELBECK,Colts,3750000.0,2.81,indianapolis-colts,MATT HASSELBECK,IND
833,00-0010346,Peyton Manning,QB,1976,1998.0,1.0,1.0,1.0,77.0,230.0,...,0.0,0.0,-0.181818,PEYTON MANNING,Broncos,17500000.0,12.96,denver-broncos,PEYTON MANNING,DEN
834,00-0010346,Peyton Manning,QB,1976,1998.0,1.0,1.0,1.0,77.0,230.0,...,-1.226244,2.0,0.690632,PEYTON MANNING,Broncos,17500000.0,12.96,denver-broncos,PEYTON MANNING,DEN
835,00-0019596,Tom Brady,QB,1977,2000.0,6.0,33.0,199.0,76.0,225.0,...,-0.718266,3.0,0.550827,TOM BRADY,Patriots,14800000.0,10.64,new-england-patriots,TOM BRADY,NE
836,00-0019596,Tom Brady,QB,1977,2000.0,6.0,33.0,199.0,76.0,225.0,...,0.736264,-0.375,0.042683,TOM BRADY,Patriots,14800000.0,10.64,new-england-patriots,TOM BRADY,NE



=== Defense (salary only) HEAD ===


Unnamed: 0,player_id,player_name_x,position_x,birth_year,draft_year,draft_round,draft_pick,draft_ovr,height,weight,...,delta_season_average_team_defense_snaps,delta_career_average_team_defense_snaps,player_name_clean,team_name_readable,player_name_y,position_y,cap_hit,cap_percent,team_y,team_abbr
2401,00-0020515,Ryan Pickett,DT,0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.184206,0.0,RYAN PICKETT,Texans,Ryan Pickett,DT,469412.0,0.35,houston-texans,HOU
2402,00-0020535,Justin Smith,DE,0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.022571,0.0,JUSTIN SMITH,49ers,Justin Smith,DE,6936666.0,5.12,san-francisco-49ers,SF
2404,00-0020712,James Harrison,OLB,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.478523,0.0,JAMES HARRISON,Steelers,James Harrison,OLB,534412.0,0.4,pittsburgh-steelers,PIT
2405,00-0020840,Ryan Clark,FS,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,RYAN CLARK,Commanders,Ryan Clark,S,635000.0,0.48,washington-redskins,WAS
2406,00-0021140,Julius Peppers,DE,1980,2002.0,1.0,2.0,2.0,79.0,295.0,...,13.616307,-0.803068,JULIUS PEPPERS,Packers,Julius Peppers,DE,3500000.0,2.47,green-bay-packers,GB



Offense FULL dataset: (7133, 667)
Defense FULL dataset: (16150, 157)

=== Offense FULL HEAD ===


Unnamed: 0,player_id,player_name,position,birth_year,draft_year,draft_round,draft_pick,draft_ovr,height,weight,...,delta_ypc,delta_td_pct,delta_yptouch,player_name_clean_x,team_name_readable,cap_hit,cap_percent,team_y,player_name_clean_y,team_abbr
0,00-0000865,Charlie Batch,QB,1974,1998.0,2.0,30.0,60.0,74.0,216.0,...,0.0,-0.909091,-0.388795,CHARLIE BATCH,Steelers,,,,,
1,00-0004541,Donald Driver,WR,1975,1999.0,7.0,7.0,213.0,72.0,194.0,...,0.0,49.0,1.226852,DONALD DRIVER,Packers,,,,,
2,00-0006101,Tony Gonzalez,TE,1976,1997.0,1.0,13.0,13.0,77.0,247.0,...,0.0,3.833333,0.236242,TONY GONZALEZ,Falcons,,,,,
3,00-0006101,Tony Gonzalez,TE,1976,1997.0,1.0,13.0,13.0,77.0,247.0,...,0.0,-0.413793,0.085776,TONY GONZALEZ,Falcons,,,,,
4,00-0007091,Matt Hasselbeck,QB,1975,1998.0,6.0,34.0,187.0,76.0,235.0,...,-0.337868,0.5,0.250509,MATT HASSELBECK,Titans,,,,,



=== Defense FULL HEAD ===


Unnamed: 0,player_id,player_name_x,position_x,birth_year,draft_year,draft_round,draft_pick,draft_ovr,height,weight,...,delta_season_average_team_defense_snaps,delta_career_average_team_defense_snaps,player_name_clean,team_name_readable,player_name_y,position_y,cap_hit,cap_percent,team_y,team_abbr
0,00-0000551,Brendon Ayanbadejo,ILB,0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.504375,0.0,BRENDON AYANBADEJO,Ravens,,,,,,
1,00-0000585,Champ Bailey,CB,0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.028099,0.0,CHAMP BAILEY,Broncos,,,,,,
2,00-0000741,Ronde Barber,FS,0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.000631,0.0,RONDE BARBER,Buccaneers,,,,,,
3,00-0001263,,,0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.0,0.0,,Colts,,,,,,
4,00-0001263,,,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,,Eagles,,,,,,



Saved salary-only and full datasets.
