In [1]:
import pandas as pd
from pathlib import Path

DATA_DIR = Path("../data/raw")

# yearly offense and defense
yp_off = pd.read_csv(DATA_DIR / "yearly_player_stats_offense.csv")
yp_def = pd.read_csv(DATA_DIR / "yearly_player_stats_defense.csv")

# weekly offense and defense (optional for later)
wp_off = pd.read_csv(DATA_DIR / "weekly_player_stats_offense.csv")
wp_def = pd.read_csv(DATA_DIR / "weekly_player_stats_defense.csv")

# salary tables 2014–2020
salary = pd.read_csv(DATA_DIR / "2014-thru-2020-cap-tables.csv")


In [2]:
#standardize names
salary = salary.rename(columns={
    "name": "player_name",
    "team": "team",
    "season": "season",
    "cap_hit": "cap_hit"
})


In [3]:
#clean regex
# Convert $5,200,000 → 5200000
salary["cap_hit"] = (
    salary["cap_hit"]
    .astype(str)
    .str.replace("$","", regex=False)
    .str.replace(",","", regex=False)
    .astype(float)
)

salary["season"] = salary["season"].astype(int)

salary.head()

def clean_name(s):
    return (
        s.str.upper()
         .str.strip()
         .str.replace(r"[^A-Z ]", "", regex=True)
    )

yp_off["player_name_clean"] = clean_name(yp_off["player_name"])
yp_def["player_name_clean"] = clean_name(yp_def["player_name"])
salary["player_name_clean"] = clean_name(salary["player_name"])



In [4]:
#replace team abbreviations
team_abbr_to_name = {
    "ARI": "Cardinals", "ATL": "Falcons", "BAL": "Ravens", "BUF": "Bills",
    "CAR": "Panthers", "CHI": "Bears", "CIN": "Bengals", "CLE": "Browns",
    "DAL": "Cowboys", "DEN": "Broncos", "DET": "Lions", "GB":  "Packers",
    "HOU": "Texans", "IND": "Colts", "JAX": "Jaguars", "KC":  "Chiefs",
    "LAC": "Chargers", "LAR": "Rams", "LV": "Raiders", "OAK": "Raiders",
    "MIA": "Dolphins", "MIN": "Vikings", "NE":  "Patriots", "NO": "Saints",
    "NYG": "Giants", "NYJ": "Jets", "PHI": "Eagles", "PIT": "Steelers",
    "SEA": "Seahawks", "SF":  "49ers", "TB": "Buccaneers", "TEN": "Titans",
    "WAS": "Commanders", "WSH": "Commanders"
}

yp_off["team_name"] = yp_off["team"].map(team_abbr_to_name)
yp_def["team_name"] = yp_def["team"].map(team_abbr_to_name)


In [5]:
#merge offense
offense_merged = yp_off.merge(
    salary,
    on=["player_name_clean", "team", "season"],
    how="left"
)


In [6]:
#merge defense 
defense_merged = yp_def.merge(
    salary,
    on=["player_name_clean", "team", "season"],
    how="left"
)


In [7]:
#clean for missing salaries
offense_merged["cap_hit"] = offense_merged["cap_hit"].fillna(0)

defense_merged["cap_hit"] = defense_merged["cap_hit"].fillna(0)

In [8]:
#inspect
offense_merged = offense_merged.sort_values(["player_name_clean", "season"])
offense_merged.head()

defense_merged = defense_merged.sort_values(["player_name_clean", "season"])
defense_merged.head()

Unnamed: 0,player_id,player_name_x,position,birth_year,draft_year,draft_round,draft_pick,draft_ovr,height,weight,...,delta_season_average_defense_snaps,delta_career_average_defense_snaps,delta_season_average_team_defense_snaps,delta_career_average_team_defense_snaps,player_name_clean,team_name,player_name_y,pos,cap_hit,cap_percent
3545,00-0031259,Aaron Colvin,CB,1991,2014.0,4.0,14.0,114.0,72.0,191.0,...,-0.041765,0.0,-0.466296,0.0,AARON COLVIN,Jaguars,,,0.0,
4618,00-0031259,Aaron Colvin,CB,1991,2014.0,4.0,14.0,114.0,72.0,191.0,...,12.8322,2.808696,2.458311,0.615548,AARON COLVIN,Jaguars,,,0.0,
5649,00-0031259,Aaron Colvin,CB,1991,2014.0,4.0,14.0,114.0,72.0,191.0,...,-0.126947,1.293003,-0.11864,0.197182,AARON COLVIN,Jaguars,,,0.0,
6690,00-0031259,Aaron Colvin,CB,1991,2014.0,4.0,14.0,114.0,72.0,191.0,...,-0.17332,0.873178,0.257372,0.345008,AARON COLVIN,Jaguars,,,0.0,
7799,00-0031259,Aaron Colvin,CB,1991,2014.0,4.0,14.0,114.0,72.0,191.0,...,-0.45927,0.393889,-0.434182,0.083101,AARON COLVIN,Texans,,,0.0,


In [9]:
#save data 

offense_merged.to_csv("../data/processed/offense_players_with_salary.csv", index=False)
defense_merged.to_csv("../data/processed/defense_players_with_salary.csv", index=False)

print("Saved: player_season_merged.csv")


Saved: player_season_merged.csv
