In [1]:
# This produces the dataframe for WR

In [2]:
## Notes on the NFL Library ##
# the NFL python library seem to not work on Tuesday probably due to updates (not confirmed)

In [3]:
## REQUIRED ACTIONS - Include in a README doc ## 
# modify the number of weeks if the NFL adds regular season games to the schedule
# Update the season start date each year

In [None]:
## REQUIRED ACTIONS - Include in a README doc ## 
# ensure the output directory exists ./csv_files for the csv file function

In [4]:
## Required installations
!pip install nfl_data_py
# Ensure all required packages are installed within the notebook
# !pip install --quiet nfl_data_py
!pip install --quiet rapidfuzz




In [5]:
# import the libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import glob
from IPython.display import display, HTML
from datetime import datetime, timedelta
import nfl_data_py as nfl
import os
import re
import time
from random import sample, uniform, seed
import io
from rapidfuzz import fuzz, process
import numpy as np
import hashlib
import shutil

In [6]:
# Set Pandas options to display all columns in a single row without wrapping
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

In [7]:
## Begin: time calculators ##

In [8]:
# modify the number of weeks if the NFL adds regular season games to the schedule
# Update this each year
season_start_date = datetime(2025, 9, 4)  
REG_WEEKS = 18

In [9]:
def get_current_week(today=None):
    if today is None:
        today = datetime.now()
    delta_days = (today.date() - season_start_date.date()).days
    week_num = (delta_days // 7) + 1
    return max(0, week_num)  # clamp to 0 for preseason

In [10]:
# 1=preseason, 2=regular, 3=playoffs
def get_season_type(current_week, reg_weeks=REG_WEEKS):
    if current_week == 0:
        return 1
    elif current_week <= reg_weeks:
        return 2
    else:
        return 3

current_year = season_start_date.year
current_week = get_current_week()
season_type = get_season_type(current_week, REG_WEEKS)

print("current_year:", current_year)
print("current_week:", current_week)
print("season_type:", season_type)

current_year: 2025
current_week: 0
season_type: 1


In [11]:
# Returns a list of years to pull.
def get_year_range(current_year, current_week, start_year=2017, reg_weeks=18):
    if current_week == 0:
        return list(range(start_year, current_year))
    else:
        return list(range(start_year, current_year + 1))

In [12]:
# Builds (year, week) pairs for scraping.
# - 2017–2020: weeks 1–17
# - 2021+: weeks 1–18
def generate_year_week_combinations(start_year, end_year, current_year=None, current_week=None):
    combos = []
    for year in range(start_year, end_year + 1):
        max_regular = 17 if year <= 2020 else 18

        # Handle the current year
        if current_year is not None and year == current_year:
            if current_week is None or current_week == 0:
                # preseason: don't add any weeks for this year
                continue
            upper = min(max_regular, int(current_week))
        else:
            upper = max_regular

        combos.extend([(year, wk) for wk in range(1, upper + 1)])
    return combos

In [13]:
# define the year, week, and season type
current_year = season_start_date.year
current_week = get_current_week()
season_type  = get_season_type(current_week, REG_WEEKS)

years = get_year_range(current_year, current_week, start_year=2017)
year_week_pairs = generate_year_week_combinations(
    start_year=years[0] if years else 2017,
    end_year=years[-1] if years else current_year - 1,
    current_year=current_year,
    current_week=current_week
)


In [14]:
# test years and weeks to pull

# Years list should exclude current year during preseason
print("years:", years)                      # expect no 2025 when current_week == 0
print("contains current_year?", current_year in years)

# Year-week pairs should have no current_year and valid week caps
yrs_in_pairs = sorted({y for (y, _) in year_week_pairs})
print("years in pairs:", yrs_in_pairs)
print("pairs count:", len(year_week_pairs))
print("first 5:", year_week_pairs[:5])
print("last 5:", year_week_pairs[-5:])

# Validate week caps per year (≤17 for <=2020, ≤18 otherwise)
violations = []
for y in yrs_in_pairs:
    max_reg = 17 if y <= 2020 else 18
    max_week = max(w for (yy, w) in year_week_pairs if yy == y)
    if max_week > max_reg:
        violations.append((y, max_week, max_reg))
print("week-cap violations:", violations)   # expect []

# Ensure current year is COMPLETELY absent during preseason
has_current_year = any(yy == current_year for (yy, _) in year_week_pairs)
print("current year present in pairs?", has_current_year)  # expect False


years: [2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
contains current_year? False
years in pairs: [2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
pairs count: 140
first 5: [(2017, 1), (2017, 2), (2017, 3), (2017, 4), (2017, 5)]
last 5: [(2024, 14), (2024, 15), (2024, 16), (2024, 17), (2024, 18)]
week-cap violations: []
current year present in pairs? False


In [15]:
## End: time calculators ##

In [16]:
# returns a dataframe summarizing missing values for a given dataFrame.
def check_nulls(df, name=None):
    null_counts = df.isnull().sum()
    null_percent = (null_counts / len(df)).round(4)
    summary = pd.DataFrame({
        'Missing Count': null_counts,
        'Missing %': null_percent
    })
    summary = summary[summary['Missing Count'] > 0].sort_values(by='Missing %', ascending=False)
    
    if name:
        print(f"\n📊 Missing Value Summary for: {name}")
    return summary


In [25]:
# save a dataFrame to the ./csv_files directory with the given filename

# Ensure the output directory exists
os.makedirs("./csv_files", exist_ok=True)

def save_csv(df, filename, index=False, float_format=None):
    if not filename.endswith('.csv'):
        filename += '.csv'
    path = os.path.join("./csv_files", filename)
    df.to_csv(path, index=index, float_format=float_format)
    print(f"Saved: {path}")


In [17]:
### Begin: Python NFL Library Dataframe ###

In [18]:
# Validate years to pull from the nfl library
print("years:", years)                         
assert current_year not in years

wr_weekly = nfl.import_weekly_data(years=years, downcast=True)
print(wr_weekly[['season','week']].agg(['min','max']))
print("unique seasons:", sorted(wr_weekly['season'].unique()))

# sanity: no week beyond league cap per year
violations = []
for y, g in wr_weekly.groupby('season'):
    # Regular season cap: 17 weeks (<=2020) or 18 weeks (>=2021)
    # Postseason cap: up to week 22 (including Super Bowl)
    max_allowed = 22
    max_week = int(g['week'].max())
    if max_week > max_allowed:
        violations.append((y, max_week, f"> {max_allowed} not allowed"))
print("week-cap violations:", violations)  # expect []



years: [2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
Downcasting floats.
     season  week
min    2017     1
max    2024    22
unique seasons: [2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
week-cap violations: []


In [19]:
# display all available columns in the nfl python API for weekly stats
nfl.see_weekly_cols()

Index(['player_id', 'player_name', 'player_display_name', 'position', 'position_group', 'headshot_url', 'recent_team', 'season', 'week', 'season_type', 'opponent_team', 'completions', 'attempts', 'passing_yards', 'passing_tds', 'interceptions', 'sacks', 'sack_yards', 'sack_fumbles', 'sack_fumbles_lost', 'passing_air_yards', 'passing_yards_after_catch', 'passing_first_downs', 'passing_epa', 'passing_2pt_conversions', 'pacr', 'dakota', 'carries', 'rushing_yards', 'rushing_tds', 'rushing_fumbles', 'rushing_fumbles_lost', 'rushing_first_downs', 'rushing_epa', 'rushing_2pt_conversions', 'receptions', 'targets', 'receiving_yards', 'receiving_tds', 'receiving_fumbles', 'receiving_fumbles_lost', 'receiving_air_yards', 'receiving_yards_after_catch', 'receiving_first_downs', 'receiving_epa', 'receiving_2pt_conversions', 'racr', 'target_share', 'air_yards_share', 'wopr', 'special_teams_tds', 'fantasy_points', 'fantasy_points_ppr'], dtype='object')

In [20]:
# define the base columns. 
base_columns = [
    'season', 'season_type', 'week', 'player_id', 'player_name', 
    'position', 'position_group', 'recent_team',
    'fantasy_points', 'fantasy_points_ppr'
]

In [21]:
# Import the player IDs from nfl.import_ids() - without parameters
ids_data = nfl.import_ids()

# Drop the unnecessary columns
columns_to_drop = [
    'position', 'team', 'birthdate', 'age', 'draft_year', 
    'draft_round', 'draft_pick', 'draft_ovr', 'twitter_username', 
    'height', 'weight', 'college', 'db_season'
]
ids_data = ids_data.drop(columns=columns_to_drop, errors='ignore')

# Display the resulting dataframe for review
# print(f"Columns after dropping unnecessary ones: {ids_data.columns.tolist()}")
# display(ids_data)

In [22]:
# import the weekly data from nfl.import_weekly_data(years, columns, downcast)
weekly_data = nfl.import_weekly_data(
    years=years,
    columns=base_columns
)

# display(weekly_data)

Downcasting floats.


In [23]:
## Output: a dataframe of ALL NFL athletes info and ids since 2017

# Merge the two dataframes on 'player_id' and 'gsis_id'
# Align column names for merging
ids_data = ids_data.rename(columns={'gsis_id': 'player_id'})  
id_dataframe = pd.merge(weekly_data, ids_data, on='player_id', how='inner')

# Assign the resulting dataframe to a variable
all_players_id_data = id_dataframe

# Display the resulting ID dataframe
# display(all_players_id_data)

In [24]:
## Output: a dataframe of NFL WR info and ids since 2017
# extract WR from the dataframe
# Create a new dataframe with only wide receivers
wide_receiver_ids = all_players_id_data[all_players_id_data['position'] == 'WR']

# Display the resulting dataframe for review
print(f"Shape of merged dataframe: {wide_receiver_ids.shape}")

# Display the resulting dataframe for review
# display(wide_receiver_ids)

Shape of merged dataframe: (17384, 31)


In [None]:
## Output: a dataframe of NFL WR info, ids, and stats since 2017
# WR-specific columns (receiving-related)
wr_columns = [
    'receptions', 'targets', 'receiving_yards', 'receiving_tds',
    'receiving_fumbles', 'receiving_fumbles_lost',
    'receiving_air_yards', 'receiving_yards_after_catch',
    'receiving_first_downs', 'receiving_epa',
    'receiving_2pt_conversions', 'racr', 'target_share',
    'air_yards_share', 'wopr'
]

# Pull WR-specific columns from weekly data
wr_stats = nfl.import_weekly_data(
    years=years,
    columns=['player_id', 'season', 'week'] + wr_columns  # Include keys for merging
)

# Merge WR-specific stats with wide_receiver_ids
wr_ids_weekly_stats_df = pd.merge(
    wide_receiver_ids,
    wr_stats,
    on=['player_id', 'season', 'week'],  # Ensure correct alignment
    how='inner'
)

# Display the resulting dataframe for review
print(f"Shape of merged dataframe: {wr_ids_weekly_stats_df.shape}")

# Row integrity check
print(
    f"Row count matches: {wr_ids_weekly_stats_df.shape[0] == wide_receiver_ids.shape[0]}"
)

# display the df
display(wr_ids_weekly_stats_df)

In [None]:
# csv file
# save_csv(wr_ids_weekly_stats_df, "wr_ids_weekly_stats_df")

In [27]:
# check for nulls
# updated null value analysis using helper function
null_summary_wr_ids_weekly = check_nulls(wr_ids_weekly_stats_df, name="WR Weekly Stats")

# Filter out columns containing '_id'
null_summary_wr_ids_weekly = null_summary_wr_ids_weekly[~null_summary_wr_ids_weekly.index.str.contains('_id')]

display(null_summary_wr_ids_weekly)


📊 Missing Value Summary for: WR Weekly Stats


Unnamed: 0,Missing Count,Missing %
racr,327,0.0188
receiving_epa,285,0.0164
air_yards_share,285,0.0164
target_share,285,0.0164
wopr,285,0.0164


In [28]:
# Output: imports the NFL next-generation stats from the nfl python library

# import the next generation stats (NGS) from nfl.import_ngs_data()
# note: ngs starts at week 0 (previous season totals) - not needed so drop those rows

# Pull NGS receiving data for the specified years
wr_ngs_df = nfl.import_ngs_data('receiving', years)

# Exclude rows where 'week' == 0 and filter for 'WR' position in one step
wr_ngs_df = wr_ngs_df[(wr_ngs_df['week'] != 0) & (wr_ngs_df['player_position'] == 'WR')]

# Drop unnecessary columns (already in the nfl python baseline dataframe)
wr_ngs_df = wr_ngs_df.drop(columns=['player_jersey_number'], errors='ignore')

# Display the resulting dataframe
print(f"Shape of NGS WR DataFrame after dropping columns: {wr_ngs_df.shape}")
display(wr_ngs_df)

Shape of NGS WR DataFrame after dropping columns: (8249, 22)


Unnamed: 0,season,season_type,week,player_display_name,player_position,team_abbr,avg_cushion,avg_separation,avg_intended_air_yards,percent_share_of_intended_air_yards,receptions,targets,catch_percentage,yards,rec_touchdowns,avg_yac,avg_expected_yac,avg_yac_above_expectation,player_gsis_id,player_first_name,player_last_name,player_short_name
1725,2017,REG,1,Ryan Grant,WR,WAS,9.936667,2.894592,4.410000,7.154639,4,6,66.666667,61.0,0,11.232500,10.072361,1.160139,00-0031068,Ryan,Grant,R.Grant
1726,2017,REG,1,Martavis Bryant,WR,PIT,8.300000,4.122054,12.688333,33.327496,2,6,33.333333,14.0,0,0.155000,4.098278,-3.943278,00-0031373,Martavis,Bryant,M.Bryant
1729,2017,REG,1,Jamison Crowder,WR,WAS,7.655000,3.177793,10.540000,19.949707,3,7,42.857143,14.0,0,1.450000,1.631897,-0.181897,00-0031941,Jamison,Crowder,J.Crowder
1732,2017,REG,1,Nelson Agholor,WR,PHI,7.423750,2.462620,10.463750,20.274656,6,8,75.000000,86.0,1,5.611667,3.262470,2.349197,00-0031549,Nelson,Agholor,N.Agholor
1733,2017,REG,1,John Brown,WR,ARI,7.360000,2.751526,13.422222,28.208481,4,9,44.444444,32.0,0,-0.377500,0.961993,-1.339493,00-0031051,John,Brown,J.Brown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13323,2024,POST,23,Xavier Worthy,WR,KC,8.160000,4.959113,14.276250,44.737358,8,8,100.000000,157.0,2,6.250000,6.154624,0.095376,00-0039894,Xavier,Worthy,X.Worthy
13324,2024,POST,23,DeAndre Hopkins,WR,KC,7.676000,3.446231,11.974000,23.451761,2,5,40.000000,18.0,1,0.565000,0.798474,-0.233474,00-0030564,DeAndre,Hopkins,D.Hopkins
13325,2024,POST,23,DeVonta Smith,WR,PHI,7.470000,2.221577,14.752000,40.028219,4,5,80.000000,69.0,1,0.340000,0.600076,-0.260076,00-0036912,DeVonta,Smith,D.Smith
13327,2024,POST,23,Marquise Brown,WR,KC,4.943333,3.302615,6.356667,14.939872,2,6,33.333333,15.0,0,2.450000,3.533891,-1.083891,00-0035662,Marquise,Brown,M.Brown


In [29]:
# csv file
save_csv(wr_ngs_df, "wr_ngs_df")

Saved: ./csv_files/wr_ngs_df.csv


In [30]:
print(wr_ngs_df.columns.tolist())


['season', 'season_type', 'week', 'player_display_name', 'player_position', 'team_abbr', 'avg_cushion', 'avg_separation', 'avg_intended_air_yards', 'percent_share_of_intended_air_yards', 'receptions', 'targets', 'catch_percentage', 'yards', 'rec_touchdowns', 'avg_yac', 'avg_expected_yac', 'avg_yac_above_expectation', 'player_gsis_id', 'player_first_name', 'player_last_name', 'player_short_name']


In [31]:
# updated null analysis using helper function
wr_ngs_null_summary_df = check_nulls(wr_ngs_df, name="NGS WR Stats")
display(wr_ngs_null_summary_df)



📊 Missing Value Summary for: NGS WR Stats


Unnamed: 0,Missing Count,Missing %
avg_expected_yac,42,0.0051
avg_yac_above_expectation,42,0.0051
avg_yac,33,0.004
yards,28,0.0034
avg_cushion,2,0.0002


In [None]:
### End: Python NFL Library Dataframe ###

In [None]:
### Begin:fantasypros webscraping ###

In [None]:
# a scraper function for a single (year, week) to test parsing logic
def test_scraper_sample(scraper_func, year_week_pair=(2024, 1), **kwargs):
    # Wrap the pair in a list so it matches the scraper signature
    year_week_pairs = [year_week_pair]
    
    sample_df, sample_errors = scraper_func(
        year_week_pairs=year_week_pairs,
        **kwargs
    )
    
    print("Sample shape:", sample_df.shape)
    print("team_abbr unique values:", sample_df["team_abbr"].unique()[:15])
    display(sample_df.head(10))
    
    return sample_df, sample_errors

In [None]:
# scrape FantasyPros weekly WR basic stats 
def wr_scrape_fp_basic_stats(
    year_week_pairs,
    save_csv_path=None,              
    sleep_range=(0.35, 0.85),
    timeout=20
):
    """
    Scrape FantasyPros weekly WR *basic* stats for all (year, week) pairs provided.
    Expects year_week_pairs from generate_year_week_combinations(...) so preseason is skipped
    and in-season weeks are capped at current_week.

    Returns
    -------
    df : pandas.DataFrame
    errors : list[dict]
    """

    # BASIC stats page
    url_tpl = "https://www.fantasypros.com/nfl/stats/wr.php?year={y}&week={w}&range=week"

    sess = requests.Session()
    sess.headers.update({
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
                      "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
    })

    rows, errors = [], []

    def _extract_fp_id(a_tag):
        if not a_tag: return None
        for cls in a_tag.get("class", []):
            m = re.match(r"fp-id-(\d+)", cls)
            if m: return m.group(1)
        if a_tag.has_attr("data-player-id"): return str(a_tag["data-player-id"])
        if a_tag.has_attr("href"):
            m = re.search(r"(\d+)(?:/|$)", a_tag["href"])
            if m: return m.group(1)
        return None

    def _normalize_team(t):
        t = (t or "").upper().strip()
        alias = {"JAX":"JAC", "WSH":"WAS", "LAR":"LA", "STL":"LA", "OAK":"LV", "SD":"LAC"}
        return alias.get(t, t)

    for (year, week) in year_week_pairs:
        url = url_tpl.format(y=year, w=week)
        try:
            resp = sess.get(url, timeout=timeout)
            resp.raise_for_status()
            soup = BeautifulSoup(resp.text, "html.parser")

            table = soup.find("table")
            if table is None:
                errors.append({"year": year, "week": week, "url": url, "error": "no_table"})
                time.sleep(uniform(*sleep_range)); continue

            thead = table.find("thead")
            headers = [h.get_text(strip=True) for h in (thead.find_all("th") if thead else [])]

            tbody = table.find("tbody")
            tr_list = tbody.find_all("tr") if tbody else []
            for tr in tr_list:
                tds = tr.find_all(["td","th"])
                if not tds: continue

                # locate player cell
                player_idx = None
                for i, h in enumerate(headers):
                    if h.lower() == "player": player_idx = i; break
                if player_idx is None:
                    for i, td in enumerate(tds):
                        if td.find("a", class_=re.compile(r"\bfp-player-link\b")):
                            player_idx = i; break
                player_td = tds[player_idx] if player_idx is not None else tr

                a = player_td.find("a", class_=re.compile(r"\bfp-player-link\b"))
                fantasypros_id = _extract_fp_id(a)
                player_name = a.get_text(strip=True) if a else None

                # --- TEAM EXTRACTION (mirrors advanced scraper) ---
                team_abbr = None
                # Attempt regex from player_td text like "Jayden Reed(GB)"
                m = re.search(r"\(([A-Z]{2,4})\)", player_td.get_text(" ", strip=True))
                if m:
                    team_abbr = m.group(1)

                team_abbr = _normalize_team(team_abbr)

                # record
                cell_vals = [td.get_text(strip=True) for td in tds]
                rec = {
                    "season": year,
                    "season_type": "REG",
                    "week": week,
                    "fantasypros_id": fantasypros_id,
                    "player_name": player_name,
                    "team_abbr": team_abbr,
                }
                for col, val in zip(headers, cell_vals):
                    rec[col] = val
                rows.append(rec)

            time.sleep(uniform(*sleep_range))

        except Exception as e:
            errors.append({"year": year, "week": week, "url": url, "error": str(e)})
            time.sleep(uniform(*sleep_range))
            continue

    df = pd.DataFrame(rows)

    if not df.empty:
        df["season"] = pd.to_numeric(df["season"], errors="coerce").astype("Int64")
        df["week"]   = pd.to_numeric(df["week"],   errors="coerce").astype("Int64")
        if "fantasypros_id" in df.columns:
            df = df.drop_duplicates(subset=["season","week","fantasypros_id"], keep="first")
        else:
            df = df.drop_duplicates()

    if save_csv_path:
        df.to_csv(save_csv_path, index=False)

    return df, errors


In [None]:
# pull a sample of the scraped fantasypros data for visual inspection
sample_df, sample_errors = test_scraper_sample(
    wr_scrape_fp_basic_stats,
    year_week_pair=(2024, 1),
    save_csv_path=None  # prevent saving during test
)


In [None]:
# output: a dataframe of WR basic stats
wr_fp_basic_stats_df, fp_basic_errors = wr_scrape_fp_basic_stats(
    year_week_pairs,
    save_csv_path="wr_fp_basic_stats.csv"
)

print(f"Shape: {wr_fp_basic_stats_df.shape}")
display(wr_fp_basic_stats_df)

display(check_nulls(wr_fp_basic_stats_df, name="FantasyPros WR Basic Stats"))

In [None]:
# Scrape FantasyPros weekly WR advanced stats
def wr_scrape_fp_adv_stats(
    year_week_pairs,
    save_csv_path=None,              
    sleep_range=(0.35, 0.85),
    timeout=20
):
    url_tpl = "https://www.fantasypros.com/nfl/advanced-stats-wr.php?year={y}&week={w}&range=week&type=reg&mode=pergame"

    sess = requests.Session()
    sess.headers.update({
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
                      "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
    })

    rows, errors = [], []
    def _extract_fp_id(a_tag):
        if not a_tag: return None
        for cls in a_tag.get("class", []):
            m = re.match(r"fp-id-(\d+)", cls)
            if m: return m.group(1)
        if a_tag.has_attr("data-player-id"): return str(a_tag["data-player-id"])
        if a_tag.has_attr("href"):
            m = re.search(r"(\d+)(?:/|$)", a_tag["href"])
            if m: return m.group(1)
        return None

    def _normalize_team(t):
        t = (t or "").upper().strip()
        alias = {"JAX": "JAC", "WSH": "WAS", "LAR": "LA", "STL": "LA", "OAK": "LV", "SD": "LAC"}
        return alias.get(t, t)

    for (year, week) in year_week_pairs:
        url = url_tpl.format(y=year, w=week)
        try:
            resp = sess.get(url, timeout=timeout)
            resp.raise_for_status()
            soup = BeautifulSoup(resp.text, "html.parser")

            table = soup.find("table")
            if table is None:
                errors.append({"year": year, "week": week, "url": url, "error": "no_table"})
                time.sleep(uniform(*sleep_range)); continue

            thead = table.find("thead")
            headers = [h.get_text(strip=True) for h in (thead.find_all("th") if thead else [])]

            tbody = table.find("tbody")
            tr_list = tbody.find_all("tr") if tbody else []
            for tr in tr_list:
                tds = tr.find_all(["td","th"])
                if not tds: continue

                # Locate the player cell (by header name or anchor class)
                player_idx = None
                for i, h in enumerate(headers):
                    if h.lower() == "player": player_idx = i; break
                if player_idx is None:
                    for i, td in enumerate(tds):
                        if td.find("a", class_=re.compile(r"\bfp-player-link\b")):
                            player_idx = i; break
                player_td = tds[player_idx] if player_idx is not None else tr

                a = player_td.find("a", class_=re.compile(r"\bfp-player-link\b"))
                fantasypros_id = _extract_fp_id(a)
                player_name = a.get_text(strip=True) if a else None

                # --- TEAM EXTRACTION (priority order) ---
                team_abbr = None
                # 1) “Team” column if present
                try:
                    team_col_idx = headers.index("Team")
                    team_abbr = tds[team_col_idx].get_text(strip=True)
                except ValueError:
                    pass
                # 2) small/span near player name like "(MIA)"
                if not team_abbr:
                    tag = player_td.select_one("small") or player_td.select_one("span") \
                          or player_td.select_one('span[class*="team"]')
                    if tag:
                        txt = tag.get_text(strip=True)
                        m = re.search(r"\(([A-Z]{2,4})\)", txt)
                        team_abbr = m.group(1) if m else txt
                # 3) logo alt/title
                if not team_abbr and a:
                    img = a.find_next("img")
                    if img:
                        team_abbr = img.get("alt") or img.get("title")
                # 4) regex fallback on full player cell text
                if not team_abbr:
                    m = re.search(r"\(([A-Z]{2,4})\)", player_td.get_text(" ", strip=True))
                    if m: team_abbr = m.group(1)

                team_abbr = _normalize_team(team_abbr)

                # Build row dict
                cell_vals = [td.get_text(strip=True) for td in tds]
                rec = {
                    "season": year,
                    "season_type": "REG",
                    "week": week,
                    "fantasypros_id": fantasypros_id,
                    "player_name": player_name,
                    "team_abbr": team_abbr,
                }
                for col, val in zip(headers, cell_vals):
                    rec[col] = val
                rows.append(rec)

            time.sleep(uniform(*sleep_range))

        except Exception as e:
            errors.append({"year": year, "week": week, "url": url, "error": str(e)})
            time.sleep(uniform(*sleep_range))
            continue

    df = pd.DataFrame(rows)

    if not df.empty:
        df["season"] = pd.to_numeric(df["season"], errors="coerce").astype("Int64")
        df["week"] = pd.to_numeric(df["week"], errors="coerce").astype("Int64")
        if "fantasypros_id" in df.columns:
            df = df.drop_duplicates(subset=["season", "week", "fantasypros_id"], keep="first")
        else:
            df = df.drop_duplicates()

    if save_csv_path:
        df.to_csv(save_csv_path, index=False)

    return df, errors


In [None]:
# pull a sample of the scraped fantasypros for visual inspection
sample_df, sample_errors = test_scraper_sample(
    wr_scrape_fp_adv_stats,
    year_week_pair=(2024, 1),
    save_csv_path=None  # prevent saving during test
)


In [None]:
# output: a dataframe of WR fantasypros advanced stats
wr_fp_advanced_stats_df, fp_errors = wr_scrape_fp_adv_stats(
    year_week_pairs,
    save_csv_path="wr_fp_advanced_stats.csv"
)

# ✅ Updated null analysis using helper function
print(f"Shape of FantasyPros WR Advanced Stats DataFrame: {wr_fp_advanced_stats_df.shape}")

display(wr_fp_advanced_stats_df.head(25))
display(check_nulls(wr_fp_advanced_stats_df, name="FantasyPros WR Advanced Stats"))

In [None]:
# scrape WR fantasypros redzone stats 
def wr_scrape_fp_rz_stats(
    year_week_pairs,
    save_csv_path=None,              
    sleep_range=(0.35, 0.85),
    timeout=20
):
    url_tpl = "https://www.fantasypros.com/nfl/red-zone-stats/wr.php?year={y}&week={w}&range=week"

    sess = requests.Session()
    sess.headers.update({
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
                      "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
    })

    rows, errors = [], []

    def _extract_fp_id(a_tag):
        if not a_tag:
            return None
        for cls in a_tag.get("class", []):
            m = re.match(r"fp-id-(\d+)", cls)
            if m:
                return m.group(1)
        if a_tag.has_attr("data-player-id"):
            return str(a_tag["data-player-id"])
        if a_tag.has_attr("href"):
            m = re.search(r"(\d+)(?:/|$)", a_tag["href"])
            if m:
                return m.group(1)
        return None

    def _normalize_team(t):
        t = (t or "").upper().strip()
        alias = {"JAX":"JAC", "WSH":"WAS", "LAR":"LA", "STL":"LA", "OAK":"LV", "SD":"LAC"}
        if t in {"FANTASYPROS", "FANTASY PROS", "FANTASY-PROS", "FP", ""}:
            return None
        if t != "FA" and not (2 <= len(t) <= 4 and t.isalpha()):
            return None
        return alias.get(t, t)

    for (year, week) in year_week_pairs:
        url = url_tpl.format(y=year, w=week)
        try:
            resp = sess.get(url, timeout=timeout)
            resp.raise_for_status()
            soup = BeautifulSoup(resp.text, "html.parser")

            table = soup.find("table")
            if table is None:
                errors.append({"year": year, "week": week, "url": url, "error": "no_table"})
                time.sleep(uniform(*sleep_range))
                continue

            thead = table.find("thead")
            headers = [h.get_text(strip=True) for h in (thead.find_all("th") if thead else [])]

            tbody = table.find("tbody")
            tr_list = tbody.find_all("tr") if tbody else []
            for tr in tr_list:
                tds = tr.find_all(["td","th"])
                if not tds:
                    continue

                # locate player cell
                player_idx = None
                for i, h in enumerate(headers):
                    if h.lower() == "player":
                        player_idx = i
                        break
                if player_idx is None:
                    for i, td in enumerate(tds):
                        if td.find("a", class_=re.compile(r"\bfp-player-link\b")):
                            player_idx = i
                            break
                player_td = tds[player_idx] if player_idx is not None else tr

                a = player_td.find("a", class_=re.compile(r"\bfp-player-link\b"))
                fantasypros_id = _extract_fp_id(a)
                player_name = a.get_text(strip=True) if a else None

                # --- TEAM extraction fix ---
                team_abbr = None

                # 1) Regex directly from full player cell text
                m = re.search(r"\(([A-Z]{2,4})\)", player_td.get_text(" ", strip=True))
                if m:
                    team_abbr = m.group(1)

                # 2) 'Team' column if present
                if not team_abbr:
                    try:
                        team_col_idx = headers.index("Team")
                        team_abbr = tds[team_col_idx].get_text(strip=True)
                    except ValueError:
                        pass

                # 3) small/span near player name
                if not team_abbr:
                    tag = (player_td.select_one("small")
                           or player_td.select_one("span")
                           or player_td.select_one('span[class*="team"]'))
                    if tag:
                        txt = tag.get_text(strip=True)
                        m = re.search(r"\(([A-Z]{2,4})\)", txt)
                        team_abbr = m.group(1) if m else txt

                # 4) logo alt/title
                if not team_abbr and a:
                    img = a.find_next("img")
                    if img:
                        team_abbr = img.get("alt") or img.get("title")

                team_abbr = _normalize_team(team_abbr)

                # record
                cell_vals = [td.get_text(strip=True) for td in tds]
                rec = {
                    "season": year,
                    "season_type": "REG",
                    "week": week,
                    "fantasypros_id": fantasypros_id,
                    "player_name": player_name,
                    "team_abbr": team_abbr,
                }
                for col, val in zip(headers, cell_vals):
                    rec[col] = val
                rows.append(rec)

            time.sleep(uniform(*sleep_range))

        except Exception as e:
            errors.append({"year": year, "week": week, "url": url, "error": str(e)})
            time.sleep(uniform(*sleep_range))
            continue

    df = pd.DataFrame(rows)

    if not df.empty:
        df["season"] = pd.to_numeric(df["season"], errors="coerce").astype("Int64")
        df["week"]   = pd.to_numeric(df["week"],   errors="coerce").astype("Int64")
        if "fantasypros_id" in df.columns:
            df = df.drop_duplicates(subset=["season","week","fantasypros_id"], keep="first")
        else:
            df = df.drop_duplicates()

    if save_csv_path:
        df.to_csv(save_csv_path, index=False)

    return df, errors


In [None]:
# pull a sample of the scraped fantasypros data for visual inspection
sample_df, sample_errors = test_scraper_sample(
    wr_scrape_fp_rz_stats,
    year_week_pair=(2024, 1),
    save_csv_path=None  # prevent saving during test
)


In [None]:
# output: a dataframe of WR fantasypros advanced stats
wr_fp_rz_stats_df, fp_rz_errors = wr_scrape_fp_rz_stats(
    year_week_pairs,
    save_csv_path="wr_fp_rz_stats.csv"
)

# 📊 Display shape
print(f"Shape of FantasyPros WR Red Zone Stats DataFrame: {wr_fp_rz_stats_df.shape}")

# 👀 Display first few rows
display(wr_fp_rz_stats_df.head(25))

# 🔍 Display missing value summary
display(check_nulls(wr_fp_rz_stats_df, name="FantasyPros WR Red Zone Stats"))

In [None]:
# Listing columns of all three FantasyPros dataframes
basic_stats_cols = wr_fp_basic_stats_df.columns.tolist()
advanced_stats_cols = wr_fp_advanced_stats_df.columns.tolist()
redzone_stats_cols = wr_fp_rz_stats_df.columns.tolist()

# Combine into a dataframe for comparison
comparison_df = pd.DataFrame({
    "Basic Stats": pd.Series(basic_stats_cols),
    "Advanced Stats": pd.Series(advanced_stats_cols),
    "Red Zone Stats": pd.Series(redzone_stats_cols)
})
comparison_df

In [None]:
# ✅ Display the shape of each dataframe before merging
print(f"📊 **Shape of WR Basic Stats DataFrame:** {wr_fp_basic_stats_df.shape}")
print(f"\n📊 **Shape of WR Advanced Stats DataFrame:** {wr_fp_advanced_stats_df.shape}")
print(f"📊 **Shape of WR Red Zone Stats DataFrame:** {wr_fp_rz_stats_df.shape}")

In [None]:
# check nulls
# Apply helper function to each FantasyPros DataFrame
basic_stats_nulls = check_nulls(wr_fp_basic_stats_df, "FantasyPros Basic Stats")
advanced_stats_nulls = check_nulls(wr_fp_advanced_stats_df, "FantasyPros Advanced Stats")
redzone_nulls = check_nulls(wr_fp_rz_stats_df, "FantasyPros Red Zone Stats")

# Concatenate all results (only non-empty will be shown)
combined_nulls = pd.concat(
    [basic_stats_nulls, advanced_stats_nulls, redzone_nulls],
    keys=["Basic Stats", "Advanced Stats", "Red Zone Stats"]
)
combined_nulls

In [None]:
### End:fantasypros webscraping ###

In [None]:
## Begin: Build the dataframe for the DFS Fanduel and Draft Kings salary data from BigDataBall ##
# ** Files must be in the local directory ** NFL-20xx-DFS-Dataset.xlsx

In [None]:
# create a helper function to clean the the dfs salary data
#    - Cleans and flattens multi-index column names for DFS salary Excel files:
#    - Joins tuples if multi-index
#    - Removes special characters
#    - Normalizes spaces
#    - Converts to lowercase for matching
def clean_column_dfs(col):
    if isinstance(col, tuple):
        col = ' '.join(str(x) for x in col if x)

    return (
        str(col)
        .replace('\n', ' ')
        .replace('(', '')
        .replace(')', '')
        .replace('"', '')
        .replace('#', '')
        .replace('$', '')
        .replace('/', '')
        .replace('-', ' ')
        .strip()
        .lower()
        .replace('  ', ' ')
        .replace('   ', ' ')
    )

In [None]:
# Read the excel files
filepath = 'NFL-2024-DFS-Dataset.xlsx'
dfs_raw = pd.read_excel(filepath, header=[0, 1])
original_row_count = len(dfs_raw)

dfs_raw.columns = [clean_column_dfs(col) for col in dfs_raw.columns]
dfs_raw.head()  # Optional preview

In [None]:
# helper function does the following:
# Fanduel and Draft Kings player salary data for all positions (QB, RB, TE, WR, DST)
# creates and combines the dataframes for years 2017 - present 
# performs data validation checks

def create_DFS_dataframe(filepath, year):
    
    # Step 1: Read and clean the headers
    dfs_raw = pd.read_excel(filepath, header=[0, 1])
    original_row_count = len(dfs_raw)
    dfs_raw.columns = [clean_column_dfs(col) for col in dfs_raw.columns]

    # ✅ Step 2: Extract only relevant columns using cleaned names
    expected_cols = {
        'player': 'game information player dst',
        'week': 'game information week',
        'date': 'game information date',
        'player_id': 'game information player id',
        'team': 'game information team', 
        'opponent': 'game information opponent',
        'dk_position': 'position draftkings',
        'fd_position': 'position fanduel',
        'dk_salary': 'salary for draftkings classic contests',
        'fd_salary': 'salary for fanduel full roster contests',
        'dk_fpts': 'fantasy points scored draftkings',
        'fd_fpts': 'fantasy points scored fanduel'
    }

    # Subset the dataframe using cleaned column names
    dfs_subset = dfs_raw[list(expected_cols.values())].copy()

    # Rename them to simple identifiers for internal use
    dfs_subset.columns = list(expected_cols.keys())

    
    dfs_subset['date'] = pd.to_datetime(dfs_subset['date'])

    team_abbreviation_mapping = {
        'NWE': 'NE',
        'SFO': 'SF',
        'OAK': 'LV',
        'KAN': 'KC',
        'TAM': 'TB',
        'NOR': 'NO',
        'LAR': 'LA',
        'GNB': 'GB'
    }
    mask_dst = dfs_subset['dk_position'] == 'DST'
    dfs_subset.loc[mask_dst, 'player_id'] = dfs_subset.loc[mask_dst, 'player_id'].replace(team_abbreviation_mapping)

    def fix_season(row):
        game_year = row['date'].year
        game_month = row['date'].month
        game_week = row['week']
        
        if game_month in [1, 2]:
            if (game_year <= 2020 and game_week >= 18):
                return game_year - 1
            elif (game_year >= 2021 and game_week >= 19):
                return game_year - 1
            elif (game_year >= 2021 and game_week == 18):
                return game_year - 1
        return game_year

    dfs_subset['season'] = dfs_subset.apply(fix_season, axis=1)

    # 🔥 Track NaNs before dropping
    season_nulls_before = dfs_subset['season'].isna().sum()

    dfs_subset = dfs_subset.dropna(subset=['season'])
    dfs_subset['season'] = dfs_subset['season'].astype(int)

    season_nulls_after = dfs_subset['season'].isna().sum()

    print(f"🔎 Season NaN rows dropped: {season_nulls_before}")
    print(f"Remaining NaN rows (should be 0): {season_nulls_after}")

    dfs_subset = dfs_subset.drop(columns=['date'])

    dfs_subset['dk_salary'] = pd.to_numeric(dfs_subset['dk_salary'], errors='coerce')
    dfs_subset['fd_salary'] = pd.to_numeric(dfs_subset['fd_salary'], errors='coerce')
    dfs_subset = dfs_subset.dropna(subset=['dk_salary', 'fd_salary'])
    dfs_subset['dk_salary'] = dfs_subset['dk_salary'].astype(int)
    dfs_subset['fd_salary'] = dfs_subset['fd_salary'].astype(int)
    dfs_subset['week'] = dfs_subset['week'].astype(int)
    
    dfs_subset = dfs_subset[['season', 'week', 'player_id', 'player', 'dk_position', 'fd_position', 
                             'team', 'opponent', 'dk_salary', 'fd_salary', 'dk_fpts', 'fd_fpts']]
    
    unique_weeks = dfs_subset['week'].nunique()
    min_week = dfs_subset['week'].min()
    max_week = dfs_subset['week'].max()
    expected_weeks = 21 if int(year) <= 2020 else 22

    print(f"\nProcessing file: {filepath}")
    print(f"Original rows in xlsx file: {original_row_count}")
    print(f"Number of players with no salary data found in xlsx: {original_row_count - len(dfs_subset)}")
    print(f"Rows in csv file after dropping NaNs: {len(dfs_subset)}")

    if original_row_count - (original_row_count - len(dfs_subset)) == len(dfs_subset):
        print("✅ Salary Validation passed: Counts match after dropping NaNs.")
        salary_validation = 'Passed'
    else:
        print("❌ Salary Validation failed: Counts mismatch!")
        salary_validation = 'Failed'

    print(f"Weeks detected: {min_week} to {max_week}")
    print(f"Total unique weeks found: {unique_weeks}")
    print("🔔 Reminder: Missing final playoff week (e.g., Super Bowl) is normal if no salary data exists.")

    if unique_weeks == expected_weeks or unique_weeks == expected_weeks - 1:
        print(f"✅ Week Validation passed: {unique_weeks} weeks found (expected {expected_weeks}).\n")
        week_validation = 'Passed'
    else:
        print(f"❌ Week Validation failed: {unique_weeks} weeks found, expected {expected_weeks}.\n")
        week_validation = 'Failed'
    
    return dfs_subset, {
        'year': int(year),
        'original_rows': original_row_count,
        'nan_rows': original_row_count - len(dfs_subset),
        'rows_after_drop': len(dfs_subset),
        'min_week': min_week,
        'max_week': max_week,
        'unique_weeks': unique_weeks,
        'expected_weeks': expected_weeks,
        'salary_validation': salary_validation,
        'week_validation': week_validation
    }

In [None]:
# ** dataframe of Fanduel and Draft Kings Salaries FOR all positions ** 

# main control flow implements the helper function 
# output: combined dataframe and csv files of all seasons fanduel draft kings player salary data 
# output: data validation checks

# Find all matching files
file_list = sorted(glob.glob('NFL-*-DFS-Dataset.xlsx'))

# Handle if no files found
if not file_list:
    print("❌ No xlsx files detected.\nPlease download and place the BigDataBall NFL DFS Excel files into the same directory as this Jupyter Notebook file.")
else:
    # Process each file
    all_years_dfs = []
    validation_records = []
    file_years = []

    for file in file_list:
        year = file.split('-')[1]  # Extract year from filename
        file_years.append(int(year))
        
        year_df, validation_info = create_DFS_dataframe(file, year)

        # ** csv file ***
        # Save per-year CSV
        # year_df.to_csv(f'nfl_fd_dk_salary_{year}.csv', index=False)
        
        # Append to master list
        all_years_dfs.append(year_df)
        validation_records.append(validation_info)

    # Create validation summary DataFrame
    validation_summary_df = pd.DataFrame(validation_records)
    print("\n📋 Validation Summary:")
    display(validation_summary_df)

    # Combine all years into one big dataframe
    nfl_fd_dk_salary_combined = pd.concat(all_years_dfs, ignore_index=True)

    # Determine latest season dynamically
    current_season = max(file_years)

    # Export final combined CSV
    final_filename = f'nfl_fd_dk_salary_2017_{current_season}.csv'

    # If the file already exists, create a backup
    if os.path.exists(final_filename):
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        backup_filename = f'nfl_fd_dk_salary_2017_{current_season}_backup_{timestamp}.csv'
        shutil.copy(final_filename, backup_filename)
        print(f"🛡️ Backup created: {backup_filename}")


    # *** csv file ***
    # nfl_fd_dk_salary_combined.to_csv(final_filename, index=False)

    print(f"\n✅ Final combined CSV saved as: {final_filename}")

    # Display a quick preview
    display(nfl_fd_dk_salary_combined.head())

In [None]:
# ** WR dataframe of Fanduel and Draft Kings player Salaries ** 

# Determine current season based on available data
current_season = nfl_fd_dk_salary_combined['season'].max()

# Extract WR players where DraftKings position is WR
wr_fd_dk_salary_2017_current_df = nfl_fd_dk_salary_combined.loc[
    nfl_fd_dk_salary_combined['dk_position'] == 'WR'
]

# *** csv file ***
wr_csv_filename = f'wr_fd_dk_salary_2017_{current_season}.csv'
wr_fd_dk_salary_2017_current_df.to_csv(wr_csv_filename, index=False)

print(f"✅ WR DFS dataframe created and saved as {wr_csv_filename}")

# Optional: Display a quick preview
display(wr_fd_dk_salary_2017_current_df.head())


In [None]:
## End: Build the dataframe for the DFS Fanduel and Draft Kings salary data from BigDataBall ##

In [None]:
# All dataframes - no features and no salary
# wr_ids_weekly_stats_df
# wr_ngs_df
# wr_fp_basic_stats_df
# wr_fp_advanced_stats_df
# wr_fp_rz_stats_df
# wr_fd_dk_salary_2017_current_df

In [None]:
## Begin: team abbreviation standardization ##

In [None]:
# List columns from each dataframe in memory
ids_weekly_cols = wr_ids_weekly_stats_df.columns.tolist()
ngs_cols = wr_ngs_df.columns.tolist()
fp_basic_cols = wr_fp_basic_stats_df.columns.tolist()
fp_adv_cols = wr_fp_advanced_stats_df.columns.tolist()
fp_rz_cols = wr_fp_rz_stats_df.columns.tolist()
dfs_fd_dk_cols = wr_fd_dk_salary_2017_current_df.columns.tolist()

# Combine into a dataframe for side-by-side comparison
comparison_df = pd.DataFrame({
    "IDs & Weekly Stats": pd.Series(ids_weekly_cols),
    "NGS Stats": pd.Series(ngs_cols),
    "FantasyPros Basic": pd.Series(fp_basic_cols),
    "FantasyPros Adv": pd.Series(fp_adv_cols),
    "FantasyPros RZ": pd.Series(fp_rz_cols),
    "DFS FD DK": pd.Series(dfs_fd_dk_cols) 
})

comparison_df

In [None]:
def show_team_uniques():
    def norm(s):
        return (
            s.astype('string')
             .str.strip()
             .str.upper()
             .str.replace(".", "", regex=False)
             .str.replace(" ", "", regex=False)
        )

    datasets = {
        "wr_ids_weekly_stats_df.recent_team": (wr_ids_weekly_stats_df, ['recent_team']),
        "wr_ngs_df.team_abbr":               (wr_ngs_df,             ['team_abbr']),
        "wr_fp_basic_stats_df.team":         (wr_fp_basic_stats_df,  ['team_abbr']),
        "wr_fp_advanced_stats_df.team":      (wr_fp_advanced_stats_df,['team_abbr']),
        "wr_fp_rz_stats_df.team":            (wr_fp_rz_stats_df,     ['team_abbr']),
        "wr_fd_dk_salary_2017_current_df.team": (wr_fd_dk_salary_2017_current_df, ['team']),
    }

    for label, (df, candidates) in datasets.items():
        team_col = next((c for c in candidates if c in df.columns), None)
        print(f"\n{label}")
        if not team_col:
            print(f"  ⚠️ No team column found in {candidates}")
            continue

        vals = sorted(norm(df[team_col].dropna()).unique())
        print(f"  column: {team_col} | uniques ({len(vals)}):")
        print(vals)

# Call to preview all six
show_team_uniques()

In [None]:
# clean & get unique values
def get_team_set(df, col):
    return set(df[col].dropna().astype(str).str.strip().str.upper())

baseline_set = get_team_set(wr_ids_weekly_stats_df, 'recent_team')
print(f"Baseline (wr_ids_weekly_stats_df.recent_team) — {len(baseline_set)} uniques:\n{sorted(baseline_set)}\n")

# Dataframe -> column to compare
compare_map = {
    "wr_ngs_df": ("team_abbr" if "team_abbr" in wr_ngs_df.columns else None),
    "wr_fp_basic_stats_df": ("team" if "team" in wr_fp_basic_stats_df.columns else None),
    "wr_fp_advanced_stats_df": ("team" if "team" in wr_fp_advanced_stats_df.columns else None),
    "wr_fp_rz_stats_df": ("team" if "team" in wr_fp_rz_stats_df.columns else None),
    "wr_fd_dk_salary_2017_current_df": ("team" if "team" in wr_fd_dk_salary_2017_current_df.columns else None),
}

for name, col in compare_map.items():
    if col and col in globals()[name].columns:
        other_set = get_team_set(globals()[name], col)
        diff_from_baseline = other_set - baseline_set
        diff_in_baseline = baseline_set - other_set
        print(f"{name}.{col}:")
        print(f"  Unique values: {len(other_set)}")
        print(f"  In {name} but not in baseline: {sorted(diff_from_baseline) if diff_from_baseline else 'None'}")
        print(f"  In baseline but not in {name}: {sorted(diff_in_baseline) if diff_in_baseline else 'None'}\n")
    else:
        print(f"{name}: ⚠️ No team column found or mismatch\n")


In [None]:
# standardize team abbreviations

In [None]:

# base mapping: full team names -> abbreviations
full_name_to_abbr = {
    'Arizona Cardinals': 'ARI', 'Atlanta Falcons': 'ATL', 'Baltimore Ravens': 'BAL',
    'Buffalo Bills': 'BUF', 'Carolina Panthers': 'CAR', 'Chicago Bears': 'CHI',
    'Cincinnati Bengals': 'CIN', 'Cleveland Browns': 'CLE', 'Dallas Cowboys': 'DAL',
    'Denver Broncos': 'DEN', 'Detroit Lions': 'DET', 'Green Bay Packers': 'GB',
    'Houston Texans': 'HOU', 'Indianapolis Colts': 'IND', 'Jacksonville Jaguars': 'JAX',
    'Kansas City Chiefs': 'KC', 'Las Vegas Raiders': 'LV', 'Los Angeles Chargers': 'LAC',
    'Los Angeles Rams': 'LA', 'Miami Dolphins': 'MIA', 'Minnesota Vikings': 'MIN',
    'New England Patriots': 'NE', 'New Orleans Saints': 'NO', 'New York Giants': 'NYG',
    'New York Jets': 'NYJ', 'Philadelphia Eagles': 'PHI', 'Pittsburgh Steelers': 'PIT',
    'San Francisco 49ers': 'SF', 'Seattle Seahawks': 'SEA', 'Tampa Bay Buccaneers': 'TB',
    'Tennessee Titans': 'TEN', 'Washington Commanders': 'WAS', 

    # legacy names
    'St. Louis Rams': 'LAR',
    'San Diego Chargers': 'LAC',
    'Oakland Raiders': 'LV',
    'Washington Football Team': 'WAS',
    'Washington Redskins': 'WAS',
    
    # Free agent placeholder
    'Free Agent': 'FA'
    
}

# --- Start with exact-case mapping ---
alias_map = {name.upper(): abbr for name, abbr in full_name_to_abbr.items()}

# --- Add no-space/punctuation aliases ---
for name, abbr in full_name_to_abbr.items():
    no_space = re.sub(r'[^A-Z0-9]', '', name.upper())
    alias_map[no_space] = abbr

# --- Add free agent compressed form ---
alias_map['FREEAGENT'] = 'FA'

# --- Abbreviation fixups (site quirks, alternate short codes) ---
abbr_fixes = {
    'ARZ': 'ARI', 'TBB': 'TB', 'NEP': 'NE', 'GBP': 'GB',
    'KCC': 'KC', 'SFF': 'SF', 'NOS': 'NO', 'JAC': 'JAX',
    'LAR': 'LA', 'LVR': 'LV', 'WSH': 'WAS', 'WFT': 'WAS'
}

# Merge fixups into alias_map so one lookup covers all cases
alias_map.update(abbr_fixes)

In [None]:
# create team abbreviation mapping logic
def standardize_team_abbr(df, col, mapping):
    if col in df.columns:
        df[col] = (
            df[col]
            .astype(str)
            .str.strip()
            .str.upper()
            .str.replace(".", "", regex=False)
            .str.replace(" ", "", regex=False)
            .replace(mapping)
        )

In [None]:
# Apply standardization to all relevant dataframes/columns in one go
datasets_to_standardize = [
    (wr_ids_weekly_stats_df, "recent_team"),
    (wr_ngs_df, "team_abbr"),
    (wr_fp_basic_stats_df, "team_abbr"),
    (wr_fp_advanced_stats_df, "team_abbr"),
    (wr_fp_rz_stats_df, "team_abbr"),
    (wr_fd_dk_salary_2017_current_df, "team")
]

for df, col in datasets_to_standardize:
    standardize_team_abbr(df, col, alias_map)

# Quick check after standardization
for df, col in datasets_to_standardize:
    if col in df.columns:
        print(f"{col} uniques in dataframe:")
        print(sorted(df[col].dropna().unique()))
        print()


In [None]:
# list the unique values for each specified team column for visual inspection
dfs_and_cols = [
    ("IDs & Weekly", wr_ids_weekly_stats_df, "recent_team"),
    ("NGS", wr_ngs_df, "team_abbr"),
    ("FP Basic", wr_fp_basic_stats_df, "team_abbr"),
    ("FP Advanced", wr_fp_advanced_stats_df, "team_abbr"),
    ("FP RZ", wr_fp_rz_stats_df, "team_abbr"),
    ("DFS DK", wr_fd_dk_salary_2017_current_df, "team")
]

for label, df, col in dfs_and_cols:
    if col in df.columns:
        print(f"\n[{label}] {col} uniques ({len(df[col].dropna().unique())}):")
        print(sorted(df[col].dropna().unique()))


In [None]:
## End: team abbreviation standardization ##

In [None]:
## Begin: data type evaluation and normalization

In [None]:
# Save each dataframe to CSV for visual inspection
wr_fp_basic_stats_df.to_csv('wr_fp_basic_stats.csv', index=False)
wr_fp_advanced_stats_df.to_csv('wr_fp_advanced_stats.csv', index=False)
wr_fp_rz_stats_df.to_csv('wr_fp_rz_stats.csv', index=False)

In [None]:
def strip_whitespace_columns(df):
    df = df.copy()
    for col in df.select_dtypes(include='object').columns:
        try:
            df[col] = df[col].astype(str).str.strip()
        except Exception as e:
            print(f"⚠️ Could not strip column '{col}': {e}")
    return df


In [None]:
def lowercase_id_columns(df, keys=['fantasypros_id', 'player_name']):
    df = df.copy()
    for key in keys:
        if key in df.columns:
            try:
                df[key] = df[key].astype(str).str.lower()
            except Exception as e:
                print(f"⚠️ Could not lowercase column '{key}': {e}")
    return df


In [None]:
# # convert a column to int32, coercing invalid entries to NaN
# def clean_integer_column(df, column_name):
#     df = df.copy()
#     if column_name in df.columns:
#         try:
#             df[column_name] = (
#                 pd.to_numeric(df[column_name], errors='coerce')
#                 .astype('Int32')  # Pandas nullable integer
#             )
#         except Exception as e:
#             print(f"⚠️ Could not clean integer column '{column_name}': {e}")
#     return df


In [None]:
# Convert a column to Int32, safely handling empty strings and non-numeric entries
# *Note: int32 vs Int32 - Int32 can hanlde NaN
def clean_integer_column(df, column_name):
    df = df.copy()
    if column_name in df.columns:
        try:
            df[column_name] = (
                df[column_name]
                .astype(str)                    # Ensure it's string type
                .str.strip()                    # Remove extra whitespace
                .replace('', np.nan)            # Replace empty string with NaN
                .replace('nan', np.nan)         # Optional: if string "nan" exists
            )
            df[column_name] = (
                pd.to_numeric(df[column_name], errors='coerce')  # Coerce invalids to NaN
                .astype('Int32')                                 # Nullable integer
            )
        except Exception as e:
            print(f"⚠️ Could not clean integer column '{column_name}': {e}")
    return df


In [None]:
def convert_percentage_columns(df, percent_cols):
    df = df.copy()
    for col in percent_cols:
        if col in df.columns:
            try:
                df[col] = (
                    df[col]
                    .astype(str)
                    .str.replace('%', '', regex=False)
                    .str.strip()
                    .replace('', np.nan)
                    .astype(float) / 100
                ).astype('float32')
            except Exception as e:
                print(f"⚠️ Could not convert column '{col}' to float32 percentage: {e}")
    return df


In [None]:
# wr_fp_basic_stats_type_map 

wr_fp_basic_stats_type_map = {
    'season': 'Int32',
    'season_type': 'str',
    'week': 'Int32',
    'fantasypros_id': 'str',
    'player_name': 'str',
    'team_abbr': 'str',
    'Rank': 'Int32',
    'Player': 'str',
    'REC': 'Int32',
    'TGT': 'Int32',
    'YDS': 'Int32',
    'Y/R': 'float32',
    'LG': 'Int32',
    '20+': 'Int32',
    'TD': 'Int32',
    'ATT': 'Int32',
    'FL': 'Int32',
    'G': 'Int32',
    'FPTS': 'float32',
    'FPTS/G': 'float32',
    'ROST': 'float32'  # Already converted from %
}



In [None]:
# wr_fp_advanced_stats_type_map 

wr_fp_advanced_stats_type_map = {
    'season': 'Int32',
    'season_type': 'str',
    'week': 'Int32',
    'fantasypros_id': 'str',
    'player_name': 'str',
    'Player': 'str',
    'team_abbr': 'str',
    'Rank': 'Int32',
    'G': 'Int32',
    'REC': 'Int32',
    'YDS': 'Int32',
    'Y/R': 'float32',
    'YBC': 'Int32',
    'YBC/R': 'float32',
    'AIR': 'Int32',
    'AIR/R': 'float32',
    'YAC': 'Int32',
    'YAC/R': 'float32',
    'YACON': 'Int32',
    'YACON/R': 'float32',
    'BRKTKL': 'Int32',
    'TGT': 'Int32',
    '% TM': 'float32',  # Already converted from %
    'CATCHABLE': 'Int32',
    'DROP': 'Int32',
    'RZ TGT': 'Int32',
    '10+ YDS': 'Int32',
    '20+ YDS': 'Int32',
    '30+ YDS': 'Int32',
    '40+ YDS': 'Int32',
    '50+ YDS': 'Int32',
    'LNG': 'Int32'
}


In [None]:
# wr_fp_rz_stats_type_map 

wr_fp_rz_stats_type_map = {
    'season': 'Int32',
    'season_type': 'str',
    'week': 'Int32',
    'fantasypros_id': 'str',
    'player_name': 'str',
    'team_abbr': 'str',
    'Rank': 'Int32',
    'Player': 'str',
    'REC': 'Int32',
    'TGT': 'Int32',
    'REC PCT': 'float32',  # Already converted from %
    'YDS': 'Int32',
    'Y/R': 'float32',
    'TD': 'Int32',
    'TGT PCT': 'float32',  # Already converted from %
    'ATT': 'Int32',
    'PCT': 'float32',      # Already converted from %
    'FL': 'Int32',
    'G': 'Int32',
    'FPTS': 'float32',
    'FPTS/G': 'float32',
    'ROST %': 'float32'    # Already converted from %
}




In [None]:
# def cast_column_types(df, type_map):
#     df = df.copy()
#     for col, dtype in type_map.items():
#         if col in df.columns:
#             try:
#                 df[col] = df[col].astype(dtype)
#             except Exception as e:
#                 print(f"⚠️ Warning: could not convert column '{col}' to {dtype}. Reason: {e}")
#     return df


In [None]:
# cast dataframe columns to specified types with error logging.
def cast_column_types(df, type_map, df_name="DataFrame", verbose=True):
    df = df.copy()
    for col, dtype in type_map.items():
        if col in df.columns:
            try:
                df[col] = df[col].astype(dtype)
                if verbose:
                    print(f"✅ [{df_name}] {col} → {dtype}")
            except Exception as e:
                print(f"⚠️  [{df_name}] Failed to convert '{col}' to {dtype}: {e}")
        else:
            print(f"ℹ️  [{df_name}] Column '{col}' not found — skipping.")
    return df


In [None]:
# apply cleaning and normalization to FantasyPros dataframes 

# Apply to Basic Stats
wr_fp_basic_stats_df = wr_fp_basic_stats_df.copy()
wr_fp_basic_stats_df = strip_whitespace_columns(wr_fp_basic_stats_df)
wr_fp_basic_stats_df = lowercase_id_columns(wr_fp_basic_stats_df)
wr_fp_basic_stats_df = convert_percentage_columns(wr_fp_basic_stats_df, ['ROST'])
wr_fp_basic_stats_df = cast_column_types(wr_fp_basic_stats_df, wr_fp_basic_stats_type_map, df_name="Basic Stats")
print(f"✅ wr_fp_basic_stats_df shape: {wr_fp_basic_stats_df.shape}")

# Apply to Advanced Stats
wr_fp_advanced_stats_df = wr_fp_advanced_stats_df.copy()
wr_fp_advanced_stats_df = strip_whitespace_columns(wr_fp_advanced_stats_df)
wr_fp_advanced_stats_df = lowercase_id_columns(wr_fp_advanced_stats_df)
wr_fp_advanced_stats_df = convert_percentage_columns(wr_fp_advanced_stats_df, ['% TM'])
wr_fp_advanced_stats_df = cast_column_types(wr_fp_advanced_stats_df, wr_fp_advanced_stats_type_map, df_name="Advanced Stats")
print(f"✅ wr_fp_advanced_stats_df shape: {wr_fp_advanced_stats_df.shape}")

# Apply to Red Zone Stats
wr_fp_rz_stats_df = wr_fp_rz_stats_df.copy()
wr_fp_rz_stats_df = strip_whitespace_columns(wr_fp_rz_stats_df)
wr_fp_rz_stats_df = lowercase_id_columns(wr_fp_rz_stats_df)
wr_fp_rz_stats_df = convert_percentage_columns(wr_fp_rz_stats_df, ['REC PCT', 'TGT PCT', 'PCT', 'ROST %'])
wr_fp_rz_stats_df = cast_column_types(wr_fp_rz_stats_df, wr_fp_rz_stats_type_map, df_name="Red Zone Stats")
print(f"✅ wr_fp_rz_stats_df shape: {wr_fp_rz_stats_df.shape}")


# clean the 'G' column
wr_fp_basic_stats_df = clean_integer_column(wr_fp_basic_stats_df, 'G')
wr_fp_advanced_stats_df = clean_integer_column(wr_fp_advanced_stats_df, 'G')
wr_fp_rz_stats_df = clean_integer_column(wr_fp_rz_stats_df, 'G')

# csv files
wr_fp_basic_stats_df.to_csv("wr_fp_basic_stats_cleaned.csv", index=False)
wr_fp_advanced_stats_df.to_csv("wr_fp_advanced_stats_cleaned.csv", index=False)
wr_fp_rz_stats_df.to_csv("wr_fp_rz_stats_cleaned.csv", index=False)

In [None]:
wr_fd_dk_salary_2017_current_df.info()
wr_fd_dk_salary_2017_current_df.head()

In [None]:
# wr_fd_dk_salary_type_map

wr_fd_dk_salary_type_map = {
    'season': 'Int32',
    'week': 'Int32',
    'player_id': 'str',
    'player': 'str',
    'dk_position': 'str',
    'fd_position': 'str',
    'team': 'str',
    'opponent': 'str',
    'dk_salary': 'Int32',
    'fd_salary': 'Int32',
    'dk_fpts': 'float32',
    'fd_fpts': 'float32'
}


In [None]:
# apply type casting to wr_fd_dk_salary_2017_current_df
wr_fd_dk_salary_2017_current_df = cast_column_types(
    wr_fd_dk_salary_2017_current_df,
    wr_fd_dk_salary_type_map
)

print(f"✅ wr_fd_dk_salary_2017_current_df shape: {wr_fd_dk_salary_2017_current_df.shape}")
wr_fd_dk_salary_2017_current_df.to_csv("wr_fd_dk_salary_2017_current_cleaned.csv", index=False)

In [None]:
wr_ids_weekly_stats_df.info()

In [None]:
# wr_ids_weekly_stats type map

wr_ids_weekly_stats_type_map = {
    'season': 'Int32',
    'season_type': 'str',
    'week': 'Int32',
    'player_id': 'str',
    'player_name': 'str',
    'position': 'str',
    'position_group': 'str',
    'recent_team': 'str',
    'fantasy_points': 'float32',
    'fantasy_points_ppr': 'float32',
    'pff_id': 'str',
    'nfl_id': 'str',
    'name': 'str',
    'stats_global_id': 'str',
    'mfl_id': 'Int32',
    'ff_id': 'str',
    'cbs_id': 'str',
    'fleaflicker_id': 'str',
    'sportradar_id': 'str',
    'rotoworld_id': 'str',
    'sleeper_id': 'str',
    'ktc_id': 'str',
    'stats_id': 'str',
    'fantasypros_id': 'str',
    'merge_name': 'str',
    'cbfref_id': 'str',
    'fantasy_data_id': 'str',
    'espn_id': 'str',
    'swish_id': 'str',
    'rotowire_id': 'str',
    'yahoo_id': 'str',
    'receptions': 'Int32',
    'targets': 'Int32',
    'receiving_yards': 'Int32',
    'receiving_tds': 'Int32',
    'receiving_fumbles': 'float32',
    'receiving_fumbles_lost': 'float32',
    'receiving_air_yards': 'float32',
    'receiving_yards_after_catch': 'float32',
    'receiving_first_downs': 'float32',
    'receiving_epa': 'float32',
    'receiving_2pt_conversions': 'Int32',
    'racr': 'float32',
    'target_share': 'float32',
    'air_yards_share': 'float32',
    'wopr': 'float32',
}


In [None]:
# apply type casting to wr_ids_weekly_stats_df
wr_ids_weekly_stats_df = cast_column_types(
    wr_ids_weekly_stats_df,
    wr_ids_weekly_stats_type_map
)

print(f"\n✅ wr_ids_weekly_stats_df shape: {wr_ids_weekly_stats_df.shape}")
for col, dtype in wr_ids_weekly_stats_type_map.items():
    if col in wr_ids_weekly_stats_df.columns:
        actual_dtype = str(wr_ids_weekly_stats_df[col].dtype)
        checkmark = "✅" if actual_dtype == dtype.lower() or actual_dtype == dtype else "⚠️"
        print(f"{checkmark} [DataFrame] {col} → {actual_dtype}")


In [None]:
wr_ngs_df.info()
wr_ngs_df.head()

In [None]:
# wr_ngs_df type map

wr_ngs_df_type_map = {
    # int columns
    'season': 'Int32',
    'week': 'Int32',
    'receptions': 'Int32',
    'targets': 'Int32',
    'rec_touchdowns': 'Int32',

    # float columns
    'avg_cushion': 'float32',
    'avg_separation': 'float32',
    'avg_intended_air_yards': 'float32',
    'percent_share_of_intended_air_yards': 'float32',
    'catch_percentage': 'float32',
    'yards': 'float32',
    'avg_yac': 'float32',
    'avg_expected_yac': 'float32',
    'avg_yac_above_expectation': 'float32',

    # object → str
    'season_type': 'str',
    'player_display_name': 'str',
    'player_position': 'str',
    'team_abbr': 'str',
    'player_gsis_id': 'str',
    'player_first_name': 'str',
    'player_last_name': 'str',
    'player_short_name': 'str'
}


In [None]:
# Apply type casting to wr_ngs_df using the type map
wr_ngs_df = cast_column_types(
    wr_ngs_df,
    wr_ngs_df_type_map
)

# Display verification summary
print(f"✅ wr_ngs_df shape: {wr_ngs_df.shape}")

In [None]:
# Save cleaned dataframes to CSV
wr_fd_dk_salary_2017_current_df.to_csv("wr_fd_dk_salary_2017_current_cleaned.csv", index=False)
wr_ids_weekly_stats_df.to_csv("wr_ids_weekly_stats_cleaned.csv", index=False)
wr_ngs_df.to_csv("wr_ngs_df_cleaned.csv", index=False)


In [None]:
## Next Tasks
# minor refactor of code blocks to adjust where we output csv files
# prep to merge with dfs (using modified names "_dfs")
# there will be 10 dataframes total (dfs and non-dfs)
# build the feature engineering list
# conduct data normalization analysis in preparation to build the features
# data normalization 
# build the features
# eda analysis
# monte carlo simulation dataframes

In [None]:
## End: data type evaluation and normalization

In [None]:
### Begin: Merge Process ###

In [None]:
### End: DMerge Process ###

In [None]:
### Begin: Feature Engineering ###

In [None]:
# Apply Rolling Averages and Aggregates - 3,5,and 7 week averages
# Ouput: updated dataframe with aggregates (optional csv file)

# Start from sorted copy of the main DF
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg = (
    wr_nfl_py_fp_odds_salary_merged_mod_cols
    .sort_values(by=['name', 'season', 'week'])
    .reset_index(drop=True)
)

# Feature map: full column -> short prefix
feature_map = {
    'targets': 'tgt',
    'receptions': 'rec',
    'receiving_yards': 'rec_yds',
    'receiving_air_yards': 'rec_air_yards',
    'fpts': 'fpts'
}

windows = [3, 5, 7]

# Apply rolling averages and lag features
for full_col, short in feature_map.items():
    # Group once
    grouped = wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg.groupby(['name', 'season'])

    # Rolling averages using apply (preserves group boundaries)
    for window in windows:
        col_name = f"{short}_{window}wk_avg"
        wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg[col_name] = grouped[full_col].apply(
            lambda x: x.rolling(window=window, min_periods=window).mean().shift(1)
        ).reset_index(drop=True)

    # Lag feature (1-game lookback)
    lag_col = f"{short}_lag_1"
    wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg[lag_col] = grouped[full_col].shift(1).reset_index(drop=True)

# Final integrity check
print("✅ Final shape:", wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg.shape)

# ** csv file **
# wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg.to_csv(
#     "wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg.csv",
#     index=False,
#     float_format="%.2f"
# )
# print("📤 Exported to: wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg.csv")


In [None]:
# Validation tests of aggregates
# output: there should be no aggregates prior to week 4

# Structural Check — No aggregates in first 3 weeks of a season
def check_early_aggregates(df, cols, earliest_week=4):
    early = df[df['week'] < earliest_week]
    violations = early[cols].notna().sum()
    print("🚨 Aggregates present before week", earliest_week)
    print(violations[violations > 0])

# Boundary Check — Rolling aggregates must reset per season
def check_season_boundaries(df, col_prefix):
    errors = []
    for short in col_prefix:
        col_name = f'{short}_3wk_avg'
        season_transitions = df.groupby(['name'])['season'].diff().fillna(0)
        cross_season_rows = df[season_transitions != 0]
        if cross_season_rows[col_name].notna().any():
            errors.append(col_name)
    if errors:
        print("❌ Rolling values leaked across seasons:", errors)
    else:
        print("✅ No cross-season leakage detected.")

# Shape check
def check_shape(df, expected_cols_added):
    print("✅ Final shape:", df.shape)
    print("✅ Final columns:", df.columns[-expected_cols_added:])

# === Apply Checks ===
rolling_cols = [f"{short}_{w}wk_avg" for short in ['tgt', 'rec', 'rec_yds', 'rec_air_yards', 'fpts'] for w in [3, 5, 7]]
lag_cols = [f"{short}_lag_1" for short in ['tgt', 'rec', 'rec_yds', 'rec_air_yards', 'fpts']]
check_early_aggregates(wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg, rolling_cols)
check_season_boundaries(wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg, ['tgt', 'rec', 'rec_yds', 'rec_air_yards', 'fpts'])
check_shape(wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg, expected_cols_added=len(rolling_cols + lag_cols))

In [None]:
# Add Trend Features (deltas) - recent performance over / under (3wk, 5wk, 7wk) averages
# output: updated dataframe with deltas (optional csv file)

# new dataframe
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend = wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg.copy()

# Column map
feature_map = {
    'targets': 'tgt',
    'receptions': 'rec',
    'receiving_yards': 'rec_yds',
    'receiving_air_yards': 'rec_air_yards',
    'fpts': 'fpts'
}

windows = [3, 5, 7]

# Create delta (deviation from trend) features
for full_col, short in feature_map.items():
    for window in windows:
        avg_col = f"{short}_{window}wk_avg"
        delta_col = f"{short}_{window}wk_delta"
        wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend[delta_col] = (
            wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend[full_col] -
            wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend[avg_col]
        )

# Summary and export
delta_cols = [f"{short}_{w}wk_delta" for short in feature_map.values() for w in windows]
print("✅ Added delta columns:", delta_cols)
print("✅ Final shape:", wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend.shape)

# ** csv file **
# wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend.to_csv(
#     "wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend.csv",
#     index=False,
#     float_format="%.2f"
# )
# print("📤 Exported to: wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend.csv")


In [None]:
# Create boolean columns
# output: updated dataframe with booleans (optional csv file)

# Start from the previous trend-enhanced dataframe
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool = wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend.copy()

# Define boolean columns as 0/1 integers
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['tgt_ge_5'] = (wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['targets'] >= 5).astype(int)
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['tgt_ge_7'] = (wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['targets'] >= 7).astype(int)
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['rec_ge_5'] = (wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['receptions'] >= 5).astype(int)
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['rec_ge_7'] = (wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['receptions'] >= 7).astype(int)

wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['target_share_ge_20'] = (wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['target_share'] >= 0.2).astype(int)
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['target_share_ge_30'] = (wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['target_share'] >= 0.3).astype(int)

wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['over_100_yds'] = (wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['receiving_yards'] >= 100).astype(int)
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['double_digit_targets'] = (wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['targets'] >= 10).astype(int)
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['boom_week'] = (wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['fpts'] >= 20).astype(int)
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['bust_week'] = (wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['fpts'] < 5).astype(int)

# If 'home' is already boolean, convert to int
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['is_home_game'] = wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['home'].astype(int)

# Final shape and column check
print("✅ Final shape:", wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool.shape)
print("✅ New boolean columns added.")

# ** csv file **
# wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool.to_csv(
#     "wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool.csv",
#     index=False,
#     float_format="%.2f"
# )
# print("📤 Exported to: wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool.csv")


In [None]:
# *** Dataframe: this can be used as the final dataframe but the filename is long ***

# split the over / under column into two columns: o_u and total
# output: updated dataframe with o_u and total columns (optional csv file) 

# Copy from final boolean-enriched dataframe
wr_nfl_py_fp_odds_salary_features = wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool.copy()

# Extract 'O' or 'U' and map to "over"/"under"
wr_nfl_py_fp_odds_salary_features['O_U'] = (
    wr_nfl_py_fp_odds_salary_features['over_under']
    .str[0]
    .map({'O': 'over', 'U': 'under'})
)

# Extract the numeric total (handles int or float)
wr_nfl_py_fp_odds_salary_features['Total'] = (
    wr_nfl_py_fp_odds_salary_features['over_under']
    .str.extract(r'(\d+\.?\d*)')[0]
    .astype(float)
)

# Validation
print("✅ Final shape:", wr_nfl_py_fp_odds_salary_features.shape)
print("✅ Sample 'O_U' values:", wr_nfl_py_fp_odds_salary_features['O_U'].unique())
print("✅ Sample 'Total' values:", wr_nfl_py_fp_odds_salary_features['Total'].dropna().unique()[:5])

# ** csv file **
# wr_nfl_py_fp_odds_salary_features.to_csv(
#     "wr_nfl_py_fp_odds_salary_features.csv",
#     index=False,
#     float_format="%.2f"
# )
# print("📤 Exported to: wr_nfl_py_fp_odds_salary_features.csv")

In [None]:
### End: Feature Engineering ###