In [1]:
# This produces the dataframe for WR

In [2]:
## Notes on the NFL Library ##
# the NFL python library seem to not work on Tuesday probably due to updates (not confirmed)

In [3]:
## REQUIRED ACTIONS - Include in a README doc ## 
# modify the number of weeks if the NFL adds regular season games to the schedule
# Update the season start date each year

In [4]:
## Required installations
!pip install nfl_data_py
# Ensure all required packages are installed within the notebook
# !pip install --quiet nfl_data_py
!pip install --quiet rapidfuzz




In [5]:
# import the libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import glob
from IPython.display import display, HTML
from datetime import datetime, timedelta
import nfl_data_py as nfl
import os
import re
import time
from random import sample, uniform, seed
import io
from rapidfuzz import fuzz, process
import numpy as np
import hashlib
import shutil

In [6]:
# Set Pandas options to display all columns in a single row without wrapping
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

In [7]:
## Begin: time calculators ##

In [8]:
# modify the number of weeks if the NFL adds regular season games to the schedule
# Update this each year
season_start_date = datetime(2025, 9, 4)  
REG_WEEKS = 18

In [9]:
def get_current_week(today=None):
    if today is None:
        today = datetime.now()
    delta_days = (today.date() - season_start_date.date()).days
    week_num = (delta_days // 7) + 1
    return max(0, week_num)  # clamp to 0 for preseason

In [10]:
# 1=preseason, 2=regular, 3=playoffs
def get_season_type(current_week, reg_weeks=REG_WEEKS):
    if current_week == 0:
        return 1
    elif current_week <= reg_weeks:
        return 2
    else:
        return 3

current_year = season_start_date.year
current_week = get_current_week()
season_type = get_season_type(current_week, REG_WEEKS)

print("current_year:", current_year)
print("current_week:", current_week)
print("season_type:", season_type)

current_year: 2025
current_week: 0
season_type: 1


In [11]:
# Returns a list of years to pull.
def get_year_range(current_year, current_week, start_year=2017, reg_weeks=18):
    if current_week == 0:
        return list(range(start_year, current_year))
    else:
        return list(range(start_year, current_year + 1))

In [12]:
# Builds (year, week) pairs for scraping.
# - 2017–2020: weeks 1–17
# - 2021+: weeks 1–18
def generate_year_week_combinations(start_year, end_year, current_year=None, current_week=None):
    combos = []
    for year in range(start_year, end_year + 1):
        max_regular = 17 if year <= 2020 else 18

        # Handle the current year
        if current_year is not None and year == current_year:
            if current_week is None or current_week == 0:
                # preseason: don't add any weeks for this year
                continue
            upper = min(max_regular, int(current_week))
        else:
            upper = max_regular

        combos.extend([(year, wk) for wk in range(1, upper + 1)])
    return combos

In [13]:
# define the year, week, and season type
current_year = season_start_date.year
current_week = get_current_week()
season_type  = get_season_type(current_week, REG_WEEKS)

years = get_year_range(current_year, current_week, start_year=2017)
year_week_pairs = generate_year_week_combinations(
    start_year=years[0] if years else 2017,
    end_year=years[-1] if years else current_year - 1,
    current_year=current_year,
    current_week=current_week
)


In [14]:
# test years and weeks to pull

# Years list should exclude current year during preseason
print("years:", years)                      # expect no 2025 when current_week == 0
print("contains current_year?", current_year in years)

# Year-week pairs should have no current_year and valid week caps
yrs_in_pairs = sorted({y for (y, _) in year_week_pairs})
print("years in pairs:", yrs_in_pairs)
print("pairs count:", len(year_week_pairs))
print("first 5:", year_week_pairs[:5])
print("last 5:", year_week_pairs[-5:])

# Validate week caps per year (≤17 for <=2020, ≤18 otherwise)
violations = []
for y in yrs_in_pairs:
    max_reg = 17 if y <= 2020 else 18
    max_week = max(w for (yy, w) in year_week_pairs if yy == y)
    if max_week > max_reg:
        violations.append((y, max_week, max_reg))
print("week-cap violations:", violations)   # expect []

# Ensure current year is COMPLETELY absent during preseason
has_current_year = any(yy == current_year for (yy, _) in year_week_pairs)
print("current year present in pairs?", has_current_year)  # expect False


years: [2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
contains current_year? False
years in pairs: [2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
pairs count: 140
first 5: [(2017, 1), (2017, 2), (2017, 3), (2017, 4), (2017, 5)]
last 5: [(2024, 14), (2024, 15), (2024, 16), (2024, 17), (2024, 18)]
week-cap violations: []
current year present in pairs? False


In [15]:
## End: time calculators ##

In [16]:
def check_nulls(df, name=None):
    """
    Returns a dataframe summarizing missing values for a given DataFrame.
    
    Parameters:
        df (pd.DataFrame): The DataFrame to analyze.
        name (str): Optional label for output clarity.
    
    Returns:
        pd.DataFrame: Summary of missing values (count and %), sorted.
    """
    null_counts = df.isnull().sum()
    null_percent = (null_counts / len(df)).round(4)
    summary = pd.DataFrame({
        'Missing Count': null_counts,
        'Missing %': null_percent
    })
    summary = summary[summary['Missing Count'] > 0].sort_values(by='Missing %', ascending=False)
    
    if name:
        print(f"\n📊 Missing Value Summary for: {name}")
    return summary


In [17]:
### Begin: Python NFL Library Dataframe ###

In [18]:
# Validate years to pull from the nfl library
print("years:", years)                         
assert current_year not in years

wr_weekly = nfl.import_weekly_data(years=years, downcast=True)
print(wr_weekly[['season','week']].agg(['min','max']))
print("unique seasons:", sorted(wr_weekly['season'].unique()))

# sanity: no week beyond league cap per year
violations = []
for y, g in wr_weekly.groupby('season'):
    # Regular season cap: 17 weeks (<=2020) or 18 weeks (>=2021)
    # Postseason cap: up to week 22 (including Super Bowl)
    max_allowed = 22
    max_week = int(g['week'].max())
    if max_week > max_allowed:
        violations.append((y, max_week, f"> {max_allowed} not allowed"))
print("week-cap violations:", violations)  # expect []



years: [2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
Downcasting floats.
     season  week
min    2017     1
max    2024    22
unique seasons: [2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
week-cap violations: []


In [19]:
# display all available columns in the nfl python API for weekly stats
nfl.see_weekly_cols()

Index(['player_id', 'player_name', 'player_display_name', 'position', 'position_group', 'headshot_url', 'recent_team', 'season', 'week', 'season_type', 'opponent_team', 'completions', 'attempts', 'passing_yards', 'passing_tds', 'interceptions', 'sacks', 'sack_yards', 'sack_fumbles', 'sack_fumbles_lost', 'passing_air_yards', 'passing_yards_after_catch', 'passing_first_downs', 'passing_epa', 'passing_2pt_conversions', 'pacr', 'dakota', 'carries', 'rushing_yards', 'rushing_tds', 'rushing_fumbles', 'rushing_fumbles_lost', 'rushing_first_downs', 'rushing_epa', 'rushing_2pt_conversions', 'receptions', 'targets', 'receiving_yards', 'receiving_tds', 'receiving_fumbles', 'receiving_fumbles_lost', 'receiving_air_yards', 'receiving_yards_after_catch', 'receiving_first_downs', 'receiving_epa', 'receiving_2pt_conversions', 'racr', 'target_share', 'air_yards_share', 'wopr', 'special_teams_tds', 'fantasy_points', 'fantasy_points_ppr'], dtype='object')

In [20]:
# define the base columns. 
base_columns = [
    'season', 'season_type', 'week', 'player_id', 'player_name', 
    'position', 'position_group', 'recent_team',
    'fantasy_points', 'fantasy_points_ppr'
]

In [21]:
# Import the player IDs from nfl.import_ids() - without parameters
ids_data = nfl.import_ids()

# Drop the unnecessary columns
columns_to_drop = [
    'position', 'team', 'birthdate', 'age', 'draft_year', 
    'draft_round', 'draft_pick', 'draft_ovr', 'twitter_username', 
    'height', 'weight', 'college', 'db_season'
]
ids_data = ids_data.drop(columns=columns_to_drop, errors='ignore')

# Display the resulting dataframe for review
# print(f"Columns after dropping unnecessary ones: {ids_data.columns.tolist()}")
# display(ids_data)

In [22]:
# import the weekly data from nfl.import_weekly_data(years, columns, downcast)
weekly_data = nfl.import_weekly_data(
    years=years,
    columns=base_columns
)

# display(weekly_data)

Downcasting floats.


In [23]:
## Output: a dataframe of ALL NFL athletes info and ids since 2017

# Merge the two dataframes on 'player_id' and 'gsis_id'
# Align column names for merging
ids_data = ids_data.rename(columns={'gsis_id': 'player_id'})  
id_dataframe = pd.merge(weekly_data, ids_data, on='player_id', how='inner')

# Assign the resulting dataframe to a variable
all_players_id_data = id_dataframe

# Display the resulting ID dataframe
# display(all_players_id_data)

In [24]:
## Output: a dataframe of NFL WR info and ids since 2017
# extract WR from the dataframe
# Create a new dataframe with only wide receivers
wide_receiver_ids = all_players_id_data[all_players_id_data['position'] == 'WR']

# Display the resulting dataframe for review
print(f"Shape of merged dataframe: {wide_receiver_ids.shape}")

# Display the resulting dataframe for review
# display(wide_receiver_ids)

Shape of merged dataframe: (17384, 31)


In [25]:
## Output: a dataframe of NFL WR info, ids, and stats since 2017
# WR-specific columns (receiving-related)
wr_columns = [
    'receptions', 'targets', 'receiving_yards', 'receiving_tds',
    'receiving_fumbles', 'receiving_fumbles_lost',
    'receiving_air_yards', 'receiving_yards_after_catch',
    'receiving_first_downs', 'receiving_epa',
    'receiving_2pt_conversions', 'racr', 'target_share',
    'air_yards_share', 'wopr'
]

# Pull WR-specific columns from weekly data
wr_stats = nfl.import_weekly_data(
    years=years,
    columns=['player_id', 'season', 'week'] + wr_columns  # Include keys for merging
)

# Merge WR-specific stats with wide_receiver_ids
wr_ids_weekly_stats_df = pd.merge(
    wide_receiver_ids,
    wr_stats,
    on=['player_id', 'season', 'week'],  # Ensure correct alignment
    how='inner'
)

# Display the resulting dataframe for review
print(f"Shape of merged dataframe: {wr_ids_weekly_stats_df.shape}")

# Row integrity check
print(
    f"Row count matches: {wr_ids_weekly_stats_df.shape[0] == wide_receiver_ids.shape[0]}"
)

# display the df
display(wr_ids_weekly_stats_df)

# csv file
# wr_ids_weekly_stats_df.to_csv('wr_ids_weekly_stats_df.csv', index=False)

Downcasting floats.
Shape of merged dataframe: (17384, 46)
Row count matches: True


Unnamed: 0,season,season_type,week,player_id,player_name,position,position_group,recent_team,fantasy_points,fantasy_points_ppr,merge_name,mfl_id,stats_global_id,yahoo_id,ktc_id,rotowire_id,pfr_id,espn_id,sportradar_id,fantasy_data_id,sleeper_id,name,swish_id,cbs_id,fleaflicker_id,stats_id,cfbref_id,nfl_id,pff_id,fantasypros_id,rotoworld_id,receptions,targets,receiving_yards,receiving_tds,receiving_fumbles,receiving_fumbles_lost,receiving_air_yards,receiving_yards_after_catch,receiving_first_downs,receiving_epa,receiving_2pt_conversions,racr,target_share,air_yards_share,wopr
0,2017,REG,1,00-0022921,L.Fitzgerald,WR,WR,ARI,7.4,13.400000,larry fitzgerald,7393,246053.0,6762.0,,3730.0,FitzLa00,5528.0,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,5571.0,223.0,Larry Fitzgerald,,492934.0,1732.0,6762.0,larry-fitzgerald-1,larryfitzgerald/2506106,1724.0,9383.0,1661.0,6,13,74.0,0,0.0,0.0,144.0,44.0,4.0,0.997088,0,0.513889,0.276596,0.342043,0.654324
1,2017,REG,2,00-0022921,L.Fitzgerald,WR,WR,ARI,2.1,5.100000,larry fitzgerald,7393,246053.0,6762.0,,3730.0,FitzLa00,5528.0,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,5571.0,223.0,Larry Fitzgerald,,492934.0,1732.0,6762.0,larry-fitzgerald-1,larryfitzgerald/2506106,1724.0,9383.0,1661.0,3,6,21.0,0,0.0,0.0,29.0,17.0,2.0,-3.455533,0,0.724138,0.166667,0.069378,0.298565
2,2017,REG,3,00-0022921,L.Fitzgerald,WR,WR,ARI,20.9,33.900002,larry fitzgerald,7393,246053.0,6762.0,,3730.0,FitzLa00,5528.0,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,5571.0,223.0,Larry Fitzgerald,,492934.0,1732.0,6762.0,larry-fitzgerald-1,larryfitzgerald/2506106,1724.0,9383.0,1661.0,13,15,149.0,1,0.0,0.0,138.0,45.0,6.0,7.632769,0,1.079710,0.312500,0.369973,0.727731
3,2017,REG,4,00-0022921,L.Fitzgerald,WR,WR,ARI,9.2,13.200000,larry fitzgerald,7393,246053.0,6762.0,,3730.0,FitzLa00,5528.0,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,5571.0,223.0,Larry Fitzgerald,,492934.0,1732.0,6762.0,larry-fitzgerald-1,larryfitzgerald/2506106,1724.0,9383.0,1661.0,4,7,32.0,1,0.0,0.0,31.0,18.0,1.0,0.162141,0,1.032258,0.137255,0.070938,0.255539
4,2017,REG,5,00-0022921,L.Fitzgerald,WR,WR,ARI,5.1,11.100000,larry fitzgerald,7393,246053.0,6762.0,,3730.0,FitzLa00,5528.0,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,5571.0,223.0,Larry Fitzgerald,,492934.0,1732.0,6762.0,larry-fitzgerald-1,larryfitzgerald/2506106,1724.0,9383.0,1661.0,6,10,51.0,0,0.0,0.0,44.0,29.0,5.0,2.428232,0,1.159091,0.227273,0.105516,0.414770
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17379,2024,REG,2,00-0039920,M.Corley,WR,WR,NYJ,0.4,1.400000,malachi corley,16636,0.0,40944.0,1607.0,17777.0,CorlMa00,4613104.0,bae59933-8b94-4837-990e-f0a4ced3cdbb,,11617.0,Malachi Corley,1215291.0,3162613.0,,40944.0,malachi-corley-1,,,26023.0,,1,1,4.0,0,0.0,0.0,-1.0,5.0,0.0,-0.475780,0,0.000000,0.034483,-0.006579,0.047119
17380,2024,REG,9,00-0039920,M.Corley,WR,WR,NYJ,1.8,1.800000,malachi corley,16636,0.0,40944.0,1607.0,17777.0,CorlMa00,4613104.0,bae59933-8b94-4837-990e-f0a4ced3cdbb,,11617.0,Malachi Corley,1215291.0,3162613.0,,40944.0,malachi-corley-1,,,26023.0,,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,,0,,,,
17381,2024,REG,10,00-0039920,M.Corley,WR,WR,NYJ,0.2,1.200000,malachi corley,16636,0.0,40944.0,1607.0,17777.0,CorlMa00,4613104.0,bae59933-8b94-4837-990e-f0a4ced3cdbb,,11617.0,Malachi Corley,1215291.0,3162613.0,,40944.0,malachi-corley-1,,,26023.0,,1,2,2.0,0,0.0,0.0,12.0,0.0,1.0,-2.246118,0,0.166667,0.060606,0.057692,0.131294
17382,2024,REG,11,00-0039920,M.Corley,WR,WR,NYJ,1.0,2.000000,malachi corley,16636,0.0,40944.0,1607.0,17777.0,CorlMa00,4613104.0,bae59933-8b94-4837-990e-f0a4ced3cdbb,,11617.0,Malachi Corley,1215291.0,3162613.0,,40944.0,malachi-corley-1,,,26023.0,,1,1,10.0,0,0.0,0.0,10.0,0.0,0.0,0.563583,0,1.000000,0.034483,0.080645,0.108176


In [26]:
# check for nulls
# ✅ Updated null value analysis using helper function
null_summary_wr_ids_weekly = check_nulls(wr_ids_weekly_stats_df, name="WR Weekly Stats")

# Filter out columns containing '_id'
null_summary_wr_ids_weekly = null_summary_wr_ids_weekly[~null_summary_wr_ids_weekly.index.str.contains('_id')]

display(null_summary_wr_ids_weekly)


📊 Missing Value Summary for: WR Weekly Stats


Unnamed: 0,Missing Count,Missing %
racr,327,0.0188
receiving_epa,285,0.0164
air_yards_share,285,0.0164
target_share,285,0.0164
wopr,285,0.0164


In [27]:
# Output: imports the NFL next-generation stats from the nfl python library

# import the next generation stats (NGS) from nfl.import_ngs_data()
# note: ngs starts at week 0 (previous season totals) - not needed so drop those rows

# Pull NGS receiving data for the specified years
wr_ngs_df = nfl.import_ngs_data('receiving', years)

# Exclude rows where 'week' == 0 and filter for 'WR' position in one step
wr_ngs_df = wr_ngs_df[(wr_ngs_df['week'] != 0) & (wr_ngs_df['player_position'] == 'WR')]

# Drop unnecessary columns (already in the nfl python baseline dataframe)
wr_ngs_df = wr_ngs_df.drop(columns=['player_jersey_number'], errors='ignore')

# Display the resulting dataframe
print(f"Shape of NGS WR DataFrame after dropping columns: {wr_ngs_df.shape}")
display(wr_ngs_df)

# ***csv file***
# wr_ngs_df.to_csv('wr_ngs_df.csv', index=False)

Shape of NGS WR DataFrame after dropping columns: (8249, 22)


Unnamed: 0,season,season_type,week,player_display_name,player_position,team_abbr,avg_cushion,avg_separation,avg_intended_air_yards,percent_share_of_intended_air_yards,receptions,targets,catch_percentage,yards,rec_touchdowns,avg_yac,avg_expected_yac,avg_yac_above_expectation,player_gsis_id,player_first_name,player_last_name,player_short_name
1725,2017,REG,1,Ryan Grant,WR,WAS,9.936667,2.894592,4.410000,7.154639,4,6,66.666667,61.0,0,11.232500,10.072361,1.160139,00-0031068,Ryan,Grant,R.Grant
1726,2017,REG,1,Martavis Bryant,WR,PIT,8.300000,4.122054,12.688333,33.327496,2,6,33.333333,14.0,0,0.155000,4.098278,-3.943278,00-0031373,Martavis,Bryant,M.Bryant
1729,2017,REG,1,Jamison Crowder,WR,WAS,7.655000,3.177793,10.540000,19.949707,3,7,42.857143,14.0,0,1.450000,1.631897,-0.181897,00-0031941,Jamison,Crowder,J.Crowder
1732,2017,REG,1,Nelson Agholor,WR,PHI,7.423750,2.462620,10.463750,20.274656,6,8,75.000000,86.0,1,5.611667,3.262470,2.349197,00-0031549,Nelson,Agholor,N.Agholor
1733,2017,REG,1,John Brown,WR,ARI,7.360000,2.751526,13.422222,28.208481,4,9,44.444444,32.0,0,-0.377500,0.961993,-1.339493,00-0031051,John,Brown,J.Brown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13323,2024,POST,23,Xavier Worthy,WR,KC,8.160000,4.959113,14.276250,44.737358,8,8,100.000000,157.0,2,6.250000,6.154624,0.095376,00-0039894,Xavier,Worthy,X.Worthy
13324,2024,POST,23,DeAndre Hopkins,WR,KC,7.676000,3.446231,11.974000,23.451761,2,5,40.000000,18.0,1,0.565000,0.798474,-0.233474,00-0030564,DeAndre,Hopkins,D.Hopkins
13325,2024,POST,23,DeVonta Smith,WR,PHI,7.470000,2.221577,14.752000,40.028219,4,5,80.000000,69.0,1,0.340000,0.600076,-0.260076,00-0036912,DeVonta,Smith,D.Smith
13327,2024,POST,23,Marquise Brown,WR,KC,4.943333,3.302615,6.356667,14.939872,2,6,33.333333,15.0,0,2.450000,3.533891,-1.083891,00-0035662,Marquise,Brown,M.Brown


In [28]:
print(wr_ngs_df.columns.tolist())


['season', 'season_type', 'week', 'player_display_name', 'player_position', 'team_abbr', 'avg_cushion', 'avg_separation', 'avg_intended_air_yards', 'percent_share_of_intended_air_yards', 'receptions', 'targets', 'catch_percentage', 'yards', 'rec_touchdowns', 'avg_yac', 'avg_expected_yac', 'avg_yac_above_expectation', 'player_gsis_id', 'player_first_name', 'player_last_name', 'player_short_name']


In [29]:
# ✅ Updated null analysis using helper function
wr_ngs_null_summary_df = check_nulls(wr_ngs_df, name="NGS WR Stats")
display(wr_ngs_null_summary_df)



📊 Missing Value Summary for: NGS WR Stats


Unnamed: 0,Missing Count,Missing %
avg_expected_yac,42,0.0051
avg_yac_above_expectation,42,0.0051
avg_yac,33,0.004
yards,28,0.0034
avg_cushion,2,0.0002


In [30]:
### End: Python NFL Library Dataframe ###

In [31]:
### Begin:fantasypros webscraping ###

In [32]:
# a scraper function for a single (year, week) to test parsing logic
def test_scraper_sample(scraper_func, year_week_pair=(2024, 1), **kwargs):
    # Wrap the pair in a list so it matches the scraper signature
    year_week_pairs = [year_week_pair]
    
    sample_df, sample_errors = scraper_func(
        year_week_pairs=year_week_pairs,
        **kwargs
    )
    
    print("Sample shape:", sample_df.shape)
    print("team_abbr unique values:", sample_df["team_abbr"].unique()[:15])
    display(sample_df.head(10))
    
    return sample_df, sample_errors

In [33]:
# scrape FantasyPros weekly WR basic stats 
def wr_scrape_fp_basic_stats(
    year_week_pairs,
    save_csv_path=None,              # e.g., "wr_fp_basic_stats.csv" (None => don't save)
    sleep_range=(0.35, 0.85),
    timeout=20
):
    """
    Scrape FantasyPros weekly WR *basic* stats for all (year, week) pairs provided.
    Expects year_week_pairs from generate_year_week_combinations(...) so preseason is skipped
    and in-season weeks are capped at current_week.

    Returns
    -------
    df : pandas.DataFrame
    errors : list[dict]
    """

    # BASIC stats page
    url_tpl = "https://www.fantasypros.com/nfl/stats/wr.php?year={y}&week={w}&range=week"

    sess = requests.Session()
    sess.headers.update({
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
                      "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
    })

    rows, errors = [], []

    def _extract_fp_id(a_tag):
        if not a_tag: return None
        for cls in a_tag.get("class", []):
            m = re.match(r"fp-id-(\d+)", cls)
            if m: return m.group(1)
        if a_tag.has_attr("data-player-id"): return str(a_tag["data-player-id"])
        if a_tag.has_attr("href"):
            m = re.search(r"(\d+)(?:/|$)", a_tag["href"])
            if m: return m.group(1)
        return None

    def _normalize_team(t):
        t = (t or "").upper().strip()
        alias = {"JAX":"JAC", "WSH":"WAS", "LAR":"LA", "STL":"LA", "OAK":"LV", "SD":"LAC"}
        return alias.get(t, t)

    for (year, week) in year_week_pairs:
        url = url_tpl.format(y=year, w=week)
        try:
            resp = sess.get(url, timeout=timeout)
            resp.raise_for_status()
            soup = BeautifulSoup(resp.text, "html.parser")

            table = soup.find("table")
            if table is None:
                errors.append({"year": year, "week": week, "url": url, "error": "no_table"})
                time.sleep(uniform(*sleep_range)); continue

            thead = table.find("thead")
            headers = [h.get_text(strip=True) for h in (thead.find_all("th") if thead else [])]

            tbody = table.find("tbody")
            tr_list = tbody.find_all("tr") if tbody else []
            for tr in tr_list:
                tds = tr.find_all(["td","th"])
                if not tds: continue

                # locate player cell
                player_idx = None
                for i, h in enumerate(headers):
                    if h.lower() == "player": player_idx = i; break
                if player_idx is None:
                    for i, td in enumerate(tds):
                        if td.find("a", class_=re.compile(r"\bfp-player-link\b")):
                            player_idx = i; break
                player_td = tds[player_idx] if player_idx is not None else tr

                a = player_td.find("a", class_=re.compile(r"\bfp-player-link\b"))
                fantasypros_id = _extract_fp_id(a)
                player_name = a.get_text(strip=True) if a else None

                # --- TEAM EXTRACTION (mirrors advanced scraper) ---
                team_abbr = None
                # Attempt regex from player_td text like "Jayden Reed(GB)"
                m = re.search(r"\(([A-Z]{2,4})\)", player_td.get_text(" ", strip=True))
                if m:
                    team_abbr = m.group(1)

                team_abbr = _normalize_team(team_abbr)

                # record
                cell_vals = [td.get_text(strip=True) for td in tds]
                rec = {
                    "season": year,
                    "season_type": "REG",
                    "week": week,
                    "fantasypros_id": fantasypros_id,
                    "player_name": player_name,
                    "team_abbr": team_abbr,
                }
                for col, val in zip(headers, cell_vals):
                    rec[col] = val
                rows.append(rec)

            time.sleep(uniform(*sleep_range))

        except Exception as e:
            errors.append({"year": year, "week": week, "url": url, "error": str(e)})
            time.sleep(uniform(*sleep_range))
            continue

    df = pd.DataFrame(rows)

    if not df.empty:
        df["season"] = pd.to_numeric(df["season"], errors="coerce").astype("Int64")
        df["week"]   = pd.to_numeric(df["week"],   errors="coerce").astype("Int64")
        if "fantasypros_id" in df.columns:
            df = df.drop_duplicates(subset=["season","week","fantasypros_id"], keep="first")
        else:
            df = df.drop_duplicates()

    if save_csv_path:
        df.to_csv(save_csv_path, index=False)

    return df, errors


In [34]:
# pull a sample of the scraped fantasypros data for visual inspection
sample_df, sample_errors = test_scraper_sample(
    wr_scrape_fp_basic_stats,
    year_week_pair=(2024, 1),
    save_csv_path=None  # prevent saving during test
)


Sample shape: (430, 21)
team_abbr unique values: ['GB' 'NYJ' 'DET' 'MIA' 'KC' 'IND' 'TB' 'SEA' 'PHI' 'NE' 'WAS' 'NO' 'MIN'
 'HOU' 'JAC']


Unnamed: 0,season,season_type,week,fantasypros_id,player_name,team_abbr,Rank,Player,REC,TGT,YDS,Y/R,LG,20+,TD,ATT,FL,G,FPTS,FPTS/G,ROST
0,2024,REG,1,23020,Jayden Reed,GB,1,Jayden Reed(GB),4,6,33,34.5,70,3,1,1,0,1,29.1,29.1,79.9%
1,2024,REG,1,17301,Allen Lazard,NYJ,2,Allen Lazard(NYJ),6,9,0,14.8,36,2,0,0,0,1,20.9,20.9,3.3%
2,2024,REG,1,23677,Jameson Williams,DET,3,Jameson Williams(DET),5,9,13,24.2,52,3,0,1,0,1,19.4,19.4,90.8%
3,2024,REG,1,15802,Tyreek Hill,MIA,4,Tyreek Hill(MIA),7,12,0,18.6,80,2,0,0,0,1,19.0,19.0,98.3%
4,2024,REG,1,23019,Xavier Worthy,KC,5,Xavier Worthy(KC),2,3,21,23.5,35,1,1,1,0,1,18.8,18.8,94.1%
5,2024,REG,1,23791,Alec Pierce,IND,6,Alec Pierce(IND),3,3,0,41.7,60,2,0,0,0,1,18.5,18.5,12.9%
6,2024,REG,1,12119,Mike Evans,TB,7,Mike Evans(TB),5,6,0,12.2,24,1,0,0,0,1,18.1,18.1,97.6%
7,2024,REG,1,16433,Cooper Kupp,SEA,8,Cooper Kupp(SEA),14,21,10,7.9,21,2,0,2,0,1,18.0,18.0,83.5%
8,2024,REG,1,18218,A.J. Brown,PHI,9,A.J. Brown(PHI),5,10,0,23.8,67,2,0,0,0,1,17.9,17.9,99.3%
9,2024,REG,1,13981,Stefon Diggs,NE,10,Stefon Diggs(NE),6,6,6,5.5,10,0,0,1,0,1,15.9,15.9,83.6%


In [35]:
# output: a dataframe of WR basic stats
wr_fp_basic_stats_df, fp_basic_errors = wr_scrape_fp_basic_stats(
    year_week_pairs,
    save_csv_path="wr_fp_basic_stats.csv"
)

print(f"Shape: {wr_fp_basic_stats_df.shape}")
display(wr_fp_basic_stats_df)

display(check_nulls(wr_fp_basic_stats_df, name="FantasyPros WR Basic Stats"))

Shape: (25785, 21)


Unnamed: 0,season,season_type,week,fantasypros_id,player_name,team_abbr,Rank,Player,REC,TGT,YDS,Y/R,LG,20+,TD,ATT,FL,G,FPTS,FPTS/G,ROST
0,2017,REG,1,13981,Stefon Diggs,NE,1,Stefon Diggs(NE),7,8,-6,13.3,30,1,0,1,0,1,20.7,20.7,83.6%
1,2017,REG,1,15802,Tyreek Hill,MIA,2,Tyreek Hill(MIA),7,8,5,19.0,75,0,0,2,0,1,19.8,19.8,98.3%
2,2017,REG,1,16488,Kenny Golladay,FA,3,Kenny Golladay(FA),4,7,0,17.3,45,0,0,0,0,1,18.9,18.9,4.0%
3,2017,REG,1,9808,Antonio Brown,FA,4,Antonio Brown(FA),11,11,0,16.5,50,0,0,0,0,1,18.2,18.2,1.0%
4,2017,REG,1,13429,Adam Thielen,CAR,5,Adam Thielen(CAR),9,10,0,17.4,44,2,0,0,0,1,15.7,15.7,36.4%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25780,2024,REG,18,15706,Alex Erickson,FA,424,Alex Erickson(FA),0,0,0,0,0,0,0,0,0,1,0.0,0.0,0.0%
25781,2024,REG,18,15495,Laquon Treadwell,IND,425,Laquon Treadwell(IND),0,0,0,0,0,0,0,0,0,0,0,0,0.0%
25782,2024,REG,18,28024,Jaden Smith,TB,426,Jaden Smith(TB),0,0,0,0,0,0,0,0,0,0,0,0,0.0%
25783,2024,REG,18,23739,Calvin Austin III,PIT,427,Calvin Austin III(PIT),0,1,0,0,0,0,0,0,1,1,-2.0,-2.0,10.0%



📊 Missing Value Summary for: FantasyPros WR Basic Stats


Unnamed: 0,Missing Count,Missing %


In [36]:
# Scrape FantasyPros weekly WR advanced stats
def wr_scrape_fp_adv_stats(
    year_week_pairs,
    save_csv_path=None,              
    sleep_range=(0.35, 0.85),
    timeout=20
):
    url_tpl = "https://www.fantasypros.com/nfl/advanced-stats-wr.php?year={y}&week={w}&range=week&type=reg&mode=pergame"

    sess = requests.Session()
    sess.headers.update({
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
                      "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
    })

    rows, errors = [], []
    def _extract_fp_id(a_tag):
        if not a_tag: return None
        for cls in a_tag.get("class", []):
            m = re.match(r"fp-id-(\d+)", cls)
            if m: return m.group(1)
        if a_tag.has_attr("data-player-id"): return str(a_tag["data-player-id"])
        if a_tag.has_attr("href"):
            m = re.search(r"(\d+)(?:/|$)", a_tag["href"])
            if m: return m.group(1)
        return None

    def _normalize_team(t):
        t = (t or "").upper().strip()
        alias = {"JAX": "JAC", "WSH": "WAS", "LAR": "LA", "STL": "LA", "OAK": "LV", "SD": "LAC"}
        return alias.get(t, t)

    for (year, week) in year_week_pairs:
        url = url_tpl.format(y=year, w=week)
        try:
            resp = sess.get(url, timeout=timeout)
            resp.raise_for_status()
            soup = BeautifulSoup(resp.text, "html.parser")

            table = soup.find("table")
            if table is None:
                errors.append({"year": year, "week": week, "url": url, "error": "no_table"})
                time.sleep(uniform(*sleep_range)); continue

            thead = table.find("thead")
            headers = [h.get_text(strip=True) for h in (thead.find_all("th") if thead else [])]

            tbody = table.find("tbody")
            tr_list = tbody.find_all("tr") if tbody else []
            for tr in tr_list:
                tds = tr.find_all(["td","th"])
                if not tds: continue

                # Locate the player cell (by header name or anchor class)
                player_idx = None
                for i, h in enumerate(headers):
                    if h.lower() == "player": player_idx = i; break
                if player_idx is None:
                    for i, td in enumerate(tds):
                        if td.find("a", class_=re.compile(r"\bfp-player-link\b")):
                            player_idx = i; break
                player_td = tds[player_idx] if player_idx is not None else tr

                a = player_td.find("a", class_=re.compile(r"\bfp-player-link\b"))
                fantasypros_id = _extract_fp_id(a)
                player_name = a.get_text(strip=True) if a else None

                # --- TEAM EXTRACTION (priority order) ---
                team_abbr = None
                # 1) “Team” column if present
                try:
                    team_col_idx = headers.index("Team")
                    team_abbr = tds[team_col_idx].get_text(strip=True)
                except ValueError:
                    pass
                # 2) small/span near player name like "(MIA)"
                if not team_abbr:
                    tag = player_td.select_one("small") or player_td.select_one("span") \
                          or player_td.select_one('span[class*="team"]')
                    if tag:
                        txt = tag.get_text(strip=True)
                        m = re.search(r"\(([A-Z]{2,4})\)", txt)
                        team_abbr = m.group(1) if m else txt
                # 3) logo alt/title
                if not team_abbr and a:
                    img = a.find_next("img")
                    if img:
                        team_abbr = img.get("alt") or img.get("title")
                # 4) regex fallback on full player cell text
                if not team_abbr:
                    m = re.search(r"\(([A-Z]{2,4})\)", player_td.get_text(" ", strip=True))
                    if m: team_abbr = m.group(1)

                team_abbr = _normalize_team(team_abbr)

                # Build row dict
                cell_vals = [td.get_text(strip=True) for td in tds]
                rec = {
                    "season": year,
                    "season_type": "REG",
                    "week": week,
                    "fantasypros_id": fantasypros_id,
                    "player_name": player_name,
                    "team_abbr": team_abbr,
                }
                for col, val in zip(headers, cell_vals):
                    rec[col] = val
                rows.append(rec)

            time.sleep(uniform(*sleep_range))

        except Exception as e:
            errors.append({"year": year, "week": week, "url": url, "error": str(e)})
            time.sleep(uniform(*sleep_range))
            continue

    df = pd.DataFrame(rows)

    if not df.empty:
        df["season"] = pd.to_numeric(df["season"], errors="coerce").astype("Int64")
        df["week"] = pd.to_numeric(df["week"], errors="coerce").astype("Int64")
        if "fantasypros_id" in df.columns:
            df = df.drop_duplicates(subset=["season", "week", "fantasypros_id"], keep="first")
        else:
            df = df.drop_duplicates()

    if save_csv_path:
        df.to_csv(save_csv_path, index=False)

    return df, errors


In [37]:
# pull a sample of the scraped fantasypros for visual inspection
sample_df, sample_errors = test_scraper_sample(
    wr_scrape_fp_adv_stats,
    year_week_pair=(2024, 1),
    save_csv_path=None  # prevent saving during test
)


Sample shape: (8, 32)
team_abbr unique values: ['GB' 'NYJ' 'DET' 'MIA' 'KC' 'IND' 'TB' 'SEA']


Unnamed: 0,season,season_type,week,fantasypros_id,player_name,team_abbr,Rank,Player,G,REC,YDS,Y/R,YBC,YBC/R,AIR,AIR/R,YAC,YAC/R,YACON,YACON/R,BRKTKL,TGT,% TM,CATCHABLE,DROP,RZ TGT,10+ YDS,20+ YDS,30+ YDS,40+ YDS,50+ YDS,LNG
0,2024,REG,1,23020,Jayden Reed,GB,1,Jayden Reed(GB),1,4,138,34.5,83,20.8,104,26.0,55,13.8,34,8.5,1,6,18.8%,4,0,0,3,3,2,1,1,70
1,2024,REG,1,17301,Allen Lazard,NYJ,2,Allen Lazard(NYJ),1,6,89,14.8,60,10.0,91,15.2,29,4.8,7,1.2,0,9,31.0%,6,0,3,3,2,1,0,0,36
2,2024,REG,1,23677,Jameson Williams,DET,3,Jameson Williams(DET),1,5,121,24.2,58,11.6,127,25.4,63,12.6,5,1.0,1,9,32.1%,5,0,1,3,3,2,1,1,52
3,2024,REG,1,15802,Tyreek Hill,MIA,4,Tyreek Hill(MIA),1,7,130,18.6,46,6.6,143,20.4,84,12.0,13,1.9,1,12,33.3%,7,0,2,5,2,1,1,1,80
4,2024,REG,1,23019,Xavier Worthy,KC,5,Xavier Worthy(KC),1,2,47,23.5,24,12.0,37,18.5,23,11.5,1,0.5,0,3,11.1%,2,0,1,2,1,1,0,0,35
5,2024,REG,1,23791,Alec Pierce,IND,6,Alec Pierce(IND),1,3,125,41.7,119,39.7,119,39.7,6,2.0,0,0.0,0,3,15.8%,3,0,0,2,2,2,2,2,60
6,2024,REG,1,12119,Mike Evans,TB,7,Mike Evans(TB),1,5,61,12.2,53,10.6,64,12.8,8,1.6,1,0.2,0,6,20.0%,5,0,3,3,1,0,0,0,24
7,2024,REG,1,16433,Cooper Kupp,SEA,8,Cooper Kupp(SEA),1,14,110,7.9,69,4.9,153,10.9,41,2.9,21,1.5,1,21,43.8%,15,1,3,4,2,0,0,0,21


In [38]:
# output: a dataframe of WR fantasypros advanced stats
wr_fp_advanced_stats_df, fp_errors = wr_scrape_fp_adv_stats(
    year_week_pairs,
    save_csv_path="wr_fp_advanced_stats.csv"
)

# ✅ Updated null analysis using helper function
print(f"Shape of FantasyPros WR Advanced Stats DataFrame: {wr_fp_advanced_stats_df.shape}")

display(wr_fp_advanced_stats_df.head(25))
display(check_nulls(wr_fp_advanced_stats_df, name="FantasyPros WR Advanced Stats"))

Shape of FantasyPros WR Advanced Stats DataFrame: (1120, 32)


Unnamed: 0,season,season_type,week,fantasypros_id,player_name,team_abbr,Rank,Player,G,REC,YDS,Y/R,YBC,YBC/R,AIR,AIR/R,YAC,YAC/R,YACON,YACON/R,BRKTKL,TGT,% TM,CATCHABLE,DROP,RZ TGT,10+ YDS,20+ YDS,30+ YDS,40+ YDS,50+ YDS,LNG
0,2017,REG,1,13981,Stefon Diggs,NE,1,Stefon Diggs(NE),1,7,93,13.3,76,10.9,0,0.0,17,2.4,6,0.9,0,8,25.0%,7,0,2,5,2,1,0,0,30
1,2017,REG,1,15802,Tyreek Hill,MIA,2,Tyreek Hill(MIA),1,7,133,19.0,78,11.1,0,0.0,55,7.9,1,0.1,0,8,23.5%,7,0,0,4,1,1,1,1,75
2,2017,REG,1,16488,Kenny Golladay,FA,3,Kenny Golladay(FA),1,4,69,17.3,64,16.0,0,0.0,5,1.3,0,0.0,0,7,17.9%,5,1,1,2,1,1,1,0,45
3,2017,REG,1,9808,Antonio Brown,FA,4,Antonio Brown(FA),1,11,182,16.5,90,8.2,0,0.0,92,8.4,50,4.5,0,11,30.6%,11,0,0,7,2,2,1,1,50
4,2017,REG,1,13429,Adam Thielen,CAR,5,Adam Thielen(CAR),1,9,157,17.4,92,10.2,0,0.0,65,7.2,17,1.9,0,10,31.3%,10,0,0,4,4,2,1,0,44
5,2017,REG,1,13969,Nelson Agholor,FA,6,Nelson Agholor(FA),1,6,86,14.3,51,8.5,0,0.0,35,5.8,18,3.0,0,8,21.1%,6,0,1,3,1,1,1,1,58
6,2017,REG,1,13081,Bennie Fowler III,FA,7,Bennie Fowler III(FA),1,3,21,7.0,21,7.0,0,0.0,0,0.0,0,0.0,0,4,14.3%,3,0,2,1,0,0,0,0,10
7,2017,REG,1,9320,Jordy Nelson,FA,8,Jordy Nelson(FA),1,7,79,11.3,73,10.4,0,0.0,6,0.9,1,0.1,0,8,19.0%,7,0,0,3,1,1,0,0,32
8,2017,REG,2,9460,Michael Crabtree,FA,1,Michael Crabtree(FA),1,6,80,13.3,50,8.3,0,0.0,30,5.0,0,0.0,0,6,21.4%,6,0,0,3,3,0,0,0,26
9,2017,REG,2,11548,Jermaine Kearse,FA,2,Jermaine Kearse(FA),1,4,64,16.0,57,14.3,0,0.0,7,1.8,0,0.0,0,5,20.8%,4,0,0,3,1,1,0,0,34



📊 Missing Value Summary for: FantasyPros WR Advanced Stats


Unnamed: 0,Missing Count,Missing %


In [39]:
# scrape WR fantasypros redzone stats 
def wr_scrape_fp_rz_stats(
    year_week_pairs,
    save_csv_path=None,              
    sleep_range=(0.35, 0.85),
    timeout=20
):
    url_tpl = "https://www.fantasypros.com/nfl/red-zone-stats/wr.php?year={y}&week={w}&range=week"

    sess = requests.Session()
    sess.headers.update({
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
                      "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
    })

    rows, errors = [], []

    def _extract_fp_id(a_tag):
        if not a_tag:
            return None
        for cls in a_tag.get("class", []):
            m = re.match(r"fp-id-(\d+)", cls)
            if m:
                return m.group(1)
        if a_tag.has_attr("data-player-id"):
            return str(a_tag["data-player-id"])
        if a_tag.has_attr("href"):
            m = re.search(r"(\d+)(?:/|$)", a_tag["href"])
            if m:
                return m.group(1)
        return None

    def _normalize_team(t):
        t = (t or "").upper().strip()
        alias = {"JAX":"JAC", "WSH":"WAS", "LAR":"LA", "STL":"LA", "OAK":"LV", "SD":"LAC"}
        if t in {"FANTASYPROS", "FANTASY PROS", "FANTASY-PROS", "FP", ""}:
            return None
        if t != "FA" and not (2 <= len(t) <= 4 and t.isalpha()):
            return None
        return alias.get(t, t)

    for (year, week) in year_week_pairs:
        url = url_tpl.format(y=year, w=week)
        try:
            resp = sess.get(url, timeout=timeout)
            resp.raise_for_status()
            soup = BeautifulSoup(resp.text, "html.parser")

            table = soup.find("table")
            if table is None:
                errors.append({"year": year, "week": week, "url": url, "error": "no_table"})
                time.sleep(uniform(*sleep_range))
                continue

            thead = table.find("thead")
            headers = [h.get_text(strip=True) for h in (thead.find_all("th") if thead else [])]

            tbody = table.find("tbody")
            tr_list = tbody.find_all("tr") if tbody else []
            for tr in tr_list:
                tds = tr.find_all(["td","th"])
                if not tds:
                    continue

                # locate player cell
                player_idx = None
                for i, h in enumerate(headers):
                    if h.lower() == "player":
                        player_idx = i
                        break
                if player_idx is None:
                    for i, td in enumerate(tds):
                        if td.find("a", class_=re.compile(r"\bfp-player-link\b")):
                            player_idx = i
                            break
                player_td = tds[player_idx] if player_idx is not None else tr

                a = player_td.find("a", class_=re.compile(r"\bfp-player-link\b"))
                fantasypros_id = _extract_fp_id(a)
                player_name = a.get_text(strip=True) if a else None

                # --- TEAM extraction fix ---
                team_abbr = None

                # 1) Regex directly from full player cell text
                m = re.search(r"\(([A-Z]{2,4})\)", player_td.get_text(" ", strip=True))
                if m:
                    team_abbr = m.group(1)

                # 2) 'Team' column if present
                if not team_abbr:
                    try:
                        team_col_idx = headers.index("Team")
                        team_abbr = tds[team_col_idx].get_text(strip=True)
                    except ValueError:
                        pass

                # 3) small/span near player name
                if not team_abbr:
                    tag = (player_td.select_one("small")
                           or player_td.select_one("span")
                           or player_td.select_one('span[class*="team"]'))
                    if tag:
                        txt = tag.get_text(strip=True)
                        m = re.search(r"\(([A-Z]{2,4})\)", txt)
                        team_abbr = m.group(1) if m else txt

                # 4) logo alt/title
                if not team_abbr and a:
                    img = a.find_next("img")
                    if img:
                        team_abbr = img.get("alt") or img.get("title")

                team_abbr = _normalize_team(team_abbr)

                # record
                cell_vals = [td.get_text(strip=True) for td in tds]
                rec = {
                    "season": year,
                    "season_type": "REG",
                    "week": week,
                    "fantasypros_id": fantasypros_id,
                    "player_name": player_name,
                    "team_abbr": team_abbr,
                }
                for col, val in zip(headers, cell_vals):
                    rec[col] = val
                rows.append(rec)

            time.sleep(uniform(*sleep_range))

        except Exception as e:
            errors.append({"year": year, "week": week, "url": url, "error": str(e)})
            time.sleep(uniform(*sleep_range))
            continue

    df = pd.DataFrame(rows)

    if not df.empty:
        df["season"] = pd.to_numeric(df["season"], errors="coerce").astype("Int64")
        df["week"]   = pd.to_numeric(df["week"],   errors="coerce").astype("Int64")
        if "fantasypros_id" in df.columns:
            df = df.drop_duplicates(subset=["season","week","fantasypros_id"], keep="first")
        else:
            df = df.drop_duplicates()

    if save_csv_path:
        df.to_csv(save_csv_path, index=False)

    return df, errors


In [40]:
# pull a sample of the scraped fantasypros data for visual inspection
sample_df, sample_errors = test_scraper_sample(
    wr_scrape_fp_rz_stats,
    year_week_pair=(2024, 1),
    save_csv_path=None  # prevent saving during test
)


Sample shape: (8, 22)
team_abbr unique values: ['TB' 'NE' 'JAC' 'SEA' 'BUF' 'LAC' 'NYJ']


Unnamed: 0,season,season_type,week,fantasypros_id,player_name,team_abbr,Rank,Player,REC,TGT,REC PCT,YDS,Y/R,TD,TGT PCT,ATT,PCT,FL,G,FPTS,FPTS/G,ROST %
0,2024,REG,1,12119,Mike Evans,TB,1,Mike Evans(TB),2,3,66.7%,0,9.0,0,75.0%,0,0%,0,1,13.8,13.8,97.6%
1,2024,REG,1,13981,Stefon Diggs,NE,2,Stefon Diggs(NE),3,3,100.0%,0,3.7,0,75.0%,0,0%,0,1,13.1,13.1,83.6%
2,2024,REG,1,23000,Brian Thomas Jr.,JAC,3,Brian Thomas Jr.(JAC),1,1,100.0%,0,14.0,0,100.0%,0,0%,0,1,7.4,7.4,99.1%
3,2024,REG,1,16433,Cooper Kupp,SEA,4,Cooper Kupp(SEA),2,3,66.7%,1,6.5,0,60.0%,1,100.0%,0,1,7.4,7.4,83.5%
4,2024,REG,1,16489,Mack Hollins,NE,5,Mack Hollins(NE),1,1,100.0%,0,11.0,0,25.0%,0,0%,0,1,7.1,7.1,1.5%
5,2024,REG,1,23748,Khalil Shakir,BUF,6,Khalil Shakir(BUF),1,1,100.0%,0,11.0,0,25.0%,0,0%,0,1,7.1,7.1,79.1%
6,2024,REG,1,26122,Ladd McConkey,LAC,7,Ladd McConkey(LAC),1,2,50.0%,0,10.0,0,50.0%,0,0%,0,1,7.0,7.0,98.6%
7,2024,REG,1,17301,Allen Lazard,NYJ,8,Allen Lazard(NYJ),2,3,66.7%,0,3.5,0,50.0%,0,0%,0,1,6.7,6.7,3.3%


In [41]:
# output: a dataframe of WR fantasypros advanced stats
wr_fp_rz_stats_df, fp_rz_errors = wr_scrape_fp_rz_stats(
    year_week_pairs,
    save_csv_path="wr_fp_rz_stats.csv"
)

# 📊 Display shape
print(f"Shape of FantasyPros WR Red Zone Stats DataFrame: {wr_fp_rz_stats_df.shape}")

# 👀 Display first few rows
display(wr_fp_rz_stats_df.head(25))

# 🔍 Display missing value summary
display(check_nulls(wr_fp_rz_stats_df, name="FantasyPros WR Red Zone Stats"))

Shape of FantasyPros WR Red Zone Stats DataFrame: (1120, 22)


Unnamed: 0,season,season_type,week,fantasypros_id,player_name,team_abbr,Rank,Player,REC,TGT,REC PCT,YDS,Y/R,TD,TGT PCT,ATT,PCT,FL,G,FPTS,FPTS/G,ROST %
0,2017,REG,1,13981,Stefon Diggs,NE,1,Stefon Diggs(NE),3,3,100.0%,0,7.3,0,60.0%,0,0%,0,1,14.2,14.2,83.6%
1,2017,REG,1,13081,Bennie Fowler III,FA,2,Bennie Fowler III(FA),2,2,100.0%,0,5.5,0,66.7%,0,0%,0,1,13.1,13.1,0.0%
2,2017,REG,1,13840,Seth Roberts,FA,3,Seth Roberts(FA),1,1,100.0%,0,19.0,0,20.0%,0,0%,0,1,7.9,7.9,0.0%
3,2017,REG,1,16433,Cooper Kupp,SEA,4,Cooper Kupp(SEA),1,1,100.0%,0,18.0,0,100.0%,0,0%,0,1,7.8,7.8,83.5%
4,2017,REG,1,11606,DeAndre Hopkins,BAL,5,DeAndre Hopkins(BAL),2,3,66.7%,0,5.5,0,75.0%,0,0%,0,1,7.1,7.1,17.9%
5,2017,REG,1,16488,Kenny Golladay,FA,6,Kenny Golladay(FA),1,1,100.0%,0,10.0,0,33.3%,0,0%,0,1,7.0,7.0,4.0%
6,2017,REG,1,13894,Amari Cooper,FA,7,Amari Cooper(FA),1,4,25.0%,0,8.0,0,80.0%,0,0%,0,1,6.8,6.8,10.8%
7,2017,REG,1,11215,Marvin Jones Jr.,FA,8,Marvin Jones Jr.(FA),1,1,100.0%,0,6.0,0,33.3%,0,0%,0,1,6.6,6.6,0.0%
8,2017,REG,2,9707,Emmanuel Sanders,FA,1,Emmanuel Sanders(FA),3,3,100.0%,0,7.0,0,75.0%,0,0%,0,1,14.1,14.1,0.0%
9,2017,REG,2,9460,Michael Crabtree,FA,2,Michael Crabtree(FA),2,2,100.0%,0,1.5,0,66.7%,0,0%,0,1,12.3,12.3,0.0%



📊 Missing Value Summary for: FantasyPros WR Red Zone Stats


Unnamed: 0,Missing Count,Missing %


In [42]:
# Listing columns of all three FantasyPros dataframes
basic_stats_cols = wr_fp_basic_stats_df.columns.tolist()
advanced_stats_cols = wr_fp_advanced_stats_df.columns.tolist()
redzone_stats_cols = wr_fp_rz_stats_df.columns.tolist()

# Combine into a dataframe for comparison
comparison_df = pd.DataFrame({
    "Basic Stats": pd.Series(basic_stats_cols),
    "Advanced Stats": pd.Series(advanced_stats_cols),
    "Red Zone Stats": pd.Series(redzone_stats_cols)
})
comparison_df

Unnamed: 0,Basic Stats,Advanced Stats,Red Zone Stats
0,season,season,season
1,season_type,season_type,season_type
2,week,week,week
3,fantasypros_id,fantasypros_id,fantasypros_id
4,player_name,player_name,player_name
5,team_abbr,team_abbr,team_abbr
6,Rank,Rank,Rank
7,Player,Player,Player
8,REC,G,REC
9,TGT,REC,TGT


In [43]:
# ✅ Display the shape of each dataframe before merging
print(f"📊 **Shape of WR Basic Stats DataFrame:** {wr_fp_basic_stats_df.shape}")
print(f"\n📊 **Shape of WR Advanced Stats DataFrame:** {wr_fp_advanced_stats_df.shape}")
print(f"📊 **Shape of WR Red Zone Stats DataFrame:** {wr_fp_rz_stats_df.shape}")

📊 **Shape of WR Basic Stats DataFrame:** (25785, 21)

📊 **Shape of WR Advanced Stats DataFrame:** (1120, 32)
📊 **Shape of WR Red Zone Stats DataFrame:** (1120, 22)


In [46]:
# check nulls
# Apply helper function to each FantasyPros DataFrame
basic_stats_nulls = check_nulls(wr_fp_basic_stats_df, "FantasyPros Basic Stats")
advanced_stats_nulls = check_nulls(wr_fp_advanced_stats_df, "FantasyPros Advanced Stats")
redzone_nulls = check_nulls(wr_fp_rz_stats_df, "FantasyPros Red Zone Stats")

# Concatenate all results (only non-empty will be shown)
combined_nulls = pd.concat(
    [basic_stats_nulls, advanced_stats_nulls, redzone_nulls],
    keys=["Basic Stats", "Advanced Stats", "Red Zone Stats"]
)
combined_nulls


📊 Missing Value Summary for: FantasyPros Basic Stats

📊 Missing Value Summary for: FantasyPros Advanced Stats

📊 Missing Value Summary for: FantasyPros Red Zone Stats


Unnamed: 0,Unnamed: 1,Missing Count,Missing %


In [None]:
### End:fantasypros webscraping ###

In [48]:
# All dataframes - no features and no salary
# wr_ids_weekly_stats_df
# wr_ngs_df
# wr_fp_basic_stats_df
# wr_fp_advanced_stats_df
# wr_fp_rz_stats_df


In [49]:
## Begin: team abbreviation standardization ##

In [50]:
# List columns from each dataframe in memory
ids_weekly_cols = wr_ids_weekly_stats_df.columns.tolist()
ngs_cols = wr_ngs_df.columns.tolist()
fp_basic_cols = wr_fp_basic_stats_df.columns.tolist()
fp_adv_cols = wr_fp_advanced_stats_df.columns.tolist()
fp_rz_cols = wr_fp_rz_stats_df.columns.tolist()

# Combine into a dataframe for side-by-side comparison
comparison_df = pd.DataFrame({
    "IDs & Weekly Stats": pd.Series(ids_weekly_cols),
    "NGS Stats": pd.Series(ngs_cols),
    "FantasyPros Basic": pd.Series(fp_basic_cols),
    "FantasyPros Adv": pd.Series(fp_adv_cols),
    "FantasyPros RZ": pd.Series(fp_rz_cols)
})

comparison_df

Unnamed: 0,IDs & Weekly Stats,NGS Stats,FantasyPros Basic,FantasyPros Adv,FantasyPros RZ
0,season,season,season,season,season
1,season_type,season_type,season_type,season_type,season_type
2,week,week,week,week,week
3,player_id,player_display_name,fantasypros_id,fantasypros_id,fantasypros_id
4,player_name,player_position,player_name,player_name,player_name
5,position,team_abbr,team_abbr,team_abbr,team_abbr
6,position_group,avg_cushion,Rank,Rank,Rank
7,recent_team,avg_separation,Player,Player,Player
8,fantasy_points,avg_intended_air_yards,REC,G,REC
9,fantasy_points_ppr,percent_share_of_intended_air_yards,TGT,REC,TGT


In [None]:
## Next Tasks
# standardize the team abbr across all dataframes
# prep to merge with dfs (using modified names "_dfs")
# there will be 10 dataframes total (dfs and non-dfs)
# build the feature engineering list
# data normalization in preparation to build the features
# build the features

In [51]:
def show_team_uniques():
    def norm(s):
        return (s.astype('string')
                 .str.strip()
                 .str.upper()
                 .str.replace(".", "", regex=False)
                 .str.replace(" ", "", regex=False))

    out = {}
    if 'recent_team' in wr_ids_weekly_stats_df.columns:
        out['wr_ids_weekly_stats_df.recent_team'] = sorted(norm(wr_ids_weekly_stats_df['recent_team'].dropna()).unique())
    if 'team_abbr' in wr_ngs_df.columns:
        out['wr_ngs_df.team_abbr'] = sorted(norm(wr_ngs_df['team_abbr'].dropna()).unique())

    for k, v in out.items():
        print(f"\n{k} ({len(v)} uniques):")
        print(v)

# Call to preview only
show_team_uniques()



wr_ids_weekly_stats_df.recent_team (32 uniques):
['ARI', 'ATL', 'BAL', 'BUF', 'CAR', 'CHI', 'CIN', 'CLE', 'DAL', 'DEN', 'DET', 'GB', 'HOU', 'IND', 'JAX', 'KC', 'LA', 'LAC', 'LV', 'MIA', 'MIN', 'NE', 'NO', 'NYG', 'NYJ', 'PHI', 'PIT', 'SEA', 'SF', 'TB', 'TEN', 'WAS']

wr_ngs_df.team_abbr (32 uniques):
['ARI', 'ATL', 'BAL', 'BUF', 'CAR', 'CHI', 'CIN', 'CLE', 'DAL', 'DEN', 'DET', 'GB', 'HOU', 'IND', 'JAX', 'KC', 'LAC', 'LAR', 'LV', 'MIA', 'MIN', 'NE', 'NO', 'NYG', 'NYJ', 'PHI', 'PIT', 'SEA', 'SF', 'TB', 'TEN', 'WAS']


In [52]:
# standardize team abbreviations
# output: none

# Create One Unified Map
# Canonical full team name → abbreviation
full_name_to_abbr = {
    'Arizona Cardinals': 'ARI', 'Atlanta Falcons': 'ATL', 'Baltimore Ravens': 'BAL',
    'Buffalo Bills': 'BUF', 'Carolina Panthers': 'CAR', 'Chicago Bears': 'CHI',
    'Cincinnati Bengals': 'CIN', 'Cleveland Browns': 'CLE', 'Dallas Cowboys': 'DAL',
    'Denver Broncos': 'DEN', 'Detroit Lions': 'DET', 'Green Bay Packers': 'GB',
    'Houston Texans': 'HOU', 'Indianapolis Colts': 'IND', 'Jacksonville Jaguars': 'JAX',
    'Kansas City Chiefs': 'KC', 'Las Vegas Raiders': 'LV', 'Los Angeles Chargers': 'LAC',
    'Los Angeles Rams': 'LA', 'Miami Dolphins': 'MIA', 'Minnesota Vikings': 'MIN',
    'New England Patriots': 'NE', 'New Orleans Saints': 'NO', 'New York Giants': 'NYG',
    'New York Jets': 'NYJ', 'Philadelphia Eagles': 'PHI', 'Pittsburgh Steelers': 'PIT',
    'San Francisco 49ers': 'SF', 'Seattle Seahawks': 'SEA', 'Tampa Bay Buccaneers': 'TB',
    'Tennessee Titans': 'TEN', 'Washington Commanders': 'WAS', 'Washington Football Team': 'WAS'

}

# Abbreviation fix-ups
abbr_fix_map = {
    'ARZ': 'ARI', 'TBB': 'TB', 'NEP': 'NE', 'GBP': 'GB', 'KCC': 'KC',
    'SFF': 'SF', 'LAR': 'LA', 'NOS': 'NO', 'JAC': 'JAX', 'LVR': 'LV'
}


# abbreviation fixes
team_map = {
    "OAK": "LV",
    "SD": "LAC",
    "STL": "LAR",
    "WSH": "WAS",
    "LA": "LAR"
}

In [53]:
# create team abbreviation mapping logic
def standardize_team_abbr(df, col, mapping):
    if col in df.columns:
        df[col] = (
            df[col]
            .astype(str)
            .str.strip()
            .str.upper()
            .str.replace(".", "", regex=False)
            .str.replace(" ", "", regex=False)
            .replace(mapping)
        )

In [54]:
# apply the team abbreviation mapping logic
standardize_team_abbr(wr_ids_weekly_stats_df, "recent_team", team_map)
standardize_team_abbr(wr_ngs_df, "team_abbr", team_map)

# Preview uniques after standardization
print("wr_ids_weekly_stats_df.recent_team uniques:")
print(sorted(wr_ids_weekly_stats_df["recent_team"].unique()))

print("\nwr_ngs_df.team_abbr uniques:")
print(sorted(wr_ngs_df["team_abbr"].unique()))

wr_ids_weekly_stats_df.recent_team uniques:
['ARI', 'ATL', 'BAL', 'BUF', 'CAR', 'CHI', 'CIN', 'CLE', 'DAL', 'DEN', 'DET', 'GB', 'HOU', 'IND', 'JAX', 'KC', 'LAC', 'LAR', 'LV', 'MIA', 'MIN', 'NE', 'NO', 'NYG', 'NYJ', 'PHI', 'PIT', 'SEA', 'SF', 'TB', 'TEN', 'WAS']

wr_ngs_df.team_abbr uniques:
['ARI', 'ATL', 'BAL', 'BUF', 'CAR', 'CHI', 'CIN', 'CLE', 'DAL', 'DEN', 'DET', 'GB', 'HOU', 'IND', 'JAX', 'KC', 'LAC', 'LAR', 'LV', 'MIA', 'MIN', 'NE', 'NO', 'NYG', 'NYJ', 'PHI', 'PIT', 'SEA', 'SF', 'TB', 'TEN', 'WAS']


In [None]:
## End: team abbreviation standardization ##

In [None]:
## Begin: Build the dataframe for the DFS Fanduel and Draft Kings salary data from BigDataBall ##
# ** Files must be in the local directory ** NFL-20xx-DFS-Dataset.xlsx

In [None]:
# create a helper function to clean the the dfs salary data
# no output

def clean_column_dfs(col):
    """
    Cleans and flattens multi-index column names for DFS salary Excel files:
    - Joins tuples if multi-index
    - Removes special characters
    - Normalizes spaces
    - Converts to lowercase for matching
    """
    if isinstance(col, tuple):
        col = ' '.join(str(x) for x in col if x)

    return (
        str(col)
        .replace('\n', ' ')
        .replace('(', '')
        .replace(')', '')
        .replace('"', '')
        .replace('#', '')
        .replace('$', '')
        .replace('/', '')
        .replace('-', ' ')
        .strip()
        .lower()
        .replace('  ', ' ')
        .replace('   ', ' ')
    )

In [None]:
# Read the excel files
filepath = 'NFL-2024-DFS-Dataset.xlsx'
dfs_raw = pd.read_excel(filepath, header=[0, 1])
original_row_count = len(dfs_raw)

dfs_raw.columns = [clean_column_dfs(col) for col in dfs_raw.columns]
dfs_raw.head()  # Optional preview

In [None]:
# helper function does the following:
# Fanduel and Draft Kings player salary data for all positions (QB, RB, TE, WR, DST)
# creates and combines the dataframes for years 2017 - present 
# performs data validation checks
# output: none

def process_single_year(filepath, year):
    
    # Step 1: Read and clean the headers
    dfs_raw = pd.read_excel(filepath, header=[0, 1])
    original_row_count = len(dfs_raw)
    dfs_raw.columns = [clean_column_dfs(col) for col in dfs_raw.columns]

    # ✅ Step 2: Extract only relevant columns using cleaned names
    expected_cols = {
        'player': 'game information player dst',
        'week': 'game information week',
        'date': 'game information date',
        'player_id': 'game information player id',
        'dk_position': 'position draftkings',
        'fd_position': 'position fanduel',
        'dk_salary': 'salary for draftkings classic contests',
        'fd_salary': 'salary for fanduel full roster contests'
    }

    # Subset the dataframe using cleaned column names
    dfs_subset = dfs_raw[list(expected_cols.values())].copy()

    # Rename them to simple identifiers for internal use
    dfs_subset.columns = list(expected_cols.keys())

    
    dfs_subset['date'] = pd.to_datetime(dfs_subset['date'])

    team_abbreviation_mapping = {
        'NWE': 'NE',
        'SFO': 'SF',
        'OAK': 'LV',
        'KAN': 'KC',
        'TAM': 'TB',
        'NOR': 'NO',
        'LAR': 'LA',
        'GNB': 'GB'
    }
    mask_dst = dfs_subset['dk_position'] == 'DST'
    dfs_subset.loc[mask_dst, 'player_id'] = dfs_subset.loc[mask_dst, 'player_id'].replace(team_abbreviation_mapping)

    def fix_season(row):
        game_year = row['date'].year
        game_month = row['date'].month
        game_week = row['week']
        
        if game_month in [1, 2]:
            if (game_year <= 2020 and game_week >= 18):
                return game_year - 1
            elif (game_year >= 2021 and game_week >= 19):
                return game_year - 1
            elif (game_year >= 2021 and game_week == 18):
                return game_year - 1
        return game_year

    dfs_subset['season'] = dfs_subset.apply(fix_season, axis=1)

    # 🔥 Track NaNs before dropping
    season_nulls_before = dfs_subset['season'].isna().sum()

    dfs_subset = dfs_subset.dropna(subset=['season'])
    dfs_subset['season'] = dfs_subset['season'].astype(int)

    season_nulls_after = dfs_subset['season'].isna().sum()

    print(f"🔎 Season NaN rows dropped: {season_nulls_before}")
    print(f"Remaining NaN rows (should be 0): {season_nulls_after}")

    dfs_subset = dfs_subset.drop(columns=['date'])

    dfs_subset['dk_salary'] = pd.to_numeric(dfs_subset['dk_salary'], errors='coerce')
    dfs_subset['fd_salary'] = pd.to_numeric(dfs_subset['fd_salary'], errors='coerce')
    dfs_subset = dfs_subset.dropna(subset=['dk_salary', 'fd_salary'])
    dfs_subset['dk_salary'] = dfs_subset['dk_salary'].astype(int)
    dfs_subset['fd_salary'] = dfs_subset['fd_salary'].astype(int)
    dfs_subset['week'] = dfs_subset['week'].astype(int)
    
    dfs_subset = dfs_subset[['season', 'week', 'player_id', 'player', 'dk_position', 'fd_position', 'dk_salary', 'fd_salary']]
    
    unique_weeks = dfs_subset['week'].nunique()
    min_week = dfs_subset['week'].min()
    max_week = dfs_subset['week'].max()
    expected_weeks = 21 if int(year) <= 2020 else 22

    print(f"\nProcessing file: {filepath}")
    print(f"Original rows in xlsx file: {original_row_count}")
    print(f"Number of players with no salary data found in xlsx: {original_row_count - len(dfs_subset)}")
    print(f"Rows in csv file after dropping NaNs: {len(dfs_subset)}")

    if original_row_count - (original_row_count - len(dfs_subset)) == len(dfs_subset):
        print("✅ Salary Validation passed: Counts match after dropping NaNs.")
        salary_validation = 'Passed'
    else:
        print("❌ Salary Validation failed: Counts mismatch!")
        salary_validation = 'Failed'

    print(f"Weeks detected: {min_week} to {max_week}")
    print(f"Total unique weeks found: {unique_weeks}")
    print("🔔 Reminder: Missing final playoff week (e.g., Super Bowl) is normal if no salary data exists.")

    if unique_weeks == expected_weeks or unique_weeks == expected_weeks - 1:
        print(f"✅ Week Validation passed: {unique_weeks} weeks found (expected {expected_weeks}).\n")
        week_validation = 'Passed'
    else:
        print(f"❌ Week Validation failed: {unique_weeks} weeks found, expected {expected_weeks}.\n")
        week_validation = 'Failed'
    
    return dfs_subset, {
        'year': int(year),
        'original_rows': original_row_count,
        'nan_rows': original_row_count - len(dfs_subset),
        'rows_after_drop': len(dfs_subset),
        'min_week': min_week,
        'max_week': max_week,
        'unique_weeks': unique_weeks,
        'expected_weeks': expected_weeks,
        'salary_validation': salary_validation,
        'week_validation': week_validation
    }

In [None]:
# ** dataframe of Fanduel and Draft Kings Salaries FOR all positions ** 

# main control flow implements the helper function 
# output: combined dataframe and csv files of all seasons fanduel draft kings player salary data 
# output: data validation checks

# Find all matching files
file_list = sorted(glob.glob('NFL-*-DFS-Dataset.xlsx'))

# Handle if no files found
if not file_list:
    print("❌ No xlsx files detected.\nPlease download and place the BigDataBall NFL DFS Excel files into the same directory as this Jupyter Notebook file.")
else:
    # Process each file
    all_years_dfs = []
    validation_records = []
    file_years = []

    for file in file_list:
        year = file.split('-')[1]  # Extract year from filename
        file_years.append(int(year))
        
        year_df, validation_info = process_single_year(file, year)

        # ** csv file ***
        # Save per-year CSV
        # year_df.to_csv(f'nfl_fd_dk_salary_{year}.csv', index=False)
        
        # Append to master list
        all_years_dfs.append(year_df)
        validation_records.append(validation_info)

    # Create validation summary DataFrame
    validation_summary_df = pd.DataFrame(validation_records)
    print("\n📋 Validation Summary:")
    display(validation_summary_df)

    # Combine all years into one big dataframe
    nfl_fd_dk_salary_combined = pd.concat(all_years_dfs, ignore_index=True)

    # Determine latest season dynamically
    current_season = max(file_years)

    # Export final combined CSV
    final_filename = f'nfl_fd_dk_salary_2017_{current_season}.csv'

    # If the file already exists, create a backup
    if os.path.exists(final_filename):
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        backup_filename = f'nfl_fd_dk_salary_2017_{current_season}_backup_{timestamp}.csv'
        shutil.copy(final_filename, backup_filename)
        print(f"🛡️ Backup created: {backup_filename}")


    # *** csv file ***
    # nfl_fd_dk_salary_combined.to_csv(final_filename, index=False)

    print(f"\n✅ Final combined CSV saved as: {final_filename}")

    # Display a quick preview
    display(nfl_fd_dk_salary_combined.head())

In [None]:
# team abbreviations validations check
def validate_team_defenses(dfs_df, year_label, baseline_set):
    dfs_team_defense = dfs_df.loc[dfs_df['dk_position'] == 'DST', 'player_id']
    dfs_nfl_tm_abbr = set(dfs_team_defense.dropna().unique())
    
    difference_dfs = dfs_nfl_tm_abbr - baseline_set
    difference_baseline = baseline_set - dfs_nfl_tm_abbr
    
    print(f"\nValidating Team Defenses for {year_label}:")
    if not difference_dfs and not difference_baseline:
        print(f"✅ Team defenses match for {year_label}")
    else:
        print(f"❌ Team mismatch detected for {year_label}")
        print(f"Teams only in DFS: {difference_dfs}")
        print(f"Teams only in Baseline: {difference_baseline}")

# Validate each yearly dataframe
for df, info in zip(all_years_dfs, validation_records):
    validate_team_defenses(df, year_label=info['year'], baseline_set=baseline_nfl_tm_abbr)

# Validate the full combined dataframe
validate_team_defenses(nfl_fd_dk_salary_combined, year_label='Combined', baseline_set=baseline_nfl_tm_abbr)

In [None]:
# ** Final WR dataframe of Fanduel and Draft Kings player Salaries ** 

# output: 

# Determine current season based on available data
current_season = nfl_fd_dk_salary_combined['season'].max()

# Extract WR players where DraftKings position is WR
wr_fd_dk_salary_2017_current_df = nfl_fd_dk_salary_combined.loc[
    nfl_fd_dk_salary_combined['dk_position'] == 'WR'
]

# *** csv file ***
# wr_csv_filename = f'wr_fd_dk_salary_2017_{current_season}.csv'
# wr_fd_dk_salary_2017_current_df.to_csv(wr_csv_filename, index=False)

print(f"✅ WR DFS dataframe created and saved as {wr_csv_filename}")

# Optional: Display a quick preview
display(wr_fd_dk_salary_2017_current_df.head())


In [None]:
## End: Build the dataframe for the DFS Fanduel and Draft Kings salary data from BigDataBall ##

In [None]:
### Begin: Data Normalization and Merge Process ###

In [None]:
# helper function for normalization methods
def normalize_dataframe(df: pd.DataFrame, type_map: dict = None) -> pd.DataFrame:
    """
    Clean and normalize dataframe:
    - Strip whitespace from object columns
    - Lowercase common ID/name fields
    - Replace special NA tokens with np.nan
    - Cast to types from provided type_map
    """
    df = df.copy()

    # Replace common string-based missing values with np.nan
    df.replace(["N/A", "NA", "-", ""], np.nan, inplace=True)

    # Strip whitespace from string/object columns
    for col in df.select_dtypes(include='object').columns:
        try:
            df[col] = df[col].astype(str).str.strip()
        except Exception as e:
            print(f"⚠️ Could not strip column '{col}': {e}")

    # Lowercase likely ID/name fields (if present)
    for key in ['player', 'player_name', 'player_id', 'fantasypros_id', 'FantasyPros_ID', 'merge_name']:
        if key in df.columns:
            df[key] = df[key].astype(str).str.lower()

    # Apply type conversions as defined in type_map
    if type_map:
        for col, dtype in type_map.items():
            if col in df.columns:
                try:
                    df[col] = df[col].astype(dtype)
                except Exception as e:
                    print(f"⚠️ Warning: could not convert column '{col}' to {dtype}. Reason: {e}")

    return df

In [None]:
def clean_percentage_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
    df = df.copy()
    for col in columns:
        if col in df.columns:
            try:
                # Convert to string first (safe for NaNs)
                df[col] = df[col].astype(str).str.replace('%', '', regex=False)
                df[col] = df[col].astype(float)
            except Exception as e:
                print(f"⚠️ Could not clean and convert '{col}': {e}")
    return df

In [None]:
def inspect_dataframe_types(df: pd.DataFrame, name: str = "DataFrame") -> None:
    print(f"📋 Inspecting: {name}")
    print(f"Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
    summary = pd.DataFrame({
        'dtype': df.dtypes,
        'na_count': df.isna().sum()
    }).sort_index()
    display(summary)

In [None]:
# normalize data types for merging 
# WR Baseline type map
type_map_wr = {
    'season': 'int32',
    'week': 'int32',
    'player_id': 'string',
    'fantasypros_id': 'string',
    # Add others as needed...
}

# FP Advanced Stats type map
type_map_fp_adv = {
    'fantasypros_id': 'string',
    'season': 'int32',
    'week': 'int32',
    'Player': 'string',
    'G': 'int32',
    'REC': 'float32',
    'YDS': 'float32',
    'YBC': 'float32',
    'AIR': 'float32',
    'YAC': 'float32',
    'YACON': 'float32',
    'BRKTKL': 'float32',
    'TGT': 'float32',
    'CATCHABLE': 'float32',
    'DROP': 'float32',
    'RZ TGT': 'float32',
    '10+ YDS': 'float32',
    '20+ YDS': 'float32',
    '30+ YDS': 'float32',
    '40+ YDS': 'float32',
    '50+ YDS': 'float32',
}

# FP Fantasy Points + Rostered type map
type_map_fp_fpts = {
    'fantasypros_id': 'string',
    'season': 'int32',
    'week': 'int32',
    'FPTS': 'float32',
    'ROST': 'string',
}

# FP Redzone type map
type_map_fp_rz = {
    'fantasypros_id': 'string',
    'season': 'int32',
    'week': 'int32',
    'REC PCT_rz': 'float32',
    'REC_rz': 'int32',
    'TGT PCT_rz': 'float32',
    'TGT_rz': 'int32',
    'Y/R_rz': 'float32',
}


In [None]:
### End: Data Normalization and Merge Process ###

In [None]:
### Begin: Feature Engineering ###

In [None]:
# drop unnecessary columns
# output: updated dataframe and csv file with modified columns
# List of columns to drop
cols_to_drop = [
    'player_id', 'player_name', 'position_group', 'mfl_id', 'sportradar_id',
    'fantasypros_id', 'pff_id', 'sleeper_id', 'nfl_id', 'espn_id', 'yahoo_id',
    'fleaflicker_id', 'cbs_id', 'pfr_id', 'cfbref_id', 'rotowire_id',
    'rotoworld_id', 'ktc_id', 'stats_id', 'stats_global_id', 'fantasy_data_id',
    'swish_id', 'merge_name', 'player_gsis_id', 'player_first_name',
    'player_last_name', 'player_short_name', 'game_type', 'team', 'opponent',
    'pfr_player_name', 'pfr_player_id', 'player', 'team_abbr_y', 'merge_key'
]

# Initial state
initial_shape = wr_nfl_py_fp_odds_salary_merged.shape
initial_col_count = initial_shape[1]

print("Before modification:")
print(f"Shape: {initial_shape}")
print(f"Total Columns: {initial_col_count}")

# Drop only existing columns
cols_existing = [col for col in cols_to_drop if col in wr_nfl_py_fp_odds_salary_merged.columns]
wr_nfl_py_fp_odds_salary_merged_mod_cols = wr_nfl_py_fp_odds_salary_merged.drop(columns=cols_existing)

# Post-drop validation
final_shape = wr_nfl_py_fp_odds_salary_merged_mod_cols.shape
final_col_count = final_shape[1]
dropped_count = len(cols_existing)
expected_final_col_count = initial_col_count - dropped_count

print("\nAfter modification:")
print(f"Shape: {final_shape}")
print(f"Total Columns After Drop: {final_col_count}")
print(f"Number of Columns Dropped: {dropped_count}")
print(f"Expected Final Column Count: {expected_final_col_count}")
print(f"Column Count Validation Passed: {final_col_count == expected_final_col_count}")

# Confirm none of the dropped columns remain
all_dropped = all(col not in wr_nfl_py_fp_odds_salary_merged_mod_cols.columns for col in cols_to_drop)
print(f"All Specified Columns Successfully Dropped: {all_dropped}")

# ** csv file **
output_csv_path = "wr_nfl_py_fp_odds_salary_merged_mod_cols.csv"
# wr_nfl_py_fp_odds_salary_merged_mod_cols.to_csv(output_csv_path, index=False)
# print(f"\nModified dataframe saved to {output_csv_path}")

In [None]:
# reorder columns
# Define new column order
priority_cols = [
    'season', 'season_type', 'week', 'name', 'position', 'recent_team',
    'fpts', 'dk_salary', 'fd_salary', 'rost',
    'opponent_abbr', 'home', 'role', 'result', 'score', 'spread', 'over_under'
]

# Capture original state
original_shape = wr_nfl_py_fp_odds_salary_merged_mod_cols.shape
original_columns = wr_nfl_py_fp_odds_salary_merged_mod_cols.columns.tolist()

print("Original Shape:", original_shape)
print("Original First 20 Columns:", original_columns[:20])

# Build final column order
remaining_cols = [col for col in original_columns if col not in priority_cols]
final_col_order = priority_cols + remaining_cols

# Reorder the columns
wr_nfl_py_fp_odds_salary_merged_mod_cols = wr_nfl_py_fp_odds_salary_merged_mod_cols[final_col_order]

# Post-reorder validation
new_shape = wr_nfl_py_fp_odds_salary_merged_mod_cols.shape
new_columns = wr_nfl_py_fp_odds_salary_merged_mod_cols.columns.tolist()

print("\nNew Shape:", new_shape)
print("New First 20 Columns:", new_columns[:20])

# Validation Checks
print("\n✅ Shape Integrity:", original_shape == new_shape)
print("✅ Column Count Matches:", len(original_columns) == len(new_columns))
print("✅ All Columns Preserved:", set(original_columns) == set(new_columns))

# ** csv file **
output_reordered_csv = "wr_nfl_py_fp_odds_salary_merged_mod_cols_reordered.csv"
# wr_nfl_py_fp_odds_salary_merged_mod_cols.to_csv(output_reordered_csv, index=False)
# print(f"\nReordered dataframe saved to {output_reordered_csv}")

In [None]:
# Apply Rolling Averages and Aggregates - 3,5,and 7 week averages
# Ouput: updated dataframe with aggregates (optional csv file)

# Start from sorted copy of the main DF
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg = (
    wr_nfl_py_fp_odds_salary_merged_mod_cols
    .sort_values(by=['name', 'season', 'week'])
    .reset_index(drop=True)
)

# Feature map: full column -> short prefix
feature_map = {
    'targets': 'tgt',
    'receptions': 'rec',
    'receiving_yards': 'rec_yds',
    'receiving_air_yards': 'rec_air_yards',
    'fpts': 'fpts'
}

windows = [3, 5, 7]

# Apply rolling averages and lag features
for full_col, short in feature_map.items():
    # Group once
    grouped = wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg.groupby(['name', 'season'])

    # Rolling averages using apply (preserves group boundaries)
    for window in windows:
        col_name = f"{short}_{window}wk_avg"
        wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg[col_name] = grouped[full_col].apply(
            lambda x: x.rolling(window=window, min_periods=window).mean().shift(1)
        ).reset_index(drop=True)

    # Lag feature (1-game lookback)
    lag_col = f"{short}_lag_1"
    wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg[lag_col] = grouped[full_col].shift(1).reset_index(drop=True)

# Final integrity check
print("✅ Final shape:", wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg.shape)

# ** csv file **
# wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg.to_csv(
#     "wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg.csv",
#     index=False,
#     float_format="%.2f"
# )
# print("📤 Exported to: wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg.csv")


In [None]:
# Validation tests of aggregates
# output: there should be no aggregates prior to week 4

# Structural Check — No aggregates in first 3 weeks of a season
def check_early_aggregates(df, cols, earliest_week=4):
    early = df[df['week'] < earliest_week]
    violations = early[cols].notna().sum()
    print("🚨 Aggregates present before week", earliest_week)
    print(violations[violations > 0])

# Boundary Check — Rolling aggregates must reset per season
def check_season_boundaries(df, col_prefix):
    errors = []
    for short in col_prefix:
        col_name = f'{short}_3wk_avg'
        season_transitions = df.groupby(['name'])['season'].diff().fillna(0)
        cross_season_rows = df[season_transitions != 0]
        if cross_season_rows[col_name].notna().any():
            errors.append(col_name)
    if errors:
        print("❌ Rolling values leaked across seasons:", errors)
    else:
        print("✅ No cross-season leakage detected.")

# Shape check
def check_shape(df, expected_cols_added):
    print("✅ Final shape:", df.shape)
    print("✅ Final columns:", df.columns[-expected_cols_added:])

# === Apply Checks ===
rolling_cols = [f"{short}_{w}wk_avg" for short in ['tgt', 'rec', 'rec_yds', 'rec_air_yards', 'fpts'] for w in [3, 5, 7]]
lag_cols = [f"{short}_lag_1" for short in ['tgt', 'rec', 'rec_yds', 'rec_air_yards', 'fpts']]
check_early_aggregates(wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg, rolling_cols)
check_season_boundaries(wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg, ['tgt', 'rec', 'rec_yds', 'rec_air_yards', 'fpts'])
check_shape(wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg, expected_cols_added=len(rolling_cols + lag_cols))

In [None]:
# Add Trend Features (deltas) - recent performance over / under (3wk, 5wk, 7wk) averages
# output: updated dataframe with deltas (optional csv file)

# new dataframe
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend = wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg.copy()

# Column map
feature_map = {
    'targets': 'tgt',
    'receptions': 'rec',
    'receiving_yards': 'rec_yds',
    'receiving_air_yards': 'rec_air_yards',
    'fpts': 'fpts'
}

windows = [3, 5, 7]

# Create delta (deviation from trend) features
for full_col, short in feature_map.items():
    for window in windows:
        avg_col = f"{short}_{window}wk_avg"
        delta_col = f"{short}_{window}wk_delta"
        wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend[delta_col] = (
            wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend[full_col] -
            wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend[avg_col]
        )

# Summary and export
delta_cols = [f"{short}_{w}wk_delta" for short in feature_map.values() for w in windows]
print("✅ Added delta columns:", delta_cols)
print("✅ Final shape:", wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend.shape)

# ** csv file **
# wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend.to_csv(
#     "wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend.csv",
#     index=False,
#     float_format="%.2f"
# )
# print("📤 Exported to: wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend.csv")


In [None]:
# Create boolean columns
# output: updated dataframe with booleans (optional csv file)

# Start from the previous trend-enhanced dataframe
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool = wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend.copy()

# Define boolean columns as 0/1 integers
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['tgt_ge_5'] = (wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['targets'] >= 5).astype(int)
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['tgt_ge_7'] = (wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['targets'] >= 7).astype(int)
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['rec_ge_5'] = (wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['receptions'] >= 5).astype(int)
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['rec_ge_7'] = (wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['receptions'] >= 7).astype(int)

wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['target_share_ge_20'] = (wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['target_share'] >= 0.2).astype(int)
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['target_share_ge_30'] = (wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['target_share'] >= 0.3).astype(int)

wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['over_100_yds'] = (wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['receiving_yards'] >= 100).astype(int)
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['double_digit_targets'] = (wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['targets'] >= 10).astype(int)
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['boom_week'] = (wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['fpts'] >= 20).astype(int)
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['bust_week'] = (wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['fpts'] < 5).astype(int)

# If 'home' is already boolean, convert to int
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['is_home_game'] = wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['home'].astype(int)

# Final shape and column check
print("✅ Final shape:", wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool.shape)
print("✅ New boolean columns added.")

# ** csv file **
# wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool.to_csv(
#     "wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool.csv",
#     index=False,
#     float_format="%.2f"
# )
# print("📤 Exported to: wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool.csv")


In [None]:
# *** Dataframe: this can be used as the final dataframe but the filename is long ***

# split the over / under column into two columns: o_u and total
# output: updated dataframe with o_u and total columns (optional csv file) 

# Copy from final boolean-enriched dataframe
wr_nfl_py_fp_odds_salary_features = wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool.copy()

# Extract 'O' or 'U' and map to "over"/"under"
wr_nfl_py_fp_odds_salary_features['O_U'] = (
    wr_nfl_py_fp_odds_salary_features['over_under']
    .str[0]
    .map({'O': 'over', 'U': 'under'})
)

# Extract the numeric total (handles int or float)
wr_nfl_py_fp_odds_salary_features['Total'] = (
    wr_nfl_py_fp_odds_salary_features['over_under']
    .str.extract(r'(\d+\.?\d*)')[0]
    .astype(float)
)

# Validation
print("✅ Final shape:", wr_nfl_py_fp_odds_salary_features.shape)
print("✅ Sample 'O_U' values:", wr_nfl_py_fp_odds_salary_features['O_U'].unique())
print("✅ Sample 'Total' values:", wr_nfl_py_fp_odds_salary_features['Total'].dropna().unique()[:5])

# ** csv file **
# wr_nfl_py_fp_odds_salary_features.to_csv(
#     "wr_nfl_py_fp_odds_salary_features.csv",
#     index=False,
#     float_format="%.2f"
# )
# print("📤 Exported to: wr_nfl_py_fp_odds_salary_features.csv")

In [None]:
### End: Feature Engineering ###

In [None]:
### Begin: Final Dataframe ###

In [None]:
### *** Final Dataframe *** ###

# Rename final dataframe
wr_nfl_df_sorted_final = wr_nfl_py_fp_odds_salary_features.copy()

# **csv file **
wr_nfl_df_sorted_final.to_csv(
    "wr_nfl_df_sorted_final.csv",
    index=False,
    float_format="%.2f"
)

print("✅ Final dataframe saved as 'wr_nfl_df_sorted_final.csv'")
print("✅ Final shape:", wr_nfl_df_sorted_final.shape)

In [None]:
### End: Final Dataframe ###

In [None]:
### Begin: Final Dataframe Summary Statistics ###

In [None]:
# Final Dataframe Summary Statistics
# Output: csv output of the dataframe summary statistics on data types and missing values

# Total columns and datatypes
total_cols = wr_nfl_df_sorted_final.shape[1]
dtypes_summary = wr_nfl_df_sorted_final.dtypes.value_counts().sort_values(ascending=False)
dtypes_percent = (dtypes_summary / total_cols * 100).round(2)

print("🧠 Data Type Distribution (by count and %):")
for dtype, count in dtypes_summary.items():
    print(f"{str(dtype):15} {count:>3} columns  ({dtypes_percent[dtype]:>5.1f}%)")

# Object-type columns
object_cols = wr_nfl_df_sorted_final.select_dtypes(include='object').columns.tolist()
print(f"\n🚨 Object-type columns found ({len(object_cols)} total / {total_cols} columns):")
print(object_cols)

# Missing value summary by count and % of rows
row_count = len(wr_nfl_df_sorted_final)
na_counts = wr_nfl_df_sorted_final.isna().sum()
na_percent = (na_counts / row_count * 100).round(2)
na_summary = pd.DataFrame({'Missing': na_counts, 'Percent': na_percent})
na_summary = na_summary[na_summary['Missing'] > 0].sort_values(by='Percent', ascending=False)

print(f"\n⚠️ Missing Value Summary (non-zero only) — Top {len(na_summary)} columns:")
display(na_summary)

# ** csv file **
na_summary.to_csv("wr_df_final_summary_stats.csv")

In [None]:
### End: Final Dataframe Summary Statistics ###