In [1]:
# This produces the dataframe for WR

In [2]:
## Notes on the NFL Library ##
# the NFL python library seem to not work on Tuesday probably due to updates (not confirmed)

In [3]:
## REQUIRED ACTIONS - Include in a README doc ## 
# modify the number of weeks if the NFL adds regular season games to the schedule
# Update the season start date each year

In [4]:
## REQUIRED ACTIONS - Include in a README doc ## 
# ensure the directories exists in your local directory
# ./csv_files 
# ./dfs_files

In [5]:
## Required installations
!pip install nfl_data_py
# Ensure all required packages are installed within the notebook
# !pip install --quiet nfl_data_py
!pip install --quiet rapidfuzz




In [6]:
# import the libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import glob
from IPython.display import display, HTML
from datetime import datetime, timedelta
import nfl_data_py as nfl
import os
import re
import time
import random
from random import sample, uniform, seed
import io
from rapidfuzz import fuzz, process
import numpy as np
import hashlib
import shutil

In [7]:
# Set Pandas options to display all columns in a single row without wrapping
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

In [8]:
## Begin: time calculators ##

In [9]:
# modify the number of weeks if the NFL adds regular season games to the schedule
# Update this each year
season_start_date = datetime(2025, 9, 4)  
REG_WEEKS = 18

In [10]:
def get_current_week(today=None):
    if today is None:
        today = datetime.now()
    delta_days = (today.date() - season_start_date.date()).days
    week_num = (delta_days // 7) + 1
    return max(0, week_num)  # clamp to 0 for preseason

In [11]:
# 1=preseason, 2=regular, 3=playoffs
def get_season_type(current_week, reg_weeks=REG_WEEKS):
    if current_week == 0:
        return 1
    elif current_week <= reg_weeks:
        return 2
    else:
        return 3

current_year = season_start_date.year
current_week = get_current_week()
season_type = get_season_type(current_week, REG_WEEKS)

print("current_year:", current_year)
print("current_week:", current_week)
print("season_type:", season_type)

current_year: 2025
current_week: 0
season_type: 1


In [12]:
# Returns a list of years to pull.
def get_year_range(current_year, current_week, start_year=2017, reg_weeks=18):
    if current_week == 0:
        return list(range(start_year, current_year))
    else:
        return list(range(start_year, current_year + 1))

In [13]:
# Builds (year, week) pairs for scraping.
# - 2017–2020: weeks 1–17
# - 2021+: weeks 1–18
def generate_year_week_combinations(start_year, end_year, current_year=None, current_week=None):
    combos = []
    for year in range(start_year, end_year + 1):
        max_regular = 17 if year <= 2020 else 18

        # Handle the current year
        if current_year is not None and year == current_year:
            if current_week is None or current_week == 0:
                # preseason: don't add any weeks for this year
                continue
            upper = min(max_regular, int(current_week))
        else:
            upper = max_regular

        combos.extend([(year, wk) for wk in range(1, upper + 1)])
    return combos

In [14]:
# define the year, week, and season type
current_year = season_start_date.year
current_week = get_current_week()
season_type  = get_season_type(current_week, REG_WEEKS)

years = get_year_range(current_year, current_week, start_year=2017)
year_week_pairs = generate_year_week_combinations(
    start_year=years[0] if years else 2017,
    end_year=years[-1] if years else current_year - 1,
    current_year=current_year,
    current_week=current_week
)


In [15]:
# test years and weeks to pull

# Years list should exclude current year during preseason
print("years:", years)                      # expect no 2025 when current_week == 0
print("contains current_year?", current_year in years)

# Year-week pairs should have no current_year and valid week caps
yrs_in_pairs = sorted({y for (y, _) in year_week_pairs})
print("years in pairs:", yrs_in_pairs)
print("pairs count:", len(year_week_pairs))
print("first 5:", year_week_pairs[:5])
print("last 5:", year_week_pairs[-5:])

# Validate week caps per year (≤17 for <=2020, ≤18 otherwise)
violations = []
for y in yrs_in_pairs:
    max_reg = 17 if y <= 2020 else 18
    max_week = max(w for (yy, w) in year_week_pairs if yy == y)
    if max_week > max_reg:
        violations.append((y, max_week, max_reg))
print("week-cap violations:", violations)   # expect []

# Ensure current year is COMPLETELY absent during preseason
has_current_year = any(yy == current_year for (yy, _) in year_week_pairs)
print("current year present in pairs?", has_current_year)  # expect False


years: [2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
contains current_year? False
years in pairs: [2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
pairs count: 140
first 5: [(2017, 1), (2017, 2), (2017, 3), (2017, 4), (2017, 5)]
last 5: [(2024, 14), (2024, 15), (2024, 16), (2024, 17), (2024, 18)]
week-cap violations: []
current year present in pairs? False


In [16]:
## End: time calculators ##

In [17]:
# returns a dataframe summarizing missing values for a given dataFrame.
def check_nulls(df, name=None):
    null_counts = df.isnull().sum()
    null_percent = (null_counts / len(df)).round(4)
    summary = pd.DataFrame({
        'Missing Count': null_counts,
        'Missing %': null_percent
    })
    summary = summary[summary['Missing Count'] > 0].sort_values(by='Missing %', ascending=False)
    
    if name:
        print(f"\n📊 Missing Value Summary for: {name}")
    return summary


In [18]:
# save a dataFrame to the ./csv_files directory with the given filename

# Ensure the output directory exists
os.makedirs("./csv_files", exist_ok=True)

def save_csv(df, filename, index=False, float_format=None):
    if not filename.endswith('.csv'):
        filename += '.csv'
    path = os.path.join("./csv_files", filename)
    df.to_csv(path, index=index, float_format=float_format)
    print(f"Saved: {path}")


In [19]:
### Begin: Python NFL Library Dataframe ###

In [20]:
# Validate years to pull from the nfl library
print("years:", years)                         
assert current_year not in years

wr_weekly = nfl.import_weekly_data(years=years, downcast=True)
print(wr_weekly[['season','week']].agg(['min','max']))
print("unique seasons:", sorted(wr_weekly['season'].unique()))

# sanity: no week beyond league cap per year
violations = []
for y, g in wr_weekly.groupby('season'):
    # Regular season cap: 17 weeks (<=2020) or 18 weeks (>=2021)
    # Postseason cap: up to week 22 (including Super Bowl)
    max_allowed = 22
    max_week = int(g['week'].max())
    if max_week > max_allowed:
        violations.append((y, max_week, f"> {max_allowed} not allowed"))
print("week-cap violations:", violations)  # expect []



years: [2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
Downcasting floats.
     season  week
min    2017     1
max    2024    22
unique seasons: [2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
week-cap violations: []


In [21]:
# display all available columns in the nfl python API for weekly stats
nfl.see_weekly_cols()

Index(['player_id', 'player_name', 'player_display_name', 'position', 'position_group', 'headshot_url', 'recent_team', 'season', 'week', 'season_type', 'opponent_team', 'completions', 'attempts', 'passing_yards', 'passing_tds', 'interceptions', 'sacks', 'sack_yards', 'sack_fumbles', 'sack_fumbles_lost', 'passing_air_yards', 'passing_yards_after_catch', 'passing_first_downs', 'passing_epa', 'passing_2pt_conversions', 'pacr', 'dakota', 'carries', 'rushing_yards', 'rushing_tds', 'rushing_fumbles', 'rushing_fumbles_lost', 'rushing_first_downs', 'rushing_epa', 'rushing_2pt_conversions', 'receptions', 'targets', 'receiving_yards', 'receiving_tds', 'receiving_fumbles', 'receiving_fumbles_lost', 'receiving_air_yards', 'receiving_yards_after_catch', 'receiving_first_downs', 'receiving_epa', 'receiving_2pt_conversions', 'racr', 'target_share', 'air_yards_share', 'wopr', 'special_teams_tds', 'fantasy_points', 'fantasy_points_ppr'], dtype='object')

In [22]:
# define the base columns. 
base_columns = [
    'season', 'season_type', 'week', 'player_id', 'player_name', 
    'position', 'position_group', 'recent_team',
    'fantasy_points', 'fantasy_points_ppr'
]

In [23]:
# Import the player IDs from nfl.import_ids() - without parameters
ids_data = nfl.import_ids()

# Drop the unnecessary columns
columns_to_drop = [
    'position', 'team', 'birthdate', 'age', 'draft_year', 
    'draft_round', 'draft_pick', 'draft_ovr', 'twitter_username', 
    'height', 'weight', 'college', 'db_season'
]
ids_data = ids_data.drop(columns=columns_to_drop, errors='ignore')

# Display the resulting dataframe for review
# print(f"Columns after dropping unnecessary ones: {ids_data.columns.tolist()}")
# display(ids_data)

In [24]:
# import the weekly data from nfl.import_weekly_data(years, columns, downcast)
weekly_data = nfl.import_weekly_data(
    years=years,
    columns=base_columns
)

# display(weekly_data)

Downcasting floats.


In [25]:
## Output: a dataframe of ALL NFL athletes info and ids since 2017

# Merge the two dataframes on 'player_id' and 'gsis_id'
# Align column names for merging
ids_data = ids_data.rename(columns={'gsis_id': 'player_id'})  
id_dataframe = pd.merge(weekly_data, ids_data, on='player_id', how='inner')

# Assign the resulting dataframe to a variable
all_players_id_data = id_dataframe

# Display the resulting ID dataframe
# display(all_players_id_data)

In [26]:
## Output: a dataframe of NFL WR info and ids since 2017
# extract WR from the dataframe
# Create a new dataframe with only wide receivers
wide_receiver_ids = all_players_id_data[all_players_id_data['position'] == 'WR']

# Display the resulting dataframe for review
print(f"Shape of merged dataframe: {wide_receiver_ids.shape}")

# Display the resulting dataframe for review
# display(wide_receiver_ids)

Shape of merged dataframe: (17384, 31)


In [27]:
## Output: a dataframe of NFL WR info, ids, and stats since 2017
# WR-specific columns (receiving-related)
wr_columns = [
    'receptions', 'targets', 'receiving_yards', 'receiving_tds',
    'receiving_fumbles', 'receiving_fumbles_lost',
    'receiving_air_yards', 'receiving_yards_after_catch',
    'receiving_first_downs', 'receiving_epa',
    'receiving_2pt_conversions', 'racr', 'target_share',
    'air_yards_share', 'wopr'
]

# Pull WR-specific columns from weekly data
wr_stats = nfl.import_weekly_data(
    years=years,
    columns=['player_id', 'season', 'week'] + wr_columns  # Include keys for merging
)

# Merge WR-specific stats with wide_receiver_ids
wr_ids_weekly_stats_df = pd.merge(
    wide_receiver_ids,
    wr_stats,
    on=['player_id', 'season', 'week'],  # Ensure correct alignment
    how='inner'
)

# Display the resulting dataframe for review
print(f"Shape of merged dataframe: {wr_ids_weekly_stats_df.shape}")

# Row integrity check
print(
    f"Row count matches: {wr_ids_weekly_stats_df.shape[0] == wide_receiver_ids.shape[0]}"
)

# display the df
display(wr_ids_weekly_stats_df)

Downcasting floats.
Shape of merged dataframe: (17384, 46)
Row count matches: True


Unnamed: 0,season,season_type,week,player_id,player_name,position,position_group,recent_team,fantasy_points,fantasy_points_ppr,sleeper_id,nfl_id,stats_global_id,mfl_id,swish_id,yahoo_id,fantasy_data_id,rotowire_id,ktc_id,pff_id,stats_id,fleaflicker_id,rotoworld_id,merge_name,espn_id,fantasypros_id,cbs_id,cfbref_id,pfr_id,sportradar_id,name,receptions,targets,receiving_yards,receiving_tds,receiving_fumbles,receiving_fumbles_lost,receiving_air_yards,receiving_yards_after_catch,receiving_first_downs,receiving_epa,receiving_2pt_conversions,racr,target_share,air_yards_share,wopr
0,2017,REG,1,00-0022921,L.Fitzgerald,WR,WR,ARI,7.4,13.400000,223.0,larryfitzgerald/2506106,246053.0,7393,,6762.0,5571.0,3730.0,,1724.0,6762.0,1732.0,1661.0,larry fitzgerald,5528.0,9383.0,492934.0,larry-fitzgerald-1,FitzLa00,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,Larry Fitzgerald,6,13,74.0,0,0.0,0.0,144.0,44.0,4.0,0.997088,0,0.513889,0.276596,0.342043,0.654324
1,2017,REG,2,00-0022921,L.Fitzgerald,WR,WR,ARI,2.1,5.100000,223.0,larryfitzgerald/2506106,246053.0,7393,,6762.0,5571.0,3730.0,,1724.0,6762.0,1732.0,1661.0,larry fitzgerald,5528.0,9383.0,492934.0,larry-fitzgerald-1,FitzLa00,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,Larry Fitzgerald,3,6,21.0,0,0.0,0.0,29.0,17.0,2.0,-3.455533,0,0.724138,0.166667,0.069378,0.298565
2,2017,REG,3,00-0022921,L.Fitzgerald,WR,WR,ARI,20.9,33.900002,223.0,larryfitzgerald/2506106,246053.0,7393,,6762.0,5571.0,3730.0,,1724.0,6762.0,1732.0,1661.0,larry fitzgerald,5528.0,9383.0,492934.0,larry-fitzgerald-1,FitzLa00,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,Larry Fitzgerald,13,15,149.0,1,0.0,0.0,138.0,45.0,6.0,7.632769,0,1.079710,0.312500,0.369973,0.727731
3,2017,REG,4,00-0022921,L.Fitzgerald,WR,WR,ARI,9.2,13.200000,223.0,larryfitzgerald/2506106,246053.0,7393,,6762.0,5571.0,3730.0,,1724.0,6762.0,1732.0,1661.0,larry fitzgerald,5528.0,9383.0,492934.0,larry-fitzgerald-1,FitzLa00,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,Larry Fitzgerald,4,7,32.0,1,0.0,0.0,31.0,18.0,1.0,0.162141,0,1.032258,0.137255,0.070938,0.255539
4,2017,REG,5,00-0022921,L.Fitzgerald,WR,WR,ARI,5.1,11.100000,223.0,larryfitzgerald/2506106,246053.0,7393,,6762.0,5571.0,3730.0,,1724.0,6762.0,1732.0,1661.0,larry fitzgerald,5528.0,9383.0,492934.0,larry-fitzgerald-1,FitzLa00,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,Larry Fitzgerald,6,10,51.0,0,0.0,0.0,44.0,29.0,5.0,2.428232,0,1.159091,0.227273,0.105516,0.414770
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17379,2024,REG,2,00-0039920,M.Corley,WR,WR,NYJ,0.4,1.400000,11617.0,,0.0,16636,1215291.0,40944.0,,17777.0,1607.0,,40944.0,,,malachi corley,4613104.0,26023.0,3162613.0,malachi-corley-1,CorlMa00,bae59933-8b94-4837-990e-f0a4ced3cdbb,Malachi Corley,1,1,4.0,0,0.0,0.0,-1.0,5.0,0.0,-0.475780,0,0.000000,0.034483,-0.006579,0.047119
17380,2024,REG,9,00-0039920,M.Corley,WR,WR,NYJ,1.8,1.800000,11617.0,,0.0,16636,1215291.0,40944.0,,17777.0,1607.0,,40944.0,,,malachi corley,4613104.0,26023.0,3162613.0,malachi-corley-1,CorlMa00,bae59933-8b94-4837-990e-f0a4ced3cdbb,Malachi Corley,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,,0,,,,
17381,2024,REG,10,00-0039920,M.Corley,WR,WR,NYJ,0.2,1.200000,11617.0,,0.0,16636,1215291.0,40944.0,,17777.0,1607.0,,40944.0,,,malachi corley,4613104.0,26023.0,3162613.0,malachi-corley-1,CorlMa00,bae59933-8b94-4837-990e-f0a4ced3cdbb,Malachi Corley,1,2,2.0,0,0.0,0.0,12.0,0.0,1.0,-2.246118,0,0.166667,0.060606,0.057692,0.131294
17382,2024,REG,11,00-0039920,M.Corley,WR,WR,NYJ,1.0,2.000000,11617.0,,0.0,16636,1215291.0,40944.0,,17777.0,1607.0,,40944.0,,,malachi corley,4613104.0,26023.0,3162613.0,malachi-corley-1,CorlMa00,bae59933-8b94-4837-990e-f0a4ced3cdbb,Malachi Corley,1,1,10.0,0,0.0,0.0,10.0,0.0,0.0,0.563583,0,1.000000,0.034483,0.080645,0.108176


In [28]:
# csv file
# save_csv(wr_ids_weekly_stats_df, "wr_ids_weekly_stats_df")

In [29]:
# check for nulls
# updated null value analysis using helper function
null_summary_wr_ids_weekly = check_nulls(wr_ids_weekly_stats_df, name="WR Weekly Stats")

# Filter out columns containing '_id'
null_summary_wr_ids_weekly = null_summary_wr_ids_weekly[~null_summary_wr_ids_weekly.index.str.contains('_id')]

display(null_summary_wr_ids_weekly)


📊 Missing Value Summary for: WR Weekly Stats


Unnamed: 0,Missing Count,Missing %
racr,327,0.0188
receiving_epa,285,0.0164
air_yards_share,285,0.0164
target_share,285,0.0164
wopr,285,0.0164


In [30]:
# Output: imports the NFL next-generation stats from the nfl python library

# import the next generation stats (NGS) from nfl.import_ngs_data()
# note: ngs starts at week 0 (previous season totals) - not needed so drop those rows

# Pull NGS receiving data for the specified years
wr_ngs_df = nfl.import_ngs_data('receiving', years)

# Exclude rows where 'week' == 0 and filter for 'WR' position in one step
wr_ngs_df = wr_ngs_df[(wr_ngs_df['week'] != 0) & (wr_ngs_df['player_position'] == 'WR')]

# Drop unnecessary columns (already in the nfl python baseline dataframe)
wr_ngs_df = wr_ngs_df.drop(columns=['player_jersey_number'], errors='ignore')

# Display the resulting dataframe
print(f"Shape of NGS WR DataFrame after dropping columns: {wr_ngs_df.shape}")
display(wr_ngs_df)

Shape of NGS WR DataFrame after dropping columns: (8249, 22)


Unnamed: 0,season,season_type,week,player_display_name,player_position,team_abbr,avg_cushion,avg_separation,avg_intended_air_yards,percent_share_of_intended_air_yards,receptions,targets,catch_percentage,yards,rec_touchdowns,avg_yac,avg_expected_yac,avg_yac_above_expectation,player_gsis_id,player_first_name,player_last_name,player_short_name
1725,2017,REG,1,Ryan Grant,WR,WAS,9.936667,2.894592,4.410000,7.154639,4,6,66.666667,61.0,0,11.232500,10.072361,1.160139,00-0031068,Ryan,Grant,R.Grant
1726,2017,REG,1,Martavis Bryant,WR,PIT,8.300000,4.122054,12.688333,33.327496,2,6,33.333333,14.0,0,0.155000,4.098278,-3.943278,00-0031373,Martavis,Bryant,M.Bryant
1729,2017,REG,1,Jamison Crowder,WR,WAS,7.655000,3.177793,10.540000,19.949707,3,7,42.857143,14.0,0,1.450000,1.631897,-0.181897,00-0031941,Jamison,Crowder,J.Crowder
1732,2017,REG,1,Nelson Agholor,WR,PHI,7.423750,2.462620,10.463750,20.274656,6,8,75.000000,86.0,1,5.611667,3.262470,2.349197,00-0031549,Nelson,Agholor,N.Agholor
1733,2017,REG,1,John Brown,WR,ARI,7.360000,2.751526,13.422222,28.208481,4,9,44.444444,32.0,0,-0.377500,0.961993,-1.339493,00-0031051,John,Brown,J.Brown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13323,2024,POST,23,Xavier Worthy,WR,KC,8.160000,4.959113,14.276250,44.737358,8,8,100.000000,157.0,2,6.250000,6.154624,0.095376,00-0039894,Xavier,Worthy,X.Worthy
13324,2024,POST,23,DeAndre Hopkins,WR,KC,7.676000,3.446231,11.974000,23.451761,2,5,40.000000,18.0,1,0.565000,0.798474,-0.233474,00-0030564,DeAndre,Hopkins,D.Hopkins
13325,2024,POST,23,DeVonta Smith,WR,PHI,7.470000,2.221577,14.752000,40.028219,4,5,80.000000,69.0,1,0.340000,0.600076,-0.260076,00-0036912,DeVonta,Smith,D.Smith
13327,2024,POST,23,Marquise Brown,WR,KC,4.943333,3.302615,6.356667,14.939872,2,6,33.333333,15.0,0,2.450000,3.533891,-1.083891,00-0035662,Marquise,Brown,M.Brown


In [31]:
# csv file
# save_csv(wr_ngs_df, "wr_ngs_df")

In [32]:
print(wr_ngs_df.columns.tolist())


['season', 'season_type', 'week', 'player_display_name', 'player_position', 'team_abbr', 'avg_cushion', 'avg_separation', 'avg_intended_air_yards', 'percent_share_of_intended_air_yards', 'receptions', 'targets', 'catch_percentage', 'yards', 'rec_touchdowns', 'avg_yac', 'avg_expected_yac', 'avg_yac_above_expectation', 'player_gsis_id', 'player_first_name', 'player_last_name', 'player_short_name']


In [33]:
# updated null analysis using helper function
wr_ngs_null_summary_df = check_nulls(wr_ngs_df, name="NGS WR Stats")
display(wr_ngs_null_summary_df)



📊 Missing Value Summary for: NGS WR Stats


Unnamed: 0,Missing Count,Missing %
avg_expected_yac,42,0.0051
avg_yac_above_expectation,42,0.0051
avg_yac,33,0.004
yards,28,0.0034
avg_cushion,2,0.0002


In [34]:
### End: Python NFL Library Dataframe ###

In [35]:
### Begin:fantasypros webscraping ###

In [36]:
# scrape FantasyPros weekly WR basic stats 

wr_fp_basic_stats_columns = [
    "rec", "tgt", "rec_yds", "rec_ypc", "rec_lg", "rec_20+", "rec_td",
    "rush_att", "rush_yds", "rush_td",
    "fum", "games", "fpts", "fpts_per_game", "rost_pct"
]


In [37]:
# scrape FantasyPros weekly WR basic stats 
def parse_fp_basic_row(row, year, week):
    try:
        tds = row.find_all("td")
        anchor = tds[1].find("a")

     
        # FantasyPros ID        
        # Extract class list
        classes = anchor.get("class", [])
        fp_id = None
        
        # Find the one that starts with 'fp-id-' and extract the number
        for cls in classes:
            if cls.startswith("fp-id-"):
                fp_id = cls.split("fp-id-")[-1]
                break


        # Player name and team
        player_text = anchor.get_text(strip=True)
        team_text = tds[1].get_text(strip=True).replace(player_text, "").strip()
        team = team_text.strip("()")

        # Stat values (from REC to ROST)
        stat_values = [td.get_text(strip=True) for td in tds[2:]]

        # Ensure correct length
        if len(stat_values) != len(wr_fp_basic_stats_columns):
            print(f"⚠️ Row stat length mismatch: {len(stat_values)} found, {len(wr_fp_basic_stats_columns)} expected.")
            return None

        # Build dict
        row_data = {
            "year": year,
            "week": week,
            "fantasypros_id": fp_id,
            "player_name": player_text,
            "team": team
        }

        for col, val in zip(wr_fp_basic_stats_columns, stat_values):
            row_data[col] = val

        return row_data
    
    except Exception as e:
        print(f"❌ Error parsing row: {e}")
        return None


In [38]:
# scrape FantasyPros weekly WR basic stats 
def wr_scrape_fp_basic_stats(
    year_week_pairs,
    sleep_range=(0.35, 0.85),
    timeout=20
):

    url_tpl = "https://www.fantasypros.com/nfl/stats/wr.php?year={y}&week={w}&range=week"
    all_rows = []

    for year, week in year_week_pairs:
        url = url_tpl.format(y=year, w=week)
        # print(f"Scraping: {year}-W{week} → {url}")

        try:
            response = requests.get(url, timeout=timeout)
            response.raise_for_status()
        except Exception as e:
            print(f"❌ Failed to fetch {url}: {e}")
            continue

        soup = BeautifulSoup(response.content, "html.parser")
        player_rows = soup.find_all("tr", class_=lambda x: x and x.startswith("mpb-player-"))
        
        for row in player_rows:
            row_data = parse_fp_basic_row(row, year, week)  # parse player row stats
            if row_data:
                all_rows.append(row_data)

        time.sleep(uniform(*sleep_range))

    return pd.DataFrame(all_rows)


In [39]:
# test scraping results
# ** Do a visual test and use the LLM to assist with snapshots **

# one year-week pair only
year_week_pairs = [(2024, 1)]

# output: a dataframe of WR basic stats
wr_fp_basic_stats_df = wr_scrape_fp_basic_stats(year_week_pairs)
fp_basic_errors = None  # placeholder for consistency

print(f"Shape: {wr_fp_basic_stats_df.shape}")
display(wr_fp_basic_stats_df.head(10))

# null summary
display(check_nulls(wr_fp_basic_stats_df, name="FantasyPros WR Basic Stats"))


Shape: (427, 20)


Unnamed: 0,year,week,fantasypros_id,player_name,team,rec,tgt,rec_yds,rec_ypc,rec_lg,rec_20+,rec_td,rush_att,rush_yds,rush_td,fum,games,fpts,fpts_per_game,rost_pct
0,2024,1,23020,Jayden Reed,GB,4,6,138,34.5,70,3,1,1,33,1,0,1,29.1,29.1,77.4%
1,2024,1,17301,Allen Lazard,NYJ,6,9,89,14.8,36,2,2,0,0,0,0,1,20.9,20.9,1.9%
2,2024,1,23677,Jameson Williams,DET,5,9,121,24.2,52,3,1,1,13,0,0,1,19.4,19.4,91.5%
3,2024,1,15802,Tyreek Hill,MIA,7,12,130,18.6,80,2,1,0,0,0,0,1,19.0,19.0,98.6%
4,2024,1,23019,Xavier Worthy,KC,2,3,47,23.5,35,1,1,1,21,1,0,1,18.8,18.8,95.6%
5,2024,1,23791,Alec Pierce,IND,3,3,125,41.7,60,2,1,0,0,0,0,1,18.5,18.5,8.8%
6,2024,1,12119,Mike Evans,TB,5,6,61,12.2,24,1,2,0,0,0,0,1,18.1,18.1,98.0%
7,2024,1,16433,Cooper Kupp,SEA,14,21,110,7.9,21,2,1,2,10,0,0,1,18.0,18.0,84.5%
8,2024,1,18218,A.J. Brown,PHI,5,10,119,23.8,67,2,1,0,0,0,0,1,17.9,17.9,99.5%
9,2024,1,13981,Stefon Diggs,NE,6,6,33,5.5,10,0,2,1,6,0,0,1,15.9,15.9,85.1%



📊 Missing Value Summary for: FantasyPros WR Basic Stats


Unnamed: 0,Missing Count,Missing %


In [40]:
## Dataframe ##
# scrape FantasyPros weekly WR basic stats 
# full pull: 2017 to current year/week

current_year = datetime.now().year
current_week = get_current_week()
year_week_pairs = generate_year_week_combinations(start_year=2017, end_year=current_year, current_year=current_year, current_week=current_week)

# output: full WR basic stats dataframe
wr_fp_basic_stats_df = wr_scrape_fp_basic_stats(year_week_pairs)
fp_basic_errors = None  # placeholder

print(f"Shape: {wr_fp_basic_stats_df.shape}")
display(wr_fp_basic_stats_df.head(10))

# null summary
display(check_nulls(wr_fp_basic_stats_df, name="FantasyPros WR Basic Stats"))


Shape: (25726, 20)


Unnamed: 0,year,week,fantasypros_id,player_name,team,rec,tgt,rec_yds,rec_ypc,rec_lg,rec_20+,rec_td,rush_att,rush_yds,rush_td,fum,games,fpts,fpts_per_game,rost_pct
0,2017,1,13981,Stefon Diggs,NE,7,8,93,13.3,30,1,2,1,-6,0,0,1,20.7,20.7,85.1%
1,2017,1,15802,Tyreek Hill,MIA,7,8,133,19.0,75,0,1,2,5,0,0,1,19.8,19.8,98.6%
2,2017,1,16488,Kenny Golladay,FA,4,7,69,17.3,45,0,2,0,0,0,0,1,18.9,18.9,4.0%
3,2017,1,9808,Antonio Brown,FA,11,11,182,16.5,50,0,0,0,0,0,0,1,18.2,18.2,1.0%
4,2017,1,13429,Adam Thielen,CAR,9,10,157,17.4,44,2,0,0,0,0,0,1,15.7,15.7,36.4%
5,2017,1,13969,Nelson Agholor,FA,6,8,86,14.3,58,0,1,0,0,0,0,1,14.6,14.6,1.0%
6,2017,1,13081,Bennie Fowler III,FA,3,4,21,7.0,10,0,2,0,0,0,0,1,14.1,14.1,0.0%
7,2017,1,9320,Jordy Nelson,FA,7,8,79,11.3,32,0,1,0,0,0,0,1,13.9,13.9,0.1%
8,2017,1,16433,Cooper Kupp,SEA,4,6,76,19.0,28,2,1,0,0,0,0,1,13.6,13.6,84.5%
9,2017,1,13894,Amari Cooper,FA,5,13,62,12.4,23,1,1,0,0,0,0,1,12.2,12.2,7.2%



📊 Missing Value Summary for: FantasyPros WR Basic Stats


Unnamed: 0,Missing Count,Missing %


In [41]:
# scsv file
# save_csv(wr_fp_basic_stats_df, "wr_fp_basic_stats_df")


In [42]:
# scrape FantasyPros weekly WR advanced stats
wr_fp_advanced_stats_columns = [
    "games", "rec", "yds", "ypr", "ybc", "ybc_per_rec", "air", "air_per_rec",
    "yac", "yac_per_rec", "yacon", "yacon_per_rec", "brktkl", "tgt", "% TM",
    "catchable", "drop", "rz_tgt", "10_plus", "20_plus", "30_plus",
    "40_plus", "50_plus", "lng"
]


In [43]:
# scrape FantasyPros weekly WR advanced stats

# parse a single row of advanced WR stats
def parse_fp_adv_row(row, year, week):
    try:
        tds = row.find_all("td")
        anchor = tds[1].find("a")

        # FantasyPros ID
        classes = anchor.get("class", [])
        fp_id = None
        for cls in classes:
            if cls.startswith("fp-id-"):
                fp_id = cls.split("fp-id-")[-1]
                break

        # Player name
        player_name = anchor.text.strip()

        # Player name and team
        player_text = anchor.get_text(strip=True)
        team_text = tds[1].get_text(strip=True).replace(player_text, "").strip()
        team = team_text.strip("()")

        # Get stat values (skipping rank and player)
        stats = [td.text.strip() for td in tds[2:]]

        if len(stats) != len(wr_fp_advanced_stats_columns):
            print(f"⚠️ Stat length mismatch at year={year}, week={week}")
            print(f"  Parsed stats length: {len(stats)}")
            print(f"  Expected: {len(wr_fp_advanced_stats_columns)}")
            print(f"  Raw values: {stats}")
            return None


        return {
            "year": year,
            "week": week,
            "fantasypros_id": fp_id,
            "player_name": player_name,
            "team": team,
            **dict(zip(wr_fp_advanced_stats_columns, stats))
        }

    except Exception as e:
        print(f"Row parsing error (year={year}, week={week}): {e}")
        return None

In [44]:
# scrape FantasyPros weekly WR advanced stats
def wr_scrape_fp_adv_stats(
    year_week_pairs,
    save_csv_path=None,
    sleep_range=(0.35, 0.85),
    timeout=20
):
    url_tpl = "https://www.fantasypros.com/nfl/advanced-stats-wr.php?year={y}&week={w}&range=week&type=reg&mode=pergame"

    all_data = []

    for (y, w) in year_week_pairs:
        # print(f"Scraping: {y}-W{w} → {url_tpl.format(y=y, w=w)}")

        try:
            response = requests.get(url_tpl.format(y=y, w=w), timeout=timeout)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, "html.parser")
            table = soup.find("table", {"id": "data"})
            rows = table.tbody.find_all("tr")

            for row in rows:
                parsed = parse_fp_adv_row(row, y, w)
                if parsed:
                    all_data.append(parsed)

            # polite scraping
            time.sleep(random.uniform(*sleep_range))

        except Exception as e:
            print(f"Error fetching {y}-W{w}: {e}")

    # build the DataFrame
    return pd.DataFrame(all_data)


In [45]:
# test scraping results
# ** Do a visual test and use the LLM to assist with snapshots **

# one year-week pair only
year_week_pairs = [(2024, 1)]

# output: a dataframe of WR advanced stats
wr_fp_advanced_stats_df = wr_scrape_fp_adv_stats(year_week_pairs)
fp_adv_errors = None  # placeholder for consistency

print(f"Shape: {wr_fp_advanced_stats_df.shape}")
display(wr_fp_advanced_stats_df.head(10))

# null summary
display(check_nulls(wr_fp_advanced_stats_df, name="FantasyPros WR Advanced Stats"))



Shape: (8, 29)


Unnamed: 0,year,week,fantasypros_id,player_name,team,games,rec,yds,ypr,ybc,ybc_per_rec,air,air_per_rec,yac,yac_per_rec,yacon,yacon_per_rec,brktkl,tgt,% TM,catchable,drop,rz_tgt,10_plus,20_plus,30_plus,40_plus,50_plus,lng
0,2024,1,23020,Jayden Reed,GB,1,4,138,34.5,83,20.8,104,26.0,55,13.8,34,8.5,1,6,18.8%,4,0,0,3,3,2,1,1,70
1,2024,1,17301,Allen Lazard,NYJ,1,6,89,14.8,60,10.0,91,15.2,29,4.8,7,1.2,0,9,31.0%,6,0,3,3,2,1,0,0,36
2,2024,1,23677,Jameson Williams,DET,1,5,121,24.2,58,11.6,127,25.4,63,12.6,5,1.0,1,9,32.1%,5,0,1,3,3,2,1,1,52
3,2024,1,15802,Tyreek Hill,MIA,1,7,130,18.6,46,6.6,143,20.4,84,12.0,13,1.9,1,12,33.3%,7,0,2,5,2,1,1,1,80
4,2024,1,23019,Xavier Worthy,KC,1,2,47,23.5,24,12.0,37,18.5,23,11.5,1,0.5,0,3,11.1%,2,0,1,2,1,1,0,0,35
5,2024,1,23791,Alec Pierce,IND,1,3,125,41.7,119,39.7,119,39.7,6,2.0,0,0.0,0,3,15.8%,3,0,0,2,2,2,2,2,60
6,2024,1,12119,Mike Evans,TB,1,5,61,12.2,53,10.6,64,12.8,8,1.6,1,0.2,0,6,20.0%,5,0,3,3,1,0,0,0,24
7,2024,1,16433,Cooper Kupp,SEA,1,14,110,7.9,69,4.9,153,10.9,41,2.9,21,1.5,1,21,43.8%,15,1,3,4,2,0,0,0,21



📊 Missing Value Summary for: FantasyPros WR Advanced Stats


Unnamed: 0,Missing Count,Missing %


In [46]:
## Dataframe ##
# scrape FantasyPros weekly WR advanced stats
# full pull: 2017 to current year/week

current_year = datetime.now().year
current_week = get_current_week()
year_week_pairs = generate_year_week_combinations(start_year=2017, end_year=current_year, current_year=current_year, current_week=current_week)

# output: full WR advanced stats dataframe
wr_fp_advanced_stats_df = wr_scrape_fp_adv_stats(year_week_pairs)
fp_adv_errors = None  # placeholder

print(f"Shape: {wr_fp_advanced_stats_df.shape}")
display(wr_fp_advanced_stats_df.head(10))

# null summary
display(check_nulls(wr_fp_advanced_stats_df, name="FantasyPros WR Advanced Stats"))


Shape: (1120, 29)


Unnamed: 0,year,week,fantasypros_id,player_name,team,games,rec,yds,ypr,ybc,ybc_per_rec,air,air_per_rec,yac,yac_per_rec,yacon,yacon_per_rec,brktkl,tgt,% TM,catchable,drop,rz_tgt,10_plus,20_plus,30_plus,40_plus,50_plus,lng
0,2017,1,13981,Stefon Diggs,NE,1,7,93,13.3,76,10.9,0,0.0,17,2.4,6,0.9,0,8,25.0%,7,0,2,5,2,1,0,0,30
1,2017,1,15802,Tyreek Hill,MIA,1,7,133,19.0,78,11.1,0,0.0,55,7.9,1,0.1,0,8,23.5%,7,0,0,4,1,1,1,1,75
2,2017,1,16488,Kenny Golladay,FA,1,4,69,17.3,64,16.0,0,0.0,5,1.3,0,0.0,0,7,17.9%,5,1,1,2,1,1,1,0,45
3,2017,1,9808,Antonio Brown,FA,1,11,182,16.5,90,8.2,0,0.0,92,8.4,50,4.5,0,11,30.6%,11,0,0,7,2,2,1,1,50
4,2017,1,13429,Adam Thielen,CAR,1,9,157,17.4,92,10.2,0,0.0,65,7.2,17,1.9,0,10,31.3%,10,0,0,4,4,2,1,0,44
5,2017,1,13969,Nelson Agholor,FA,1,6,86,14.3,51,8.5,0,0.0,35,5.8,18,3.0,0,8,21.1%,6,0,1,3,1,1,1,1,58
6,2017,1,13081,Bennie Fowler III,FA,1,3,21,7.0,21,7.0,0,0.0,0,0.0,0,0.0,0,4,14.3%,3,0,2,1,0,0,0,0,10
7,2017,1,9320,Jordy Nelson,FA,1,7,79,11.3,73,10.4,0,0.0,6,0.9,1,0.1,0,8,19.0%,7,0,0,3,1,1,0,0,32
8,2017,2,9460,Michael Crabtree,FA,1,6,80,13.3,50,8.3,0,0.0,30,5.0,0,0.0,0,6,21.4%,6,0,0,3,3,0,0,0,26
9,2017,2,11548,Jermaine Kearse,FA,1,4,64,16.0,57,14.3,0,0.0,7,1.8,0,0.0,0,5,20.8%,4,0,0,3,1,1,0,0,34



📊 Missing Value Summary for: FantasyPros WR Advanced Stats


Unnamed: 0,Missing Count,Missing %


In [47]:
# csv file
# save_csv(wr_fp_advanced_stats_df, "wr_fp_advanced_stats_df")


In [48]:
# scrape WR fantasypros redzone stats 
wr_fp_rz_stats_columns = [
    "rec_rz", "tgt_rz", "rec_pct_rz", "yds_rz", "yds_per_rec_rz", "td_rz", "tgt_pct_rz",
    "rush_att_rz", "rush_yds_rz", "rush_td_rz", "rush_pct_rz", "fl_rz",
    "games", "fpts_rz", "fpts_pg_rz", "rost_pct"
]


In [49]:
# scrape FantasyPros weekly WR redzone stats
def parse_fp_rz_row(row, year, week):
    try:
        tds = row.find_all("td")
        anchor = tds[1].find("a")

        # FantasyPros ID
        classes = anchor.get("class", [])
        fp_id = None
        for cls in classes:
            if cls.startswith("fp-id-"):
                fp_id = cls.split("fp-id-")[-1]
                break

        # Player name and team
        player_text = anchor.get_text(strip=True)
        team_text = tds[1].get_text(strip=True).replace(player_text, "").strip()
        team = team_text.strip("()")

        # Stat values (from REC_RZ to FPTS_RZ)
        stat_values = [td.get_text(strip=True) for td in tds[2:]]

        # Ensure correct length
        if len(stat_values) != len(wr_fp_rz_stats_columns):
            print(f"⚠️ Row stat length mismatch: {len(stat_values)} found, {len(wr_fp_rz_stats_columns)} expected.")
            return None

        # Build dict
        row_data = {
            "year": year,
            "week": week,
            "fantasypros_id": fp_id,
            "player_name": player_text,
            "team": team
        }

        for col, val in zip(wr_fp_rz_stats_columns, stat_values):
            row_data[col] = val

        return row_data

    except Exception as e:
        print(f"❌ Error parsing row: {e}")
        return None


In [50]:
# scrape WR fantasypros redzone stats
def wr_scrape_fp_rz_stats(
    year_week_pairs,
    save_csv_path=None,
    sleep_range=(0.35, 0.85),
    timeout=20
):
    url_tpl = "https://www.fantasypros.com/nfl/red-zone-stats/wr.php?year={y}&week={w}&range=week"
    records = []

    for (year, week) in year_week_pairs:
        # print(f"Fetching {year}-W{week} ...", end=" ")

        try:
            url = url_tpl.format(y=year, w=week)
            time.sleep(uniform(*sleep_range))
            res = requests.get(url, timeout=timeout)
            if res.status_code != 200:
                print(f"❌ Status code: {res.status_code}")
                continue

            soup = BeautifulSoup(res.text, "html.parser")
            table = soup.find("table")
            if not table:
                print("❌ No table found.")
                continue

            rows = table.find("tbody").find_all("tr")
            for row in rows:
                parsed = parse_fp_rz_row(row, year, week)
                if parsed:
                    records.append(parsed)

            # print(f"✅ {len(records)} rows")

        except Exception as e:
            print(f"❌ Error: {e}")

    df = pd.DataFrame(records)

    return df


In [51]:
# test scraping results
# ** Do a visual test and use the LLM to assist with snapshots **

# one year-week pair only
year_week_pairs = [(2024, 1)]

# output: a dataframe of WR redzone stats
wr_fp_rz_stats_df = wr_scrape_fp_rz_stats(year_week_pairs)
fp_rz_errors = None  # placeholder for consistency

print(f"Shape: {wr_fp_rz_stats_df.shape}")
display(wr_fp_rz_stats_df.head(10))

# null summary
display(check_nulls(wr_fp_rz_stats_df, name="FantasyPros WR Redzone Stats"))



Shape: (8, 21)


Unnamed: 0,year,week,fantasypros_id,player_name,team,rec_rz,tgt_rz,rec_pct_rz,yds_rz,yds_per_rec_rz,td_rz,tgt_pct_rz,rush_att_rz,rush_yds_rz,rush_td_rz,rush_pct_rz,fl_rz,games,fpts_rz,fpts_pg_rz,rost_pct
0,2024,1,12119,Mike Evans,TB,2,3,66.7%,18,9.0,2,75.0%,0,0,0,0%,0,1,13.8,13.8,98.0%
1,2024,1,13981,Stefon Diggs,NE,3,3,100.0%,11,3.7,2,75.0%,0,0,0,0%,0,1,13.1,13.1,85.1%
2,2024,1,23000,Brian Thomas Jr.,JAC,1,1,100.0%,14,14.0,1,100.0%,0,0,0,0%,0,1,7.4,7.4,99.5%
3,2024,1,16433,Cooper Kupp,SEA,2,3,66.7%,13,6.5,1,60.0%,1,1,0,100.0%,0,1,7.4,7.4,84.5%
4,2024,1,16489,Mack Hollins,NE,1,1,100.0%,11,11.0,1,25.0%,0,0,0,0%,0,1,7.1,7.1,0.9%
5,2024,1,23748,Khalil Shakir,BUF,1,1,100.0%,11,11.0,1,25.0%,0,0,0,0%,0,1,7.1,7.1,78.1%
6,2024,1,26122,Ladd McConkey,LAC,1,2,50.0%,10,10.0,1,50.0%,0,0,0,0%,0,1,7.0,7.0,99.0%
7,2024,1,17301,Allen Lazard,NYJ,2,3,66.7%,7,3.5,1,50.0%,0,0,0,0%,0,1,6.7,6.7,1.9%



📊 Missing Value Summary for: FantasyPros WR Redzone Stats


Unnamed: 0,Missing Count,Missing %


In [52]:
## Dataframe ##
# scrape FantasyPros weekly WR redzone stats
# full pull: 2017 to current year/week

current_year = datetime.now().year
current_week = get_current_week()
year_week_pairs = generate_year_week_combinations(start_year=2017, end_year=current_year, current_year=current_year, current_week=current_week)

# output: full WR redzone stats dataframe
wr_fp_rz_stats_df = wr_scrape_fp_rz_stats(year_week_pairs)
fp_rz_errors = None  # placeholder

print(f"Shape: {wr_fp_rz_stats_df.shape}")
display(wr_fp_rz_stats_df.head(10))

# null summary
display(check_nulls(wr_fp_rz_stats_df, name="FantasyPros WR Redzone Stats"))


Shape: (1120, 21)


Unnamed: 0,year,week,fantasypros_id,player_name,team,rec_rz,tgt_rz,rec_pct_rz,yds_rz,yds_per_rec_rz,td_rz,tgt_pct_rz,rush_att_rz,rush_yds_rz,rush_td_rz,rush_pct_rz,fl_rz,games,fpts_rz,fpts_pg_rz,rost_pct
0,2017,1,13981,Stefon Diggs,NE,3,3,100.0%,22,7.3,2,60.0%,0,0,0,0%,0,1,14.2,14.2,85.1%
1,2017,1,13081,Bennie Fowler III,FA,2,2,100.0%,11,5.5,2,66.7%,0,0,0,0%,0,1,13.1,13.1,0.0%
2,2017,1,13840,Seth Roberts,FA,1,1,100.0%,19,19.0,1,20.0%,0,0,0,0%,0,1,7.9,7.9,0.0%
3,2017,1,16433,Cooper Kupp,SEA,1,1,100.0%,18,18.0,1,100.0%,0,0,0,0%,0,1,7.8,7.8,84.5%
4,2017,1,11606,DeAndre Hopkins,BAL,2,3,66.7%,11,5.5,1,75.0%,0,0,0,0%,0,1,7.1,7.1,15.9%
5,2017,1,16488,Kenny Golladay,FA,1,1,100.0%,10,10.0,1,33.3%,0,0,0,0%,0,1,7.0,7.0,4.0%
6,2017,1,13894,Amari Cooper,FA,1,4,25.0%,8,8.0,1,80.0%,0,0,0,0%,0,1,6.8,6.8,7.2%
7,2017,1,11215,Marvin Jones Jr.,FA,1,1,100.0%,6,6.0,1,33.3%,0,0,0,0%,0,1,6.6,6.6,0.0%
8,2017,2,9707,Emmanuel Sanders,FA,3,3,100.0%,21,7.0,2,75.0%,0,0,0,0%,0,1,14.1,14.1,0.0%
9,2017,2,9460,Michael Crabtree,FA,2,2,100.0%,3,1.5,2,66.7%,0,0,0,0%,0,1,12.3,12.3,0.0%



📊 Missing Value Summary for: FantasyPros WR Redzone Stats


Unnamed: 0,Missing Count,Missing %


In [53]:
# csv file
# save_csv(wr_fp_rz_stats_df, "wr_fp_rz_stats_df")


In [54]:
# Listing columns of all three FantasyPros dataframes
basic_stats_cols = wr_fp_basic_stats_df.columns.tolist()
advanced_stats_cols = wr_fp_advanced_stats_df.columns.tolist()
redzone_stats_cols = wr_fp_rz_stats_df.columns.tolist()

# Combine into a dataframe for comparison
comparison_df = pd.DataFrame({
    "Basic Stats": pd.Series(basic_stats_cols),
    "Advanced Stats": pd.Series(advanced_stats_cols),
    "Red Zone Stats": pd.Series(redzone_stats_cols)
})
comparison_df

Unnamed: 0,Basic Stats,Advanced Stats,Red Zone Stats
0,year,year,year
1,week,week,week
2,fantasypros_id,fantasypros_id,fantasypros_id
3,player_name,player_name,player_name
4,team,team,team
5,rec,games,rec_rz
6,tgt,rec,tgt_rz
7,rec_yds,yds,rec_pct_rz
8,rec_ypc,ypr,yds_rz
9,rec_lg,ybc,yds_per_rec_rz


In [55]:
# Display the shape of each dataframe
print(f"📊 **Shape of WR Basic Stats DataFrame:** {wr_fp_basic_stats_df.shape}")
print(f"\n📊 **Shape of WR Advanced Stats DataFrame:** {wr_fp_advanced_stats_df.shape}")
print(f"📊 **Shape of WR Red Zone Stats DataFrame:** {wr_fp_rz_stats_df.shape}")

📊 **Shape of WR Basic Stats DataFrame:** (25726, 20)

📊 **Shape of WR Advanced Stats DataFrame:** (1120, 29)
📊 **Shape of WR Red Zone Stats DataFrame:** (1120, 21)


In [56]:
# check nulls
# Apply helper function to each FantasyPros DataFrame
basic_stats_nulls = check_nulls(wr_fp_basic_stats_df, "FantasyPros Basic Stats")
advanced_stats_nulls = check_nulls(wr_fp_advanced_stats_df, "FantasyPros Advanced Stats")
redzone_nulls = check_nulls(wr_fp_rz_stats_df, "FantasyPros Red Zone Stats")

# Concatenate all results (only non-empty will be shown)
combined_nulls = pd.concat(
    [basic_stats_nulls, advanced_stats_nulls, redzone_nulls],
    keys=["Basic Stats", "Advanced Stats", "Red Zone Stats"]
)
combined_nulls


📊 Missing Value Summary for: FantasyPros Basic Stats

📊 Missing Value Summary for: FantasyPros Advanced Stats

📊 Missing Value Summary for: FantasyPros Red Zone Stats


Unnamed: 0,Unnamed: 1,Missing Count,Missing %


In [57]:
### End:fantasypros webscraping ###

In [58]:
## Begin: Build the dataframe for the DFS Fanduel and Draft Kings salary data from BigDataBall ##
# ** Files must be in the local directory ** NFL-20xx-DFS-Dataset.xlsx

In [59]:
# create a helper function to clean the the dfs salary data
#    - Cleans and flattens multi-index column names for DFS salary Excel files:
#    - Joins tuples if multi-index
#    - Removes special characters
#    - Normalizes spaces
#    - Converts to lowercase for matching
def clean_column_dfs(col):
    if isinstance(col, tuple):
        col = ' '.join(str(x) for x in col if x)

    return (
        str(col)
        .replace('\n', ' ')
        .replace('(', '')
        .replace(')', '')
        .replace('"', '')
        .replace('#', '')
        .replace('$', '')
        .replace('/', '')
        .replace('-', ' ')
        .strip()
        .lower()
        .replace('  ', ' ')
        .replace('   ', ' ')
    )

In [60]:
# Read the excel files
filepath = './dfs_files/NFL-2024-DFS-Dataset.xlsx'
dfs_raw = pd.read_excel(filepath, header=[0, 1])
original_row_count = len(dfs_raw)

dfs_raw.columns = [clean_column_dfs(col) for col in dfs_raw.columns]
dfs_raw.head()  # Optional preview

Unnamed: 0,game information bigdataball dataset,game information game id,game information date,game information week,game information start time et,game information player id,game information player dst,game information team,game information opponent,game information venue rh,position draftkings,position fanduel,salary for draftkings classic contests,salary for fanduel full roster contests,fantasy points scored draftkings,fantasy points scored fanduel
0,,,NaT,,,,,,,,#N/A MEANING: The game was not included in any...,,,,,
1,NFL 2024 Regular Season,45540-BAL@KAN,2024-09-05,1.0,8:20 PM,BAL,Baltimore Ravens,Baltimore Ravens,Kansas City Chiefs,Road,DST,D,2700.0,3900.0,4.0,4.0
2,NFL 2024 Regular Season,45540-BAL@KAN,2024-09-05,1.0,8:20 PM,nelson-agholor,Nelson Agholor,Baltimore Ravens,Kansas City Chiefs,Road,WR,WR,3800.0,4200.0,1.6,1.1
3,NFL 2024 Regular Season,45540-BAL@KAN,2024-09-05,1.0,8:20 PM,derrick-henry,Derrick Henry,Baltimore Ravens,Kansas City Chiefs,Road,RB,RB,6900.0,7900.0,10.6,10.6
4,NFL 2024 Regular Season,45540-BAL@KAN,2024-09-05,1.0,8:20 PM,patrick-ricard,Patrick Ricard,Baltimore Ravens,Kansas City Chiefs,Road,RB,RB,4000.0,4100.0,0.0,0.0


In [61]:
# helper function does the following:
# Fanduel and Draft Kings player salary data for all positions (QB, RB, TE, WR, DST)
# creates and combines the dataframes for years 2017 - present 
# performs data validation checks

def create_DFS_dataframe(filepath, year):
    
    # Step 1: Read and clean the headers
    dfs_raw = pd.read_excel(filepath, header=[0, 1])
    original_row_count = len(dfs_raw)
    dfs_raw.columns = [clean_column_dfs(col) for col in dfs_raw.columns]

    # ✅ Step 2: Extract only relevant columns using cleaned names
    expected_cols = {
        'player': 'game information player dst',
        'week': 'game information week',
        'date': 'game information date',
        'player_id': 'game information player id',
        'team': 'game information team', 
        'opponent': 'game information opponent',
        'dk_position': 'position draftkings',
        'fd_position': 'position fanduel',
        'dk_salary': 'salary for draftkings classic contests',
        'fd_salary': 'salary for fanduel full roster contests',
        'dk_fpts': 'fantasy points scored draftkings',
        'fd_fpts': 'fantasy points scored fanduel'
    }

    # Subset the dataframe using cleaned column names
    dfs_subset = dfs_raw[list(expected_cols.values())].copy()

    # Rename them to simple identifiers for internal use
    dfs_subset.columns = list(expected_cols.keys())

    
    dfs_subset['date'] = pd.to_datetime(dfs_subset['date'])

    team_abbreviation_mapping = {
        'NWE': 'NE',
        'SFO': 'SF',
        'OAK': 'LV',
        'KAN': 'KC',
        'TAM': 'TB',
        'NOR': 'NO',
        'LAR': 'LA',
        'GNB': 'GB'
    }
    mask_dst = dfs_subset['dk_position'] == 'DST'
    dfs_subset.loc[mask_dst, 'player_id'] = dfs_subset.loc[mask_dst, 'player_id'].replace(team_abbreviation_mapping)

    def fix_season(row):
        game_year = row['date'].year
        game_month = row['date'].month
        game_week = row['week']
        
        if game_month in [1, 2]:
            if (game_year <= 2020 and game_week >= 18):
                return game_year - 1
            elif (game_year >= 2021 and game_week >= 19):
                return game_year - 1
            elif (game_year >= 2021 and game_week == 18):
                return game_year - 1
        return game_year

    dfs_subset['season'] = dfs_subset.apply(fix_season, axis=1)

    # 🔥 Track NaNs before dropping
    season_nulls_before = dfs_subset['season'].isna().sum()

    dfs_subset = dfs_subset.dropna(subset=['season'])
    dfs_subset['season'] = dfs_subset['season'].astype(int)

    season_nulls_after = dfs_subset['season'].isna().sum()

    print(f"🔎 Season NaN rows dropped: {season_nulls_before}")
    print(f"Remaining NaN rows (should be 0): {season_nulls_after}")

    dfs_subset = dfs_subset.drop(columns=['date'])

    dfs_subset['dk_salary'] = pd.to_numeric(dfs_subset['dk_salary'], errors='coerce')
    dfs_subset['fd_salary'] = pd.to_numeric(dfs_subset['fd_salary'], errors='coerce')
    dfs_subset = dfs_subset.dropna(subset=['dk_salary', 'fd_salary'])
    dfs_subset['dk_salary'] = dfs_subset['dk_salary'].astype(int)
    dfs_subset['fd_salary'] = dfs_subset['fd_salary'].astype(int)
    dfs_subset['week'] = dfs_subset['week'].astype(int)
    
    dfs_subset = dfs_subset[['season', 'week', 'player_id', 'player', 'dk_position', 'fd_position', 
                             'team', 'opponent', 'dk_salary', 'fd_salary', 'dk_fpts', 'fd_fpts']]
    
    unique_weeks = dfs_subset['week'].nunique()
    min_week = dfs_subset['week'].min()
    max_week = dfs_subset['week'].max()
    expected_weeks = 21 if int(year) <= 2020 else 22

    print(f"\nProcessing file: {filepath}")
    print(f"Original rows in xlsx file: {original_row_count}")
    print(f"Number of players with no salary data found in xlsx: {original_row_count - len(dfs_subset)}")
    print(f"Rows in csv file after dropping NaNs: {len(dfs_subset)}")

    if original_row_count - (original_row_count - len(dfs_subset)) == len(dfs_subset):
        print("✅ Salary Validation passed: Counts match after dropping NaNs.")
        salary_validation = 'Passed'
    else:
        print("❌ Salary Validation failed: Counts mismatch!")
        salary_validation = 'Failed'

    print(f"Weeks detected: {min_week} to {max_week}")
    print(f"Total unique weeks found: {unique_weeks}")
    print("🔔 Reminder: Missing final playoff week (e.g., Super Bowl) is normal if no salary data exists.")

    if unique_weeks == expected_weeks or unique_weeks == expected_weeks - 1:
        print(f"✅ Week Validation passed: {unique_weeks} weeks found (expected {expected_weeks}).\n")
        week_validation = 'Passed'
    else:
        print(f"❌ Week Validation failed: {unique_weeks} weeks found, expected {expected_weeks}.\n")
        week_validation = 'Failed'
    
    return dfs_subset, {
        'year': int(year),
        'original_rows': original_row_count,
        'nan_rows': original_row_count - len(dfs_subset),
        'rows_after_drop': len(dfs_subset),
        'min_week': min_week,
        'max_week': max_week,
        'unique_weeks': unique_weeks,
        'expected_weeks': expected_weeks,
        'salary_validation': salary_validation,
        'week_validation': week_validation
    }

In [62]:
# ** dataframe of Fanduel and Draft Kings Salaries FOR all positions ** 

# main control flow implements the helper function 
# output: combined dataframe and csv files of all seasons fanduel draft kings player salary data 
# output: data validation checks

# Find all matching files
file_list = sorted(glob.glob('./dfs_files/NFL-*-DFS-Dataset.xlsx'))

# Handle if no files found
if not file_list:
    print("❌ No xlsx files detected.\nPlease download and place the BigDataBall NFL DFS Excel files into the same directory as this Jupyter Notebook file.")
else:
    # Process each file
    all_years_dfs = []
    validation_records = []
    file_years = []

    for file in file_list:
        year = file.split('-')[1]  # Extract year from filename
        file_years.append(int(year))
        
        year_df, validation_info = create_DFS_dataframe(file, year)
        
        # Append to master list
        all_years_dfs.append(year_df)
        validation_records.append(validation_info)

    # Create validation summary DataFrame
    validation_summary_df = pd.DataFrame(validation_records)
    print("\n📋 Validation Summary:")
    display(validation_summary_df)

    # Combine all years into one big dataframe
    nfl_fd_dk_salary_combined = pd.concat(all_years_dfs, ignore_index=True)

    # Determine latest season dynamically
    current_season = max(file_years)

    # Export final combined CSV
    final_filename = f'nfl_fd_dk_salary_2017_{current_season}.csv'

    # If the file already exists, create a backup
    if os.path.exists(final_filename):
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        backup_filename = f'nfl_fd_dk_salary_2017_{current_season}_backup_{timestamp}.csv'
        shutil.copy(final_filename, backup_filename)
        print(f"🛡️ Backup created: {backup_filename}")

    # Display a quick preview
    display(nfl_fd_dk_salary_combined.head())

🔎 Season NaN rows dropped: 1
Remaining NaN rows (should be 0): 0

Processing file: ./dfs_files/NFL-2017-DFS-Dataset.xlsx
Original rows in xlsx file: 7664
Number of players with no salary data found in xlsx: 799
Rows in csv file after dropping NaNs: 6865
✅ Salary Validation passed: Counts match after dropping NaNs.
Weeks detected: 1 to 20
Total unique weeks found: 20
🔔 Reminder: Missing final playoff week (e.g., Super Bowl) is normal if no salary data exists.
✅ Week Validation passed: 20 weeks found (expected 21).

🔎 Season NaN rows dropped: 1
Remaining NaN rows (should be 0): 0

Processing file: ./dfs_files/NFL-2018-DFS-Dataset.xlsx
Original rows in xlsx file: 7216
Number of players with no salary data found in xlsx: 197
Rows in csv file after dropping NaNs: 7019
✅ Salary Validation passed: Counts match after dropping NaNs.
Weeks detected: 1 to 20
Total unique weeks found: 20
🔔 Reminder: Missing final playoff week (e.g., Super Bowl) is normal if no salary data exists.
✅ Week Validation

Unnamed: 0,year,original_rows,nan_rows,rows_after_drop,min_week,max_week,unique_weeks,expected_weeks,salary_validation,week_validation
0,2017,7664,799,6865,1,20,20,21,Passed,Passed
1,2018,7216,197,7019,1,20,20,21,Passed,Passed
2,2019,7252,268,6984,1,20,20,21,Passed,Passed
3,2020,7572,385,7187,1,20,20,21,Passed,Passed
4,2021,7994,1650,6344,1,21,21,22,Passed,Passed
5,2022,7872,138,7734,1,21,21,22,Passed,Passed
6,2023,7897,65,7832,1,21,21,22,Passed,Passed
7,2024,7912,136,7776,1,21,21,22,Passed,Passed


Unnamed: 0,season,week,player_id,player,dk_position,fd_position,team,opponent,dk_salary,fd_salary,dk_fpts,fd_fpts
0,2017,1,KC,Kansas City Chiefs,DST,DST,Kansas City Chiefs,New England Patriots,2300,4300,3.0,3.0
1,2017,1,NE,New England Patriots,DST,DST,New England Patriots,Kansas City Chiefs,3400,4500,1.0,1.0
2,2017,1,,Alex Smith,QB,QB,Kansas City Chiefs,New England Patriots,5400,6900,34.02,31.02
3,2017,1,,Tom Brady,QB,QB,New England Patriots,Kansas City Chiefs,7600,8900,10.68,10.68
4,2017,1,,Anthony Sherman,RB,RB,Kansas City Chiefs,New England Patriots,3000,4500,0.0,0.0


In [63]:
# csv file
# save_csv(nfl_fd_dk_salary_combined, f"nfl_fd_dk_salary_2017_{current_season}_df")


In [64]:
# ** WR dataframe of Fanduel and Draft Kings player Salaries ** 

# Determine current season based on available data
current_season = nfl_fd_dk_salary_combined['season'].max()

# Extract WR players where DraftKings position is WR
wr_fd_dk_salary_2017_current_df = nfl_fd_dk_salary_combined.loc[
    nfl_fd_dk_salary_combined['dk_position'] == 'WR'
]

# Display a quick preview
display(wr_fd_dk_salary_2017_current_df.head())


Unnamed: 0,season,week,player_id,player,dk_position,fd_position,team,opponent,dk_salary,fd_salary,dk_fpts,fd_fpts
18,2017,1,,Albert Wilson,WR,WR,Kansas City Chiefs,New England Patriots,3200,4500,8.7,6.2
19,2017,1,,Chris Conley,WR,WR,Kansas City Chiefs,New England Patriots,3300,5100,6.3,5.3
20,2017,1,,De'Anthony Thomas,WR,WR,Kansas City Chiefs,New England Patriots,3000,4500,1.2,0.7
21,2017,1,,Demarcus Robinson,WR,WR,Kansas City Chiefs,New England Patriots,3000,4500,0.0,0.0
22,2017,1,,Jehu Chesson,WR,WR,Kansas City Chiefs,New England Patriots,3000,4500,0.0,0.0


In [65]:
# csv file
# save_csv(wr_fd_dk_salary_2017_current_df, f"wr_fd_dk_salary_2017_{current_season}_df")


In [66]:
## End: Build the dataframe for the DFS Fanduel and Draft Kings salary data from BigDataBall ##

In [67]:
## Begin: team abbreviation standardization ##

In [68]:
## *** NEXT TASKS ***
# Review normalization section of the notebook,...
# modify the column names in the fantasypros dataframes, ...
# then  update the data appropriate sections / functions to conform to our modifications

In [69]:
# redefined as an explicit .copy() to:
# Prevent SettingWithCopyWarning
# Avoid chained assignment issues during column standardization
# Ensure full memory independence from prior transformations

wr_fp_basic_stats_df = wr_fp_basic_stats_df.copy()
wr_fp_advanced_stats_df = wr_fp_advanced_stats_df.copy()
wr_fp_rz_stats_df = wr_fp_rz_stats_df.copy()
wr_ngs_df = wr_ngs_df.copy()
wr_ids_weekly_stats_df = wr_ids_weekly_stats_df.copy()
wr_fd_dk_salary_2017_current_df = wr_fd_dk_salary_2017_current_df.copy()


In [70]:
# List columns from each dataframe in memory
ids_weekly_cols = wr_ids_weekly_stats_df.columns.tolist()
ngs_cols = wr_ngs_df.columns.tolist()
fp_basic_cols = wr_fp_basic_stats_df.columns.tolist()
fp_adv_cols = wr_fp_advanced_stats_df.columns.tolist()
fp_rz_cols = wr_fp_rz_stats_df.columns.tolist()
dfs_fd_dk_cols = wr_fd_dk_salary_2017_current_df.columns.tolist()

# Combine into a dataframe for side-by-side comparison
comparison_df = pd.DataFrame({
    "IDs & Weekly Stats": pd.Series(ids_weekly_cols),
    "NGS Stats": pd.Series(ngs_cols),
    "FantasyPros Basic": pd.Series(fp_basic_cols),
    "FantasyPros Adv": pd.Series(fp_adv_cols),
    "FantasyPros RZ": pd.Series(fp_rz_cols),
    "DFS FD DK": pd.Series(dfs_fd_dk_cols) 
})

comparison_df

Unnamed: 0,IDs & Weekly Stats,NGS Stats,FantasyPros Basic,FantasyPros Adv,FantasyPros RZ,DFS FD DK
0,season,season,year,year,year,season
1,season_type,season_type,week,week,week,week
2,week,week,fantasypros_id,fantasypros_id,fantasypros_id,player_id
3,player_id,player_display_name,player_name,player_name,player_name,player
4,player_name,player_position,team,team,team,dk_position
5,position,team_abbr,rec,games,rec_rz,fd_position
6,position_group,avg_cushion,tgt,rec,tgt_rz,team
7,recent_team,avg_separation,rec_yds,yds,rec_pct_rz,opponent
8,fantasy_points,avg_intended_air_yards,rec_ypc,ypr,yds_rz,dk_salary
9,fantasy_points_ppr,percent_share_of_intended_air_yards,rec_lg,ybc,yds_per_rec_rz,fd_salary


In [71]:
def show_team_uniques():
    def norm(s):
        return (
            s.astype('string')
             .str.strip()
             .str.upper()
             .str.replace(".", "", regex=False)
             .str.replace(" ", "", regex=False)
        )

    datasets = {
        "wr_ids_weekly_stats_df.recent_team": (wr_ids_weekly_stats_df, ['recent_team']),
        "wr_ngs_df.team_abbr":               (wr_ngs_df,             ['team_abbr']),
        "wr_fp_basic_stats_df.team":         (wr_fp_basic_stats_df,  ['team']),
        "wr_fp_advanced_stats_df.team":      (wr_fp_advanced_stats_df,['team']),
        "wr_fp_rz_stats_df.team":            (wr_fp_rz_stats_df,     ['team']),
        "wr_fd_dk_salary_2017_current_df.team": (wr_fd_dk_salary_2017_current_df, ['team']),
    }

    for label, (df, candidates) in datasets.items():
        team_col = next((c for c in candidates if c in df.columns), None)
        print(f"\n{label}")
        if not team_col:
            print(f"  ⚠️ No team column found in {candidates}")
            continue

        vals = sorted(norm(df[team_col].dropna()).unique())
        print(f"  column: {team_col} | uniques ({len(vals)}):")
        print(vals)

# Call to preview all six
show_team_uniques()


wr_ids_weekly_stats_df.recent_team
  column: recent_team | uniques (32):
['ARI', 'ATL', 'BAL', 'BUF', 'CAR', 'CHI', 'CIN', 'CLE', 'DAL', 'DEN', 'DET', 'GB', 'HOU', 'IND', 'JAX', 'KC', 'LA', 'LAC', 'LV', 'MIA', 'MIN', 'NE', 'NO', 'NYG', 'NYJ', 'PHI', 'PIT', 'SEA', 'SF', 'TB', 'TEN', 'WAS']

wr_ngs_df.team_abbr
  column: team_abbr | uniques (32):
['ARI', 'ATL', 'BAL', 'BUF', 'CAR', 'CHI', 'CIN', 'CLE', 'DAL', 'DEN', 'DET', 'GB', 'HOU', 'IND', 'JAX', 'KC', 'LAC', 'LAR', 'LV', 'MIA', 'MIN', 'NE', 'NO', 'NYG', 'NYJ', 'PHI', 'PIT', 'SEA', 'SF', 'TB', 'TEN', 'WAS']

wr_fp_basic_stats_df.team
  column: team | uniques (33):
['ARI', 'ATL', 'BAL', 'BUF', 'CAR', 'CHI', 'CIN', 'CLE', 'DAL', 'DEN', 'DET', 'FA', 'GB', 'HOU', 'IND', 'JAC', 'KC', 'LAC', 'LAR', 'LV', 'MIA', 'MIN', 'NE', 'NO', 'NYG', 'NYJ', 'PHI', 'PIT', 'SEA', 'SF', 'TB', 'TEN', 'WAS']

wr_fp_advanced_stats_df.team
  column: team | uniques (33):
['ARI', 'ATL', 'BAL', 'BUF', 'CAR', 'CHI', 'CIN', 'CLE', 'DAL', 'DEN', 'DET', 'FA', 'GB', '

In [72]:
# clean & get unique values
def get_team_set(df, col):
    return set(df[col].dropna().astype(str).str.strip().str.upper())

baseline_set = get_team_set(wr_ids_weekly_stats_df, 'recent_team')
print(f"Baseline (wr_ids_weekly_stats_df.recent_team) — {len(baseline_set)} uniques:\n{sorted(baseline_set)}\n")

# Dataframe -> column to compare
compare_map = {
    "wr_ngs_df": ("team_abbr" if "team_abbr" in wr_ngs_df.columns else None),
    "wr_fp_basic_stats_df": ("team" if "team" in wr_fp_basic_stats_df.columns else None),
    "wr_fp_advanced_stats_df": ("team" if "team" in wr_fp_advanced_stats_df.columns else None),
    "wr_fp_rz_stats_df": ("team" if "team" in wr_fp_rz_stats_df.columns else None),
    "wr_fd_dk_salary_2017_current_df": ("team" if "team" in wr_fd_dk_salary_2017_current_df.columns else None),
}

for name, col in compare_map.items():
    if col and col in globals()[name].columns:
        other_set = get_team_set(globals()[name], col)
        diff_from_baseline = other_set - baseline_set
        diff_in_baseline = baseline_set - other_set
        print(f"{name}.{col}:")
        print(f"  Unique values: {len(other_set)}")
        print(f"  In {name} but not in baseline: {sorted(diff_from_baseline) if diff_from_baseline else 'None'}")
        print(f"  In baseline but not in {name}: {sorted(diff_in_baseline) if diff_in_baseline else 'None'}\n")
    else:
        print(f"{name}: ⚠️ No team column found or mismatch\n")


Baseline (wr_ids_weekly_stats_df.recent_team) — 32 uniques:
['ARI', 'ATL', 'BAL', 'BUF', 'CAR', 'CHI', 'CIN', 'CLE', 'DAL', 'DEN', 'DET', 'GB', 'HOU', 'IND', 'JAX', 'KC', 'LA', 'LAC', 'LV', 'MIA', 'MIN', 'NE', 'NO', 'NYG', 'NYJ', 'PHI', 'PIT', 'SEA', 'SF', 'TB', 'TEN', 'WAS']

wr_ngs_df.team_abbr:
  Unique values: 32
  In wr_ngs_df but not in baseline: ['LAR']
  In baseline but not in wr_ngs_df: ['LA']

wr_fp_basic_stats_df.team:
  Unique values: 33
  In wr_fp_basic_stats_df but not in baseline: ['FA', 'JAC', 'LAR']
  In baseline but not in wr_fp_basic_stats_df: ['JAX', 'LA']

wr_fp_advanced_stats_df.team:
  Unique values: 33
  In wr_fp_advanced_stats_df but not in baseline: ['FA', 'JAC', 'LAR']
  In baseline but not in wr_fp_advanced_stats_df: ['JAX', 'LA']

wr_fp_rz_stats_df.team:
  Unique values: 33
  In wr_fp_rz_stats_df but not in baseline: ['FA', 'JAC', 'LAR']
  In baseline but not in wr_fp_rz_stats_df: ['JAX', 'LA']

wr_fd_dk_salary_2017_current_df.team:
  Unique values: 35
  In

In [73]:
# standardize team abbreviations

In [74]:

# base mapping: full team names -> abbreviations
full_name_to_abbr = {
    'Arizona Cardinals': 'ARI', 'Atlanta Falcons': 'ATL', 'Baltimore Ravens': 'BAL',
    'Buffalo Bills': 'BUF', 'Carolina Panthers': 'CAR', 'Chicago Bears': 'CHI',
    'Cincinnati Bengals': 'CIN', 'Cleveland Browns': 'CLE', 'Dallas Cowboys': 'DAL',
    'Denver Broncos': 'DEN', 'Detroit Lions': 'DET', 'Green Bay Packers': 'GB',
    'Houston Texans': 'HOU', 'Indianapolis Colts': 'IND', 'Jacksonville Jaguars': 'JAX',
    'Kansas City Chiefs': 'KC', 'Las Vegas Raiders': 'LV', 'Los Angeles Chargers': 'LAC',
    'Los Angeles Rams': 'LA', 'Miami Dolphins': 'MIA', 'Minnesota Vikings': 'MIN',
    'New England Patriots': 'NE', 'New Orleans Saints': 'NO', 'New York Giants': 'NYG',
    'New York Jets': 'NYJ', 'Philadelphia Eagles': 'PHI', 'Pittsburgh Steelers': 'PIT',
    'San Francisco 49ers': 'SF', 'Seattle Seahawks': 'SEA', 'Tampa Bay Buccaneers': 'TB',
    'Tennessee Titans': 'TEN', 'Washington Commanders': 'WAS', 

    # legacy names
    'St. Louis Rams': 'LAR',
    'San Diego Chargers': 'LAC',
    'Oakland Raiders': 'LV',
    'Washington Football Team': 'WAS',
    'Washington Redskins': 'WAS',
    
    # Free agent placeholder
    'Free Agent': 'FA'
    
}

# --- Start with exact-case mapping ---
alias_map = {name.upper(): abbr for name, abbr in full_name_to_abbr.items()}

# --- Add no-space/punctuation aliases ---
for name, abbr in full_name_to_abbr.items():
    no_space = re.sub(r'[^A-Z0-9]', '', name.upper())
    alias_map[no_space] = abbr

# --- Add free agent compressed form ---
alias_map['FREEAGENT'] = 'FA'

# --- Abbreviation fixups (site quirks, alternate short codes) ---
abbr_fixes = {
    'ARZ': 'ARI', 'TBB': 'TB', 'NEP': 'NE', 'GBP': 'GB',
    'KCC': 'KC', 'SFF': 'SF', 'NOS': 'NO', 'JAC': 'JAX',
    'LAR': 'LA', 'LVR': 'LV', 'WSH': 'WAS', 'WFT': 'WAS'
}

# Merge fixups into alias_map so one lookup covers all cases
alias_map.update(abbr_fixes)

In [75]:
# create team abbreviation mapping logic
def standardize_team_abbr(df, col, mapping):
    if col in df.columns:
        df[col] = (
            df[col]
            .astype(str)
            .str.strip()
            .str.upper()
            .str.replace(".", "", regex=False)
            .str.replace(" ", "", regex=False)
            .replace(mapping)
        )

In [76]:
# Apply standardization to all relevant dataframes/columns in one go
datasets_to_standardize = [
    (wr_ids_weekly_stats_df, "recent_team"),
    (wr_ngs_df, "team_abbr"),
    (wr_fp_basic_stats_df, "team"),
    (wr_fp_advanced_stats_df, "team"),
    (wr_fp_rz_stats_df, "team"),
    (wr_fd_dk_salary_2017_current_df, "team")
]

for df, col in datasets_to_standardize:
    standardize_team_abbr(df, col, alias_map)

# Quick check after standardization
for df, col in datasets_to_standardize:
    if col in df.columns:
        print(f"{col} uniques in dataframe:")
        print(sorted(df[col].dropna().unique()))
        print()


recent_team uniques in dataframe:
['ARI', 'ATL', 'BAL', 'BUF', 'CAR', 'CHI', 'CIN', 'CLE', 'DAL', 'DEN', 'DET', 'GB', 'HOU', 'IND', 'JAX', 'KC', 'LA', 'LAC', 'LV', 'MIA', 'MIN', 'NE', 'NO', 'NYG', 'NYJ', 'PHI', 'PIT', 'SEA', 'SF', 'TB', 'TEN', 'WAS']

team_abbr uniques in dataframe:
['ARI', 'ATL', 'BAL', 'BUF', 'CAR', 'CHI', 'CIN', 'CLE', 'DAL', 'DEN', 'DET', 'GB', 'HOU', 'IND', 'JAX', 'KC', 'LA', 'LAC', 'LV', 'MIA', 'MIN', 'NE', 'NO', 'NYG', 'NYJ', 'PHI', 'PIT', 'SEA', 'SF', 'TB', 'TEN', 'WAS']

team uniques in dataframe:
['ARI', 'ATL', 'BAL', 'BUF', 'CAR', 'CHI', 'CIN', 'CLE', 'DAL', 'DEN', 'DET', 'FA', 'GB', 'HOU', 'IND', 'JAX', 'KC', 'LA', 'LAC', 'LV', 'MIA', 'MIN', 'NE', 'NO', 'NYG', 'NYJ', 'PHI', 'PIT', 'SEA', 'SF', 'TB', 'TEN', 'WAS']

team uniques in dataframe:
['ARI', 'ATL', 'BAL', 'BUF', 'CAR', 'CHI', 'CIN', 'CLE', 'DAL', 'DEN', 'DET', 'FA', 'GB', 'HOU', 'IND', 'JAX', 'KC', 'LA', 'LAC', 'LV', 'MIA', 'MIN', 'NE', 'NO', 'NYG', 'NYJ', 'PHI', 'PIT', 'SEA', 'SF', 'TB', 'TEN', 'WAS

In [77]:
# list the unique values for each specified team column for visual inspection
dfs_and_cols = [
    ("IDs & Weekly", wr_ids_weekly_stats_df, "recent_team"),
    ("NGS", wr_ngs_df, "team_abbr"),
    ("FP Basic", wr_fp_basic_stats_df, "team"),
    ("FP Advanced", wr_fp_advanced_stats_df, "team"),
    ("FP RZ", wr_fp_rz_stats_df, "team"),
    ("DFS DK", wr_fd_dk_salary_2017_current_df, "team")
]

for label, df, col in dfs_and_cols:
    if col in df.columns:
        print(f"\n[{label}] {col} uniques ({len(df[col].dropna().unique())}):")
        print(sorted(df[col].dropna().unique()))



[IDs & Weekly] recent_team uniques (32):
['ARI', 'ATL', 'BAL', 'BUF', 'CAR', 'CHI', 'CIN', 'CLE', 'DAL', 'DEN', 'DET', 'GB', 'HOU', 'IND', 'JAX', 'KC', 'LA', 'LAC', 'LV', 'MIA', 'MIN', 'NE', 'NO', 'NYG', 'NYJ', 'PHI', 'PIT', 'SEA', 'SF', 'TB', 'TEN', 'WAS']

[NGS] team_abbr uniques (32):
['ARI', 'ATL', 'BAL', 'BUF', 'CAR', 'CHI', 'CIN', 'CLE', 'DAL', 'DEN', 'DET', 'GB', 'HOU', 'IND', 'JAX', 'KC', 'LA', 'LAC', 'LV', 'MIA', 'MIN', 'NE', 'NO', 'NYG', 'NYJ', 'PHI', 'PIT', 'SEA', 'SF', 'TB', 'TEN', 'WAS']

[FP Basic] team uniques (33):
['ARI', 'ATL', 'BAL', 'BUF', 'CAR', 'CHI', 'CIN', 'CLE', 'DAL', 'DEN', 'DET', 'FA', 'GB', 'HOU', 'IND', 'JAX', 'KC', 'LA', 'LAC', 'LV', 'MIA', 'MIN', 'NE', 'NO', 'NYG', 'NYJ', 'PHI', 'PIT', 'SEA', 'SF', 'TB', 'TEN', 'WAS']

[FP Advanced] team uniques (33):
['ARI', 'ATL', 'BAL', 'BUF', 'CAR', 'CHI', 'CIN', 'CLE', 'DAL', 'DEN', 'DET', 'FA', 'GB', 'HOU', 'IND', 'JAX', 'KC', 'LA', 'LAC', 'LV', 'MIA', 'MIN', 'NE', 'NO', 'NYG', 'NYJ', 'PHI', 'PIT', 'SEA', 'SF', 'T

In [78]:
## End: team abbreviation standardization ##

In [79]:
## Begin: Column Renaming – Normalize FantasyPros Column Names

In [80]:
# Column Renaming – Normalize FantasyPros Column Names

# wr_fp_basic_stats_df
wr_fp_basic_stats_df.rename(columns={
    'year': 'season',
    'ROST': 'rost_pct'
}, inplace=True)

# wr_fp_advanced_stats_df
wr_fp_advanced_stats_df.rename(columns={
    'year': 'season',
    '% TM': 'pct_tm_tgts',
    'yacon': 'rec_yds_after_contact'
}, inplace=True)

# wr_fp_rz_stats_df
wr_fp_rz_stats_df.rename(columns={
    'year': 'season',
    'REC PCT': 'rec_pct_rz',
    'TGT PCT': 'tgt_pct_rz',
    'PCT': 'pct_rz',
    'ROST %': 'rost_pct_rz'
}, inplace=True)


In [81]:
# List columns from each dataframe in memory
ids_weekly_cols = wr_ids_weekly_stats_df.columns.tolist()
ngs_cols = wr_ngs_df.columns.tolist()
fp_basic_cols = wr_fp_basic_stats_df.columns.tolist()
fp_adv_cols = wr_fp_advanced_stats_df.columns.tolist()
fp_rz_cols = wr_fp_rz_stats_df.columns.tolist()
dfs_fd_dk_cols = wr_fd_dk_salary_2017_current_df.columns.tolist()

# Combine into a dataframe for side-by-side comparison
comparison_df = pd.DataFrame({
    "IDs & Weekly Stats": pd.Series(ids_weekly_cols),
    "NGS Stats": pd.Series(ngs_cols),
    "FantasyPros Basic": pd.Series(fp_basic_cols),
    "FantasyPros Adv": pd.Series(fp_adv_cols),
    "FantasyPros RZ": pd.Series(fp_rz_cols),
    "DFS FD DK": pd.Series(dfs_fd_dk_cols) 
})

comparison_df

Unnamed: 0,IDs & Weekly Stats,NGS Stats,FantasyPros Basic,FantasyPros Adv,FantasyPros RZ,DFS FD DK
0,season,season,season,season,season,season
1,season_type,season_type,week,week,week,week
2,week,week,fantasypros_id,fantasypros_id,fantasypros_id,player_id
3,player_id,player_display_name,player_name,player_name,player_name,player
4,player_name,player_position,team,team,team,dk_position
5,position,team_abbr,rec,games,rec_rz,fd_position
6,position_group,avg_cushion,tgt,rec,tgt_rz,team
7,recent_team,avg_separation,rec_yds,yds,rec_pct_rz,opponent
8,fantasy_points,avg_intended_air_yards,rec_ypc,ypr,yds_rz,dk_salary
9,fantasy_points_ppr,percent_share_of_intended_air_yards,rec_lg,ybc,yds_per_rec_rz,fd_salary


In [82]:
## End: Column Renaming – Normalize FantasyPros Column Names

In [83]:
# redefined as an explicit .copy() to:
# Prevent SettingWithCopyWarning
# Avoid chained assignment issues during column standardization
# Ensure full memory independence from prior transformations

wr_fp_basic_stats_df = wr_fp_basic_stats_df.copy()
wr_fp_advanced_stats_df = wr_fp_advanced_stats_df.copy()
wr_fp_rz_stats_df = wr_fp_rz_stats_df.copy()
wr_ngs_df = wr_ngs_df.copy()
wr_fd_dk_salary_2017_current_df = wr_fd_dk_salary_2017_current_df.copy()

# No need to redefine wr_ids_weekly_stats_df again unless you also normalize recent_team


In [84]:
## Begin: data type evaluation and normalization

In [85]:
def strip_whitespace_columns(df):
    df = df.copy()
    for col in df.select_dtypes(include='object').columns:
        try:
            df[col] = df[col].astype(str).str.strip()
        except Exception as e:
            print(f"⚠️ Could not strip column '{col}': {e}")
    return df


In [86]:
def lowercase_id_columns(df, keys=['fantasypros_id', 'player_name']):
    df = df.copy()
    for key in keys:
        if key in df.columns:
            try:
                df[key] = df[key].astype(str).str.lower()
            except Exception as e:
                print(f"⚠️ Could not lowercase column '{key}': {e}")
    return df


In [87]:
# Convert a column to Int32, safely handling empty strings and non-numeric entries
# *Note: int32 vs Int32 - Int32 can hanlde NaN
def clean_integer_column(df, column_name):
    df = df.copy()
    if column_name in df.columns:
        try:
            df[column_name] = (
                df[column_name]
                .astype(str)                    # Ensure it's string type
                .str.strip()                    # Remove extra whitespace
                .replace('', np.nan)            # Replace empty string with NaN
                .replace('nan', np.nan)         # Optional: if string "nan" exists
            )
            df[column_name] = (
                pd.to_numeric(df[column_name], errors='coerce')  # Coerce invalids to NaN
                .astype('Int32')                                 # Nullable integer
            )
        except Exception as e:
            print(f"⚠️ Could not clean integer column '{column_name}': {e}")
    return df


In [88]:
def convert_percentage_columns(df, percent_cols):
    df = df.copy()
    for col in percent_cols:
        if col in df.columns:
            try:
                df[col] = (
                    df[col]
                    .astype(str)
                    .str.replace('%', '', regex=False)
                    .str.strip()
                    .replace('', np.nan)
                    .astype(float) / 100
                ).astype('float32')
            except Exception as e:
                print(f"⚠️ Could not convert column '{col}' to float32 percentage: {e}")
    return df


In [89]:
# wr_fp_basic_stats_type_map 

wr_fp_basic_stats_type_map = {
    'season': 'Int32',
    'week': 'Int32',
    'fantasypros_id': 'str',
    'player_name': 'str',
    'team': 'str',
    'rec': 'Int32',
    'tgt': 'Int32',
    'rec_yds': 'Int32',
    'rec_ypc': 'float32',
    'rec_lg': 'Int32',
    'rec_20+': 'Int32',
    'rec_td': 'Int32',
    'rush_att': 'Int32',
    'rush_yds': 'Int32',
    'rush_td': 'Int32',
    'fum': 'Int32',
    'games': 'Int32',
    'fpts': 'float32',
    'fpts_per_game': 'float32',
    'rost_pct': 'float32'
}



In [90]:
# wr_fp_advanced_stats_type_map 

wr_fp_advanced_stats_type_map = {
    'season': 'Int32',               # renamed from 'year'
    'week': 'Int32',
    'fantasypros_id': 'str',
    'player_name': 'str',
    'team': 'str',
    'games': 'Int32',
    'rec': 'Int32',
    'yds': 'Int32',
    'ypr': 'float32',
    'ybc': 'Int32',
    'ybc_per_rec': 'float32',
    'air': 'Int32',
    'air_per_rec': 'float32',
    'yac': 'Int32',
    'yac_per_rec': 'float32',
    'rec_yds_after_contact': 'float32',         # ✅ formerly 'yacon'
    'yacon_per_rec': 'float32',
    'brktkl': 'Int32',
    'tgt': 'Int32',
    'pct_tm_tgts': 'float32',               # ✅ formerly '% TM'
    'catchable': 'Int32',
    'drop': 'Int32',
    'rz_tgt': 'Int32',
    '10_plus': 'Int32',
    '20_plus': 'Int32',
    '30_plus': 'Int32',
    '40_plus': 'Int32',
    '50_plus': 'Int32',
    'lng': 'Int32'
}



In [91]:
# wr_fp_rz_stats_type_map 

wr_fp_rz_stats_type_map = {
    'season': 'Int32',             # renamed from 'year'
    'week': 'Int32',
    'fantasypros_id': 'str',
    'player_name': 'str',
    'team': 'str',
    'rec_rz': 'Int32',
    'tgt_rz': 'Int32',
    'rec_pct_rz': 'float32',
    'yds_rz': 'Int32',
    'yds_per_rec_rz': 'float32',
    'td_rz': 'Int32',
    'tgt_pct_rz': 'float32',
    'rush_att_rz': 'Int32',
    'rush_yds_rz': 'Int32',
    'rush_td_rz': 'Int32',
    'rush_pct_rz': 'float32',
    'fl_rz': 'Int32',
    'games': 'Int32',
    'fpts_rz': 'float32',
    'fpts_pg_rz': 'float32',
    'rost_pct': 'float32'  # ✅ already converted
}




In [92]:
# cast dataframe columns to specified types with error logging.
def cast_column_types(df, type_map, df_name="DataFrame", verbose=True):
    df = df.copy()
    for col, dtype in type_map.items():
        if col in df.columns:
            try:
                df[col] = df[col].astype(dtype)
                if verbose:
                    print(f"✅ [{df_name}] {col} → {dtype}")
            except Exception as e:
                print(f"⚠️  [{df_name}] Failed to convert '{col}' to {dtype}: {e}")
        else:
            print(f"ℹ️  [{df_name}] Column '{col}' not found — skipping.")
    return df


In [93]:
# apply cleaning and normalization to FantasyPros dataframes

# Apply to Basic Stats
wr_fp_basic_stats_df = strip_whitespace_columns(wr_fp_basic_stats_df)
wr_fp_basic_stats_df = lowercase_id_columns(wr_fp_basic_stats_df)
wr_fp_basic_stats_df = convert_percentage_columns(wr_fp_basic_stats_df, ['rost_pct'])  # ✅ added back
wr_fp_basic_stats_df = cast_column_types(wr_fp_basic_stats_df, wr_fp_basic_stats_type_map, df_name="Basic Stats")
print(f"✅ wr_fp_basic_stats_df shape: {wr_fp_basic_stats_df.shape}")

# Apply to Advanced Stats
wr_fp_advanced_stats_df = strip_whitespace_columns(wr_fp_advanced_stats_df)
wr_fp_advanced_stats_df = lowercase_id_columns(wr_fp_advanced_stats_df)
wr_fp_advanced_stats_df = convert_percentage_columns(wr_fp_advanced_stats_df, ['pct_tm_tgts'])  # ✅ added back
wr_fp_advanced_stats_df = cast_column_types(wr_fp_advanced_stats_df, wr_fp_advanced_stats_type_map, df_name="Advanced Stats")
print(f"✅ wr_fp_advanced_stats_df shape: {wr_fp_advanced_stats_df.shape}")

# Apply to Red Zone Stats
wr_fp_rz_stats_df = strip_whitespace_columns(wr_fp_rz_stats_df)
wr_fp_rz_stats_df = lowercase_id_columns(wr_fp_rz_stats_df)
wr_fp_rz_stats_df = convert_percentage_columns(
    wr_fp_rz_stats_df,
    ['rec_pct_rz', 'tgt_pct_rz', 'rush_pct_rz', 'rost_pct']  # ✅ renamed and added back
)
wr_fp_rz_stats_df = cast_column_types(wr_fp_rz_stats_df, wr_fp_rz_stats_type_map, df_name="Red Zone Stats")
print(f"✅ wr_fp_rz_stats_df shape: {wr_fp_rz_stats_df.shape}")


✅ [Basic Stats] season → Int32
✅ [Basic Stats] week → Int32
✅ [Basic Stats] fantasypros_id → str
✅ [Basic Stats] player_name → str
✅ [Basic Stats] team → str
✅ [Basic Stats] rec → Int32
✅ [Basic Stats] tgt → Int32
✅ [Basic Stats] rec_yds → Int32
✅ [Basic Stats] rec_ypc → float32
✅ [Basic Stats] rec_lg → Int32
✅ [Basic Stats] rec_20+ → Int32
✅ [Basic Stats] rec_td → Int32
✅ [Basic Stats] rush_att → Int32
✅ [Basic Stats] rush_yds → Int32
✅ [Basic Stats] rush_td → Int32
✅ [Basic Stats] fum → Int32
✅ [Basic Stats] games → Int32
✅ [Basic Stats] fpts → float32
✅ [Basic Stats] fpts_per_game → float32
✅ [Basic Stats] rost_pct → float32
✅ wr_fp_basic_stats_df shape: (25726, 20)
✅ [Advanced Stats] season → Int32
✅ [Advanced Stats] week → Int32
✅ [Advanced Stats] fantasypros_id → str
✅ [Advanced Stats] player_name → str
✅ [Advanced Stats] team → str
✅ [Advanced Stats] games → Int32
✅ [Advanced Stats] rec → Int32
✅ [Advanced Stats] yds → Int32
✅ [Advanced Stats] ypr → float32
✅ [Advanced Stats] yb

In [94]:
# view data types
wr_fd_dk_salary_2017_current_df.info()
wr_fd_dk_salary_2017_current_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20985 entries, 18 to 57739
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   season       20985 non-null  int64  
 1   week         20985 non-null  int64  
 2   player_id    18564 non-null  object 
 3   player       20985 non-null  object 
 4   dk_position  20985 non-null  object 
 5   fd_position  20985 non-null  object 
 6   team         20985 non-null  object 
 7   opponent     20985 non-null  object 
 8   dk_salary    20985 non-null  int64  
 9   fd_salary    20985 non-null  int64  
 10  dk_fpts      20982 non-null  float64
 11  fd_fpts      20982 non-null  float64
dtypes: float64(2), int64(4), object(6)
memory usage: 2.1+ MB


Unnamed: 0,season,week,player_id,player,dk_position,fd_position,team,opponent,dk_salary,fd_salary,dk_fpts,fd_fpts
18,2017,1,,Albert Wilson,WR,WR,KC,New England Patriots,3200,4500,8.7,6.2
19,2017,1,,Chris Conley,WR,WR,KC,New England Patriots,3300,5100,6.3,5.3
20,2017,1,,De'Anthony Thomas,WR,WR,KC,New England Patriots,3000,4500,1.2,0.7
21,2017,1,,Demarcus Robinson,WR,WR,KC,New England Patriots,3000,4500,0.0,0.0
22,2017,1,,Jehu Chesson,WR,WR,KC,New England Patriots,3000,4500,0.0,0.0


In [95]:
# wr_fd_dk_salary_type_map

wr_fd_dk_salary_type_map = {
    'season': 'Int32',
    'week': 'Int32',
    'player_id': 'str',
    'player': 'str',
    'dk_position': 'str',
    'fd_position': 'str',
    'team': 'str',
    'opponent': 'str',
    'dk_salary': 'Int32',
    'fd_salary': 'Int32',
    'dk_fpts': 'float32',
    'fd_fpts': 'float32'
}


In [96]:
# apply type casting to wr_fd_dk_salary_2017_current_df
wr_fd_dk_salary_2017_current_df = cast_column_types(
    wr_fd_dk_salary_2017_current_df,
    wr_fd_dk_salary_type_map
)


✅ [DataFrame] season → Int32
✅ [DataFrame] week → Int32
✅ [DataFrame] player_id → str
✅ [DataFrame] player → str
✅ [DataFrame] dk_position → str
✅ [DataFrame] fd_position → str
✅ [DataFrame] team → str
✅ [DataFrame] opponent → str
✅ [DataFrame] dk_salary → Int32
✅ [DataFrame] fd_salary → Int32
✅ [DataFrame] dk_fpts → float32
✅ [DataFrame] fd_fpts → float32


In [97]:
# view data types
wr_ids_weekly_stats_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17384 entries, 0 to 17383
Data columns (total 46 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   season                       17384 non-null  int32  
 1   season_type                  17384 non-null  object 
 2   week                         17384 non-null  int32  
 3   player_id                    17384 non-null  object 
 4   player_name                  17384 non-null  object 
 5   position                     17384 non-null  object 
 6   position_group               17384 non-null  object 
 7   recent_team                  17384 non-null  object 
 8   fantasy_points               17384 non-null  float32
 9   fantasy_points_ppr           17384 non-null  float32
 10  sleeper_id                   17384 non-null  float64
 11  nfl_id                       4738 non-null   object 
 12  stats_global_id              17384 non-null  float64
 13  mfl_id          

In [98]:
# wr_ids_weekly_stats type map

wr_ids_weekly_stats_type_map = {
    'season': 'Int32',
    'season_type': 'str',
    'week': 'Int32',
    'player_id': 'str',
    'player_name': 'str',
    'position': 'str',
    'position_group': 'str',
    'recent_team': 'str',
    'fantasy_points': 'float32',
    'fantasy_points_ppr': 'float32',
    'pff_id': 'str',
    'nfl_id': 'str',
    'name': 'str',
    'stats_global_id': 'str',
    'mfl_id': 'Int32',
    'ff_id': 'str',
    'cbs_id': 'str',
    'fleaflicker_id': 'str',
    'sportradar_id': 'str',
    'rotoworld_id': 'str',
    'sleeper_id': 'str',
    'ktc_id': 'str',
    'stats_id': 'str',
    'fantasypros_id': 'str',
    'merge_name': 'str',
    'cbfref_id': 'str',
    'fantasy_data_id': 'str',
    'espn_id': 'str',
    'swish_id': 'str',
    'rotowire_id': 'str',
    'yahoo_id': 'str',
    'receptions': 'Int32',
    'targets': 'Int32',
    'receiving_yards': 'Int32',
    'receiving_tds': 'Int32',
    'receiving_fumbles': 'float32',
    'receiving_fumbles_lost': 'float32',
    'receiving_air_yards': 'float32',
    'receiving_yards_after_catch': 'float32',
    'receiving_first_downs': 'float32',
    'receiving_epa': 'float32',
    'receiving_2pt_conversions': 'Int32',
    'racr': 'float32',
    'target_share': 'float32',
    'air_yards_share': 'float32',
    'wopr': 'float32',
}


In [99]:
# apply type casting to wr_ids_weekly_stats_df
wr_ids_weekly_stats_df = cast_column_types(
    wr_ids_weekly_stats_df,
    wr_ids_weekly_stats_type_map
)


✅ [DataFrame] season → Int32
✅ [DataFrame] season_type → str
✅ [DataFrame] week → Int32
✅ [DataFrame] player_id → str
✅ [DataFrame] player_name → str
✅ [DataFrame] position → str
✅ [DataFrame] position_group → str
✅ [DataFrame] recent_team → str
✅ [DataFrame] fantasy_points → float32
✅ [DataFrame] fantasy_points_ppr → float32
✅ [DataFrame] pff_id → str
✅ [DataFrame] nfl_id → str
✅ [DataFrame] name → str
✅ [DataFrame] stats_global_id → str
✅ [DataFrame] mfl_id → Int32
ℹ️  [DataFrame] Column 'ff_id' not found — skipping.
✅ [DataFrame] cbs_id → str
✅ [DataFrame] fleaflicker_id → str
✅ [DataFrame] sportradar_id → str
✅ [DataFrame] rotoworld_id → str
✅ [DataFrame] sleeper_id → str
✅ [DataFrame] ktc_id → str
✅ [DataFrame] stats_id → str
✅ [DataFrame] fantasypros_id → str
✅ [DataFrame] merge_name → str
ℹ️  [DataFrame] Column 'cbfref_id' not found — skipping.
✅ [DataFrame] fantasy_data_id → str
✅ [DataFrame] espn_id → str
✅ [DataFrame] swish_id → str
✅ [DataFrame] rotowire_id → str
✅ [DataFram

In [100]:
# view data types
wr_ngs_df.info()
wr_ngs_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8249 entries, 1725 to 13328
Data columns (total 22 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   season                               8249 non-null   int32  
 1   season_type                          8249 non-null   object 
 2   week                                 8249 non-null   int32  
 3   player_display_name                  8249 non-null   object 
 4   player_position                      8249 non-null   object 
 5   team_abbr                            8249 non-null   object 
 6   avg_cushion                          8247 non-null   float64
 7   avg_separation                       8249 non-null   float64
 8   avg_intended_air_yards               8249 non-null   float64
 9   percent_share_of_intended_air_yards  8249 non-null   float64
 10  receptions                           8249 non-null   int32  
 11  targets                   

Unnamed: 0,season,season_type,week,player_display_name,player_position,team_abbr,avg_cushion,avg_separation,avg_intended_air_yards,percent_share_of_intended_air_yards,receptions,targets,catch_percentage,yards,rec_touchdowns,avg_yac,avg_expected_yac,avg_yac_above_expectation,player_gsis_id,player_first_name,player_last_name,player_short_name
1725,2017,REG,1,Ryan Grant,WR,WAS,9.936667,2.894592,4.41,7.154639,4,6,66.666667,61.0,0,11.2325,10.072361,1.160139,00-0031068,Ryan,Grant,R.Grant
1726,2017,REG,1,Martavis Bryant,WR,PIT,8.3,4.122054,12.688333,33.327496,2,6,33.333333,14.0,0,0.155,4.098278,-3.943278,00-0031373,Martavis,Bryant,M.Bryant
1729,2017,REG,1,Jamison Crowder,WR,WAS,7.655,3.177793,10.54,19.949707,3,7,42.857143,14.0,0,1.45,1.631897,-0.181897,00-0031941,Jamison,Crowder,J.Crowder
1732,2017,REG,1,Nelson Agholor,WR,PHI,7.42375,2.46262,10.46375,20.274656,6,8,75.0,86.0,1,5.611667,3.26247,2.349197,00-0031549,Nelson,Agholor,N.Agholor
1733,2017,REG,1,John Brown,WR,ARI,7.36,2.751526,13.422222,28.208481,4,9,44.444444,32.0,0,-0.3775,0.961993,-1.339493,00-0031051,John,Brown,J.Brown


In [101]:
# wr_ngs_df type map

wr_ngs_df_type_map = {
    # int columns
    'season': 'Int32',
    'week': 'Int32',
    'receptions': 'Int32',
    'targets': 'Int32',
    'rec_touchdowns': 'Int32',

    # float columns
    'avg_cushion': 'float32',
    'avg_separation': 'float32',
    'avg_intended_air_yards': 'float32',
    'percent_share_of_intended_air_yards': 'float32',
    'catch_percentage': 'float32',
    'yards': 'float32',
    'avg_yac': 'float32',
    'avg_expected_yac': 'float32',
    'avg_yac_above_expectation': 'float32',

    # object → str
    'season_type': 'str',
    'player_display_name': 'str',
    'player_position': 'str',
    'team_abbr': 'str',
    'player_gsis_id': 'str',
    'player_first_name': 'str',
    'player_last_name': 'str',
    'player_short_name': 'str'
}


In [102]:
# Apply type casting to wr_ngs_df using the type map
wr_ngs_df = cast_column_types(
    wr_ngs_df,
    wr_ngs_df_type_map
)

# Display verification summary
print(f"✅ wr_ngs_df shape: {wr_ngs_df.shape}")

✅ [DataFrame] season → Int32
✅ [DataFrame] week → Int32
✅ [DataFrame] receptions → Int32
✅ [DataFrame] targets → Int32
✅ [DataFrame] rec_touchdowns → Int32
✅ [DataFrame] avg_cushion → float32
✅ [DataFrame] avg_separation → float32
✅ [DataFrame] avg_intended_air_yards → float32
✅ [DataFrame] percent_share_of_intended_air_yards → float32
✅ [DataFrame] catch_percentage → float32
✅ [DataFrame] yards → float32
✅ [DataFrame] avg_yac → float32
✅ [DataFrame] avg_expected_yac → float32
✅ [DataFrame] avg_yac_above_expectation → float32
✅ [DataFrame] season_type → str
✅ [DataFrame] player_display_name → str
✅ [DataFrame] player_position → str
✅ [DataFrame] team_abbr → str
✅ [DataFrame] player_gsis_id → str
✅ [DataFrame] player_first_name → str
✅ [DataFrame] player_last_name → str
✅ [DataFrame] player_short_name → str
✅ wr_ngs_df shape: (8249, 22)


In [103]:
### End: data type evaluation and normalization ###

In [104]:
### Begin: Merge Process ###

In [105]:
# redefined as an explicit .copy() to:
# Prevent SettingWithCopyWarning
# Avoid chained assignment issues during column standardization
# Ensure full memory independence from prior transformations

wr_ids_weekly_stats_df = wr_ids_weekly_stats_df.copy()
wr_ngs_df = wr_ngs_df.copy()

wr_fp_basic_stats_df = wr_fp_basic_stats_df.copy()
wr_fp_advanced_stats_df = wr_fp_advanced_stats_df.copy()
wr_fp_rz_stats_df = wr_fp_rz_stats_df.copy()

wr_fd_dk_salary_2017_current_df = wr_fd_dk_salary_2017_current_df.copy()


In [106]:
# generate merge_name and merge_key
def add_merge_keys(df, player_col='player', season_col='season', week_col='week', new_prefix='merge'):
    df[f'{new_prefix}_name'] = (
        df[player_col]
        .str.lower()
        .str.replace(r'[^a-z ]', '', regex=True)
        .str.strip()
    )

    df[f'{new_prefix}_key'] = (
        df[f'{new_prefix}_name'] + '_' +
        df[season_col].astype(str) + '_' +
        df[week_col].astype(str)
    )

    return df

In [107]:
def display_null_summary(df, cols, label=""):
    title = f"[NaN Summary] {label}" if label else "[NaN Summary]"
    print(title)
    display(df[cols].isnull().sum())

    print(f"\n[NaN %] {label}" if label else "\n[NaN %]")
    display((df[cols].isnull().mean() * 100).round(2))


In [108]:
# apply merge keys

# apply to NGS
wr_ngs_df = add_merge_keys(wr_ngs_df, player_col='player_display_name')

# apply to DFS
wr_fd_dk_salary_2017_current_df = add_merge_keys(wr_fd_dk_salary_2017_current_df, player_col='player')

# apply ONLY merge_key to weekly stats (merge_name is already clean)
wr_ids_weekly_stats_df['merge_key'] = (
    wr_ids_weekly_stats_df['merge_name'] + '_' +
    wr_ids_weekly_stats_df['season'].astype(str) + '_' +
    wr_ids_weekly_stats_df['week'].astype(str)
)


In [109]:
# merge salary into Weekly Stats
wr_ids_weekly_stats_salary_df = pd.merge(
    wr_ids_weekly_stats_df,
    wr_fd_dk_salary_2017_current_df,
    how='left',
    on='merge_key',
    suffixes=('', '_dfs')
)


In [110]:
# merge salary into NGS
wr_ngs_salary_df = pd.merge(
    wr_ngs_df,
    wr_fd_dk_salary_2017_current_df,
    how='left',
    on='merge_key',
    suffixes=('', '_dfs')
)


In [111]:
# Ddsplay shape

# weekly stats
print(f"[DataFrame] wr_ids_weekly_stats_salary_df shape: {wr_ids_weekly_stats_salary_df.shape}")
display(wr_ids_weekly_stats_salary_df.head())


# NGS
print(f"[DataFrame] wr_ngs_salary_df shape: {wr_ngs_salary_df.shape}")
display(wr_ngs_salary_df.head())


[DataFrame] wr_ids_weekly_stats_salary_df shape: (17449, 60)


Unnamed: 0,season,season_type,week,player_id,player_name,position,position_group,recent_team,fantasy_points,fantasy_points_ppr,sleeper_id,nfl_id,stats_global_id,mfl_id,swish_id,yahoo_id,fantasy_data_id,rotowire_id,ktc_id,pff_id,stats_id,fleaflicker_id,rotoworld_id,merge_name,espn_id,fantasypros_id,cbs_id,cfbref_id,pfr_id,sportradar_id,name,receptions,targets,receiving_yards,receiving_tds,receiving_fumbles,receiving_fumbles_lost,receiving_air_yards,receiving_yards_after_catch,receiving_first_downs,receiving_epa,receiving_2pt_conversions,racr,target_share,air_yards_share,wopr,merge_key,season_dfs,week_dfs,player_id_dfs,player,dk_position,fd_position,team,opponent,dk_salary,fd_salary,dk_fpts,fd_fpts,merge_name_dfs
0,2017,REG,1,00-0022921,L.Fitzgerald,WR,WR,ARI,7.4,13.4,223.0,larryfitzgerald/2506106,246053.0,7393,,6762.0,5571.0,3730.0,,1724.0,6762.0,1732.0,1661.0,larry fitzgerald,5528.0,9383.0,492934.0,larry-fitzgerald-1,FitzLa00,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,Larry Fitzgerald,6,13,74,0,0.0,0.0,144.0,44.0,4.0,0.997088,0,0.513889,0.276596,0.342043,0.654324,larry fitzgerald_2017_1,2017,1,,Larry Fitzgerald,WR,WR,ARI,Detroit Lions,5900,6400,13.4,10.4,larry fitzgerald
1,2017,REG,2,00-0022921,L.Fitzgerald,WR,WR,ARI,2.1,5.1,223.0,larryfitzgerald/2506106,246053.0,7393,,6762.0,5571.0,3730.0,,1724.0,6762.0,1732.0,1661.0,larry fitzgerald,5528.0,9383.0,492934.0,larry-fitzgerald-1,FitzLa00,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,Larry Fitzgerald,3,6,21,0,0.0,0.0,29.0,17.0,2.0,-3.455533,0,0.724138,0.166667,0.069378,0.298565,larry fitzgerald_2017_2,2017,2,,Larry Fitzgerald,WR,WR,ARI,Indianapolis Colts,6500,6600,5.1,3.6,larry fitzgerald
2,2017,REG,3,00-0022921,L.Fitzgerald,WR,WR,ARI,20.9,33.900002,223.0,larryfitzgerald/2506106,246053.0,7393,,6762.0,5571.0,3730.0,,1724.0,6762.0,1732.0,1661.0,larry fitzgerald,5528.0,9383.0,492934.0,larry-fitzgerald-1,FitzLa00,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,Larry Fitzgerald,13,15,149,1,0.0,0.0,138.0,45.0,6.0,7.632769,0,1.07971,0.3125,0.369973,0.727731,larry fitzgerald_2017_3,2017,3,,Larry Fitzgerald,WR,WR,ARI,Dallas Cowboys,6400,6500,36.900002,27.4,larry fitzgerald
3,2017,REG,4,00-0022921,L.Fitzgerald,WR,WR,ARI,9.2,13.2,223.0,larryfitzgerald/2506106,246053.0,7393,,6762.0,5571.0,3730.0,,1724.0,6762.0,1732.0,1661.0,larry fitzgerald,5528.0,9383.0,492934.0,larry-fitzgerald-1,FitzLa00,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,Larry Fitzgerald,4,7,32,1,0.0,0.0,31.0,18.0,1.0,0.162141,0,1.032258,0.137255,0.070938,0.255539,larry fitzgerald_2017_4,2017,4,,Larry Fitzgerald,WR,WR,ARI,San Francisco 49ers,6100,6700,13.2,11.2,larry fitzgerald
4,2017,REG,5,00-0022921,L.Fitzgerald,WR,WR,ARI,5.1,11.1,223.0,larryfitzgerald/2506106,246053.0,7393,,6762.0,5571.0,3730.0,,1724.0,6762.0,1732.0,1661.0,larry fitzgerald,5528.0,9383.0,492934.0,larry-fitzgerald-1,FitzLa00,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,Larry Fitzgerald,6,10,51,0,0.0,0.0,44.0,29.0,5.0,2.428232,0,1.159091,0.227273,0.105516,0.41477,larry fitzgerald_2017_5,2017,5,,Larry Fitzgerald,WR,WR,ARI,Philadelphia Eagles,6800,6700,11.1,8.1,larry fitzgerald


[DataFrame] wr_ngs_salary_df shape: (8280, 37)


Unnamed: 0,season,season_type,week,player_display_name,player_position,team_abbr,avg_cushion,avg_separation,avg_intended_air_yards,percent_share_of_intended_air_yards,receptions,targets,catch_percentage,yards,rec_touchdowns,avg_yac,avg_expected_yac,avg_yac_above_expectation,player_gsis_id,player_first_name,player_last_name,player_short_name,merge_name,merge_key,season_dfs,week_dfs,player_id,player,dk_position,fd_position,team,opponent,dk_salary,fd_salary,dk_fpts,fd_fpts,merge_name_dfs
0,2017,REG,1,Ryan Grant,WR,WAS,9.936666,2.894592,4.41,7.154639,4,6,66.666664,61.0,0,11.2325,10.072361,1.160139,00-0031068,Ryan,Grant,R.Grant,ryan grant,ryan grant_2017_1,2017,1,,Ryan Grant,WR,WR,WAS,Philadelphia Eagles,3400,4700,10.1,8.1,ryan grant
1,2017,REG,1,Martavis Bryant,WR,PIT,8.3,4.122054,12.688334,33.327496,2,6,33.333332,14.0,0,0.155,4.098278,-3.943278,00-0031373,Martavis,Bryant,M.Bryant,martavis bryant,martavis bryant_2017_1,2017,1,,Martavis Bryant,WR,WR,PIT,Cleveland Browns,6000,6500,3.4,2.4,martavis bryant
2,2017,REG,1,Jamison Crowder,WR,WAS,7.655,3.177794,10.54,19.949707,3,7,42.857143,14.0,0,1.45,1.631897,-0.181897,00-0031941,Jamison,Crowder,J.Crowder,jamison crowder,jamison crowder_2017_1,2017,1,,Jamison Crowder,WR,WR,WAS,Philadelphia Eagles,5600,6500,3.4,0.9,jamison crowder
3,2017,REG,1,Nelson Agholor,WR,PHI,7.42375,2.46262,10.46375,20.274656,6,8,75.0,86.0,1,5.611667,3.26247,2.349197,00-0031549,Nelson,Agholor,N.Agholor,nelson agholor,nelson agholor_2017_1,2017,1,,Nelson Agholor,WR,WR,PHI,Washington Redskins,3800,4900,20.6,17.6,nelson agholor
4,2017,REG,1,John Brown,WR,ARI,7.36,2.751526,13.422222,28.208481,4,9,44.444443,32.0,0,-0.3775,0.961993,-1.339493,00-0031051,John,Brown,J.Brown,john brown,john brown_2017_1,2017,1,,John Brown,WR,WR,ARI,Detroit Lions,4800,5700,8.2,6.2,john brown


In [112]:
salary_cols = ['dk_salary', 'fd_salary', 'dk_fpts', 'fd_fpts']

# weekly stats null summary
display_null_summary(wr_ids_weekly_stats_salary_df, salary_cols, label="Weekly Stats Salary Columns")

# NGS null summary
display_null_summary(wr_ngs_salary_df, salary_cols, label="NGS Salary Columns")


[NaN Summary] Weekly Stats Salary Columns


dk_salary    2193
fd_salary    2193
dk_fpts      2193
fd_fpts      2193
dtype: int64


[NaN %] Weekly Stats Salary Columns


dk_salary    12.57
fd_salary    12.57
dk_fpts      12.57
fd_fpts      12.57
dtype: float64

[NaN Summary] NGS Salary Columns


dk_salary    485
fd_salary    485
dk_fpts      485
fd_fpts      485
dtype: int64


[NaN %] NGS Salary Columns


dk_salary    5.86
fd_salary    5.86
dk_fpts      5.86
fd_fpts      5.86
dtype: float64

In [113]:
# csv file
# save_csv(wr_ngs_salary_df, "wr_ngs_salary_df.csv")
# save_csv(wr_ids_weekly_stats_salary_df, "wr_ids_weekly_stats_salary_df.csv")


In [114]:
# apply merge keys
wr_fp_basic_stats_df = add_merge_keys(wr_fp_basic_stats_df, player_col='player_name')
wr_fp_advanced_stats_df = add_merge_keys(wr_fp_advanced_stats_df, player_col='player_name')
wr_fp_rz_stats_df = add_merge_keys(wr_fp_rz_stats_df, player_col='player_name')


In [115]:
# merge salary into FantasyPros dataframes
wr_fp_basic_stats_salary_df = pd.merge(
    wr_fp_basic_stats_df,
    wr_fd_dk_salary_2017_current_df,
    how='left',
    on='merge_key',
    suffixes=('', '_dfs')
)

wr_fp_advanced_stats_salary_df = pd.merge(
    wr_fp_advanced_stats_df,
    wr_fd_dk_salary_2017_current_df,
    how='left',
    on='merge_key',
    suffixes=('', '_dfs')
)

wr_fp_rz_stats_salary_df = pd.merge(
    wr_fp_rz_stats_df,
    wr_fd_dk_salary_2017_current_df,
    how='left',
    on='merge_key',
    suffixes=('', '_dfs')
)


In [116]:
# null summary for salary columns
salary_cols = ['dk_salary', 'fd_salary', 'dk_fpts', 'fd_fpts']
display_null_summary(wr_fp_basic_stats_salary_df, salary_cols, label="FP Basic Salary Columns")
display_null_summary(wr_fp_advanced_stats_salary_df, salary_cols, label="FP Advanced Salary Columns")
display_null_summary(wr_fp_rz_stats_salary_df, salary_cols, label="FP Red Zone Salary Columns")


[NaN Summary] FP Basic Salary Columns


dk_salary    8647
fd_salary    8647
dk_fpts      8649
fd_fpts      8649
dtype: int64


[NaN %] FP Basic Salary Columns


dk_salary    33.52
fd_salary    33.52
dk_fpts      33.53
fd_fpts      33.53
dtype: float64

[NaN Summary] FP Advanced Salary Columns


dk_salary    141
fd_salary    141
dk_fpts      141
fd_fpts      141
dtype: int64


[NaN %] FP Advanced Salary Columns


dk_salary    12.54
fd_salary    12.54
dk_fpts      12.54
fd_fpts      12.54
dtype: float64

[NaN Summary] FP Red Zone Salary Columns


dk_salary    148
fd_salary    148
dk_fpts      148
fd_fpts      148
dtype: int64


[NaN %] FP Red Zone Salary Columns


dk_salary    13.17
fd_salary    13.17
dk_fpts      13.17
fd_fpts      13.17
dtype: float64

In [117]:
# csv file - non-salary dataframes
save_csv(wr_fp_basic_stats_df, "wr_fp_basic_stats_df.csv")
save_csv(wr_fp_advanced_stats_df, "wr_fp_advanced_stats_df.csv")
save_csv(wr_fp_rz_stats_df, "wr_fp_rz_stats_df.csv")


Saved: ./csv_files/wr_fp_basic_stats_df.csv
Saved: ./csv_files/wr_fp_advanced_stats_df.csv
Saved: ./csv_files/wr_fp_rz_stats_df.csv


In [118]:
# csv file - salary dataframes
save_csv(wr_fp_basic_stats_salary_df, "wr_fp_basic_stats_salary_df.csv")
save_csv(wr_fp_advanced_stats_salary_df, "wr_fp_advanced_stats_salary_df.csv")
save_csv(wr_fp_rz_stats_salary_df, "wr_fp_rz_stats_salary_df.csv")


Saved: ./csv_files/wr_fp_basic_stats_salary_df.csv
Saved: ./csv_files/wr_fp_advanced_stats_salary_df.csv
Saved: ./csv_files/wr_fp_rz_stats_salary_df.csv


In [119]:
### End: Merge Process ###

In [None]:
### Begin: Feature Engineering ###

In [120]:
# redefined as an explicit .copy() to:
# Prevent SettingWithCopyWarning
# Avoid chained assignment issues during column standardization
# Ensure full memory independence from prior transformations

wr_fp_basic_stats_df = wr_fp_basic_stats_df.copy()
wr_fp_advanced_stats_df = wr_fp_advanced_stats_df.copy()
wr_fp_rz_stats_df = wr_fp_rz_stats_df.copy()
wr_ngs_df = wr_ngs_df.copy()
wr_ids_weekly_stats_df = wr_ids_weekly_stats_df.copy()
wr_fd_dk_salary_2017_current_df = wr_fd_dk_salary_2017_current_df.copy()

# redefine salary-merged dataframes

# weekly stats with salary
wr_ids_weekly_stats_salary_df = wr_ids_weekly_stats_salary_df.copy()

# NGS with salary
wr_ngs_salary_df = wr_ngs_salary_df.copy()

# FantasyPros (basic, advanced, red zone) with salary
wr_fp_basic_stats_salary_df = wr_fp_basic_stats_salary_df.copy()
wr_fp_advanced_stats_salary_df = wr_fp_advanced_stats_salary_df.copy()
wr_fp_rz_stats_salary_df = wr_fp_rz_stats_salary_df.copy()


In [122]:
# list the dataframes

# list columns from each dataframe in memory
ids_weekly_cols = wr_ids_weekly_stats_df.columns.tolist()
ngs_cols = wr_ngs_df.columns.tolist()
fp_basic_cols = wr_fp_basic_stats_df.columns.tolist()
fp_basic_salary_cols = wr_fp_basic_stats_salary_df.columns.tolist()
fp_adv_cols = wr_fp_advanced_stats_df.columns.tolist()
fp_adv_salary_cols = wr_fp_advanced_stats_salary_df.columns.tolist()
fp_rz_cols = wr_fp_rz_stats_df.columns.tolist()
fp_rz_salary_cols = wr_fp_rz_stats_salary_df.columns.tolist()
dfs_fd_dk_cols = wr_fd_dk_salary_2017_current_df.columns.tolist()

# combine into a dataframe for side-by-side comparison
comparison_df_fp = pd.DataFrame({
    "FP Basic": pd.Series(fp_basic_cols),
    "FP Basic (Salary)": pd.Series(fp_basic_salary_cols),
    "FP Adv": pd.Series(fp_adv_cols),
    "FP Adv (Salary)": pd.Series(fp_adv_salary_cols),
    "FP RZ": pd.Series(fp_rz_cols),
    "FP RZ (Salary)": pd.Series(fp_rz_salary_cols),
    "DFS FD DK": pd.Series(dfs_fd_dk_cols)
})

comparison_df_fp


# combine into a dataframe for side-by-side comparison
# comparison_df = pd.DataFrame({
#     "IDs & Weekly Stats": pd.Series(ids_weekly_cols),
#     "NGS Stats": pd.Series(ngs_cols),
#     "DFS FD DK": pd.Series(dfs_fd_dk_cols)
# })

# comparison_df

Unnamed: 0,FP Basic,FP Basic (Salary),FP Adv,FP Adv (Salary),FP RZ,FP RZ (Salary),DFS FD DK
0,season,season,season,season,season,season,season
1,week,week,week,week,week,week,week
2,fantasypros_id,fantasypros_id,fantasypros_id,fantasypros_id,fantasypros_id,fantasypros_id,player_id
3,player_name,player_name,player_name,player_name,player_name,player_name,player
4,team,team,team,team,team,team,dk_position
5,rec,rec,games,games,rec_rz,rec_rz,fd_position
6,tgt,tgt,rec,rec,tgt_rz,tgt_rz,team
7,rec_yds,rec_yds,yds,yds,rec_pct_rz,rec_pct_rz,opponent
8,rec_ypc,rec_ypc,ypr,ypr,yds_rz,yds_rz,dk_salary
9,rec_lg,rec_lg,ybc,ybc,yds_per_rec_rz,yds_per_rec_rz,fd_salary


In [124]:
# **fix fpts_per_game using accurate games_played count

# Basic FantasyPros dataframe
wr_fp_basic_stats_df['games_played'] = (
    wr_fp_basic_stats_df.groupby(['season', 'player_name']).cumcount() + 1
)
wr_fp_basic_stats_df['fpts_per_game'] = (
    wr_fp_basic_stats_df['fpts'] / wr_fp_basic_stats_df['games_played']
)

# FantasyPros + Salary dataframe
wr_fp_basic_stats_salary_df['games_played'] = (
    wr_fp_basic_stats_salary_df.groupby(['season', 'player_name']).cumcount() + 1
)
wr_fp_basic_stats_salary_df['fpts_per_game'] = (
    wr_fp_basic_stats_salary_df['fpts'] / wr_fp_basic_stats_salary_df['games_played']
)


In [125]:
# features - rolling averages, percentiles, and ratios

# calculates rolling average (shifted) grouped by season and player
def feat_eng_rolling_avg(df, group_col, target_col, window):
    return (
        df.groupby(['season', group_col])[target_col]
          .transform(lambda x: x.shift(1).rolling(window, min_periods=1).mean())
    )

# calculates std deviation of past N games grouped by season and player
def feat_eng_rolling_std(df, group_col, target_col, window):
    return (
        df.groupby(['season', group_col])[target_col]
          .transform(lambda x: x.shift(1).rolling(window, min_periods=1).std())
    )

# calculates percentile rank within a group (e.g., season/week)
def feat_eng_weekly_percentile(df, group_cols, target_col):
    return df.groupby(group_cols)[target_col].rank(pct=True)

# returns binary flags for high and low performance tiers
def feat_eng_percentile_flags(percentile_series):
    return {
        'top_5pct':    (percentile_series >= 0.95).astype(int),
        'top_10pct':   (percentile_series >= 0.90).astype(int),
        'top_15pct':   (percentile_series >= 0.85).astype(int),
        'top_20pct':   (percentile_series >= 0.80).astype(int),
        'bottom_20pct': (percentile_series <= 0.20).astype(int),
    }

# calculates player share of a team-level stat (e.g., targets)
def feat_eng_team_share(df, group_cols, player_col):
    team_total = df.groupby(group_cols)[player_col].transform('sum')
    return df[player_col] / team_total.replace(0, np.nan)

# safely computes a ratio, avoiding divide-by-zero
def feat_eng_ratio(numerator, denominator):
    return numerator / denominator.replace(0, np.nan)

In [126]:
# features - binaries

# returns 1 if column >= threshold, else 0
def feat_eng_binary_flag(df, target_col, threshold):
    return (df[target_col] >= threshold).astype(int)

# returns 1 if column == value, else 0
def feat_eng_exact_match_flag(df, target_col, value):
    return (df[target_col] == value).astype(int)

# returns 1 if column > 0, else 0
def feat_eng_nonzero_flag(df, target_col):
    return (df[target_col] > 0).astype(int)

In [127]:
# features - volume, bins, and consistency score tier

# calculates a weighted sum of target stats (e.g., TGT, REC, YDS)
def feat_eng_volume_score(df, tgt_col='TGT', rec_col='REC', yds_col='YDS', 
                          tgt_weight=1.0, rec_weight=1.5, yds_weight=0.05):
    return (
        tgt_weight * df[tgt_col] +
        rec_weight * df[rec_col] +
        yds_weight * df[yds_col]
    )


# bins roster percentage into low, medium, high usage tiers
def feat_eng_roster_tier(df, rost_col='ROST'):
    return pd.cut(df[rost_col], bins=[0, 0.1, 0.3, 1.0], labels=['low', 'med', 'high'])


In [None]:
# features - apply cumulative stats for flooring

wr_fp_basic_stats_df['cum_tgt'] = (
    wr_fp_basic_stats_df.groupby(['season', 'player_name'])['TGT']
    .transform(lambda x: x.shift(1).cumsum())
)

wr_fp_basic_stats_df['cum_yds'] = (
    wr_fp_basic_stats_df.groupby(['season', 'player_name'])['YDS']
    .transform(lambda x: x.shift(1).cumsum())
)


In [None]:
# features - apply efficiency + share features

wr_fp_basic_stats_df['catch_rate'] = feat_eng_ratio(
    wr_fp_basic_stats_df['REC'], wr_fp_basic_stats_df['TGT']
)

wr_fp_basic_stats_df['yards_per_target'] = feat_eng_ratio(
    wr_fp_basic_stats_df['YDS'], wr_fp_basic_stats_df['TGT']
)

wr_fp_basic_stats_df['td_rate'] = feat_eng_ratio(
    wr_fp_basic_stats_df['TD'], wr_fp_basic_stats_df['TGT']
)

wr_fp_basic_stats_df['target_share'] = feat_eng_team_share(
    wr_fp_basic_stats_df, ['season', 'week', 'team_abbr'], 'TGT'
)

wr_fp_basic_stats_df['yard_share'] = feat_eng_team_share(
    wr_fp_basic_stats_df, ['season', 'week', 'team_abbr'], 'YDS'
)


In [None]:
# features - apply flooring 

wr_fp_basic_stats_df.loc[
    wr_fp_basic_stats_df['cum_tgt'] < 5,
    ['catch_rate', 'yards_per_target', 'td_rate', 'target_share']
] = np.nan

wr_fp_basic_stats_df.loc[
    wr_fp_basic_stats_df['cum_yds'] < 25,
    'yard_share'
] = np.nan


In [None]:
# features - apply rolling averages for raw stats

wr_fp_basic_stats_df['fpts_3game_avg'] = feat_eng_rolling_avg(wr_fp_basic_stats_df, 'player_name', 'FPTS', 3)
wr_fp_basic_stats_df['rec_3game_avg'] = feat_eng_rolling_avg(wr_fp_basic_stats_df, 'player_name', 'REC', 3)
wr_fp_basic_stats_df['tgt_3game_avg'] = feat_eng_rolling_avg(wr_fp_basic_stats_df, 'player_name', 'TGT', 3)
wr_fp_basic_stats_df['yds_3game_avg'] = feat_eng_rolling_avg(wr_fp_basic_stats_df, 'player_name', 'YDS', 3)
wr_fp_basic_stats_df['td_3game_avg'] = feat_eng_rolling_avg(wr_fp_basic_stats_df, 'player_name', 'TD', 3)
wr_fp_basic_stats_df['att_3game_avg'] = feat_eng_rolling_avg(wr_fp_basic_stats_df, 'player_name', 'ATT', 3)
wr_fp_basic_stats_df['longplay_3game_avg'] = feat_eng_rolling_avg(wr_fp_basic_stats_df, 'player_name', '20+', 3)


In [None]:
# features - apply rolling averages for engineered features

wr_fp_basic_stats_df['catch_rate_3game_avg'] = feat_eng_rolling_avg(wr_fp_basic_stats_df, 'player_name', 'catch_rate', 3)
wr_fp_basic_stats_df['yards_per_target_3game_avg'] = feat_eng_rolling_avg(wr_fp_basic_stats_df, 'player_name', 'yards_per_target', 3)
wr_fp_basic_stats_df['target_share_3game_avg'] = feat_eng_rolling_avg(wr_fp_basic_stats_df, 'player_name', 'target_share', 3)
wr_fp_basic_stats_df['yard_share_3game_avg'] = feat_eng_rolling_avg(wr_fp_basic_stats_df, 'player_name', 'yard_share', 3)


In [None]:
# features - apply selected flags

# flag: 7+ targets
wr_fp_basic_stats_df['had_7plus_targets'] = feat_eng_binary_flag(
    wr_fp_basic_stats_df, 'TGT', 7
)

# flag: 100+ receiving yards
wr_fp_basic_stats_df['had_100plus_yards'] = feat_eng_binary_flag(
    wr_fp_basic_stats_df, 'YDS', 100
)

# flag: at least one 20+ yard play
wr_fp_basic_stats_df['had_big_play'] = feat_eng_nonzero_flag(
    wr_fp_basic_stats_df, '20+'
)

# flag: long reception ≥ 40 yards
wr_fp_basic_stats_df['had_long_gain'] = feat_eng_binary_flag(
    wr_fp_basic_stats_df, 'LG', 40
)


In [None]:
# features - apply percentiles and tier flags

# compute FPTS percentile rank
wr_fp_basic_stats_df['fpts_game_percentile'] = feat_eng_weekly_percentile(
    wr_fp_basic_stats_df, ['season', 'week'], 'FPTS'
)

# apply tier flags
fpts_pct_flags = feat_eng_percentile_flags(wr_fp_basic_stats_df['fpts_game_percentile'])

wr_fp_basic_stats_df['top_5pct_fpts'] = fpts_pct_flags['top_5pct']
wr_fp_basic_stats_df['top_10pct_fpts'] = fpts_pct_flags['top_10pct']
wr_fp_basic_stats_df['top_15pct_fpts'] = fpts_pct_flags['top_15pct']
wr_fp_basic_stats_df['top_20pct_fpts'] = fpts_pct_flags['top_20pct']
wr_fp_basic_stats_df['bottom_20pct_fpts'] = fpts_pct_flags['bottom_20pct']


In [None]:
# features - apply volume_score

wr_fp_basic_stats_df['volume_score'] = feat_eng_volume_score(
    wr_fp_basic_stats_df,
    tgt_col='TGT',
    rec_col='REC',
    yds_col='YDS',
    tgt_weight=1.0,
    rec_weight=1.5,
    yds_weight=0.05
)


In [None]:
# features - apply roster_tier

wr_fp_basic_stats_df['roster_tier'] = feat_eng_roster_tier(
    wr_fp_basic_stats_df,
    rost_col='ROST'
)


In [None]:
# features - apply rolling volume score

wr_fp_basic_stats_df['volume_score_3game_avg'] = feat_eng_rolling_avg(
    wr_fp_basic_stats_df, 'player_name', 'volume_score', 3
)


In [None]:
# null summary report

null_summary = (
    wr_fp_basic_stats_df.isnull().sum()
    .to_frame(name='null_count')
    .assign(null_pct=lambda df: (df['null_count'] / len(wr_fp_basic_stats_df)).round(4))
    .sort_values(by='null_count', ascending=False)
)

display(null_summary)


In [None]:
save_csv(wr_fp_basic_stats_df, "wr_fp_basic_stats_features.csv")


In [None]:
# features - apply 

In [None]:
# features - apply 

In [None]:
# features - apply 

In [None]:
# features - apply 

In [None]:
# features - apply 

In [None]:
# features - apply 

In [None]:
# features - apply 

In [None]:
# features - apply 

In [None]:
# features - apply 

In [None]:
# features - apply 

In [None]:
# features - apply 

In [None]:
### End: Feature Engineering ###

In [None]:
## Next Tasks


# continue to build the features
# eda analysis
# monte carlo simulation dataframes