In [1]:
# This produces the dataframe for WR

In [4]:
## Notes on the NFL Library ##
# the NFL python library seem to not work on Tuesday probably due to updates (not confirmed)
# unbalanced dataframe - pfr stats start at 2018; all other stats go back to 2017

In [6]:
## REQUIRED ACTIONS - Include in a README doc ## 
# modify the season start date in the 'get_current_week' function
# modify the number of weeks if the NFL adds regular season games to the schedule

In [112]:
# import the libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import glob
from IPython.display import display, HTML
from datetime import datetime
import nfl_data_py as nfl
import os
import re
import time
from random import sample, uniform
import io

In [13]:
# Set Pandas options to display all columns in a single row without wrapping
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

In [16]:
# Function to calculate the current week of the NFL season
def get_current_week():
    current_date = datetime.now()
    season_start_date = datetime(2024, 9, 4)  # Update for the season start
    current_week = ((current_date - season_start_date).days // 7) + 1
    return current_week

# Define the current NFL year, week, and season type
current_year = datetime.now().year
current_week = get_current_week()
seasontype = 2 if current_week <= 18 else 3  # Regular season or playoffs

In [19]:
# define the years to pull
# nfl.import_weekly_data(years, columns, downcast)
def get_year_range(current_year, current_week, start_year=2017):
    if current_week <= 18:  # Regular season
        return list(range(start_year, current_year + 1))
    else:  # Playoffs
        return list(range(start_year, current_year))

# Use the function
years = get_year_range(current_year, current_week)

In [22]:
# define the base columns. 
base_columns = [
    'season', 'season_type', 'week', 'player_id', 'player_name', 
    'position', 'position_group', 'recent_team'
]

In [25]:
# Import the player IDs from nfl.import_ids() - without parameters
ids_data = nfl.import_ids()

# Drop the unnecessary columns
columns_to_drop = [
    'position', 'team', 'birthdate', 'age', 'draft_year', 
    'draft_round', 'draft_pick', 'draft_ovr', 'twitter_username', 
    'height', 'weight', 'college', 'db_season'
]
ids_data = ids_data.drop(columns=columns_to_drop, errors='ignore')

# Display the resulting dataframe for review
# print(f"Columns after dropping unnecessary ones: {ids_data.columns.tolist()}")
# display(ids_data)

In [28]:
# import the weekly data from nfl.import_weekly_data(years, columns, downcast)
weekly_data = nfl.import_weekly_data(
    years=years,
    columns=base_columns
)

# display(weekly_data)

Downcasting floats.


In [31]:
## Output: a dataframe of ALL NFL athletes info and ids since 2017

# Merge the two dataframes on 'player_id' and 'gsis_id'
# Align column names for merging
ids_data = ids_data.rename(columns={'gsis_id': 'player_id'})  
id_dataframe = pd.merge(weekly_data, ids_data, on='player_id', how='inner')

# Assign the resulting dataframe to a variable
all_players_id_data = id_dataframe

# Display the resulting ID dataframe
# display(all_players_id_data)

In [34]:
## Output: a dataframe of NFL WR info and ids since 2017
# extract WR from the dataframe
# Create a new dataframe with only wide receivers
wide_receiver_ids = all_players_id_data[all_players_id_data['position'] == 'WR']

# Display the resulting dataframe for review
print(f"Shape of merged dataframe: {wide_receiver_ids.shape}")

# Display the resulting dataframe for review
# display(wide_receiver_ids)

Shape of merged dataframe: (17396, 29)


In [37]:
## Output: a dataframe of NFL WR info, ids, and stats since 2017
# WR-specific columns (receiving-related)
wr_columns = [
    'receptions', 'targets', 'receiving_yards', 'receiving_tds',
    'receiving_fumbles', 'receiving_fumbles_lost',
    'receiving_air_yards', 'receiving_yards_after_catch',
    'receiving_first_downs', 'receiving_epa',
    'receiving_2pt_conversions', 'racr', 'target_share',
    'air_yards_share', 'wopr'
]

# Pull WR-specific columns from weekly data
wr_stats = nfl.import_weekly_data(
    years=years,
    columns=['player_id', 'season', 'week'] + wr_columns  # Include keys for merging
)

# Merge WR-specific stats with wide_receiver_ids
wr_ids_weekly_stats = pd.merge(
    wide_receiver_ids,
    wr_stats,
    on=['player_id', 'season', 'week'],  # Ensure correct alignment
    how='inner'
)

# Display the resulting dataframe for review
print(f"Shape of merged dataframe: {wr_ids_weekly_stats.shape}")

# Row integrity check
print(
    f"Row count matches: {wr_ids_weekly_stats.shape[0] == wide_receiver_ids.shape[0]}"
)

# display the df
# display(wr_ids_weekly_stats)

# csv file
# wr_ids_weekly_stats.to_csv('wr_ids_weekly_stats.csv', index=False)

Downcasting floats.
Shape of merged dataframe: (17396, 44)
Row count matches: True


In [40]:
# Output: imports the NFL next-generation stats from the nfl python library

# import the next generation stats (NGS) from nfl.import_ngs_data()
# note: ngs starts at week 0 (previous season totals) - not needed so drop those rows

# Pull NGS receiving data for the specified years
ngs_wr_df = nfl.import_ngs_data('receiving', years)

# Exclude rows where 'week' == 0 and filter for 'WR' position in one step
ngs_wr_df = ngs_wr_df[(ngs_wr_df['week'] != 0) & (ngs_wr_df['player_position'] == 'WR')]

# Drop unnecessary columns
ngs_wr_df = ngs_wr_df.drop(columns=['season_type', 'player_position', 'receptions', 'targets','player_jersey_number'], errors='ignore')

# Display the resulting dataframe
print(f"Shape of NGS WR DataFrame after dropping columns: {ngs_wr_df.shape}")
display(ngs_wr_df)

# csv file
# ngs_wr_df.to_csv('ngs_wr_df.csv', index=False)

Shape of NGS WR DataFrame after dropping columns: (8244, 18)


Unnamed: 0,season,week,player_display_name,team_abbr,avg_cushion,avg_separation,avg_intended_air_yards,percent_share_of_intended_air_yards,catch_percentage,yards,rec_touchdowns,avg_yac,avg_expected_yac,avg_yac_above_expectation,player_gsis_id,player_first_name,player_last_name,player_short_name
1725,2017,1,Ryan Grant,WAS,9.936667,2.894592,4.410000,7.154639,66.666667,61.0,0,11.232500,10.072361,1.160139,00-0031068,Ryan,Grant,R.Grant
1726,2017,1,Martavis Bryant,PIT,8.300000,4.122054,12.688333,33.327496,33.333333,14.0,0,0.155000,4.098278,-3.943278,00-0031373,Martavis,Bryant,M.Bryant
1729,2017,1,Jamison Crowder,WAS,7.655000,3.177793,10.540000,19.949707,42.857143,14.0,0,1.450000,1.631897,-0.181897,00-0031941,Jamison,Crowder,J.Crowder
1732,2017,1,Nelson Agholor,PHI,7.423750,2.462620,10.463750,20.274656,75.000000,86.0,1,5.611667,3.262470,2.349197,00-0031549,Nelson,Agholor,N.Agholor
1733,2017,1,John Brown,ARI,7.360000,2.751526,13.422222,28.208481,44.444444,32.0,0,-0.377500,0.961993,-1.339493,00-0031051,John,Brown,J.Brown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13317,2024,21,Khalil Shakir,BUF,5.627143,4.045787,4.130000,12.342569,85.714286,46.0,0,4.806667,4.824670,-0.018003,00-0037261,Khalil,Shakir,K.Shakir
13318,2024,21,A.J. Brown,PHI,5.396250,2.949668,11.140000,64.022989,75.000000,96.0,1,5.643333,3.051371,2.591962,00-0035676,Arthur,Brown,A.Brown
13320,2024,21,Xavier Worthy,KC,5.067143,3.682350,6.770000,27.904375,85.714286,85.0,1,5.901667,6.749673,-0.848007,00-0039894,Xavier,Worthy,X.Worthy
13321,2024,21,Terry McLaurin,WAS,4.690000,1.882737,13.782857,29.321663,42.857143,51.0,1,11.360000,7.610875,3.749125,00-0035659,Terry,McLaurin,T.McLaurin


In [43]:
# Output: a dataframe of NFL WR info, ids, weekly stats, and next-gen stats since 2017

# Joins wr_ids_weekly_stats dataframe with ngs_wr_df using the keys 'player_id' and 'player_gsis_id'
# Merge wr_ids_weekly_stats with ngs_wr_df using a left join
wr_ids_ngs_weekly_stats = pd.merge(
    wr_ids_weekly_stats,
    ngs_wr_df,
    left_on=['player_id', 'season', 'week'],  # Keys from wr_ids_weekly_stats
    right_on=['player_gsis_id', 'season', 'week'],  # Keys from ngs_wr_df
    how='left'  # Retain all rows from wr_ids_weekly_stats
)

# Display the shape of the resulting dataframe
print(f"Shape of merged dataframe: {wr_ids_ngs_weekly_stats.shape}")
print(f"Row count matches: {wr_ids_weekly_stats.shape[0] == wr_ids_ngs_weekly_stats.shape[0]}")

# Display a sample of the merged dataframe
# display(wr_ids_ngs_weekly_stats)

# csv file
# wr_ids_ngs_weekly_stats.to_csv('wr_ids_ngs_weekly_stats.csv', index=False)

Shape of merged dataframe: (17396, 60)
Row count matches: True


In [46]:
# Output: pro-football reference dataframe for receiving data from the python nfl library
# note: PFR data not available before 2018
# there is no position info so the data will pull WR, TE, and RB receiving data

# Define the range of years for PFR data (2018 to the current year)
pfr_years = list(range(2018, current_year))

# import pro-football reference data
pfr_rec_df = nfl.import_weekly_pfr('rec',pfr_years)

# Drop unnecessary columns
pfr_rec_df = pfr_rec_df.drop(
    columns=['game_id','pfr_game_id','receiving_int','rushing_broken_tackles', 
             'passing_drops', 'passing_drop_pct'], errors='ignore')

# display dataframe
print(f"Shape of PFR dataframe: {pfr_rec_df.shape}")
# display(pfr_rec_df)

# csv file
# pfr_rec_df.to_csv('pfr_rec_df.csv', index=False)

Shape of PFR dataframe: (31176, 11)


In [49]:
# Output: a dataframe of NFL WR info, ids, weekly stats, next-gen stats, and pro-football reference data
# NOTE: unbalanced dataframe - pfr stats start at 2018

# merge the pfr_rec_df with the wr_ids_ngs_weekly_stats dataframe
# match with ids then filter out the unmatched rows as they are likely (TE)
# Step 1: Merge the dataframes with a LEFT JOIN
wr_ids_ngs_pfr_stats = pd.merge(
    wr_ids_ngs_weekly_stats,
    pfr_rec_df,  # Use the full PFR dataframe as position data is unavailable
    left_on=['pfr_id', 'season', 'week'],  # Keys from wr_ids_ngs_weekly_stats
    right_on=['pfr_player_id', 'season', 'week'],  # Keys from pfr_rec_df
    how='left'  # Retain all rows from wr_ids_ngs_weekly_stats
)

# Display the shape of the resulting dataframe
print(f"Shape of merged dataframe: {wr_ids_ngs_pfr_stats.shape}")

# Row integrity check
print(
    f"Row count matches: {wr_ids_weekly_stats.shape[0] == wr_ids_ngs_weekly_stats.shape[0] == wr_ids_ngs_pfr_stats.shape[0]}"
)

# Display the first few rows of the merged dataframe for review
# display(wr_ids_ngs_pfr_stats)

# csv file
# wr_ids_ngs_pfr_stats.to_csv('wr_ids_ngs_pfr_stats.csv', index=False)

Shape of merged dataframe: (17396, 69)
Row count matches: True


In [52]:
# Output: an ordered dataframe of NFL WR info, ids, weekly stats, next-gen stats, and pro-footeball reference data
# Output: Ordered the df by year, week, and receiving yards
# NOTE: unbalanced dataframe - pfr stats start at 2018

# Order the dataframe by season (year), week, and receiving_yards
wr_ids_ngs_pfr_stats_sorted = wr_ids_ngs_pfr_stats.sort_values(
    by=['season', 'week', 'receiving_yards'], 
    ascending=[True, True, False]  # Ascending for season and week, descending for receiving_yards
)

# Display the shape of the resulting dataframe
print(f"Shape of merged dataframe: {wr_ids_ngs_pfr_stats_sorted.shape}")

# Row integrity check
print(
    f"Row count matches: {wr_ids_weekly_stats.shape[0] == wr_ids_ngs_weekly_stats.shape[0] == wr_ids_ngs_pfr_stats.shape[0] == wr_ids_ngs_pfr_stats_sorted.shape[0]}"
)

# Display the sorted dataframe
print("Dataframe sorted by season, week, and receiving_yards:")
display(wr_ids_ngs_pfr_stats_sorted)


# Save the sorted dataframe to a csv
# wr_ids_ngs_pfr_stats_sorted.to_csv('wr_ids_ngs_pfr_stats_sorted.csv', index=False)

Shape of merged dataframe: (17396, 69)
Row count matches: True
Dataframe sorted by season, week, and receiving_yards:


Unnamed: 0,season,season_type,week,player_id,player_name,position,position_group,recent_team,mfl_id,sportradar_id,fantasypros_id,pff_id,sleeper_id,nfl_id,espn_id,yahoo_id,fleaflicker_id,cbs_id,pfr_id,cfbref_id,rotowire_id,rotoworld_id,ktc_id,stats_id,stats_global_id,fantasy_data_id,swish_id,name,merge_name,receptions,targets,receiving_yards,receiving_tds,receiving_fumbles,receiving_fumbles_lost,receiving_air_yards,receiving_yards_after_catch,receiving_first_downs,receiving_epa,receiving_2pt_conversions,racr,target_share,air_yards_share,wopr,player_display_name,team_abbr,avg_cushion,avg_separation,avg_intended_air_yards,percent_share_of_intended_air_yards,catch_percentage,yards,rec_touchdowns,avg_yac,avg_expected_yac,avg_yac_above_expectation,player_gsis_id,player_first_name,player_last_name,player_short_name,game_type,team,opponent,pfr_player_name,pfr_player_id,receiving_broken_tackles,receiving_drop,receiving_drop_pct,receiving_rat
214,2017,REG,1,00-0027793,A.Brown,WR,WR,PIT,9988,16e33176-b73e-49b7-b0aa-c405b47a706e,9808.0,5718.0,536.0,antoniobrown/2508061,13934.0,24171.0,,1272852.0,BrowAn04,antonio-brown-1,6454.0,5698.0,,24171.0,406214.0,11056.0,406214.0,Antonio Brown,antonio brown,11,11,182.0,0,1.0,0.0,90.0,92.0,8.0,10.870283,0,2.022222,0.305556,0.400000,0.738333,Antonio Brown,PIT,4.442727,4.311392,7.329091,35.293088,100.000000,182.0,0.0,9.137273,6.636465,2.500807,00-0027793,Antonio,Brown,A.Brown,,,,,,,,,
645,2017,REG,1,00-0030035,A.Thielen,WR,WR,MIN,11938,2fa2b2da-4aa9-44b5-b27e-56876dfe2ad4,13429.0,8288.0,1689.0,,16460.0,27277.0,,2059362.0,ThieAd00,,8986.0,9054.0,308.0,27277.0,733643.0,15534.0,733643.0,Adam Thielen,adam thielen,9,10,157.0,0,0.0,0.0,105.0,59.0,4.0,6.655833,0,1.495238,0.312500,0.439331,0.776281,Adam Thielen,MIN,3.920000,2.277252,10.673000,42.259265,90.000000,157.0,0.0,6.504444,7.789293,-1.284848,00-0030035,Adam,Thielen,A.Thielen,,,,,,,,,
1673,2017,REG,1,00-0033040,T.Hill,WR,WR,KC,12801,01d8aee3-e1c4-4988-970a-8c0c2d08bd83,15802.0,10799.0,3321.0,,3116406.0,29399.0,,2131163.0,HillTy00,,11222.0,11458.0,286.0,29399.0,823156.0,18082.0,823156.0,Tyreek Hill,tyreek hill,7,8,133.0,1,0.0,0.0,94.0,55.0,5.0,9.070634,0,1.414894,0.235294,0.361538,0.606018,Tyreek Hill,KC,7.078571,5.210156,11.551250,35.725055,87.500000,133.0,1.0,7.945714,10.144182,-2.198467,00-0033040,Tyreek,Hill,T.Hill,,,,,,,,,
245,2017,REG,1,00-0027891,G.Tate,WR,WR,DET,9831,c88d9352-b835-45ed-a909-1cfec09a58bc,9683.0,5585.0,642.0,goldentate/497326,13217.0,24035.0,,1265470.0,TateGo00,golden-tate-1,6389.0,5583.0,,24035.0,400490.0,11611.0,400490.0,Golden Tate,golden tate,10,12,107.0,0,0.0,0.0,68.0,43.0,5.0,-5.569108,0,1.573529,0.307692,0.232877,0.624552,Golden Tate,DET,5.639167,2.270264,5.055833,20.000659,83.333333,107.0,0.0,4.657000,4.412235,0.244765,00-0027891,Golden,Tate,G.Tate,,,,,,,,,
38,2017,REG,1,00-0026035,D.Amendola,WR,WR,NE,9308,973bfe3c-6d0d-4130-a79c-f860650b1da6,9146.0,4717.0,491.0,dannyamendola/2649,11674.0,9037.0,5595.0,516968.0,AmenDa00,,5813.0,4991.0,,9037.0,263758.0,9906.0,263758.0,Danny Amendola,danny amendola,6,7,100.0,0,1.0,0.0,54.0,49.0,5.0,6.142825,0,1.851852,0.194444,0.095238,0.358333,Danny Amendola,NE,2.763333,3.564135,7.581429,9.771861,85.714286,100.0,0.0,8.360000,8.952487,-0.592487,00-0026035,Daniel,Amendola,D.Amendola,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15247,2024,POST,21,00-0030564,D.Hopkins,WR,WR,KC,11232,5c48ade7-4b9a-4757-9643-87a6e3839e2b,11606.0,7808.0,1426.0,deandrehopkins/2540165,15795.0,26650.0,,1737078.0,HopkDe00,deandre-hopkins-1,8619.0,8404.0,,26650.0,560241.0,14986.0,560241.0,DeAndre Hopkins,deandre hopkins,1,2,11.0,0,0.0,0.0,11.0,3.0,1.0,0.274447,0,1.000000,0.080000,0.061453,0.163017,,,,,,,,,,,,,,,,,CON,KC,BUF,DeAndre Hopkins,HopkDe00,0.0,0.0,0.000,66.7
15350,2024,POST,21,00-0031941,J.Crowder,WR,WR,WAS,12184,8002dd5e-a75a-4d72-9a8c-0f4dbc80d459,13976.0,9538.0,2410.0,jamisoncrowder/2552415,2576716.0,28493.0,,1850749.0,CrowJa00,jamison-crowder-1,10224.0,10373.0,,28493.0,599649.0,16866.0,599649.0,Jamison Crowder,jamison crowder,1,3,4.0,0,0.0,0.0,18.0,0.0,0.0,-1.918393,0,0.222222,0.063830,0.051282,0.131642,,,,,,,,,,,,,,,,,CON,WAS,PHI,Jamison Crowder,CrowJa00,0.0,0.0,0.000,42.4
15467,2024,POST,21,00-0033282,C.Samuel,WR,WR,BUF,13157,66a21b6d-97e5-4732-8bb0-062145d6bbc6,16434.0,11795.0,4082.0,,3121427.0,30153.0,,2131252.0,SamuCu00,curtis-samuel-1,11710.0,12219.0,244.0,30153.0,821389.0,18928.0,821389.0,Curtis Samuel,curtis samuel,1,3,4.0,1,0.0,0.0,9.0,0.0,1.0,3.039294,0,0.444444,0.096774,0.036885,0.170981,,,,,,,,,,,,,,,,,CON,BUF,KC,Curtis Samuel,SamuCu00,0.0,1.0,0.333,81.9
15656,2024,POST,21,00-0034386,J.Watson,WR,WR,KC,13776,bdb77276-7191-4454-85c2-e1693a33d709,17603.0,66915.0,5374.0,,3118892.0,31114.0,,2137198.0,WatsJu01,,12746.0,13208.0,178.0,31114.0,832220.0,19922.0,832220.0,Justin Watson,justin watson,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
## Data Integrity Checks ##
# ids and weekly stats, pfr, and ngs

# merged dataframe
# wr_ids_ngs_pfr_stats_sorted
# wr_ids_ngs_pfr_stats_sorted.csv

# player ids and weekly stats dataframe
# wr_ids_weekly_stats

# next-gen stats dataframe
# ngs_wr_df
# ngs_wr_df.csv


# pro-football reference dataframe
# pfr_rec_df
# pfr_rec_df.csv

In [55]:
### The next section scrapes data from fantasypros ###

In [57]:
# Generates a list of (year, week) combinations for web scraping.
# - 2017-2020: Weeks 1-17
# - 2021 and beyond: Weeks 1-18
def generate_year_week_combinations(start_year, end_year):
    year_week_combinations = []
    for year in range(start_year, end_year + 1):
        max_week = 17 if year <= 2020 else 18
        year_week_combinations.extend([(year, week) for week in range(1, max_week + 1)])
    return year_week_combinations

In [105]:
# output a dataframe of weekly WR fantasypros advanced stats week-by-week
# Define the function to scrape weekly WR advanced stats from Fantasy Pros
def wr_scrape_fantasypros_advanced_stats(start_year, end_year):
    
    # Generate year-week combinations
    year_week_combinations = generate_year_week_combinations(start_year, end_year)
    
    # Initialize an empty list to store data
    all_data = []

    for year, week in year_week_combinations:
        try:
            # Construct URL
            url = f"https://www.fantasypros.com/nfl/advanced-stats-wr.php?year={year}&week={week}&range=week&view=pergame"
            # print(f"Scraping data for Year: {year}, Week: {week} from {url}")
            
            # Send GET request
            response = requests.get(url)
            response.raise_for_status()

            # Parse HTML content
            soup = BeautifulSoup(response.content, 'html.parser')

            # Find table headers and rows
            table_headers = [header.text.strip() for header in soup.find('thead').find_all('th')]
            table_rows = soup.find('tbody').find_all('tr')

            # Parse each row
            for row in table_rows:
                row_data = [cell.text.strip() for cell in row.find_all('td')]
                if len(row_data) == len(table_headers):  # Ensure the row matches headers
                    all_data.append([year, week] + row_data)

            # Random delay to avoid server overload
            time.sleep(uniform(0.3, 0.9))

        except Exception as e:
            print(f"Error occurred while scraping Year: {year}, Week: {week}: {e}")

    # Convert data to DataFrame
    column_names = ['Year', 'Week'] + table_headers
    wr_fp_advanced_stats_df = pd.DataFrame(all_data, columns=column_names)

    # Drop the 'Rank' column if it exists
    if 'Rank' in wr_fp_advanced_stats_df.columns:
        wr_fp_advanced_stats_df = wr_fp_advanced_stats_df.drop(columns=['Rank'])
    
    return wr_fp_advanced_stats_df

# define the years
wr_fp_advanced_stats_df = wr_scrape_fantasypros_advanced_stats(2017, 2024)

# Display the resulting dataframe shape
print(f"Shape of the resulting dataframe: {wr_fp_advanced_stats_df.shape}")

# Display the resulting dataframe
display(wr_fp_advanced_stats_df)

# Save to CSV with the updated name
# wr_fp_advanced_stats_df.to_csv("fantasypros_wr_advanced_stats.csv", index=False)

Shape of the resulting dataframe: (26748, 20)


Unnamed: 0,Year,Week,Player,G,REC,YDS,YBC,AIR,YAC,YACON,BRKTKL,TGT,CATCHABLE,DROP,RZ TGT,10+ YDS,20+ YDS,30+ YDS,40+ YDS,50+ YDS
0,2017,1,Stefon Diggs (HOU),1,7,93,76,0,17,6,0,8,7,0,2,5,2,1,0,0
1,2017,1,Tyreek Hill (MIA),1,7,133,78,0,55,1,0,8,7,0,0,4,1,1,1,1
2,2017,1,Kenny Golladay (FA),1,4,69,64,0,5,0,0,7,5,1,1,2,1,1,1,0
3,2017,1,Antonio Brown (FA),1,11,182,90,0,92,50,0,11,11,0,0,7,2,2,1,1
4,2017,1,Adam Thielen (CAR),1,9,157,92,0,65,17,0,10,10,0,0,4,4,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26743,2024,18,Dee Williams (NYG),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
26744,2024,18,Amari Cooper (BUF),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
26745,2024,18,Cedric Tillman (CLE),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
26746,2024,18,Jaelon Darden (SEA),1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [108]:
# Output: a dataframe of weekly WR fantasy points and % rostered data from FantasyPros
# Scrape the weekly WR fantasy points column and the % rostered from the FantasyPros website
def wr_scrape_fantasypros_fpts_rost(start_year, end_year):
    
    # Generate year-week combinations
    week_combinations = generate_year_week_combinations(start_year, end_year)

    # Initialize an empty list to store data
    all_data = []

    for year, week in week_combinations:
        # Build the URL
        url = f"https://www.fantasypros.com/nfl/stats/wr.php?year={year}&week={week}&range=week"
        # print(f"Scraping data for Year: {year}, Week: {week} from {url}")
        
        # Request the page
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        
        # Locate the table and extract data
        table = soup.find("table", {"id": "data"})
        if table:
            table_str = str(table)
            # Skip multi-level headers using skiprows
            df = pd.read_html(io.StringIO(table_str), header=1)[0]
            df["Year"] = year
            df["Week"] = week
            all_data.append(df)
        
        # Delay to avoid bombarding the server
        time.sleep(uniform(0.3, 0.9))  # Randomized delay

    # Combine all data into a single dataframe
    wr_fpts_perct_rost_df = pd.concat(all_data, ignore_index=True)

    # Drop unnecessary columns
    columns_to_drop = [
        "Rank", "REC", "TGT", "YDS", "Y/R", "LG", "20+", "TD", 
        "ATT", "YDS.1", "TD.1", "FL", "G", "FPTS/G"
    ]
    wr_fpts_perct_rost_df.drop(columns=columns_to_drop, errors="ignore", inplace=True)

    # Save to CSV
    wr_fpts_perct_rost_df.to_csv("fantasypros_wr_fpts_perct_rost.csv", index=False)

    # Display shape of the dataframe
    print(f"Shape of WR FPTS and % Rostered dataframe after column removal: {wr_fpts_perct_rost_df.shape}")
    return wr_fpts_perct_rost_df

# define the years
wr_fpts_perct_rost_df = wr_scrape_fantasypros_fpts_rost(2017, 2024)
display(wr_fpts_perct_rost_df.head())

Scraping data for Year: 2017, Week: 1 from https://www.fantasypros.com/nfl/stats/wr.php?year=2017&week=1&range=week
Scraping data for Year: 2017, Week: 2 from https://www.fantasypros.com/nfl/stats/wr.php?year=2017&week=2&range=week
Scraping data for Year: 2017, Week: 3 from https://www.fantasypros.com/nfl/stats/wr.php?year=2017&week=3&range=week
Scraping data for Year: 2017, Week: 4 from https://www.fantasypros.com/nfl/stats/wr.php?year=2017&week=4&range=week
Scraping data for Year: 2017, Week: 5 from https://www.fantasypros.com/nfl/stats/wr.php?year=2017&week=5&range=week
Scraping data for Year: 2017, Week: 6 from https://www.fantasypros.com/nfl/stats/wr.php?year=2017&week=6&range=week
Scraping data for Year: 2017, Week: 7 from https://www.fantasypros.com/nfl/stats/wr.php?year=2017&week=7&range=week
Scraping data for Year: 2017, Week: 8 from https://www.fantasypros.com/nfl/stats/wr.php?year=2017&week=8&range=week
Scraping data for Year: 2017, Week: 9 from https://www.fantasypros.com/n

Unnamed: 0,Player,FPTS,ROST,Year,Week
0,Stefon Diggs (HOU),20.7,37.5%,2017,1
1,Tyreek Hill (MIA),19.8,99.2%,2017,1
2,Kenny Golladay (FA),18.9,4.0%,2017,1
3,Antonio Brown (FA),18.2,1.3%,2017,1
4,Adam Thielen (CAR),15.7,70.1%,2017,1


In [110]:
# output: a dataframe of weekly WR redzone stats from FantasyPros
# scrape the weekly WR redzone stats from the FantasyPros
def wr_scrape_fantasypros_redzone_stats(start_year, end_year):
    
    # Generate year-week combinations
    week_combinations = generate_year_week_combinations(start_year, end_year)

    # Initialize an empty list to store data
    all_data = []

    for year, week in week_combinations:
        url = f"https://www.fantasypros.com/nfl/red-zone-stats/wr.php?year={year}&range=week&week={week}"
        # print(f"Scraping data for Year: {year}, Week: {week} from {url}")
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        
        # Locate the table and extract data
        table = soup.find("table", {"id": "data"})
        if table:
            table_str = str(table)
            df = pd.read_html(io.StringIO(table_str))[0]  # Wrap table HTML in StringIO

            # Flatten the multi-level column headers
            df.columns = df.columns.droplevel(0)
            
            # Drop the extra header rows (if any)
            df = df[df['Player'] != 'Player']  # Filter out duplicate header rows

            # Add Year and Week columns
            df["Year"] = year
            df["Week"] = week
            all_data.append(df)
        
        # Delay to avoid bombarding the server
        time.sleep(uniform(0.3, 0.9))  # Randomized delay

    # Combine all data into a single dataframe
    wr_redzone_stats_df = pd.concat(all_data, ignore_index=True)

    # Drop unnecessary columns
    columns_to_drop = ['Rank', 'ATT', 'YDS', 'TD', 'PCT', 'FL', 'G', 'FPTS', 'FPTS/G', 'ROST %']
    wr_redzone_stats_df = wr_redzone_stats_df.drop(columns=columns_to_drop, errors='ignore')

    # Rename columns to add 'rz' prefix
    wr_redzone_stats_df = wr_redzone_stats_df.rename(
        columns={
            col: f"{col}_rz" for col in wr_redzone_stats_df.columns
            if col not in ['Player', 'Year', 'Week']
        }
    )

    # Save to CSV
    wr_redzone_stats_df.to_csv("fantasypros_wr_redzone_stats.csv", index=False)

    # Display shape of the dataframe
    print(f"Shape of WR Red Zone Stats dataframe: {wr_redzone_stats_df.shape}")
    return wr_redzone_stats_df

# define the years
wr_redzone_stats_df = wr_scrape_fantasypros_redzone_stats(2017, 2024)
display(wr_redzone_stats_df.head())

Shape of WR Red Zone Stats dataframe: (7159, 8)


Unnamed: 0,Player,REC_rz,TGT_rz,REC PCT_rz,Y/R_rz,TGT PCT_rz,Year,Week
0,Stefon Diggs (HOU),3,3,100.0%,7.3,60.0%,2017,1
1,Bennie Fowler III (FA),2,2,100.0%,5.5,66.7%,2017,1
2,Seth Roberts (FA),1,1,100.0%,19.0,20.0%,2017,1
3,Cooper Kupp (LAR),1,1,100.0%,18.0,100.0%,2017,1
4,DeAndre Hopkins (KC),2,3,66.7%,5.5,75.0%,2017,1


In [115]:
# output: a dataframe of weekly WR advanced stats, fantasy and rosterd data, and redzone from fantasyPros
# Merge WR Advanced Stats with Fantasy Points and % Rostered
wr_adv_fp_rost_merged_df = pd.merge(
    wr_fp_advanced_stats_df,
    wr_fpts_perct_rost_df,
    on=['Player', 'Year', 'Week'],
    how='outer'
)

# Merge the resulting dataframe with Red Zone Stats
wr_adv_fp_rost_rz_merged_df = pd.merge(
    wr_adv_fp_rost_merged_df,
    wr_redzone_stats_df,
    on=['Player', 'Year', 'Week'],
    how='outer'
)

# Display the shape and a sample of the merged dataframe
print(f"Shape of the merged dataframe: {wr_adv_fp_rost_rz_merged_df.shape}")
# display(wr_adv_fp_rost_rz_merged_df.head())

# Save the merged dataframe to a CSV file
# wr_adv_fp_rost_rz_merged_df.to_csv('wr_fantasypros.csv', index=False)
# print("Merged dataframe saved as 'wr_fantasypros.csv'.")

Shape of the merged dataframe: (28286, 27)


In [118]:
# output: a dataframe of weekly WR advanced stats, fantasy and rosterd data, and redzone from fantasyPros
# Order the merged dataframe by Year, Week, and YDS
wr_adv_fp_rost_rz_merged_df_sorted = wr_adv_fp_rost_rz_merged_df.sort_values(
    by=['Year', 'Week', 'YDS'], 
    ascending=[True, True, False]  # Ascending for Year and Week, Descending for YDS
)

# Display the shape and a sample of the sorted dataframe
print(f"Shape of the sorted dataframe: {wr_adv_fp_rost_rz_merged_df_sorted.shape}")
display(wr_adv_fp_rost_rz_merged_df_sorted.head())

# Save the sorted dataframe to a CSV file
wr_adv_fp_rost_rz_merged_df_sorted.to_csv('wr_fantasypros_sorted.csv', index=False)
print("Sorted dataframe saved as 'wr_fantasypros_sorted.csv'.")

Shape of the sorted dataframe: (28286, 27)


Unnamed: 0,Year,Week,Player,G,REC,YDS,YBC,AIR,YAC,YACON,BRKTKL,TGT,CATCHABLE,DROP,RZ TGT,10+ YDS,20+ YDS,30+ YDS,40+ YDS,50+ YDS,FPTS,ROST,REC_rz,TGT_rz,REC PCT_rz,Y/R_rz,TGT PCT_rz
24289,2017,1,Stefon Diggs (HOU),1,7,93,76,0,17,6,0,8,7,0,2,5,2,1,0,0,20.7,37.5%,3.0,3.0,100.0%,7.3,60.0%
2402,2017,1,Brandin Cooks (DAL),1,3,88,71,0,17,4,0,7,4,1,0,3,1,1,1,1,8.8,25.4%,,,,,
20538,2017,1,Nelson Agholor (BAL),1,6,86,51,0,35,18,0,8,6,0,1,3,1,1,1,1,14.6,0.4%,1.0,1.0,100.0%,5.0,100.0%
22155,2017,1,Randall Cobb (FA),1,9,85,19,0,66,9,0,13,9,0,0,4,1,0,0,0,8.5,0.0%,,,,,
19457,2017,1,Michael Crabtree (FA),1,6,83,50,0,33,31,2,7,7,1,0,4,1,0,0,0,8.3,0.0%,,,,,


Sorted dataframe saved as 'wr_fantasypros_sorted.csv'.


In [120]:
## Data Integrity Checks ##
# Extract unique years and weeks from each dataframe
years_merged = sorted(wr_adv_fp_rost_rz_merged_df_sorted['Year'].unique())
weeks_merged = sorted(wr_adv_fp_rost_rz_merged_df_sorted['Week'].unique())

years_redzone = sorted(wr_redzone_stats_df['Year'].unique())
weeks_redzone = sorted(wr_redzone_stats_df['Week'].unique())

years_fpts = sorted(wr_fpts_perct_rost_df['Year'].unique())
weeks_fpts = sorted(wr_fpts_perct_rost_df['Week'].unique())

years_adv_stats = sorted(wr_fp_advanced_stats_df['Year'].unique())
weeks_adv_stats = sorted(wr_fp_advanced_stats_df['Week'].unique())

# Print the results for comparison
print(f"Years in Merged DF: {years_merged}")
print(f"Years in RedZone DF: {years_redzone}")
print(f"Years in FPTS DF: {years_fpts}")
print(f"Years in Advanced Stats DF: {years_adv_stats}\n")

print(f"Weeks in Merged DF: {weeks_merged}")
print(f"Weeks in RedZone DF: {weeks_redzone}")
print(f"Weeks in FPTS DF: {weeks_fpts}")
print(f"Weeks in Advanced Stats DF: {weeks_adv_stats}")


Years in Merged DF: [2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
Years in RedZone DF: [2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
Years in FPTS DF: [2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
Years in Advanced Stats DF: [2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]

Weeks in Merged DF: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
Weeks in RedZone DF: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
Weeks in FPTS DF: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
Weeks in Advanced Stats DF: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]


In [136]:
## Data Integrity Checks ##
# load the datframe from the variables in memory or csv files in the current working directory
def load_dataframe(var_name, file_name):
    """ Load dataframe from a variable if it exists, otherwise read from CSV. """
    try:
        return globals()[var_name]  # Try to get the variable from the global namespace
    except KeyError:
        print(f"⚠️ {var_name} not found in memory. Reading from {file_name}")
        return pd.read_csv(file_name)

In [163]:
## Data Integrity Checks ##
# Output: A Stratified Random Sample of years and players from the scraped FantasyPros data
# test to ensure merged df values match the individual unmerged df of advanced stats
def test_wr_advanced_stats(years=range(2017, 2024), num_samples=25, tolerance=0.1, seed=42):
    """
    Integrity check for WR advanced stats.
    
    - Uses Stratified Random Sampling by year.
    - Tests key stats (REC, YDS, YBC, AIR, YAC, TGT, CATCHABLE).
    - Reports % of matches and mismatches found.
    - Logs whether data is loaded from variables or CSV files.
    """
    random.seed(seed)  # Set seed for reproducibility
    mismatches = []
    
    print("🔹 Running Advanced Stats Integrity Check...")
    print(f"📌 Test Details: Stratified Random Sample by Year")
    print(f"📌 Columns Tested: ['REC', 'YDS', 'YBC', 'AIR', 'YAC', 'TGT', 'CATCHABLE']\n")
    
    # Load data from memory or CSV
    merged_df = load_dataframe("wr_adv_fp_rost_rz_merged_df_sorted", "wr_fantasypros_sorted.csv")
    adv_stats_df = load_dataframe("wr_fp_advanced_stats_df", "fantasypros_wr_advanced_stats.csv")
    
    # Log the source of the data
    if isinstance(merged_df, pd.DataFrame) and isinstance(adv_stats_df, pd.DataFrame):
        print("📥 Data loaded from **variables** in memory.\n")
    else:
        print("📥 Data loaded from **CSV files**.\n")

    for year in years:
        print(f"\nTesting Year: {year}")
        
        # Filter data by year
        merged_year_df = merged_df[merged_df['Year'] == year]
        adv_year_df = adv_stats_df[adv_stats_df['Year'] == year]
        
        # Skip if no data
        if merged_year_df.empty or adv_year_df.empty:
            print(f"⚠️ Skipping Year {year} (No data available)")
            continue
        
        # Select random players
        sampled_players = random.sample(list(merged_year_df['Player'].dropna()), min(num_samples, len(merged_year_df)))
        
        total_comparisons = 0
        match_count = 0
        
        for player in sampled_players:
            for week in merged_year_df[merged_year_df['Player'] == player]['Week'].unique():
                
                merged_row = merged_year_df[(merged_year_df['Player'] == player) & (merged_year_df['Week'] == week)]
                adv_row = adv_year_df[(adv_year_df['Player'] == player) & (adv_year_df['Week'] == week)]
                
                if merged_row.empty or adv_row.empty:
                    continue  # Skip if missing
                
                for col in ['REC', 'YDS', 'YBC', 'AIR', 'YAC', 'TGT', 'CATCHABLE']:
                    merged_val = merged_row[col].values[0] if col in merged_row else None
                    adv_val = adv_row[col].values[0] if col in adv_row else None
                    
                    if merged_val is not None and adv_val is not None:
                        total_comparisons += 1
                        if merged_val == adv_val:
                            match_count += 1
                        else:
                            mismatches.append((year, player, week, col, merged_val, adv_val))
        
        # Yearly summary
        match_percentage = (match_count / total_comparisons * 100) if total_comparisons else 0
        print(f"✅ Tested {len(sampled_players)} players in Year {year} ({total_comparisons} values compared, {match_percentage:.2f}% matched)")
    
    # Final summary
    print("\n🔍 Integrity Check Summary:")
    print(f"✔ Total Comparisons: {sum(len(m) for m in mismatches) + match_count}")
    print(f"✔ Overall Match Rate: {(match_count / (match_count + len(mismatches)) * 100) if (match_count + len(mismatches)) > 0 else 0:.2f}%")
    
    if mismatches:
        print("\n❌ Mismatch Details (Showing up to 10 cases):")
        for m in mismatches[:10]:
            print(m)
    else:
        print("✅ All values matched successfully!")

# Run the test
test_wr_advanced_stats()


🔹 Running Advanced Stats Integrity Check...
📌 Test Details: Stratified Random Sample by Year
📌 Columns Tested: ['REC', 'YDS', 'YBC', 'AIR', 'YAC', 'TGT', 'CATCHABLE']

📥 Data loaded from **variables** in memory.


Testing Year: 2017
✅ Tested 25 players in Year 2017 (2415 values compared, 100.00% matched)

Testing Year: 2018
✅ Tested 25 players in Year 2018 (2310 values compared, 100.00% matched)

Testing Year: 2019
✅ Tested 25 players in Year 2019 (2653 values compared, 100.00% matched)

Testing Year: 2020
✅ Tested 25 players in Year 2020 (2737 values compared, 100.00% matched)

Testing Year: 2021
✅ Tested 25 players in Year 2021 (2877 values compared, 100.00% matched)

Testing Year: 2022
✅ Tested 25 players in Year 2022 (2688 values compared, 100.00% matched)

Testing Year: 2023
✅ Tested 25 players in Year 2023 (3059 values compared, 100.00% matched)

🔍 Integrity Check Summary:
✔ Total Comparisons: 3059
✔ Overall Match Rate: 100.00%
✅ All values matched successfully!


In [167]:
## Data Integrity Checks ##
# Output: A Stratified Random Sample of years and players from the scraped FantasyPros data
# test to ensure merged df values match the individual unmerged df fpts and rost stats
def test_wr_fpts_rost(years=range(2017, 2024), num_samples=25, tolerance=0.1, seed=42):
    """
    Integrity check for WR Fantasy Points and % Rostered.
    
    - Uses Stratified Random Sampling by year.
    - Tests key stats (FPTS, ROST).
    - Reports % of matches and mismatches found.
    - Logs whether data is loaded from variables or CSV files.
    """
    random.seed(seed)  # Set seed for reproducibility
    mismatches = []
    
    print("\n🔹 Running Fantasy Points & % Rostered Integrity Check...")
    print(f"📌 Test Details: Stratified Random Sample by Year")
    print(f"📌 Columns Tested: ['FPTS', 'ROST']\n")
    
    # Load data from memory or CSV
    merged_df = load_dataframe("wr_adv_fp_rost_rz_merged_df_sorted", "wr_fantasypros_sorted.csv")
    fpts_df = load_dataframe("wr_fpts_perct_rost_df", "fantasypros_wr_fpts_perct_rost.csv")
    
    # Log the source of the data
    if isinstance(merged_df, pd.DataFrame) and isinstance(fpts_df, pd.DataFrame):
        print("📥 Data loaded from **variables** in memory.\n")
    else:
        print("📥 Data loaded from **CSV files**.\n")

    for year in years:
        print(f"\nTesting Year: {year}")
        
        # Filter data by year
        merged_year_df = merged_df[merged_df['Year'] == year]
        fpts_year_df = fpts_df[fpts_df['Year'] == year]
        
        # Skip if no data
        if merged_year_df.empty or fpts_year_df.empty:
            print(f"⚠️ Skipping Year {year} (No data available)")
            continue
        
        # Select random players
        sampled_players = random.sample(list(merged_year_df['Player'].dropna()), min(num_samples, len(merged_year_df)))
        
        total_comparisons = 0
        match_count = 0
        
        for player in sampled_players:
            for week in merged_year_df[merged_year_df['Player'] == player]['Week'].unique():
                
                merged_row = merged_year_df[(merged_year_df['Player'] == player) & (merged_year_df['Week'] == week)]
                fpts_row = fpts_year_df[(fpts_year_df['Player'] == player) & (fpts_year_df['Week'] == week)]
                
                if merged_row.empty or fpts_row.empty:
                    continue  # Skip if missing
                
                for col in ['FPTS', 'ROST']:
                    merged_val = merged_row[col].values[0] if col in merged_row else None
                    fpts_val = fpts_row[col].values[0] if col in fpts_row else None
                    
                    if merged_val is not None and fpts_val is not None:
                        total_comparisons += 1
                        if merged_val == fpts_val:
                            match_count += 1
                        else:
                            mismatches.append((year, player, week, col, merged_val, fpts_val))
        
        # Yearly summary
        match_percentage = (match_count / total_comparisons * 100) if total_comparisons else 0
        print(f"✅ Tested {len(sampled_players)} players in Year {year} ({total_comparisons} values compared, {match_percentage:.2f}% matched)")
    
    # Final summary
    print("\n🔍 Integrity Check Summary:")
    print(f"✔ Total Comparisons: {sum(len(m) for m in mismatches) + match_count}")
    print(f"✔ Overall Match Rate: {(match_count / (match_count + len(mismatches)) * 100) if (match_count + len(mismatches)) > 0 else 0:.2f}%")
    
    if mismatches:
        print("\n❌ Mismatch Details (Showing up to 10 cases):")
        for m in mismatches[:10]:
            print(m)
    else:
        print("✅ All values matched successfully!")

# Run the test
test_wr_fpts_rost()



🔹 Running Fantasy Points & % Rostered Integrity Check...
📌 Test Details: Stratified Random Sample by Year
📌 Columns Tested: ['FPTS', 'ROST']

📥 Data loaded from **variables** in memory.


Testing Year: 2017
✅ Tested 25 players in Year 2017 (676 values compared, 100.00% matched)

Testing Year: 2018
✅ Tested 25 players in Year 2018 (638 values compared, 100.00% matched)

Testing Year: 2019
✅ Tested 25 players in Year 2019 (644 values compared, 100.00% matched)

Testing Year: 2020
✅ Tested 25 players in Year 2020 (698 values compared, 100.00% matched)

Testing Year: 2021
✅ Tested 25 players in Year 2021 (540 values compared, 100.00% matched)

Testing Year: 2022
✅ Tested 25 players in Year 2022 (574 values compared, 100.00% matched)

Testing Year: 2023
✅ Tested 25 players in Year 2023 (548 values compared, 100.00% matched)

🔍 Integrity Check Summary:
✔ Total Comparisons: 548
✔ Overall Match Rate: 100.00%
✅ All values matched successfully!


In [169]:
## Data Integrity Checks ##
# Output: A Stratified Random Sample of years and players from the scraped FantasyPros data
# test to ensure merged df values match the individual unmerged df of redzone stats
def test_wr_redzone_stats(years=range(2017, 2024), num_samples=25, tolerance=0.1, seed=42):
    """
    Integrity check for WR Red Zone Stats.
    
    - Uses Stratified Random Sampling by year.
    - Tests key stats: 'REC_rz', 'TGT_rz', 'REC PCT_rz', 'Y/R_rz', 'TGT PCT_rz'.
    - Reports % of matches and mismatches found.
    - Logs whether data is loaded from variables or CSV files.
    """
    random.seed(seed)  # Set seed for reproducibility
    mismatches = []
    
    print("\n🔹 Running Red Zone Stats Integrity Check...")
    print(f"📌 Test Details: Stratified Random Sample by Year")
    print(f"📌 Columns Tested: ['REC_rz', 'TGT_rz', 'REC PCT_rz', 'Y/R_rz', 'TGT PCT_rz']\n")
    
    # Load data from memory or CSV
    merged_df = load_dataframe("wr_adv_fp_rost_rz_merged_df_sorted", "wr_fantasypros_sorted.csv")
    rz_df = load_dataframe("wr_redzone_stats_df", "fantasypros_wr_redzone_stats.csv")
    
    # Log the source of the data
    if isinstance(merged_df, pd.DataFrame) and isinstance(rz_df, pd.DataFrame):
        print("📥 Data loaded from **variables** in memory.\n")
    else:
        print("📥 Data loaded from **CSV files**.\n")

    for year in years:
        print(f"\nTesting Year: {year}")
        
        # Filter data by year
        merged_year_df = merged_df[merged_df['Year'] == year]
        rz_year_df = rz_df[rz_df['Year'] == year]
        
        # Skip if no data
        if merged_year_df.empty or rz_year_df.empty:
            print(f"⚠️ Skipping Year {year} (No data available)")
            continue
        
        # Select random players
        sampled_players = random.sample(list(merged_year_df['Player'].dropna()), min(num_samples, len(merged_year_df)))
        
        total_comparisons = 0
        match_count = 0
        
        for player in sampled_players:
            for week in merged_year_df[merged_year_df['Player'] == player]['Week'].unique():
                
                merged_row = merged_year_df[(merged_year_df['Player'] == player) & (merged_year_df['Week'] == week)]
                rz_row = rz_year_df[(rz_year_df['Player'] == player) & (rz_year_df['Week'] == week)]
                
                if merged_row.empty or rz_row.empty:
                    continue  # Skip if missing
                
                for col in ['REC_rz', 'TGT_rz', 'REC PCT_rz', 'Y/R_rz', 'TGT PCT_rz']:
                    merged_val = merged_row[col].values[0] if col in merged_row else None
                    rz_val = rz_row[col].values[0] if col in rz_row else None
                    
                    if merged_val is not None and rz_val is not None:
                        total_comparisons += 1
                        if merged_val == rz_val:
                            match_count += 1
                        else:
                            mismatches.append((year, player, week, col, merged_val, rz_val))
        
        # Yearly summary
        match_percentage = (match_count / total_comparisons * 100) if total_comparisons else 0
        print(f"✅ Tested {len(sampled_players)} players in Year {year} ({total_comparisons} values compared, {match_percentage:.2f}% matched)")
    
    # Final summary
    print("\n🔍 Integrity Check Summary:")
    print(f"✔ Total Comparisons: {sum(len(m) for m in mismatches) + match_count}")
    print(f"✔ Overall Match Rate: {(match_count / (match_count + len(mismatches)) * 100) if (match_count + len(mismatches)) > 0 else 0:.2f}%")
    
    if mismatches:
        print("\n❌ Mismatch Details (Showing up to 10 cases):")
        for m in mismatches[:10]:
            print(m)
    else:
        print("✅ All values matched successfully!")

# Run the test
test_wr_redzone_stats()



🔹 Running Red Zone Stats Integrity Check...
📌 Test Details: Stratified Random Sample by Year
📌 Columns Tested: ['REC_rz', 'TGT_rz', 'REC PCT_rz', 'Y/R_rz', 'TGT PCT_rz']

📥 Data loaded from **variables** in memory.


Testing Year: 2017
✅ Tested 25 players in Year 2017 (505 values compared, 100.00% matched)

Testing Year: 2018
✅ Tested 25 players in Year 2018 (550 values compared, 100.00% matched)

Testing Year: 2019
✅ Tested 25 players in Year 2019 (675 values compared, 100.00% matched)

Testing Year: 2020
✅ Tested 25 players in Year 2020 (720 values compared, 100.00% matched)

Testing Year: 2021
✅ Tested 25 players in Year 2021 (535 values compared, 100.00% matched)

Testing Year: 2022
✅ Tested 25 players in Year 2022 (490 values compared, 100.00% matched)

Testing Year: 2023
✅ Tested 25 players in Year 2023 (430 values compared, 100.00% matched)

🔍 Integrity Check Summary:
✔ Total Comparisons: 430
✔ Overall Match Rate: 100.00%
✅ All values matched successfully!


In [76]:
## Next tasks
# split the team abbreviation from the 'player' column into a separate column 
# merge the dataframe with the wr_ids_ngs_pfr_stats_sorted dataframe