In [1]:
# This produces the dataframe for WR

In [2]:
## Notes on the NFL Library ##
# the NFL python library seem to not work on Tuesday probably due to updates (not confirmed)
# unbalanced dataframe - pfr stats start at 2018; all other stats go back to 2017

In [3]:
## REQUIRED ACTIONS - Include in a README doc ## 
# modify the season start date in the 'get_current_week' function
# modify the number of weeks if the NFL adds regular season games to the schedule

In [352]:
# import the libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import glob
from IPython.display import display, HTML
from datetime import datetime
import nfl_data_py as nfl
import os
import re
import time
from random import sample, uniform, seed
import io
from rapidfuzz import fuzz, process
import numpy as np
import hashlib

In [5]:
# Set Pandas options to display all columns in a single row without wrapping
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

In [6]:
# Function to calculate the current week of the NFL season
def get_current_week():
    current_date = datetime.now()
    season_start_date = datetime(2024, 9, 4)  # Update for the season start
    current_week = ((current_date - season_start_date).days // 7) + 1
    return current_week

# Define the current NFL year, week, and season type
current_year = datetime.now().year
current_week = get_current_week()
seasontype = 2 if current_week <= 18 else 3  # Regular season or playoffs

In [7]:
### Begin: Python NFL Library Dataframe ###

In [8]:
# define the years to pull
# nfl.import_weekly_data(years, columns, downcast)
def get_year_range(current_year, current_week, start_year=2017):
    if current_week <= 18:  # Regular season
        return list(range(start_year, current_year + 1))
    else:  # Playoffs
        return list(range(start_year, current_year))

# Use the function
years = get_year_range(current_year, current_week)

In [9]:
# define the base columns. 
base_columns = [
    'season', 'season_type', 'week', 'player_id', 'player_name', 
    'position', 'position_group', 'recent_team'
]

In [10]:
# Import the player IDs from nfl.import_ids() - without parameters
ids_data = nfl.import_ids()

# Drop the unnecessary columns
columns_to_drop = [
    'position', 'team', 'birthdate', 'age', 'draft_year', 
    'draft_round', 'draft_pick', 'draft_ovr', 'twitter_username', 
    'height', 'weight', 'college', 'db_season'
]
ids_data = ids_data.drop(columns=columns_to_drop, errors='ignore')

# Display the resulting dataframe for review
# print(f"Columns after dropping unnecessary ones: {ids_data.columns.tolist()}")
# display(ids_data)

In [11]:
# import the weekly data from nfl.import_weekly_data(years, columns, downcast)
weekly_data = nfl.import_weekly_data(
    years=years,
    columns=base_columns
)

# display(weekly_data)

Downcasting floats.


In [12]:
## Output: a dataframe of ALL NFL athletes info and ids since 2017

# Merge the two dataframes on 'player_id' and 'gsis_id'
# Align column names for merging
ids_data = ids_data.rename(columns={'gsis_id': 'player_id'})  
id_dataframe = pd.merge(weekly_data, ids_data, on='player_id', how='inner')

# Assign the resulting dataframe to a variable
all_players_id_data = id_dataframe

# Display the resulting ID dataframe
# display(all_players_id_data)

In [13]:
## Output: a dataframe of NFL WR info and ids since 2017
# extract WR from the dataframe
# Create a new dataframe with only wide receivers
wide_receiver_ids = all_players_id_data[all_players_id_data['position'] == 'WR']

# Display the resulting dataframe for review
print(f"Shape of merged dataframe: {wide_receiver_ids.shape}")

# Display the resulting dataframe for review
# display(wide_receiver_ids)

Shape of merged dataframe: (17405, 29)


In [14]:
## Output: a dataframe of NFL WR info, ids, and stats since 2017
# WR-specific columns (receiving-related)
wr_columns = [
    'receptions', 'targets', 'receiving_yards', 'receiving_tds',
    'receiving_fumbles', 'receiving_fumbles_lost',
    'receiving_air_yards', 'receiving_yards_after_catch',
    'receiving_first_downs', 'receiving_epa',
    'receiving_2pt_conversions', 'racr', 'target_share',
    'air_yards_share', 'wopr'
]

# Pull WR-specific columns from weekly data
wr_stats = nfl.import_weekly_data(
    years=years,
    columns=['player_id', 'season', 'week'] + wr_columns  # Include keys for merging
)

# Merge WR-specific stats with wide_receiver_ids
wr_ids_weekly_stats = pd.merge(
    wide_receiver_ids,
    wr_stats,
    on=['player_id', 'season', 'week'],  # Ensure correct alignment
    how='inner'
)

# Display the resulting dataframe for review
print(f"Shape of merged dataframe: {wr_ids_weekly_stats.shape}")

# Row integrity check
print(
    f"Row count matches: {wr_ids_weekly_stats.shape[0] == wide_receiver_ids.shape[0]}"
)

# display the df
display(wr_ids_weekly_stats)

# csv file
wr_ids_weekly_stats.to_csv('wr_ids_weekly_stats.csv', index=False)

Downcasting floats.
Shape of merged dataframe: (17405, 44)
Row count matches: True


Unnamed: 0,season,season_type,week,player_id,player_name,position,position_group,recent_team,mfl_id,sportradar_id,fantasypros_id,pff_id,sleeper_id,nfl_id,espn_id,yahoo_id,fleaflicker_id,cbs_id,pfr_id,cfbref_id,rotowire_id,rotoworld_id,ktc_id,stats_id,stats_global_id,fantasy_data_id,swish_id,name,merge_name,receptions,targets,receiving_yards,receiving_tds,receiving_fumbles,receiving_fumbles_lost,receiving_air_yards,receiving_yards_after_catch,receiving_first_downs,receiving_epa,receiving_2pt_conversions,racr,target_share,air_yards_share,wopr
0,2017,REG,1,00-0022921,L.Fitzgerald,WR,WR,ARI,7393,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,9383.0,1724.0,223.0,larryfitzgerald/2506106,5528.0,6762.0,1732.0,492934.0,FitzLa00,larry-fitzgerald-1,3730.0,1661.0,,6762.0,246053.0,5571.0,,Larry Fitzgerald,larry fitzgerald,6,13,74.0,0,0.0,0.0,144.0,44.0,4.0,0.997088,0,0.513889,0.276596,0.342043,0.654324
1,2017,REG,2,00-0022921,L.Fitzgerald,WR,WR,ARI,7393,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,9383.0,1724.0,223.0,larryfitzgerald/2506106,5528.0,6762.0,1732.0,492934.0,FitzLa00,larry-fitzgerald-1,3730.0,1661.0,,6762.0,246053.0,5571.0,,Larry Fitzgerald,larry fitzgerald,3,6,21.0,0,0.0,0.0,29.0,17.0,2.0,-3.455533,0,0.724138,0.166667,0.069378,0.298565
2,2017,REG,3,00-0022921,L.Fitzgerald,WR,WR,ARI,7393,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,9383.0,1724.0,223.0,larryfitzgerald/2506106,5528.0,6762.0,1732.0,492934.0,FitzLa00,larry-fitzgerald-1,3730.0,1661.0,,6762.0,246053.0,5571.0,,Larry Fitzgerald,larry fitzgerald,13,15,149.0,1,0.0,0.0,138.0,45.0,6.0,7.632769,0,1.079710,0.312500,0.369973,0.727731
3,2017,REG,4,00-0022921,L.Fitzgerald,WR,WR,ARI,7393,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,9383.0,1724.0,223.0,larryfitzgerald/2506106,5528.0,6762.0,1732.0,492934.0,FitzLa00,larry-fitzgerald-1,3730.0,1661.0,,6762.0,246053.0,5571.0,,Larry Fitzgerald,larry fitzgerald,4,7,32.0,1,0.0,0.0,31.0,18.0,1.0,0.162141,0,1.032258,0.137255,0.070938,0.255539
4,2017,REG,5,00-0022921,L.Fitzgerald,WR,WR,ARI,7393,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,9383.0,1724.0,223.0,larryfitzgerald/2506106,5528.0,6762.0,1732.0,492934.0,FitzLa00,larry-fitzgerald-1,3730.0,1661.0,,6762.0,246053.0,5571.0,,Larry Fitzgerald,larry fitzgerald,6,10,51.0,0,0.0,0.0,44.0,29.0,5.0,2.428232,0,1.159091,0.227273,0.105516,0.414770
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17400,2024,REG,2,00-0039920,M.Corley,WR,WR,NYJ,16636,bae59933-8b94-4837-990e-f0a4ced3cdbb,26023.0,,11617.0,,4613104.0,40944.0,,3162613.0,CorlMa00,malachi-corley-1,17777.0,,1607.0,40944.0,0.0,,1215291.0,Malachi Corley,malachi corley,1,1,4.0,0,0.0,0.0,-1.0,5.0,0.0,-0.475780,0,0.000000,0.034483,-0.006579,0.047119
17401,2024,REG,9,00-0039920,M.Corley,WR,WR,NYJ,16636,bae59933-8b94-4837-990e-f0a4ced3cdbb,26023.0,,11617.0,,4613104.0,40944.0,,3162613.0,CorlMa00,malachi-corley-1,17777.0,,1607.0,40944.0,0.0,,1215291.0,Malachi Corley,malachi corley,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,,0,,,,
17402,2024,REG,10,00-0039920,M.Corley,WR,WR,NYJ,16636,bae59933-8b94-4837-990e-f0a4ced3cdbb,26023.0,,11617.0,,4613104.0,40944.0,,3162613.0,CorlMa00,malachi-corley-1,17777.0,,1607.0,40944.0,0.0,,1215291.0,Malachi Corley,malachi corley,1,2,2.0,0,0.0,0.0,12.0,0.0,1.0,-2.246118,0,0.166667,0.060606,0.057692,0.131294
17403,2024,REG,11,00-0039920,M.Corley,WR,WR,NYJ,16636,bae59933-8b94-4837-990e-f0a4ced3cdbb,26023.0,,11617.0,,4613104.0,40944.0,,3162613.0,CorlMa00,malachi-corley-1,17777.0,,1607.0,40944.0,0.0,,1215291.0,Malachi Corley,malachi corley,1,1,10.0,0,0.0,0.0,10.0,0.0,0.0,0.563583,0,1.000000,0.034483,0.080645,0.108176


In [15]:
# Output: imports the NFL next-generation stats from the nfl python library

# import the next generation stats (NGS) from nfl.import_ngs_data()
# note: ngs starts at week 0 (previous season totals) - not needed so drop those rows

# Pull NGS receiving data for the specified years
ngs_wr_df = nfl.import_ngs_data('receiving', years)

# Exclude rows where 'week' == 0 and filter for 'WR' position in one step
ngs_wr_df = ngs_wr_df[(ngs_wr_df['week'] != 0) & (ngs_wr_df['player_position'] == 'WR')]

# Drop unnecessary columns
ngs_wr_df = ngs_wr_df.drop(columns=['season_type', 'player_position', 'receptions', 'targets','player_jersey_number'], errors='ignore')

# Display the resulting dataframe
print(f"Shape of NGS WR DataFrame after dropping columns: {ngs_wr_df.shape}")
display(ngs_wr_df)

# csv file
ngs_wr_df.to_csv('ngs_wr_df.csv', index=False)

Shape of NGS WR DataFrame after dropping columns: (8249, 18)


Unnamed: 0,season,week,player_display_name,team_abbr,avg_cushion,avg_separation,avg_intended_air_yards,percent_share_of_intended_air_yards,catch_percentage,yards,rec_touchdowns,avg_yac,avg_expected_yac,avg_yac_above_expectation,player_gsis_id,player_first_name,player_last_name,player_short_name
1725,2017,1,Ryan Grant,WAS,9.936667,2.894592,4.410000,7.154639,66.666667,61.0,0,11.232500,10.072361,1.160139,00-0031068,Ryan,Grant,R.Grant
1726,2017,1,Martavis Bryant,PIT,8.300000,4.122054,12.688333,33.327496,33.333333,14.0,0,0.155000,4.098278,-3.943278,00-0031373,Martavis,Bryant,M.Bryant
1729,2017,1,Jamison Crowder,WAS,7.655000,3.177793,10.540000,19.949707,42.857143,14.0,0,1.450000,1.631897,-0.181897,00-0031941,Jamison,Crowder,J.Crowder
1732,2017,1,Nelson Agholor,PHI,7.423750,2.462620,10.463750,20.274656,75.000000,86.0,1,5.611667,3.262470,2.349197,00-0031549,Nelson,Agholor,N.Agholor
1733,2017,1,John Brown,ARI,7.360000,2.751526,13.422222,28.208481,44.444444,32.0,0,-0.377500,0.961993,-1.339493,00-0031051,John,Brown,J.Brown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13323,2024,23,Xavier Worthy,KC,8.160000,4.959113,14.276250,44.737358,100.000000,157.0,2,6.250000,6.154624,0.095376,00-0039894,Xavier,Worthy,X.Worthy
13324,2024,23,DeAndre Hopkins,KC,7.676000,3.446231,11.974000,23.451761,40.000000,18.0,1,0.565000,0.798474,-0.233474,00-0030564,DeAndre,Hopkins,D.Hopkins
13325,2024,23,DeVonta Smith,PHI,7.470000,2.221577,14.752000,40.028219,80.000000,69.0,1,0.340000,0.600076,-0.260076,00-0036912,DeVonta,Smith,D.Smith
13327,2024,23,Marquise Brown,KC,4.943333,3.302615,6.356667,14.939872,33.333333,15.0,0,2.450000,3.533891,-1.083891,00-0035662,Marquise,Brown,M.Brown


In [16]:
# Output: a dataframe of NFL WR info, ids, weekly stats, and next-gen stats since 2017

# Joins wr_ids_weekly_stats dataframe with ngs_wr_df using the keys 'player_id' and 'player_gsis_id'
# Merge wr_ids_weekly_stats with ngs_wr_df using a left join
wr_ids_ngs_weekly_stats = pd.merge(
    wr_ids_weekly_stats,
    ngs_wr_df,
    left_on=['player_id', 'season', 'week'],  # Keys from wr_ids_weekly_stats
    right_on=['player_gsis_id', 'season', 'week'],  # Keys from ngs_wr_df
    how='left'  # Retain all rows from wr_ids_weekly_stats
)

# Display the shape of the resulting dataframe
print(f"Shape of merged dataframe: {wr_ids_ngs_weekly_stats.shape}")
print(f"Row count matches: {wr_ids_weekly_stats.shape[0] == wr_ids_ngs_weekly_stats.shape[0]}")

# Display a sample of the merged dataframe
display(wr_ids_ngs_weekly_stats)

# csv file
wr_ids_ngs_weekly_stats.to_csv('wr_ids_ngs_weekly_stats.csv', index=False)

Shape of merged dataframe: (17405, 60)
Row count matches: True


Unnamed: 0,season,season_type,week,player_id,player_name,position,position_group,recent_team,mfl_id,sportradar_id,fantasypros_id,pff_id,sleeper_id,nfl_id,espn_id,yahoo_id,fleaflicker_id,cbs_id,pfr_id,cfbref_id,rotowire_id,rotoworld_id,ktc_id,stats_id,stats_global_id,fantasy_data_id,swish_id,name,merge_name,receptions,targets,receiving_yards,receiving_tds,receiving_fumbles,receiving_fumbles_lost,receiving_air_yards,receiving_yards_after_catch,receiving_first_downs,receiving_epa,receiving_2pt_conversions,racr,target_share,air_yards_share,wopr,player_display_name,team_abbr,avg_cushion,avg_separation,avg_intended_air_yards,percent_share_of_intended_air_yards,catch_percentage,yards,rec_touchdowns,avg_yac,avg_expected_yac,avg_yac_above_expectation,player_gsis_id,player_first_name,player_last_name,player_short_name
0,2017,REG,1,00-0022921,L.Fitzgerald,WR,WR,ARI,7393,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,9383.0,1724.0,223.0,larryfitzgerald/2506106,5528.0,6762.0,1732.0,492934.0,FitzLa00,larry-fitzgerald-1,3730.0,1661.0,,6762.0,246053.0,5571.0,,Larry Fitzgerald,larry fitzgerald,6,13,74.0,0,0.0,0.0,144.0,44.0,4.0,0.997088,0,0.513889,0.276596,0.342043,0.654324,Larry Fitzgerald,ARI,5.936667,2.293974,10.764615,32.677938,46.153846,74.0,0.0,7.375000,8.630824,-1.255824,00-0022921,Larry,Fitzgerald,L.Fitzgerald
1,2017,REG,2,00-0022921,L.Fitzgerald,WR,WR,ARI,7393,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,9383.0,1724.0,223.0,larryfitzgerald/2506106,5528.0,6762.0,1732.0,492934.0,FitzLa00,larry-fitzgerald-1,3730.0,1661.0,,6762.0,246053.0,5571.0,,Larry Fitzgerald,larry fitzgerald,3,6,21.0,0,0.0,0.0,29.0,17.0,2.0,-3.455533,0,0.724138,0.166667,0.069378,0.298565,Larry Fitzgerald,ARI,4.746667,2.808189,5.010000,7.075605,50.000000,21.0,0.0,5.873333,6.784866,-0.911533,00-0022921,Larry,Fitzgerald,L.Fitzgerald
2,2017,REG,3,00-0022921,L.Fitzgerald,WR,WR,ARI,7393,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,9383.0,1724.0,223.0,larryfitzgerald/2506106,5528.0,6762.0,1732.0,492934.0,FitzLa00,larry-fitzgerald-1,3730.0,1661.0,,6762.0,246053.0,5571.0,,Larry Fitzgerald,larry fitzgerald,13,15,149.0,1,0.0,0.0,138.0,45.0,6.0,7.632769,0,1.079710,0.312500,0.369973,0.727731,Larry Fitzgerald,ARI,6.556000,3.289943,9.675333,33.997048,86.666667,149.0,1.0,3.536154,3.383512,0.152642,00-0022921,Larry,Fitzgerald,L.Fitzgerald
3,2017,REG,4,00-0022921,L.Fitzgerald,WR,WR,ARI,7393,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,9383.0,1724.0,223.0,larryfitzgerald/2506106,5528.0,6762.0,1732.0,492934.0,FitzLa00,larry-fitzgerald-1,3730.0,1661.0,,6762.0,246053.0,5571.0,,Larry Fitzgerald,larry fitzgerald,4,7,32.0,1,0.0,0.0,31.0,18.0,1.0,0.162141,0,1.032258,0.137255,0.070938,0.255539,Larry Fitzgerald,ARI,8.400000,2.609960,5.661429,8.939165,57.142857,32.0,1.0,4.125000,3.548166,0.576834,00-0022921,Larry,Fitzgerald,L.Fitzgerald
4,2017,REG,5,00-0022921,L.Fitzgerald,WR,WR,ARI,7393,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,9383.0,1724.0,223.0,larryfitzgerald/2506106,5528.0,6762.0,1732.0,492934.0,FitzLa00,larry-fitzgerald-1,3730.0,1661.0,,6762.0,246053.0,5571.0,,Larry Fitzgerald,larry fitzgerald,6,10,51.0,0,0.0,0.0,44.0,29.0,5.0,2.428232,0,1.159091,0.227273,0.105516,0.414770,Larry Fitzgerald,ARI,6.392222,2.449503,4.217000,11.136346,60.000000,51.0,0.0,6.521667,5.652730,0.868937,00-0022921,Larry,Fitzgerald,L.Fitzgerald
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17400,2024,REG,2,00-0039920,M.Corley,WR,WR,NYJ,16636,bae59933-8b94-4837-990e-f0a4ced3cdbb,26023.0,,11617.0,,4613104.0,40944.0,,3162613.0,CorlMa00,malachi-corley-1,17777.0,,1607.0,40944.0,0.0,,1215291.0,Malachi Corley,malachi corley,1,1,4.0,0,0.0,0.0,-1.0,5.0,0.0,-0.475780,0,0.000000,0.034483,-0.006579,0.047119,,,,,,,,,,,,,,,,
17401,2024,REG,9,00-0039920,M.Corley,WR,WR,NYJ,16636,bae59933-8b94-4837-990e-f0a4ced3cdbb,26023.0,,11617.0,,4613104.0,40944.0,,3162613.0,CorlMa00,malachi-corley-1,17777.0,,1607.0,40944.0,0.0,,1215291.0,Malachi Corley,malachi corley,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,,0,,,,,,,,,,,,,,,,,,,,
17402,2024,REG,10,00-0039920,M.Corley,WR,WR,NYJ,16636,bae59933-8b94-4837-990e-f0a4ced3cdbb,26023.0,,11617.0,,4613104.0,40944.0,,3162613.0,CorlMa00,malachi-corley-1,17777.0,,1607.0,40944.0,0.0,,1215291.0,Malachi Corley,malachi corley,1,2,2.0,0,0.0,0.0,12.0,0.0,1.0,-2.246118,0,0.166667,0.060606,0.057692,0.131294,,,,,,,,,,,,,,,,
17403,2024,REG,11,00-0039920,M.Corley,WR,WR,NYJ,16636,bae59933-8b94-4837-990e-f0a4ced3cdbb,26023.0,,11617.0,,4613104.0,40944.0,,3162613.0,CorlMa00,malachi-corley-1,17777.0,,1607.0,40944.0,0.0,,1215291.0,Malachi Corley,malachi corley,1,1,10.0,0,0.0,0.0,10.0,0.0,0.0,0.563583,0,1.000000,0.034483,0.080645,0.108176,,,,,,,,,,,,,,,,


In [17]:
# Output: pro-football reference dataframe for receiving data from the python nfl library
# note: PFR data not available before 2018
# there is no position info so the data will pull WR, TE, and RB receiving data

# Define the range of years for PFR data (2018 to the current year)
pfr_years = list(range(2018, current_year))

# import pro-football reference data
pfr_rec_df = nfl.import_weekly_pfr('rec',pfr_years)

# Drop unnecessary columns
pfr_rec_df = pfr_rec_df.drop(
    columns=['game_id','pfr_game_id','receiving_int','rushing_broken_tackles', 
             'passing_drops', 'passing_drop_pct'], errors='ignore')

# display dataframe
print(f"Shape of PFR dataframe: {pfr_rec_df.shape}")
display(pfr_rec_df)

# csv file
pfr_rec_df.to_csv('pfr_rec_df.csv', index=False)

Shape of PFR dataframe: (31191, 11)


Unnamed: 0,season,week,game_type,team,opponent,pfr_player_name,pfr_player_id,receiving_broken_tackles,receiving_drop,receiving_drop_pct,receiving_rat
0,2018,1,REG,PHI,ATL,Nelson Agholor,AghoNe00,0.0,0.0,0.000,80.4
1,2018,1,REG,PHI,ATL,Zach Ertz,ErtzZa00,0.0,2.0,0.200,63.7
2,2018,1,REG,PHI,ATL,Darren Sproles,SproDa00,1.0,1.0,0.143,62.8
3,2018,1,REG,PHI,ATL,DeAndre Carter,CartDe02,0.0,0.0,0.000,108.3
4,2018,1,REG,PHI,ATL,Nick Foles,FoleNi00,0.0,0.0,0.000,118.7
...,...,...,...,...,...,...,...,...,...,...,...
4448,2024,22,SB,KC,PHI,JuJu Smith-Schuster,SmitJu00,0.0,0.0,0.000,100.0
4449,2024,22,SB,KC,PHI,Noah Gray,GrayNo00,0.0,0.0,0.000,79.2
4450,2024,22,SB,KC,PHI,Kareem Hunt,HuntKa00,0.0,0.0,0.000,87.5
4451,2024,22,SB,KC,PHI,Isiah Pacheco,PachIs00,0.0,0.0,0.000,56.2


In [18]:
# Output: a dataframe of NFL WR info, ids, weekly stats, next-gen stats, and pro-football reference data
# NOTE: unbalanced dataframe - pfr stats start at 2018

# merge the pfr_rec_df with the wr_ids_ngs_weekly_stats dataframe
# match with ids then filter out the unmatched rows as they are likely (TE)
# Step 1: Merge the dataframes with a LEFT JOIN
wr_ids_ngs_pfr_stats = pd.merge(
    wr_ids_ngs_weekly_stats,
    pfr_rec_df,  # Use the full PFR dataframe as position data is unavailable
    left_on=['pfr_id', 'season', 'week'],  # Keys from wr_ids_ngs_weekly_stats
    right_on=['pfr_player_id', 'season', 'week'],  # Keys from pfr_rec_df
    how='left'  # Retain all rows from wr_ids_ngs_weekly_stats
)

# Display the shape of the resulting dataframe
print(f"Shape of merged dataframe: {wr_ids_ngs_pfr_stats.shape}")

# Row integrity check
print(
    f"Row count matches: {wr_ids_weekly_stats.shape[0] == wr_ids_ngs_weekly_stats.shape[0] == wr_ids_ngs_pfr_stats.shape[0]}"
)

# Display the first few rows of the merged dataframe for review
display(wr_ids_ngs_pfr_stats)

# csv file
wr_ids_ngs_pfr_stats.to_csv('wr_ids_ngs_pfr_stats.csv', index=False)

Shape of merged dataframe: (17405, 69)
Row count matches: True


Unnamed: 0,season,season_type,week,player_id,player_name,position,position_group,recent_team,mfl_id,sportradar_id,fantasypros_id,pff_id,sleeper_id,nfl_id,espn_id,yahoo_id,fleaflicker_id,cbs_id,pfr_id,cfbref_id,rotowire_id,rotoworld_id,ktc_id,stats_id,stats_global_id,fantasy_data_id,swish_id,name,merge_name,receptions,targets,receiving_yards,receiving_tds,receiving_fumbles,receiving_fumbles_lost,receiving_air_yards,receiving_yards_after_catch,receiving_first_downs,receiving_epa,receiving_2pt_conversions,racr,target_share,air_yards_share,wopr,player_display_name,team_abbr,avg_cushion,avg_separation,avg_intended_air_yards,percent_share_of_intended_air_yards,catch_percentage,yards,rec_touchdowns,avg_yac,avg_expected_yac,avg_yac_above_expectation,player_gsis_id,player_first_name,player_last_name,player_short_name,game_type,team,opponent,pfr_player_name,pfr_player_id,receiving_broken_tackles,receiving_drop,receiving_drop_pct,receiving_rat
0,2017,REG,1,00-0022921,L.Fitzgerald,WR,WR,ARI,7393,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,9383.0,1724.0,223.0,larryfitzgerald/2506106,5528.0,6762.0,1732.0,492934.0,FitzLa00,larry-fitzgerald-1,3730.0,1661.0,,6762.0,246053.0,5571.0,,Larry Fitzgerald,larry fitzgerald,6,13,74.0,0,0.0,0.0,144.0,44.0,4.0,0.997088,0,0.513889,0.276596,0.342043,0.654324,Larry Fitzgerald,ARI,5.936667,2.293974,10.764615,32.677938,46.153846,74.0,0.0,7.375000,8.630824,-1.255824,00-0022921,Larry,Fitzgerald,L.Fitzgerald,,,,,,,,,
1,2017,REG,2,00-0022921,L.Fitzgerald,WR,WR,ARI,7393,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,9383.0,1724.0,223.0,larryfitzgerald/2506106,5528.0,6762.0,1732.0,492934.0,FitzLa00,larry-fitzgerald-1,3730.0,1661.0,,6762.0,246053.0,5571.0,,Larry Fitzgerald,larry fitzgerald,3,6,21.0,0,0.0,0.0,29.0,17.0,2.0,-3.455533,0,0.724138,0.166667,0.069378,0.298565,Larry Fitzgerald,ARI,4.746667,2.808189,5.010000,7.075605,50.000000,21.0,0.0,5.873333,6.784866,-0.911533,00-0022921,Larry,Fitzgerald,L.Fitzgerald,,,,,,,,,
2,2017,REG,3,00-0022921,L.Fitzgerald,WR,WR,ARI,7393,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,9383.0,1724.0,223.0,larryfitzgerald/2506106,5528.0,6762.0,1732.0,492934.0,FitzLa00,larry-fitzgerald-1,3730.0,1661.0,,6762.0,246053.0,5571.0,,Larry Fitzgerald,larry fitzgerald,13,15,149.0,1,0.0,0.0,138.0,45.0,6.0,7.632769,0,1.079710,0.312500,0.369973,0.727731,Larry Fitzgerald,ARI,6.556000,3.289943,9.675333,33.997048,86.666667,149.0,1.0,3.536154,3.383512,0.152642,00-0022921,Larry,Fitzgerald,L.Fitzgerald,,,,,,,,,
3,2017,REG,4,00-0022921,L.Fitzgerald,WR,WR,ARI,7393,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,9383.0,1724.0,223.0,larryfitzgerald/2506106,5528.0,6762.0,1732.0,492934.0,FitzLa00,larry-fitzgerald-1,3730.0,1661.0,,6762.0,246053.0,5571.0,,Larry Fitzgerald,larry fitzgerald,4,7,32.0,1,0.0,0.0,31.0,18.0,1.0,0.162141,0,1.032258,0.137255,0.070938,0.255539,Larry Fitzgerald,ARI,8.400000,2.609960,5.661429,8.939165,57.142857,32.0,1.0,4.125000,3.548166,0.576834,00-0022921,Larry,Fitzgerald,L.Fitzgerald,,,,,,,,,
4,2017,REG,5,00-0022921,L.Fitzgerald,WR,WR,ARI,7393,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,9383.0,1724.0,223.0,larryfitzgerald/2506106,5528.0,6762.0,1732.0,492934.0,FitzLa00,larry-fitzgerald-1,3730.0,1661.0,,6762.0,246053.0,5571.0,,Larry Fitzgerald,larry fitzgerald,6,10,51.0,0,0.0,0.0,44.0,29.0,5.0,2.428232,0,1.159091,0.227273,0.105516,0.414770,Larry Fitzgerald,ARI,6.392222,2.449503,4.217000,11.136346,60.000000,51.0,0.0,6.521667,5.652730,0.868937,00-0022921,Larry,Fitzgerald,L.Fitzgerald,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17400,2024,REG,2,00-0039920,M.Corley,WR,WR,NYJ,16636,bae59933-8b94-4837-990e-f0a4ced3cdbb,26023.0,,11617.0,,4613104.0,40944.0,,3162613.0,CorlMa00,malachi-corley-1,17777.0,,1607.0,40944.0,0.0,,1215291.0,Malachi Corley,malachi corley,1,1,4.0,0,0.0,0.0,-1.0,5.0,0.0,-0.475780,0,0.000000,0.034483,-0.006579,0.047119,,,,,,,,,,,,,,,,,REG,NYJ,TEN,Malachi Corley,CorlMa00,0.0,0.0,0.0,83.3
17401,2024,REG,9,00-0039920,M.Corley,WR,WR,NYJ,16636,bae59933-8b94-4837-990e-f0a4ced3cdbb,26023.0,,11617.0,,4613104.0,40944.0,,3162613.0,CorlMa00,malachi-corley-1,17777.0,,1607.0,40944.0,0.0,,1215291.0,Malachi Corley,malachi corley,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
17402,2024,REG,10,00-0039920,M.Corley,WR,WR,NYJ,16636,bae59933-8b94-4837-990e-f0a4ced3cdbb,26023.0,,11617.0,,4613104.0,40944.0,,3162613.0,CorlMa00,malachi-corley-1,17777.0,,1607.0,40944.0,0.0,,1215291.0,Malachi Corley,malachi corley,1,2,2.0,0,0.0,0.0,12.0,0.0,1.0,-2.246118,0,0.166667,0.060606,0.057692,0.131294,,,,,,,,,,,,,,,,,REG,NYJ,ARI,Malachi Corley,CorlMa00,0.0,0.0,0.0,56.2
17403,2024,REG,11,00-0039920,M.Corley,WR,WR,NYJ,16636,bae59933-8b94-4837-990e-f0a4ced3cdbb,26023.0,,11617.0,,4613104.0,40944.0,,3162613.0,CorlMa00,malachi-corley-1,17777.0,,1607.0,40944.0,0.0,,1215291.0,Malachi Corley,malachi corley,1,1,10.0,0,0.0,0.0,10.0,0.0,0.0,0.563583,0,1.000000,0.034483,0.080645,0.108176,,,,,,,,,,,,,,,,,REG,NYJ,IND,Malachi Corley,CorlMa00,0.0,0.0,0.0,108.3


In [19]:
# Output: an ordered dataframe of NFL WR info, ids, weekly stats, next-gen stats, and pro-footeball reference data
# Output: Ordered the df by year, week, and receiving yards
# NOTE: unbalanced dataframe - pfr stats start at 2018

# Order the dataframe by season (year), week, and receiving_yards
wr_ids_ngs_pfr_stats_sorted = wr_ids_ngs_pfr_stats.sort_values(
    by=['season', 'week', 'receiving_yards'], 
    ascending=[True, True, False]  # Ascending for season and week, descending for receiving_yards
)

# Display the shape of the resulting dataframe
print(f"Shape of merged dataframe: {wr_ids_ngs_pfr_stats_sorted.shape}")

# Row integrity check
print(
    f"Row count matches: {wr_ids_weekly_stats.shape[0] == wr_ids_ngs_weekly_stats.shape[0] == wr_ids_ngs_pfr_stats.shape[0] == wr_ids_ngs_pfr_stats_sorted.shape[0]}"
)

# Display the sorted dataframe
print("Dataframe sorted by season, week, and receiving_yards:")
display(wr_ids_ngs_pfr_stats_sorted)


# Save the sorted dataframe to a csv
wr_ids_ngs_pfr_stats_sorted.to_csv('wr_ids_ngs_pfr_stats_sorted.csv', index=False)

Shape of merged dataframe: (17405, 69)
Row count matches: True
Dataframe sorted by season, week, and receiving_yards:


Unnamed: 0,season,season_type,week,player_id,player_name,position,position_group,recent_team,mfl_id,sportradar_id,fantasypros_id,pff_id,sleeper_id,nfl_id,espn_id,yahoo_id,fleaflicker_id,cbs_id,pfr_id,cfbref_id,rotowire_id,rotoworld_id,ktc_id,stats_id,stats_global_id,fantasy_data_id,swish_id,name,merge_name,receptions,targets,receiving_yards,receiving_tds,receiving_fumbles,receiving_fumbles_lost,receiving_air_yards,receiving_yards_after_catch,receiving_first_downs,receiving_epa,receiving_2pt_conversions,racr,target_share,air_yards_share,wopr,player_display_name,team_abbr,avg_cushion,avg_separation,avg_intended_air_yards,percent_share_of_intended_air_yards,catch_percentage,yards,rec_touchdowns,avg_yac,avg_expected_yac,avg_yac_above_expectation,player_gsis_id,player_first_name,player_last_name,player_short_name,game_type,team,opponent,pfr_player_name,pfr_player_id,receiving_broken_tackles,receiving_drop,receiving_drop_pct,receiving_rat
214,2017,REG,1,00-0027793,A.Brown,WR,WR,PIT,9988,16e33176-b73e-49b7-b0aa-c405b47a706e,9808.0,5718.0,536.0,antoniobrown/2508061,13934.0,24171.0,,1272852.0,BrowAn04,antonio-brown-1,6454.0,5698.0,,24171.0,406214.0,11056.0,406214.0,Antonio Brown,antonio brown,11,11,182.0,0,1.0,0.0,90.0,92.0,8.0,10.870283,0,2.022222,0.305556,0.400000,0.738333,Antonio Brown,PIT,4.442727,4.311392,7.329091,35.293088,100.000000,182.0,0.0,9.137273,6.636465,2.500807,00-0027793,Antonio,Brown,A.Brown,,,,,,,,,
645,2017,REG,1,00-0030035,A.Thielen,WR,WR,MIN,11938,2fa2b2da-4aa9-44b5-b27e-56876dfe2ad4,13429.0,8288.0,1689.0,,16460.0,27277.0,,2059362.0,ThieAd00,,8986.0,9054.0,308.0,27277.0,733643.0,15534.0,733643.0,Adam Thielen,adam thielen,9,10,157.0,0,0.0,0.0,105.0,59.0,4.0,6.655833,0,1.495238,0.312500,0.439331,0.776281,Adam Thielen,MIN,3.920000,2.277252,10.673000,42.259265,90.000000,157.0,0.0,6.504444,7.789293,-1.284848,00-0030035,Adam,Thielen,A.Thielen,,,,,,,,,
1673,2017,REG,1,00-0033040,T.Hill,WR,WR,KC,12801,01d8aee3-e1c4-4988-970a-8c0c2d08bd83,15802.0,10799.0,3321.0,,3116406.0,29399.0,,2131163.0,HillTy00,,11222.0,11458.0,286.0,29399.0,823156.0,18082.0,823156.0,Tyreek Hill,tyreek hill,7,8,133.0,1,0.0,0.0,94.0,55.0,5.0,9.070634,0,1.414894,0.235294,0.361538,0.606018,Tyreek Hill,KC,7.078571,5.210156,11.551250,35.725055,87.500000,133.0,1.0,7.945714,10.144182,-2.198467,00-0033040,Tyreek,Hill,T.Hill,,,,,,,,,
245,2017,REG,1,00-0027891,G.Tate,WR,WR,DET,9831,c88d9352-b835-45ed-a909-1cfec09a58bc,9683.0,5585.0,642.0,goldentate/497326,13217.0,24035.0,,1265470.0,TateGo00,golden-tate-1,6389.0,5583.0,,24035.0,400490.0,11611.0,400490.0,Golden Tate,golden tate,10,12,107.0,0,0.0,0.0,68.0,43.0,5.0,-5.569108,0,1.573529,0.307692,0.232877,0.624552,Golden Tate,DET,5.639167,2.270264,5.055833,20.000659,83.333333,107.0,0.0,4.657000,4.412235,0.244765,00-0027891,Golden,Tate,G.Tate,,,,,,,,,
38,2017,REG,1,00-0026035,D.Amendola,WR,WR,NE,9308,973bfe3c-6d0d-4130-a79c-f860650b1da6,9146.0,4717.0,491.0,dannyamendola/2649,11674.0,9037.0,5595.0,516968.0,AmenDa00,,5813.0,4991.0,,9037.0,263758.0,9906.0,263758.0,Danny Amendola,danny amendola,6,7,100.0,0,1.0,0.0,54.0,49.0,5.0,6.142825,0,1.851852,0.194444,0.095238,0.358333,Danny Amendola,NE,2.763333,3.564135,7.581429,9.771861,85.714286,100.0,0.0,8.360000,8.952487,-0.592487,00-0026035,Daniel,Amendola,D.Amendola,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15248,2024,POST,22,00-0030564,D.Hopkins,WR,WR,KC,11232,5c48ade7-4b9a-4757-9643-87a6e3839e2b,11606.0,7808.0,1426.0,deandrehopkins/2540165,15795.0,26650.0,,1737078.0,HopkDe00,deandre-hopkins-1,8619.0,8404.0,,26650.0,560241.0,14986.0,560241.0,DeAndre Hopkins,deandre hopkins,2,5,18.0,1,0.0,0.0,54.0,1.0,2.0,-6.099530,1,0.333333,0.156250,0.219512,0.388034,,,,,,,,,,,,,,,,,SB,KC,PHI,DeAndre Hopkins,HopkDe00,0.0,1.0,0.2,50.4
15570,2024,POST,22,00-0033857,J.Smith-Schuster,WR,WR,KC,13156,9547fbb1-0d4f-4d9e-83b9-e2fa30463bb9,16427.0,11817.0,4040.0,,3120348.0,30175.0,,2139620.0,SmitJu00,juju-smith-1,11877.0,12184.0,,30175.0,835909.0,18883.0,835909.0,JuJu Smith-Schuster,juju smith-schuster,2,2,16.0,0,0.0,0.0,6.0,10.0,1.0,0.514393,0,2.666667,0.062500,0.024390,0.110823,,,,,,,,,,,,,,,,,SB,KC,PHI,JuJu Smith-Schuster,SmitJu00,0.0,0.0,0.0,100.0
15943,2024,POST,22,00-0035662,M.Brown,WR,WR,KC,14105,feeee40a-dd63-41a7-89cd-6c95b5456833,18226.0,61568.0,5848.0,,4241372.0,31857.0,,2804128.0,BrowMa04,marquise-brown-1,13502.0,14010.0,311.0,31857.0,976220.0,21045.0,976220.0,Marquise Brown,marquise brown,2,6,15.0,0,0.0,0.0,39.0,5.0,1.0,-8.434413,0,0.384615,0.187500,0.158537,0.392226,,,,,,,,,,,,,,,,,SB,KC,PHI,Marquise Brown,BrowMa04,0.0,0.0,0.0,2.8
15659,2024,POST,22,00-0034386,J.Watson,WR,WR,KC,13776,bdb77276-7191-4454-85c2-e1693a33d709,17603.0,66915.0,5374.0,,3118892.0,31114.0,,2137198.0,WatsJu01,,12746.0,13208.0,178.0,31114.0,832220.0,19922.0,832220.0,Justin Watson,justin watson,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [20]:
### End: Python NFL Library Dataframe ###

In [21]:
### Begin: Python NFL Library Data Integrity Checks ###

In [22]:
## Data Integrity Checks ##
# load the datframe from the variables in memory or csv files in the current working directory
def load_dataframe(var_name, file_name):
    try:
        return globals()[var_name]  # Try to get the variable from the global namespace
    except KeyError:
        print(f"⚠️ {var_name} not found in memory. Reading from {file_name}")
        return pd.read_csv(file_name)

In [23]:
## Data Integrity Checks ##
# Output: A Stratified Random Sample of years and players from the python nfl library
# test to ensure merged df values match the ids and weekly stats from the unmerged df
def test_wr_weekly_stats(years=range(2017, 2024), num_samples=25, tolerance=0.1):
    """
    Perform integrity checks on WR Weekly Stats data by comparing merged and unmerged dataframes.
    This function checks consistency across selected numerical columns.

    Parameters:
    years (range): Range of years to test.
    num_samples (int): Number of random samples per year.
    tolerance (float): Allowed numerical difference for matching values.

    Returns:
    None
    """
    print("\n🔹 Running Weekly Stats Integrity Check...")

    # Load DataFrames
    merged_df = load_dataframe("wr_ids_ngs_pfr_stats_sorted", "wr_ids_ngs_pfr_stats_sorted.csv")
    weekly_df = load_dataframe("wr_ids_weekly_stats", "wr_ids_weekly_stats.csv")

    print(f"✅ Using {'in-memory variable' if 'wr_ids_ngs_pfr_stats_sorted' in globals() else 'CSV file'}: wr_ids_ngs_pfr_stats_sorted")
    print(f"✅ Using {'in-memory variable' if 'wr_ids_weekly_stats' in globals() else 'CSV file'}: wr_ids_weekly_stats")

    mismatches = []

    for year in years:
        print(f"\nTesting Year: {year}")

        # Filter data for the specified year
        merged_year_df = merged_df[merged_df["season"] == year]
        weekly_year_df = weekly_df[weekly_df["season"] == year]

        # Ensure numeric columns
        numeric_cols = ["receptions", "targets", "receiving_yards", "receiving_air_yards", "receiving_yards_after_catch", "receiving_first_downs"]
        merged_year_df.loc[:, numeric_cols] = merged_year_df[numeric_cols].apply(pd.to_numeric, errors="coerce")
        weekly_year_df.loc[:, numeric_cols] = weekly_year_df[numeric_cols].apply(pd.to_numeric, errors="coerce")

        # Select random sample of players for this year
        sampled_players = sample(list(merged_year_df["player_id"].dropna()), min(num_samples, len(merged_year_df)))

        for player_id in sampled_players:
            # Filter player data from each DataFrame
            merged_player = merged_year_df[merged_year_df["player_id"] == player_id]
            weekly_player = weekly_year_df[weekly_year_df["player_id"] == player_id]

            # Loop through each week and compare values
            for week in merged_player["week"].unique():
                merged_row = merged_player[merged_player["week"] == week]
                weekly_row = weekly_player[weekly_player["week"] == week]

                if merged_row.empty or weekly_row.empty:
                    continue  # Skip if there's no data for this player/week

                # Compare numerical columns
                for col in numeric_cols:
                    merged_val = merged_row[col].values[0] if col in merged_row else None
                    weekly_val = weekly_row[col].values[0] if col in weekly_row else None

                    if merged_val is not None and weekly_val is not None:
                        if abs(float(merged_val) - float(weekly_val)) > tolerance:
                            mismatches.append((year, player_id, week, col, merged_val, weekly_val))

        print(f"Tested {len(sampled_players)} players in Year {year}.")

    # Summary of mismatches
    print(f"\nTotal mismatches found in Weekly Stats: {len(mismatches)}")
    if mismatches:
        print("\nMismatch Details (Year, Player ID, Week, Column, Merged Value, Weekly Value):")
        for m in mismatches[:10]:  # Print only first 10 mismatches
            print(m)
    else:
        print("✅ Weekly Stats Integrity Check Passed!")

# Run the test
test_wr_weekly_stats()


🔹 Running Weekly Stats Integrity Check...
✅ Using in-memory variable: wr_ids_ngs_pfr_stats_sorted
✅ Using in-memory variable: wr_ids_weekly_stats

Testing Year: 2017
Tested 25 players in Year 2017.

Testing Year: 2018
Tested 25 players in Year 2018.

Testing Year: 2019
Tested 25 players in Year 2019.

Testing Year: 2020
Tested 25 players in Year 2020.

Testing Year: 2021
Tested 25 players in Year 2021.

Testing Year: 2022
Tested 25 players in Year 2022.

Testing Year: 2023
Tested 25 players in Year 2023.

Total mismatches found in Weekly Stats: 0
✅ Weekly Stats Integrity Check Passed!


In [24]:
## Data Integrity Checks ##
# Output: A Stratified Random Sample of years and players from the python nfl library
# test to ensure merged df values match the next-gen stats within the unmerged df
def test_wr_ngs_stats(years=range(2017, 2024), num_samples=25, tolerance=0.1):
    """
    Perform integrity checks on WR Next-Gen Stats (NGS) data.
    Compares key stats across merged and unmerged dataframes.
    """
    print("\n🔹 Running Next-Gen Stats (NGS) Integrity Check...")

    # Load dataframes
    merged_df = load_dataframe("wr_ids_ngs_pfr_stats_sorted", "wr_ids_ngs_pfr_stats_sorted.csv")
    ngs_df = load_dataframe("ngs_wr_df", "ngs_wr_df.csv")

    # Validation message
    print("✅ Using in-memory variable: wr_ids_ngs_pfr_stats_sorted" if "wr_ids_ngs_pfr_stats_sorted" in globals() else "✅ Loaded wr_ids_ngs_pfr_stats_sorted from CSV")
    print("✅ Using in-memory variable: ngs_wr_df" if "ngs_wr_df" in globals() else "✅ Loaded ngs_wr_df from CSV")

    mismatches = []

    for year in years:
        print(f"\nTesting Season: {year}")

        # Filter data for the given season
        merged_year_df = merged_df[merged_df["season"] == year]
        ngs_year_df = ngs_df[ngs_df["season"] == year]

        # Select random sample of players for this season
        sampled_players = sample(list(merged_year_df["player_id"].dropna()), min(num_samples, len(merged_year_df)))

        for player_id in sampled_players:
            # Filter player data from each DataFrame
            merged_player = merged_year_df[merged_year_df["player_id"] == player_id]
            ngs_player = ngs_year_df[ngs_year_df["player_gsis_id"] == player_id]

            for week in merged_player["week"].unique():
                merged_row = merged_player[merged_player["week"] == week]
                ngs_row = ngs_player[ngs_player["week"] == week]

                if merged_row.empty or ngs_row.empty:
                    continue  # Skip if there's no data for this player/week
                
                # Compare Next-Gen Stats columns
                for col in [
                    "avg_cushion", "avg_separation", "avg_intended_air_yards",
                    "percent_share_of_intended_air_yards", "catch_percentage",
                    "avg_yac", "avg_expected_yac", "avg_yac_above_expectation"
                ]:
                    merged_val = merged_row[col].values[0] if col in merged_row else None
                    ngs_val = ngs_row[col].values[0] if col in ngs_row else None

                    if merged_val is not None and ngs_val is not None:
                        merged_val = float(merged_val)  # Ensure numeric conversion
                        ngs_val = float(ngs_val)  # Ensure numeric conversion

                        if abs(merged_val - ngs_val) > tolerance:
                            mismatches.append((year, player_id, week, col, merged_val, ngs_val))

        print(f"Tested {len(sampled_players)} players in Season {year}.")

    print(f"\nTotal mismatches found in Next-Gen Stats: {len(mismatches)}")

    if mismatches:
        print("\nMismatch Details (Season, Player ID, Week, Column, Merged Value, NGS Value):")
        for m in mismatches[:10]:  # Print only the first 10 mismatches
            print(m)
    else:
        print("✅ Next-Gen Stats Integrity Check Passed!")

# Run the test
test_wr_ngs_stats()


🔹 Running Next-Gen Stats (NGS) Integrity Check...
✅ Using in-memory variable: wr_ids_ngs_pfr_stats_sorted
✅ Using in-memory variable: ngs_wr_df

Testing Season: 2017
Tested 25 players in Season 2017.

Testing Season: 2018
Tested 25 players in Season 2018.

Testing Season: 2019
Tested 25 players in Season 2019.

Testing Season: 2020
Tested 25 players in Season 2020.

Testing Season: 2021
Tested 25 players in Season 2021.

Testing Season: 2022
Tested 25 players in Season 2022.

Testing Season: 2023
Tested 25 players in Season 2023.

Total mismatches found in Next-Gen Stats: 0
✅ Next-Gen Stats Integrity Check Passed!


In [25]:
## Data Integrity Checks ##
# Output: A Stratified Random Sample of years and players from the python nfl library
# test to ensure merged df values match the pro-football reference stats within the unmerged df
def test_wr_pfr_stats(years=range(2017, 2024), num_samples=25, tolerance=0.1):
    """
    Perform integrity checks on WR data from Pro-Football Reference (PFR).
    Compares key stats across merged and unmerged dataframes.
    """
    print("\n🔹 Running Pro-Football Reference (PFR) Integrity Check...")

    # Load dataframes
    merged_df = load_dataframe("wr_ids_ngs_pfr_stats_sorted", "wr_ids_ngs_pfr_stats_sorted.csv")
    pfr_df = load_dataframe("pfr_rec_df", "pfr_rec_df.csv")

    # Validation message
    print("✅ Using in-memory variable: wr_ids_ngs_pfr_stats_sorted" if "wr_ids_ngs_pfr_stats_sorted" in globals() else "✅ Loaded wr_ids_ngs_pfr_stats_sorted from CSV")
    print("✅ Using in-memory variable: pfr_rec_df" if "pfr_rec_df" in globals() else "✅ Loaded pfr_rec_df from CSV")

    mismatches = []

    for year in years:
        print(f"\nTesting Season: {year}")

        # Filter data for the given season
        merged_year_df = merged_df[merged_df["season"] == year].copy()
        pfr_year_df = pfr_df[pfr_df["season"] == year].copy()

        # Ensure 'pfr_id' and 'pfr_player_id' are treated as strings
        merged_year_df["pfr_id"] = merged_year_df["pfr_id"].astype(str)
        pfr_year_df["pfr_player_id"] = pfr_year_df["pfr_player_id"].astype(str)

        # Select random sample of players for this season
        sampled_players = sample(list(merged_year_df["pfr_id"].dropna()), min(num_samples, len(merged_year_df)))

        for player_id in sampled_players:
            # Filter player data from each DataFrame
            merged_player = merged_year_df[merged_year_df["pfr_id"] == player_id]
            pfr_player = pfr_year_df[pfr_year_df["pfr_player_id"] == player_id]

            for week in merged_player["week"].unique():
                merged_row = merged_player[merged_player["week"] == week]
                pfr_row = pfr_player[pfr_player["week"] == week]

                if merged_row.empty or pfr_row.empty:
                    continue  # Skip if there's no data for this player/week

                # Compare PFR Stats columns
                for col in [
                    "receiving_broken_tackles", "receiving_drop", "receiving_drop_pct", "receiving_rat"
                ]:
                    merged_val = merged_row[col].values[0] if col in merged_row else None
                    pfr_val = pfr_row[col].values[0] if col in pfr_row else None

                    if merged_val is not None and pfr_val is not None:
                        merged_val = float(merged_val)  # Ensure numeric conversion
                        pfr_val = float(pfr_val)  # Ensure numeric conversion

                        if abs(merged_val - pfr_val) > tolerance:
                            mismatches.append((year, player_id, week, col, merged_val, pfr_val))

        print(f"Tested {len(sampled_players)} players in Season {year}.")

    print(f"\nTotal mismatches found in PFR Stats: {len(mismatches)}")

    if mismatches:
        print("\nMismatch Details (Season, Player ID, Week, Column, Merged Value, PFR Value):")
        for m in mismatches[:10]:  # Print only the first 10 mismatches
            print(m)
    else:
        print("✅ Pro-Football Reference (PFR) Integrity Check Passed!")

# Run the test
test_wr_pfr_stats()


🔹 Running Pro-Football Reference (PFR) Integrity Check...
✅ Using in-memory variable: wr_ids_ngs_pfr_stats_sorted
✅ Using in-memory variable: pfr_rec_df

Testing Season: 2017
Tested 25 players in Season 2017.

Testing Season: 2018
Tested 25 players in Season 2018.

Testing Season: 2019
Tested 25 players in Season 2019.

Testing Season: 2020
Tested 25 players in Season 2020.

Testing Season: 2021
Tested 25 players in Season 2021.

Testing Season: 2022
Tested 25 players in Season 2022.

Testing Season: 2023
Tested 25 players in Season 2023.

Total mismatches found in PFR Stats: 0
✅ Pro-Football Reference (PFR) Integrity Check Passed!


In [26]:
### End: Python NFL Library Data Integrity Checks ###

In [27]:
### Begin:fantasypros webscraping ###

In [28]:
# Generates a list of (year, week) combinations for web scraping.
# - 2017-2020: Weeks 1-17
# - 2021 and beyond: Weeks 1-18
def generate_year_week_combinations(start_year, end_year):
    year_week_combinations = []
    for year in range(start_year, end_year + 1):
        max_week = 17 if year <= 2020 else 18
        year_week_combinations.extend([(year, week) for week in range(1, max_week + 1)])
    return year_week_combinations

In [29]:
# output: a dataframe of weekly WR fantasypros advanced stats week-by-week for defined years
# Define the function to scrape weekly WR advanced stats from Fantasy Pros

def scrape_fantasypros_all_weeks_years(start_year, end_year):
    """
    Scrapes FantasyPros WR advanced stats for all weeks and years with correct ID extraction.
    """
    year_week_combinations = generate_year_week_combinations(start_year, end_year)
    all_data = []

    for year, week in year_week_combinations:
        try:
            # Construct the URL
            url = f"https://www.fantasypros.com/nfl/advanced-stats-wr.php?year={year}&week={week}&range=week&view=pergame"
            response = requests.get(url)
            response.raise_for_status()

            # Parse HTML content
            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract table headers
            table_headers = [header.text.strip() for header in soup.find('thead').find_all('th')]

            # Find table rows
            table_rows = soup.find('tbody').find_all('tr')

            for row in table_rows:
                # Extract FantasyPros Player ID correctly
                fantasypros_id = "Unknown"
                player_name = "Unknown"
                
                player_link = row.find("a", class_="fp-player-link")
                if player_link:
                    class_list = player_link.get("class", [])
                    for class_name in class_list:
                        if class_name.startswith("fp-id-"):
                            fantasypros_id = class_name.replace("fp-id-", "")  # Extract numeric ID
                            break
                    player_name = player_link.text.strip()  # Extract player name explicitly from <a> tag

                # Extract the rest of the row data
                row_data = [cell.text.strip() for cell in row.find_all('td')]

                # Ensure data matches headers before adding
                if len(row_data) == len(table_headers):  
                    all_data.append([year, week, fantasypros_id, player_name] + row_data)

            # Random delay to avoid server overload
            time.sleep(uniform(0.3, 0.9))

        except Exception as e:
            print(f"⚠️ Error occurred while scraping Year: {year}, Week: {week}: {e}")

    # Convert data to DataFrame
    column_names = ['Year', 'Week', 'FantasyPros_ID', 'Player'] + table_headers  # No duplicate FantasyPros_ID
    wr_fp_advanced_stats_df = pd.DataFrame(all_data, columns=column_names)

    # Drop the redundant 'Rank' column if it exists
    wr_fp_advanced_stats_df.drop(columns=['Rank'], inplace=True, errors='ignore')

    return wr_fp_advanced_stats_df

# ✅ Run the full scrape
wr_fp_advanced_stats_df = scrape_fantasypros_all_weeks_years(2017, 2024)

# ✅ Display dataset shape for verification
print(f"\n📊 **Shape of FantasyPros advanced DataFrame:** {wr_fp_advanced_stats_df.shape}")

# ✅ Display first few rows to confirm FantasyPros IDs are correctly extracted
display(wr_fp_advanced_stats_df.head(10))

# ✅ Save to CSV for manual inspection
# wr_fp_advanced_stats_df.to_csv("fantasypros_wr_advanced_stats.csv", index=False)



📊 **Shape of FantasyPros advanced DataFrame:** (26865, 22)


Unnamed: 0,Year,Week,FantasyPros_ID,Player,Player.1,G,REC,YDS,YBC,AIR,YAC,YACON,BRKTKL,TGT,CATCHABLE,DROP,RZ TGT,10+ YDS,20+ YDS,30+ YDS,40+ YDS,50+ YDS
0,2017,1,13981,Stefon Diggs,Stefon Diggs (HOU),1,7,93,76,0,17,6,0,8,7,0,2,5,2,1,0,0
1,2017,1,15802,Tyreek Hill,Tyreek Hill (MIA),1,7,133,78,0,55,1,0,8,7,0,0,4,1,1,1,1
2,2017,1,16488,Kenny Golladay,Kenny Golladay (FA),1,4,69,64,0,5,0,0,7,5,1,1,2,1,1,1,0
3,2017,1,9808,Antonio Brown,Antonio Brown (FA),1,11,182,90,0,92,50,0,11,11,0,0,7,2,2,1,1
4,2017,1,13429,Adam Thielen,Adam Thielen (CAR),1,9,157,92,0,65,17,0,10,10,0,0,4,4,2,1,0
5,2017,1,13969,Nelson Agholor,Nelson Agholor (BAL),1,6,86,51,0,35,18,0,8,6,0,1,3,1,1,1,1
6,2017,1,13081,Bennie Fowler III,Bennie Fowler III (FA),1,3,21,21,0,0,0,0,4,3,0,2,1,0,0,0,0
7,2017,1,9320,Jordy Nelson,Jordy Nelson (FA),1,7,79,73,0,6,1,0,8,7,0,0,3,1,1,0,0
8,2017,1,16433,Cooper Kupp,Cooper Kupp (LAR),1,4,76,63,0,13,7,0,6,5,1,1,3,2,0,0,0
9,2017,1,13894,Amari Cooper,Amari Cooper (BUF),1,5,62,20,0,42,27,1,13,8,3,2,3,1,0,0,0


In [30]:
# output: a dataframe of WR fantasy points and % rostered stats week-by-week for defined years

# Scrape the weekly WR fantasy points column and the % rostered from the FantasyPros website
def wr_scrape_fantasypros_fpts_rost(start_year, end_year):
    """
    Scrapes FantasyPros WR FPTS and % Rostered data for all weeks and years, including FantasyPros IDs.
    """
    # Generate year-week combinations
    week_combinations = generate_year_week_combinations(start_year, end_year)

    # Initialize an empty list to store data
    all_data = []

    for year, week in week_combinations:
        try:
            # Build the URL
            url = f"https://www.fantasypros.com/nfl/stats/wr.php?year={year}&week={week}&range=week"
            response = requests.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, "html.parser")

            # Locate the table
            table = soup.find("table", {"id": "data"})
            if table:
                table_str = str(table)

                # Read the table while skipping multi-level headers
                df = pd.read_html(io.StringIO(table_str), header=1)[0]

                # Extract FantasyPros Player IDs correctly
                table_rows = table.find("tbody").find_all("tr")

                fantasypros_ids = []
                player_names = []

                for row in table_rows:
                    # Extract FantasyPros Player ID
                    fantasypros_id = "Unknown"
                    player_name = "Unknown"
                    
                    player_link = row.find("a", class_="fp-player-link")
                    if player_link:
                        class_list = player_link.get("class", [])
                        for class_name in class_list:
                            if class_name.startswith("fp-id-"):
                                fantasypros_id = class_name.replace("fp-id-", "")  # Extract numeric ID
                                break
                        player_name = player_link.text.strip()  # Extract player name explicitly from <a> tag

                    fantasypros_ids.append(fantasypros_id)
                    player_names.append(player_name)

                # Add extracted FantasyPros IDs and correct player names
                df.insert(1, "FantasyPros_ID", fantasypros_ids)
                df["Player"] = player_names  # Replace with properly scraped names

                # Add Year and Week columns
                df["Year"] = year
                df["Week"] = week

                # Append dataframe to list
                all_data.append(df)

            # Delay to avoid bombarding the server
            time.sleep(uniform(0.3, 0.9))  # Randomized delay

        except Exception as e:
            print(f"⚠️ Error occurred while scraping Year: {year}, Week: {week}: {e}")

    # Combine all data into a single dataframe
    wr_fpts_perct_rost_df = pd.concat(all_data, ignore_index=True)

    # Drop unnecessary columns
    columns_to_drop = [
        "Rank", "REC", "TGT", "YDS", "Y/R", "LG", "20+", "TD", 
        "ATT", "YDS.1", "TD.1", "FL", "G", "FPTS/G"
    ]
    wr_fpts_perct_rost_df.drop(columns=columns_to_drop, errors="ignore", inplace=True)

    # Save to CSV
    # wr_fpts_perct_rost_df.to_csv("fantasypros_wr_fpts_perct_rost.csv", index=False)

    # Display shape of the dataframe
    print(f"\n📊 **Shape of WR FPTS and % Rostered dataframe after column removal:** {wr_fpts_perct_rost_df.shape}")

    # Display first few rows for verification
    display(wr_fpts_perct_rost_df.head(10))

    return wr_fpts_perct_rost_df

# ✅ Run the scraping function for all years
wr_fpts_perct_rost_df = wr_scrape_fantasypros_fpts_rost(2017, 2024)



📊 **Shape of WR FPTS and % Rostered dataframe after column removal:** (24127, 6)


Unnamed: 0,FantasyPros_ID,Player,FPTS,ROST,Year,Week
0,13981,Stefon Diggs,20.7,44.6%,2017,1
1,15802,Tyreek Hill,19.8,88.7%,2017,1
2,16488,Kenny Golladay,18.9,4.0%,2017,1
3,9808,Antonio Brown,18.2,1.0%,2017,1
4,13429,Adam Thielen,15.7,59.8%,2017,1
5,13969,Nelson Agholor,14.6,1.5%,2017,1
6,13081,Bennie Fowler III,14.1,0.0%,2017,1
7,9320,Jordy Nelson,13.9,0.1%,2017,1
8,16433,Cooper Kupp,13.6,89.8%,2017,1
9,13894,Amari Cooper,12.2,73.8%,2017,1


In [31]:
# output: a dataframe of weekly WR redzone stats week-by-week for defined years
# scrape the weekly WR redzone stats from the FantasyPros
def wr_scrape_fantasypros_redzone_stats(start_year, end_year):
    """
    Scrapes FantasyPros WR Red Zone Stats for all weeks and years, including FantasyPros IDs.
    """
    # Generate year-week combinations
    week_combinations = generate_year_week_combinations(start_year, end_year)

    # Initialize an empty list to store data
    all_data = []

    for year, week in week_combinations:
        try:
            # Build the URL
            url = f"https://www.fantasypros.com/nfl/red-zone-stats/wr.php?year={year}&range=week&week={week}"
            response = requests.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, "html.parser")

            # Locate the table
            table = soup.find("table", {"id": "data"})
            if table:
                table_str = str(table)

                # Read the table while skipping multi-level headers
                df = pd.read_html(io.StringIO(table_str))[0]

                # Flatten the multi-level column headers
                df.columns = df.columns.droplevel(0)

                # Drop the extra header rows (if any)
                df = df[df['Player'] != 'Player']  # Filter out duplicate header rows

                # Extract FantasyPros Player IDs correctly
                table_rows = table.find("tbody").find_all("tr")

                fantasypros_ids = []
                player_names = []

                for row in table_rows:
                    # Extract FantasyPros Player ID
                    fantasypros_id = "Unknown"
                    player_name = "Unknown"

                    player_link = row.find("a", class_="fp-player-link")
                    if player_link:
                        class_list = player_link.get("class", [])
                        for class_name in class_list:
                            if class_name.startswith("fp-id-"):
                                fantasypros_id = class_name.replace("fp-id-", "")  # Extract numeric ID
                                break
                        player_name = player_link.text.strip()  # Extract player name explicitly from <a> tag

                    fantasypros_ids.append(fantasypros_id)
                    player_names.append(player_name)

                # Add extracted FantasyPros IDs and correct player names
                df.insert(1, "FantasyPros_ID", fantasypros_ids)
                df["Player"] = player_names  # Replace with properly scraped names

                # Add Year and Week columns
                df["Year"] = year
                df["Week"] = week

                # Append dataframe to list
                all_data.append(df)

            # Delay to avoid bombarding the server
            time.sleep(uniform(0.3, 0.9))  # Randomized delay

        except Exception as e:
            print(f"⚠️ Error occurred while scraping Year: {year}, Week: {week}: {e}")

    # Combine all data into a single dataframe
    wr_redzone_stats_df = pd.concat(all_data, ignore_index=True)

    # Drop unnecessary columns
    columns_to_drop = ['Rank', 'ATT', 'YDS', 'TD', 'PCT', 'FL', 'G', 'FPTS', 'FPTS/G', 'ROST %']
    wr_redzone_stats_df.drop(columns=columns_to_drop, errors='ignore', inplace=True)

    # Rename columns to add 'rz' prefix, except for 'Player', 'FantasyPros_ID', 'Year', 'Week'
    wr_redzone_stats_df.rename(
        columns={
            col: f"{col}_rz" for col in wr_redzone_stats_df.columns
            if col not in ['Player', 'FantasyPros_ID', 'Year', 'Week']
        },
        inplace=True
    )

    # Save to CSV
    # wr_redzone_stats_df.to_csv("fantasypros_wr_redzone_stats.csv", index=False)

    # Display shape of the dataframe
    print(f"\n📊 **Shape of WR Red Zone Stats dataframe after processing:** {wr_redzone_stats_df.shape}")

    # Display first few rows for verification
    display(wr_redzone_stats_df.head(10))

    return wr_redzone_stats_df

# ✅ Run the scraping function for all years
wr_redzone_stats_df = wr_scrape_fantasypros_redzone_stats(2017, 2024)



📊 **Shape of WR Red Zone Stats dataframe after processing:** (7159, 9)


Unnamed: 0,FantasyPros_ID,Player,REC_rz,TGT_rz,REC PCT_rz,Y/R_rz,TGT PCT_rz,Year,Week
0,13981,Stefon Diggs,3,3,100.0%,7.3,60.0%,2017,1
1,13081,Bennie Fowler III,2,2,100.0%,5.5,66.7%,2017,1
2,13840,Seth Roberts,1,1,100.0%,19.0,20.0%,2017,1
3,16433,Cooper Kupp,1,1,100.0%,18.0,100.0%,2017,1
4,11606,DeAndre Hopkins,2,3,66.7%,5.5,75.0%,2017,1
5,16488,Kenny Golladay,1,1,100.0%,10.0,33.3%,2017,1
6,13894,Amari Cooper,1,4,25.0%,8.0,80.0%,2017,1
7,11215,Marvin Jones Jr.,1,1,100.0%,6.0,33.3%,2017,1
8,11616,Keenan Allen,1,1,100.0%,5.0,100.0%,2017,1
9,15506,Corey Coleman,1,1,100.0%,3.0,50.0%,2017,1


In [32]:
# output: a finalized merged fantasypros dataframe 

# ✅ Display the shape of each dataframe before merging
print(f"\n📊 **Shape of WR Advanced Stats DataFrame:** {wr_fp_advanced_stats_df.shape}")
print(f"📊 **Shape of WR FPTS and % Rostered DataFrame:** {wr_fpts_perct_rost_df.shape}")
print(f"📊 **Shape of WR Red Zone Stats DataFrame:** {wr_redzone_stats_df.shape}")

# ✅ Drop 'Player' from fpts and redzone dataframes before merge
wr_fpts_perct_rost_df = wr_fpts_perct_rost_df.drop(columns=["Player"], errors="ignore")
wr_redzone_stats_df = wr_redzone_stats_df.drop(columns=["Player"], errors="ignore")

# ✅ Merge WR Advanced Stats with Fantasy Points and % Rostered
wr_adv_fpts_rost_merged_df = pd.merge(
    wr_fp_advanced_stats_df,  # Baseline DF (keeps 'Player')
    wr_fpts_perct_rost_df,
    on=['FantasyPros_ID', 'Year', 'Week'],
    how='left'  # Retains all rows from the baseline DF
)

# ✅ Merge with Red Zone Stats
wr_adv_fpts_rost_rz_merged_df = pd.merge(
    wr_adv_fpts_rost_merged_df,
    wr_redzone_stats_df,
    on=['FantasyPros_ID', 'Year', 'Week'],
    how='left'
)

# ✅ Sort for easier verification
wr_adv_fpts_rost_rz_merged_df_sorted = wr_adv_fpts_rost_rz_merged_df.sort_values(
    by=['Year', 'Week', 'YDS'], 
    ascending=[True, True, False]
)

# ✅ Display the updated shape after dropping duplicate columns
print(f"\n📊 **fantasypros Shape After Merge:** {wr_adv_fpts_rost_rz_merged_df_sorted.shape}")

# ✅ Save to CSV
wr_adv_fpts_rost_rz_merged_df_sorted.to_csv('fantasypros_wr_adv_fpts_rost_rz_df.csv', index=False)
display(wr_adv_fpts_rost_merged_df)


📊 **Shape of WR Advanced Stats DataFrame:** (26865, 22)
📊 **Shape of WR FPTS and % Rostered DataFrame:** (24127, 6)
📊 **Shape of WR Red Zone Stats DataFrame:** (7159, 9)

📊 **fantasypros Shape After Merge:** (26865, 29)


Unnamed: 0,Year,Week,FantasyPros_ID,Player,Player.1,G,REC,YDS,YBC,AIR,YAC,YACON,BRKTKL,TGT,CATCHABLE,DROP,RZ TGT,10+ YDS,20+ YDS,30+ YDS,40+ YDS,50+ YDS,FPTS,ROST
0,2017,1,13981,Stefon Diggs,Stefon Diggs (HOU),1,7,93,76,0,17,6,0,8,7,0,2,5,2,1,0,0,20.7,44.6%
1,2017,1,15802,Tyreek Hill,Tyreek Hill (MIA),1,7,133,78,0,55,1,0,8,7,0,0,4,1,1,1,1,19.8,88.7%
2,2017,1,16488,Kenny Golladay,Kenny Golladay (FA),1,4,69,64,0,5,0,0,7,5,1,1,2,1,1,1,0,18.9,4.0%
3,2017,1,9808,Antonio Brown,Antonio Brown (FA),1,11,182,90,0,92,50,0,11,11,0,0,7,2,2,1,1,18.2,1.0%
4,2017,1,13429,Adam Thielen,Adam Thielen (CAR),1,9,157,92,0,65,17,0,10,10,0,0,4,4,2,1,0,15.7,59.8%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26860,2024,18,18587,Mecole Hardman Jr.,Mecole Hardman Jr. (KC),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.2%
26861,2024,18,23905,Skyy Moore,Skyy Moore (KC),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.1%
26862,2024,18,26354,Bub Means,Bub Means (NO),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,1.1%
26863,2024,18,26205,Ja'Lynn Polk,Ja'Lynn Polk (NE),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,11.3%


In [33]:
### End:fantasypros webscraping ###

In [34]:
### Begin: FantasyPros Data Integrity Checks Section ###

In [35]:
## Data Integrity Checks ##
# Extract unique years and weeks from each dataframe
years_merged = sorted(wr_adv_fpts_rost_rz_merged_df_sorted['Year'].unique())
weeks_merged = sorted(wr_adv_fpts_rost_rz_merged_df_sorted['Week'].unique())

years_redzone = sorted(wr_redzone_stats_df['Year'].unique())
weeks_redzone = sorted(wr_redzone_stats_df['Week'].unique())

years_fpts = sorted(wr_fpts_perct_rost_df['Year'].unique())
weeks_fpts = sorted(wr_fpts_perct_rost_df['Week'].unique())

years_adv_stats = sorted(wr_fp_advanced_stats_df['Year'].unique())
weeks_adv_stats = sorted(wr_fp_advanced_stats_df['Week'].unique())

# Print the results for comparison
print(f"Years in Merged DF: {years_merged}")
print(f"Years in RedZone DF: {years_redzone}")
print(f"Years in FPTS DF: {years_fpts}")
print(f"Years in Advanced Stats DF: {years_adv_stats}\n")

print(f"Weeks in Merged DF: {weeks_merged}")
print(f"Weeks in RedZone DF: {weeks_redzone}")
print(f"Weeks in FPTS DF: {weeks_fpts}")
print(f"Weeks in Advanced Stats DF: {weeks_adv_stats}")

Years in Merged DF: [2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
Years in RedZone DF: [2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
Years in FPTS DF: [2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
Years in Advanced Stats DF: [2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]

Weeks in Merged DF: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
Weeks in RedZone DF: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
Weeks in FPTS DF: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
Weeks in Advanced Stats DF: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]


In [36]:
## Data Integrity Checks ##
# Output: A Stratified Random Sample of years and players from the scraped FantasyPros data
# test to ensure merged df values match the individual unmerged df of advanced stats
# ✅ Integrity Check for WR Advanced Stats (using FantasyPros_ID)
def test_wr_advanced_stats(years=range(2017, 2024), num_samples=25, tolerance=0.1, random_seed=42):
    """
    Integrity check for WR advanced stats.

    - Uses Stratified Random Sampling by year.
    - Tests key stats (REC, YDS, YBC, AIR, YAC, TGT, CATCHABLE).
    - Reports % of matches and mismatches found.
    - Logs whether data is loaded from variables or CSV files.
    """
    seed(random_seed)  # Set seed for reproducibility
    mismatches = []

    print("🔹 Running Advanced Stats Integrity Check...")
    print(f"📌 Test Details: Stratified Random Sample by Year")
    print(f"📌 Columns Tested: ['REC', 'YDS', 'YBC', 'AIR', 'YAC', 'TGT', 'CATCHABLE']\n")

    # Load data from memory or CSV
    merged_df = load_dataframe("wr_adv_fpts_rost_rz_merged_df_sorted", "fantasypros_wr_adv_fpts_rost_rz_df.csv")
    adv_stats_df = load_dataframe("wr_fp_advanced_stats_df", "fantasypros_wr_advanced_stats.csv")

    # Log the source of the data
    if isinstance(merged_df, pd.DataFrame) and isinstance(adv_stats_df, pd.DataFrame):
        print("📥 Data loaded from **variables** in memory.\n")
    else:
        print("📥 Data loaded from **CSV files**.\n")

    for year in years:
        print(f"\nTesting Year: {year}")

        # Filter data by year
        merged_year_df = merged_df[merged_df['Year'] == year]
        adv_year_df = adv_stats_df[adv_stats_df['Year'] == year]

        # Skip if no data
        if merged_year_df.empty or adv_year_df.empty:
            print(f"⚠️ Skipping Year {year} (No data available)")
            continue

        # Select random IDs instead of player names
        sampled_ids = sample(list(merged_year_df['FantasyPros_ID'].dropna()), min(num_samples, len(merged_year_df)))

        total_comparisons = 0
        match_count = 0

        for fp_id in sampled_ids:
            for week in merged_year_df[merged_year_df['FantasyPros_ID'] == fp_id]['Week'].unique():

                merged_row = merged_year_df[(merged_year_df['FantasyPros_ID'] == fp_id) & (merged_year_df['Week'] == week)]
                adv_row = adv_year_df[(adv_year_df['FantasyPros_ID'] == fp_id) & (adv_year_df['Week'] == week)]

                if merged_row.empty or adv_row.empty:
                    continue  # Skip if missing

                for col in ['REC', 'YDS', 'YBC', 'AIR', 'YAC', 'TGT', 'CATCHABLE']:
                    merged_val = merged_row[col].values[0] if col in merged_row else None
                    adv_val = adv_row[col].values[0] if col in adv_row else None

                    if merged_val is not None and adv_val is not None:
                        total_comparisons += 1
                        if merged_val == adv_val:
                            match_count += 1
                        else:
                            mismatches.append((year, fp_id, week, col, merged_val, adv_val))

        # Yearly summary
        match_percentage = (match_count / total_comparisons * 100) if total_comparisons else 0
        print(f"✅ Tested {len(sampled_ids)} IDs in Year {year} ({total_comparisons} values compared, {match_percentage:.2f}% matched)")

    # Final summary
    print("\n🔍 Integrity Check Summary:")
    print(f"✔ Total Comparisons: {sum(len(m) for m in mismatches) + match_count}")
    print(f"✔ Overall Match Rate: {(match_count / (match_count + len(mismatches)) * 100) if (match_count + len(mismatches)) > 0 else 0:.2f}%")

    if mismatches:
        print("\n❌ Mismatch Details (Showing up to 10 cases):")
        for m in mismatches[:10]:
            print(m)
    else:
        print("✅ All values matched successfully!")

# ✅ Run the updated test
test_wr_advanced_stats()


🔹 Running Advanced Stats Integrity Check...
📌 Test Details: Stratified Random Sample by Year
📌 Columns Tested: ['REC', 'YDS', 'YBC', 'AIR', 'YAC', 'TGT', 'CATCHABLE']

📥 Data loaded from **variables** in memory.


Testing Year: 2017
✅ Tested 25 IDs in Year 2017 (2646 values compared, 100.00% matched)

Testing Year: 2018
✅ Tested 25 IDs in Year 2018 (2485 values compared, 100.00% matched)

Testing Year: 2019
✅ Tested 25 IDs in Year 2019 (2583 values compared, 100.00% matched)

Testing Year: 2020
✅ Tested 25 IDs in Year 2020 (2576 values compared, 100.00% matched)

Testing Year: 2021
✅ Tested 25 IDs in Year 2021 (2975 values compared, 100.00% matched)

Testing Year: 2022
✅ Tested 25 IDs in Year 2022 (2982 values compared, 100.00% matched)

Testing Year: 2023
✅ Tested 25 IDs in Year 2023 (2968 values compared, 100.00% matched)

🔍 Integrity Check Summary:
✔ Total Comparisons: 2968
✔ Overall Match Rate: 100.00%
✅ All values matched successfully!


In [37]:
## Data Integrity Checks ##
# Output: A Stratified Random Sample of years and players from the scraped FantasyPros data
# test to ensure merged df values match the individual unmerged df of %rostered and fantasypoints stats
# ✅ Integrity Check for WR Red Zone Stats (using FantasyPros_ID)

# ✅ Integrity Check for WR Fantasy Points & % Rostered (using FantasyPros_ID)
def test_wr_fpts_rost(years=range(2017, 2024), num_samples=25, tolerance=0.1, random_seed=42):
    """
    Integrity check for WR Fantasy Points and % Rostered.

    - Uses Stratified Random Sampling by year.
    - Tests key stats (FPTS, ROST).
    - Reports % of matches and mismatches found.
    - Logs whether data is loaded from variables or CSV files.
    """
    seed(random_seed)  # Set seed for reproducibility
    mismatches = []

    print("\n🔹 Running Fantasy Points & % Rostered Integrity Check...")
    print(f"📌 Test Details: Stratified Random Sample by Year")
    print(f"📌 Columns Tested: ['FPTS', 'ROST']\n")

    # Load data from memory or CSV
    merged_df = load_dataframe("wr_adv_fpts_rost_rz_merged_df_sorted", "fantasypros_wr_adv_fpts_rost_rz_df.csv")
    fpts_df = load_dataframe("wr_fpts_perct_rost_df", "fantasypros_wr_fpts_perct_rost.csv")

    # Log the source of the data
    if isinstance(merged_df, pd.DataFrame) and isinstance(fpts_df, pd.DataFrame):
        print("📥 Data loaded from **variables** in memory.\n")
    else:
        print("📥 Data loaded from **CSV files**.\n")

    for year in years:
        print(f"\nTesting Year: {year}")

        # Filter data by year
        merged_year_df = merged_df[merged_df['Year'] == year]
        fpts_year_df = fpts_df[fpts_df['Year'] == year]

        # Skip if no data
        if merged_year_df.empty or fpts_year_df.empty:
            print(f"⚠️ Skipping Year {year} (No data available)")
            continue

        # Select random FantasyPros_IDs instead of player names
        sampled_ids = sample(list(merged_year_df['FantasyPros_ID'].dropna()), min(num_samples, len(merged_year_df)))

        total_comparisons = 0
        match_count = 0

        for fp_id in sampled_ids:
            for week in merged_year_df[merged_year_df['FantasyPros_ID'] == fp_id]['Week'].unique():

                merged_row = merged_year_df[(merged_year_df['FantasyPros_ID'] == fp_id) & (merged_year_df['Week'] == week)]
                fpts_row = fpts_year_df[(fpts_year_df['FantasyPros_ID'] == fp_id) & (fpts_year_df['Week'] == week)]

                if merged_row.empty or fpts_row.empty:
                    continue  # Skip if missing

                for col in ['FPTS', 'ROST']:
                    merged_val = merged_row[col].values[0] if col in merged_row else None
                    fpts_val = fpts_row[col].values[0] if col in fpts_row else None

                    if merged_val is not None and fpts_val is not None:
                        total_comparisons += 1
                        if merged_val == fpts_val:
                            match_count += 1
                        else:
                            mismatches.append((year, fp_id, week, col, merged_val, fpts_val))

        # Yearly summary
        match_percentage = (match_count / total_comparisons * 100) if total_comparisons else 0
        print(f"✅ Tested {len(sampled_ids)} IDs in Year {year} ({total_comparisons} values compared, {match_percentage:.2f}% matched)")

    # Final summary
    print("\n🔍 Integrity Check Summary:")
    print(f"✔ Total Comparisons: {sum(len(m) for m in mismatches) + match_count}")
    print(f"✔ Overall Match Rate: {(match_count / (match_count + len(mismatches)) * 100) if (match_count + len(mismatches)) > 0 else 0:.2f}%")

    if mismatches:
        print("\n❌ Mismatch Details (Showing up to 10 cases):")
        for m in mismatches[:10]:
            print(m)
    else:
        print("✅ All values matched successfully!")

# ✅ Run the updated test
test_wr_fpts_rost()



🔹 Running Fantasy Points & % Rostered Integrity Check...
📌 Test Details: Stratified Random Sample by Year
📌 Columns Tested: ['FPTS', 'ROST']

📥 Data loaded from **variables** in memory.


Testing Year: 2017
✅ Tested 25 IDs in Year 2017 (744 values compared, 100.00% matched)

Testing Year: 2018
✅ Tested 25 IDs in Year 2018 (652 values compared, 100.00% matched)

Testing Year: 2019
✅ Tested 25 IDs in Year 2019 (626 values compared, 100.00% matched)

Testing Year: 2020
✅ Tested 25 IDs in Year 2020 (610 values compared, 100.00% matched)

Testing Year: 2021
✅ Tested 25 IDs in Year 2021 (500 values compared, 100.00% matched)

Testing Year: 2022
✅ Tested 25 IDs in Year 2022 (558 values compared, 100.00% matched)

Testing Year: 2023
✅ Tested 25 IDs in Year 2023 (586 values compared, 100.00% matched)

🔍 Integrity Check Summary:
✔ Total Comparisons: 586
✔ Overall Match Rate: 100.00%
✅ All values matched successfully!


In [38]:
## Data Integrity Checks ##
# Output: A Stratified Random Sample of years and players from the scraped FantasyPros data
# test to ensure merged df values match the individual unmerged df of redzone stats
# ✅ Integrity Check for WR Red Zone Stats (using FantasyPros_ID)
def test_wr_redzone_stats(years=range(2017, 2024), num_samples=25, tolerance=0.1, random_seed=42):
    """
    Integrity check for WR Red Zone stats.

    - Uses Stratified Random Sampling by year.
    - Tests key stats (REC_rz, TGT_rz, REC PCT_rz, Y/R_rz, TGT PCT_rz).
    - Reports % of matches and mismatches found.
    - Logs whether data is loaded from variables or CSV files.
    """
    seed(random_seed)  # Set seed for reproducibility
    mismatches = []

    print("\n🔹 Running Red Zone Stats Integrity Check...")
    print(f"📌 Test Details: Stratified Random Sample by Year")
    print(f"📌 Columns Tested: ['REC_rz', 'TGT_rz', 'REC PCT_rz', 'Y/R_rz', 'TGT PCT_rz']\n")

    # Load data from memory or CSV
    merged_df = load_dataframe("wr_adv_fpts_rost_rz_merged_df_sorted", "fantasypros_wr_adv_fpts_rost_rz_df.csv")
    redzone_df = load_dataframe("wr_redzone_stats_df", "fantasypros_wr_redzone_stats.csv")

    # Log the source of the data
    if isinstance(merged_df, pd.DataFrame) and isinstance(redzone_df, pd.DataFrame):
        print("📥 Data loaded from **variables** in memory.\n")
    else:
        print("📥 Data loaded from **CSV files**.\n")

    for year in years:
        print(f"\nTesting Year: {year}")

        # Filter data by year
        merged_year_df = merged_df[merged_df['Year'] == year]
        redzone_year_df = redzone_df[redzone_df['Year'] == year]

        # Skip if no data
        if merged_year_df.empty or redzone_year_df.empty:
            print(f"⚠️ Skipping Year {year} (No data available)")
            continue

        # Select random IDs instead of player names
        sampled_ids = sample(list(merged_year_df['FantasyPros_ID'].dropna()), min(num_samples, len(merged_year_df)))

        total_comparisons = 0
        match_count = 0

        for fp_id in sampled_ids:
            for week in merged_year_df[merged_year_df['FantasyPros_ID'] == fp_id]['Week'].unique():

                merged_row = merged_year_df[(merged_year_df['FantasyPros_ID'] == fp_id) & (merged_year_df['Week'] == week)]
                redzone_row = redzone_year_df[(redzone_year_df['FantasyPros_ID'] == fp_id) & (redzone_year_df['Week'] == week)]

                if merged_row.empty or redzone_row.empty:
                    continue  # Skip if missing

                for col in ['REC_rz', 'TGT_rz', 'REC PCT_rz', 'Y/R_rz', 'TGT PCT_rz']:
                    merged_val = merged_row[col].values[0] if col in merged_row else None
                    redzone_val = redzone_row[col].values[0] if col in redzone_row else None

                    if merged_val is not None and redzone_val is not None:
                        total_comparisons += 1
                        if merged_val == redzone_val:
                            match_count += 1
                        else:
                            mismatches.append((year, fp_id, week, col, merged_val, redzone_val))

        # Yearly summary
        match_percentage = (match_count / total_comparisons * 100) if total_comparisons else 0
        print(f"✅ Tested {len(sampled_ids)} IDs in Year {year} ({total_comparisons} values compared, {match_percentage:.2f}% matched)")

    # Final summary
    print("\n🔍 Integrity Check Summary:")
    print(f"✔ Total Comparisons: {sum(len(m) for m in mismatches) + match_count}")
    print(f"✔ Overall Match Rate: {(match_count / (match_count + len(mismatches)) * 100) if (match_count + len(mismatches)) > 0 else 0:.2f}%")

    if mismatches:
        print("\n❌ Mismatch Details (Showing up to 10 cases):")
        for m in mismatches[:10]:
            print(m)
    else:
        print("✅ All values matched successfully!")

# ✅ Run the updated test
test_wr_redzone_stats()



🔹 Running Red Zone Stats Integrity Check...
📌 Test Details: Stratified Random Sample by Year
📌 Columns Tested: ['REC_rz', 'TGT_rz', 'REC PCT_rz', 'Y/R_rz', 'TGT PCT_rz']

📥 Data loaded from **variables** in memory.


Testing Year: 2017
✅ Tested 25 IDs in Year 2017 (450 values compared, 100.00% matched)

Testing Year: 2018
✅ Tested 25 IDs in Year 2018 (525 values compared, 100.00% matched)

Testing Year: 2019
✅ Tested 25 IDs in Year 2019 (660 values compared, 100.00% matched)

Testing Year: 2020
✅ Tested 25 IDs in Year 2020 (560 values compared, 100.00% matched)

Testing Year: 2021
✅ Tested 25 IDs in Year 2021 (255 values compared, 100.00% matched)

Testing Year: 2022
✅ Tested 25 IDs in Year 2022 (485 values compared, 100.00% matched)

Testing Year: 2023
✅ Tested 25 IDs in Year 2023 (485 values compared, 100.00% matched)

🔍 Integrity Check Summary:
✔ Total Comparisons: 485
✔ Overall Match Rate: 100.00%
✅ All values matched successfully!


In [None]:
### End: FantasyPros Data Integrity Checks Section ###

In [None]:
## Begin: data normalization for fantasypros dataframes ##

In [389]:
# # output: summary of data types in each dataframe
# # Pre-data normalization step to prepare for row hashing

# # List of dataframes to analyze
# dataframes = {
#     "wr_adv_fpts_rost_rz_merged_df_sorted": wr_adv_fpts_rost_rz_merged_df_sorted,
#     "wr_fp_advanced_stats_df": wr_fp_advanced_stats_df,
#     "wr_fpts_perct_rost_df": wr_fpts_perct_rost_df,
#     "wr_redzone_stats_df": wr_redzone_stats_df,
# }

# # Dictionary to store results
# dtype_summary = {}

# # Print data type summary for each dataframe
# for name, df in dataframes.items():
#     print(f"\n📊 **Data Types Summary for {name}**")
#     print(df.dtypes)

# # Convert dtype summary into a structured dataframe for better visualization
# dtype_summary_df = pd.DataFrame(dtype_summary)

# # Display the summary table
# dtype_summary_df

In [392]:
# output: columns of normalized datatypes
# data normalization step to prepare for row hashing

# # Rename dataframes
# wr_adv_fpts_rost_rz_merged_df_sorted_data_norm = wr_adv_fpts_rost_rz_merged_df_sorted.copy()
# wr_fp_advanced_stats_df_data_norm = wr_fp_advanced_stats_df.copy()
# wr_fpts_perct_rost_df_data_norm = wr_fpts_perct_rost_df.copy()
# wr_redzone_stats_df_data_norm = wr_redzone_stats_df.copy()

# # Store all dataframes in a dictionary for streamlined processing
# dataframes = {
#     "wr_adv_fpts_rost_rz_merged_df_sorted_data_norm": wr_adv_fpts_rost_rz_merged_df_sorted_data_norm,
#     "wr_fp_advanced_stats_df_data_norm": wr_fp_advanced_stats_df_data_norm,
#     "wr_fpts_perct_rost_df_data_norm": wr_fpts_perct_rost_df_data_norm,
#     "wr_redzone_stats_df_data_norm": wr_redzone_stats_df_data_norm
# }

# # Display confirmation
# print("✅ DataFrames successfully renamed and stored for processing.\n")

# # Rename 'Year' to 'Season' in all dataframes
# for name, df in dataframes.items():
#     if 'Year' in df.columns:
#         df.rename(columns={'Year': 'Season'}, inplace=True)

# # Display column updates
# for name, df in dataframes.items():
#     print(f"\n🔍 Updated Columns in {name}:")
#     print(df.dtypes)

# # Convert Numeric Object Columns to Proper Data Types
# numeric_columns_adv = [
#     'REC', 'YDS', 'YBC', 'AIR', 'YAC', 'TGT', 'CATCHABLE', 'DROP', 'RZ TGT', 
#     '10+ YDS', '20+ YDS', '30+ YDS', '40+ YDS', '50+ YDS'
# ]

# for col in numeric_columns_adv:
#     for df_name, df in dataframes.items():
#         if col in df.columns:
#             df[col] = pd.to_numeric(df[col], errors='coerce')

# ### Drop Duplicate 'Player' Columns
# for df_name, df in dataframes.items():
#     duplicate_player_cols = [col for col in df.columns if 'Player' in col]
#     if len(duplicate_player_cols) > 1:
#         df.drop(columns=duplicate_player_cols[1:], inplace=True)

# # Fix Percentage Columns ('ROST', 'REC PCT_rz', 'TGT PCT_rz')
# # Convert 'ROST' column to decimal format
# if 'ROST' in wr_fpts_perct_rost_df_data_norm.columns:
#     wr_fpts_perct_rost_df_data_norm['ROST'] = (
#         wr_fpts_perct_rost_df_data_norm['ROST']
#         .astype(str)
#         .apply(lambda x: float(x.replace('%', '')) / 100 if '%' in x else float(x))
#         .round(3)
#     )

# # Convert 'ROST' column to decimal format in merged dataframe
# if 'ROST' in wr_adv_fpts_rost_rz_merged_df_sorted_data_norm.columns:
#     wr_adv_fpts_rost_rz_merged_df_sorted_data_norm['ROST'] = (
#         wr_adv_fpts_rost_rz_merged_df_sorted_data_norm['ROST']
#         .astype(str)
#         .apply(lambda x: float(x.replace('%', '')) / 100 if '%' in x else float(x))
#         .round(3)
#     )

# # Convert 'REC PCT_rz' and 'TGT PCT_rz' to decimal format
# for col in ['REC PCT_rz', 'TGT PCT_rz']:
#     if col in wr_redzone_stats_df_data_norm.columns:
#         wr_redzone_stats_df_data_norm[col] = (
#             wr_redzone_stats_df_data_norm[col]
#             .astype(str)
#             .apply(lambda x: float(x.replace('%', '')) / 100 if '%' in x else float(x))
#             .round(3)
#         )

# # Convert 'REC PCT_rz' and 'TGT PCT_rz' to decimal format in merged dataframe
# for col in ['REC PCT_rz', 'TGT PCT_rz']:
#     if col in wr_adv_fpts_rost_rz_merged_df_sorted_data_norm.columns:
#         wr_adv_fpts_rost_rz_merged_df_sorted_data_norm[col] = (
#             wr_adv_fpts_rost_rz_merged_df_sorted_data_norm[col]
#             .astype(str)
#             .apply(lambda x: float(x.replace('%', '')) / 100 if '%' in x else float(x))
#             .round(3)
#         )

# # Re-Check Data Types
# data_types_summary = {
#     "wr_adv_fpts_rost_rz_merged_df_sorted_data_norm": wr_adv_fpts_rost_rz_merged_df_sorted_data_norm.dtypes,
#     "wr_fp_advanced_stats_df_data_norm": wr_fp_advanced_stats_df_data_norm.dtypes,
#     "wr_fpts_perct_rost_df_data_norm": wr_fpts_perct_rost_df_data_norm.dtypes,
#     "wr_redzone_stats_df_data_norm": wr_redzone_stats_df_data_norm.dtypes
# }

# # Display final data types summary
# for name, dtype_summary in data_types_summary.items():
#     print(f"\n🔍 Final Data Types in {name}:")
#     print(dtype_summary)

In [395]:
# visually inspect data normalization consistency
# data normalization step to prepare for row hashing
# Display the first two rows of each dataframe
# for name, df in dataframes.items():
#     print(f"\n🔍 **First Two Rows of {name}:**")
#     display(df.head(2))  # Display first two rows

In [398]:
# output: reordered columns in the dataframes
# data normalization step to prepare for row hashing

# Define the correct column order
# def reorder_columns(df, first_three=["Season", "Week", "FantasyPros_ID"]):
#     """Reorders the dataframe to ensure the first three columns are consistent."""
#     cols = list(df.columns)
#     remaining_cols = [col for col in cols if col not in first_three]
#     return df[first_three + remaining_cols]

# # Apply column reordering to the unmerged dataframes
# wr_fp_advanced_stats_df_data_norm_hash = reorder_columns(wr_fp_advanced_stats_df_data_norm_hash)
# wr_fpts_perct_rost_df_data_norm_hash = reorder_columns(wr_fpts_perct_rost_df_data_norm_hash)
# wr_redzone_stats_df_data_norm_hash = reorder_columns(wr_redzone_stats_df_data_norm_hash)

# # Verify the new order of columns
# print("✅ Column Order Adjusted")
# print("\n🔍 First three columns in each dataframe:")
# print(f"Advanced Stats: {wr_fp_advanced_stats_df_data_norm_hash.columns[:3].tolist()}")
# print(f"FPTS & ROST: {wr_fpts_perct_rost_df_data_norm_hash.columns[:3].tolist()}")
# print(f"Redzone Stats: {wr_redzone_stats_df_data_norm_hash.columns[:3].tolist()}")


In [401]:
# visually inspect data normalization consistency
# data normalization step to prepare for row hashing
# Display the first two rows of each dataframe
# Display the first two rows of each dataframe for visual inspection
# dataframes_to_display = {
#     "Merged DataFrame": wr_adv_fpts_rost_rz_merged_df_sorted_data_norm_hash,
#     "Advanced Stats DataFrame": wr_fp_advanced_stats_df_data_norm_hash,
#     "FPTS & ROST DataFrame": wr_fpts_perct_rost_df_data_norm_hash,
#     "Redzone Stats DataFrame": wr_redzone_stats_df_data_norm_hash
# }

# for name, df in dataframes_to_display.items():
#     print(f"\n🔍 First two rows of {name}:")
#     display(df.head(2))


In [403]:
# ✅ Step 1: Data Type Inspection (Pre-Normalization)
# Output: summary of data types in each dataframe

dataframes = {
    "wr_adv_fpts_rost_rz_merged_df_sorted": wr_adv_fpts_rost_rz_merged_df_sorted,
    "wr_fp_advanced_stats_df": wr_fp_advanced_stats_df,
    "wr_fpts_perct_rost_df": wr_fpts_perct_rost_df,
    "wr_redzone_stats_df": wr_redzone_stats_df,
}

for name, df in dataframes.items():
    print(f"\n📊 **Data Types Summary for {name}**")
    print(df.dtypes)

# ✅ Step 2: Copy DataFrames for Normalization
wr_adv_fpts_rost_rz_merged_df_sorted_data_norm = wr_adv_fpts_rost_rz_merged_df_sorted.copy()
wr_fp_advanced_stats_df_data_norm = wr_fp_advanced_stats_df.copy()
wr_fpts_perct_rost_df_data_norm = wr_fpts_perct_rost_df.copy()
wr_redzone_stats_df_data_norm = wr_redzone_stats_df.copy()

dataframes_norm = {
    "wr_adv_fpts_rost_rz_merged_df_sorted_data_norm": wr_adv_fpts_rost_rz_merged_df_sorted_data_norm,
    "wr_fp_advanced_stats_df_data_norm": wr_fp_advanced_stats_df_data_norm,
    "wr_fpts_perct_rost_df_data_norm": wr_fpts_perct_rost_df_data_norm,
    "wr_redzone_stats_df_data_norm": wr_redzone_stats_df_data_norm
}

print("✅ DataFrames successfully renamed and stored for processing.\n")

# ✅ Step 3: Rename 'Year' to 'Season' in All DataFrames
for name, df in dataframes_norm.items():
    if 'Year' in df.columns:
        df.rename(columns={'Year': 'Season'}, inplace=True)

# ✅ Step 4: Convert Object Columns to Numeric Types
numeric_columns_adv = [
    'REC', 'YDS', 'YBC', 'AIR', 'YAC', 'TGT', 'CATCHABLE', 'DROP', 'RZ TGT', 
    '10+ YDS', '20+ YDS', '30+ YDS', '40+ YDS', '50+ YDS'
]

for col in numeric_columns_adv:
    for df_name, df in dataframes_norm.items():
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

# ✅ Step 5: Fix Percentage Columns ('ROST', 'REC PCT_rz', 'TGT PCT_rz')
# Convert 'ROST' column to decimal format
if 'ROST' in wr_fpts_perct_rost_df_data_norm.columns:
    wr_fpts_perct_rost_df_data_norm['ROST'] = (
        wr_fpts_perct_rost_df_data_norm['ROST']
        .astype(str)
        .apply(lambda x: float(x.replace('%', '')) / 100 if '%' in x else float(x))
        .round(3)
    )

if 'ROST' in wr_adv_fpts_rost_rz_merged_df_sorted_data_norm.columns:
    wr_adv_fpts_rost_rz_merged_df_sorted_data_norm['ROST'] = (
        wr_adv_fpts_rost_rz_merged_df_sorted_data_norm['ROST']
        .astype(str)
        .apply(lambda x: float(x.replace('%', '')) / 100 if '%' in x else float(x))
        .round(3)
    )

# Convert 'REC PCT_rz' and 'TGT PCT_rz' to decimal format
for col in ['REC PCT_rz', 'TGT PCT_rz']:
    if col in wr_redzone_stats_df_data_norm.columns:
        wr_redzone_stats_df_data_norm[col] = (
            wr_redzone_stats_df_data_norm[col]
            .astype(str)
            .apply(lambda x: float(x.replace('%', '')) / 100 if '%' in x else float(x))
            .round(3)
        )

for col in ['REC PCT_rz', 'TGT PCT_rz']:
    if col in wr_adv_fpts_rost_rz_merged_df_sorted_data_norm.columns:
        wr_adv_fpts_rost_rz_merged_df_sorted_data_norm[col] = (
            wr_adv_fpts_rost_rz_merged_df_sorted_data_norm[col]
            .astype(str)
            .apply(lambda x: float(x.replace('%', '')) / 100 if '%' in x else float(x))
            .round(3)
        )

# ✅ Step 6: Drop Duplicate 'Player' Columns
for df_name, df in dataframes_norm.items():
    duplicate_player_cols = [col for col in df.columns if 'Player' in col]
    if len(duplicate_player_cols) > 1:
        df.drop(columns=duplicate_player_cols[1:], inplace=True)

# ✅ Step 7: Reorder Columns (Season, Week, FantasyPros_ID First)
def reorder_columns(df, first_three=["Season", "Week", "FantasyPros_ID"]):
    """Reorders the dataframe to ensure the first three columns are consistent."""
    cols = list(df.columns)
    remaining_cols = [col for col in cols if col not in first_three]
    return df[first_three + remaining_cols]

wr_fp_advanced_stats_df_data_norm = reorder_columns(wr_fp_advanced_stats_df_data_norm)
wr_fpts_perct_rost_df_data_norm = reorder_columns(wr_fpts_perct_rost_df_data_norm)
wr_redzone_stats_df_data_norm = reorder_columns(wr_redzone_stats_df_data_norm)

print("✅ Column Order Adjusted")
print("\n🔍 First three columns in each dataframe:")
print(f"Advanced Stats: {wr_fp_advanced_stats_df_data_norm.columns[:3].tolist()}")
print(f"FPTS & ROST: {wr_fpts_perct_rost_df_data_norm.columns[:3].tolist()}")
print(f"Redzone Stats: {wr_redzone_stats_df_data_norm.columns[:3].tolist()}")

# ✅ Step 8: Display First Two Rows of Each DataFrame for Visual Inspection
dataframes_to_display = {
    "Merged DataFrame": wr_adv_fpts_rost_rz_merged_df_sorted_data_norm,
    "Advanced Stats DataFrame": wr_fp_advanced_stats_df_data_norm,
    "FPTS & ROST DataFrame": wr_fpts_perct_rost_df_data_norm,
    "Redzone Stats DataFrame": wr_redzone_stats_df_data_norm
}

for name, df in dataframes_to_display.items():
    print(f"\n🔍 First two rows of {name}:")
    display(df.head(2))

# ✅ Step 9: Final Data Type Verification
for name, df in dataframes_norm.items():
    print(f"\n🔍 Final Data Types in {name}:")
    print(df.dtypes)



📊 **Data Types Summary for wr_adv_fpts_rost_rz_merged_df_sorted**
Season              int64
Week                int64
FantasyPros_ID     object
Player             object
Player             object
G                  object
REC                object
YDS                object
YBC                object
AIR                object
YAC                object
YACON              object
BRKTKL             object
TGT                object
CATCHABLE          object
DROP               object
RZ TGT             object
10+ YDS            object
20+ YDS            object
30+ YDS            object
40+ YDS            object
50+ YDS            object
FPTS              float64
ROST               object
REC_rz            float64
TGT_rz            float64
REC PCT_rz         object
Y/R_rz            float64
TGT PCT_rz         object
dtype: object

📊 **Data Types Summary for wr_fp_advanced_stats_df**
Year               int64
Week               int64
FantasyPros_ID    object
Player            object
Player     

Unnamed: 0,Season,Week,FantasyPros_ID,G,REC,YDS,YBC,AIR,YAC,YACON,BRKTKL,TGT,CATCHABLE,DROP,RZ TGT,10+ YDS,20+ YDS,30+ YDS,40+ YDS,50+ YDS,FPTS,ROST,REC_rz,TGT_rz,REC PCT_rz,Y/R_rz,TGT PCT_rz
0,2017,1,13981,1,7,93,76,0,17,6,0,8,7,0,2,5,2,1,0,0,20.7,0.446,3.0,3.0,1.0,7.3,0.6
20,2017,1,12122,1,3,88,71,0,17,4,0,7,4,1,0,3,1,1,1,1,8.8,0.146,,,,,



🔍 First two rows of Advanced Stats DataFrame:


Unnamed: 0,Season,Week,FantasyPros_ID,G,REC,YDS,YBC,AIR,YAC,YACON,BRKTKL,TGT,CATCHABLE,DROP,RZ TGT,10+ YDS,20+ YDS,30+ YDS,40+ YDS,50+ YDS
0,2017,1,13981,1,7,93,76,0,17,6,0,8,7,0,2,5,2,1,0,0
1,2017,1,15802,1,7,133,78,0,55,1,0,8,7,0,0,4,1,1,1,1



🔍 First two rows of FPTS & ROST DataFrame:


Unnamed: 0,Season,Week,FantasyPros_ID,FPTS,ROST
0,2017,1,13981,20.7,0.4
1,2017,1,15802,19.8,0.9



🔍 First two rows of Redzone Stats DataFrame:


Unnamed: 0,Season,Week,FantasyPros_ID,REC_rz,TGT_rz,REC PCT_rz,Y/R_rz,TGT PCT_rz
0,2017,1,13981,3,3,1.0,7.3,0.6
1,2017,1,13081,2,2,1.0,5.5,0.7



🔍 Final Data Types in wr_adv_fpts_rost_rz_merged_df_sorted_data_norm:
Season              int64
Week                int64
FantasyPros_ID     object
G                  object
REC                 int64
YDS                 int64
YBC                 int64
AIR                 int64
YAC                 int64
YACON              object
BRKTKL             object
TGT                 int64
CATCHABLE           int64
DROP                int64
RZ TGT              int64
10+ YDS             int64
20+ YDS             int64
30+ YDS             int64
40+ YDS             int64
50+ YDS             int64
FPTS              float64
ROST              float64
REC_rz            float64
TGT_rz            float64
REC PCT_rz        float64
Y/R_rz            float64
TGT PCT_rz        float64
dtype: object

🔍 Final Data Types in wr_fp_advanced_stats_df_data_norm:
Season             int64
Week               int64
FantasyPros_ID    object
G                 object
REC                int64
YDS                int64
YBC  

In [None]:
## End: data normalization for fantasypros dataframes ##

In [406]:
## Begin: Enhanced data integrity checks for the fantasypros dataframes ##

In [408]:
## next tasks
# implement enhanced data integrity check via checksum hashes of rows for both python and fantasypros data

In [None]:
# output: 
# enhanced data integrity check for the fantasypros dataframe

In [None]:
### End: Enhanced data integrity checks for the fantasypros dataframes ##