In [1]:
# This produces the dataframe for WR

In [2]:
## Notes on the NFL Library ##
# the NFL python library seem to not work on Tuesday probably due to updates (not confirmed)

In [3]:
## Required installations
!pip install nfl_data_py
# Ensure all required packages are installed within the notebook
# !pip install --quiet nfl_data_py
!pip install --quiet rapidfuzz




In [4]:
# import the libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import glob
from IPython.display import display, HTML
from datetime import datetime
import nfl_data_py as nfl
import os
import re
import time
from random import sample, uniform, seed
import io
from rapidfuzz import fuzz, process
import numpy as np
import hashlib
import shutil

In [5]:
## REQUIRED ACTIONS - Include in a README doc ## 
# modify the season start date in the 'get_current_week' function
# modify the number of weeks if the NFL adds regular season games to the schedule
season_start_date = datetime(2024, 9, 4)  # Update for the season start (Yr, month, day)

In [6]:
# Set Pandas options to display all columns in a single row without wrapping
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

In [7]:
# Function to calculate the current week of the NFL season
def get_current_week():
    current_date = datetime.now()
    current_week = ((current_date - season_start_date).days // 7) + 1
    return current_week

# Define the current NFL year, week, and season type
current_year = datetime.now().year
current_week = get_current_week()
seasontype = 2 if current_week <= 18 else 3  # Regular season or playoffs

In [8]:
# define the years to pull
# nfl.import_weekly_data(years, columns, downcast)
def get_year_range(current_year, current_week, start_year=2017):
    if current_week <= 18:  # Regular season
        return list(range(start_year, current_year + 1))
    else:  # Playoffs
        return list(range(start_year, current_year))

# Use the function
years = get_year_range(current_year, current_week)

In [9]:
# Generates a list of (year, week) combinations for web scraping.
# - 2017-2020: Weeks 1-17
# - 2021 and beyond: Weeks 1-18
def generate_year_week_combinations(start_year, end_year):
    year_week_combinations = []
    for year in range(start_year, end_year + 1):
        max_week = 17 if year <= 2020 else 18
        year_week_combinations.extend([(year, week) for week in range(1, max_week + 1)])
    return year_week_combinations

In [10]:
def check_nulls(df, name=None):
    """
    Returns a dataframe summarizing missing values for a given DataFrame.
    
    Parameters:
        df (pd.DataFrame): The DataFrame to analyze.
        name (str): Optional label for output clarity.
    
    Returns:
        pd.DataFrame: Summary of missing values (count and %), sorted.
    """
    null_counts = df.isnull().sum()
    null_percent = (null_counts / len(df)).round(4)
    summary = pd.DataFrame({
        'Missing Count': null_counts,
        'Missing %': null_percent
    })
    summary = summary[summary['Missing Count'] > 0].sort_values(by='Missing %', ascending=False)
    
    if name:
        print(f"\n📊 Missing Value Summary for: {name}")
    return summary


In [11]:
### Begin: Python NFL Library Dataframe ###

In [12]:
# display all available columns in the nfl python API for weekly stats
nfl.see_weekly_cols()

Index(['player_id', 'player_name', 'player_display_name', 'position', 'position_group', 'headshot_url', 'recent_team', 'season', 'week', 'season_type', 'opponent_team', 'completions', 'attempts', 'passing_yards', 'passing_tds', 'interceptions', 'sacks', 'sack_yards', 'sack_fumbles', 'sack_fumbles_lost', 'passing_air_yards', 'passing_yards_after_catch', 'passing_first_downs', 'passing_epa', 'passing_2pt_conversions', 'pacr', 'dakota', 'carries', 'rushing_yards', 'rushing_tds', 'rushing_fumbles', 'rushing_fumbles_lost', 'rushing_first_downs', 'rushing_epa', 'rushing_2pt_conversions', 'receptions', 'targets', 'receiving_yards', 'receiving_tds', 'receiving_fumbles', 'receiving_fumbles_lost', 'receiving_air_yards', 'receiving_yards_after_catch', 'receiving_first_downs', 'receiving_epa', 'receiving_2pt_conversions', 'racr', 'target_share', 'air_yards_share', 'wopr', 'special_teams_tds', 'fantasy_points', 'fantasy_points_ppr'], dtype='object')

In [13]:
# define the base columns. 
base_columns = [
    'season', 'season_type', 'week', 'player_id', 'player_name', 
    'position', 'position_group', 'recent_team',
    'fantasy_points', 'fantasy_points_ppr'
]

In [14]:
# Import the player IDs from nfl.import_ids() - without parameters
ids_data = nfl.import_ids()

# Drop the unnecessary columns
columns_to_drop = [
    'position', 'team', 'birthdate', 'age', 'draft_year', 
    'draft_round', 'draft_pick', 'draft_ovr', 'twitter_username', 
    'height', 'weight', 'college', 'db_season'
]
ids_data = ids_data.drop(columns=columns_to_drop, errors='ignore')

# Display the resulting dataframe for review
# print(f"Columns after dropping unnecessary ones: {ids_data.columns.tolist()}")
# display(ids_data)

In [15]:
# import the weekly data from nfl.import_weekly_data(years, columns, downcast)
weekly_data = nfl.import_weekly_data(
    years=years,
    columns=base_columns
)

# display(weekly_data)

Downcasting floats.


In [16]:
## Output: a dataframe of ALL NFL athletes info and ids since 2017

# Merge the two dataframes on 'player_id' and 'gsis_id'
# Align column names for merging
ids_data = ids_data.rename(columns={'gsis_id': 'player_id'})  
id_dataframe = pd.merge(weekly_data, ids_data, on='player_id', how='inner')

# Assign the resulting dataframe to a variable
all_players_id_data = id_dataframe

# Display the resulting ID dataframe
# display(all_players_id_data)

In [17]:
## Output: a dataframe of NFL WR info and ids since 2017
# extract WR from the dataframe
# Create a new dataframe with only wide receivers
wide_receiver_ids = all_players_id_data[all_players_id_data['position'] == 'WR']

# Display the resulting dataframe for review
print(f"Shape of merged dataframe: {wide_receiver_ids.shape}")

# Display the resulting dataframe for review
# display(wide_receiver_ids)

Shape of merged dataframe: (17384, 31)


In [18]:
## Output: a dataframe of NFL WR info, ids, and stats since 2017
# WR-specific columns (receiving-related)
wr_columns = [
    'receptions', 'targets', 'receiving_yards', 'receiving_tds',
    'receiving_fumbles', 'receiving_fumbles_lost',
    'receiving_air_yards', 'receiving_yards_after_catch',
    'receiving_first_downs', 'receiving_epa',
    'receiving_2pt_conversions', 'racr', 'target_share',
    'air_yards_share', 'wopr'
]

# Pull WR-specific columns from weekly data
wr_stats = nfl.import_weekly_data(
    years=years,
    columns=['player_id', 'season', 'week'] + wr_columns  # Include keys for merging
)

# Merge WR-specific stats with wide_receiver_ids
wr_ids_weekly_stats_df = pd.merge(
    wide_receiver_ids,
    wr_stats,
    on=['player_id', 'season', 'week'],  # Ensure correct alignment
    how='inner'
)

# Display the resulting dataframe for review
print(f"Shape of merged dataframe: {wr_ids_weekly_stats_df.shape}")

# Row integrity check
print(
    f"Row count matches: {wr_ids_weekly_stats_df.shape[0] == wide_receiver_ids.shape[0]}"
)

# display the df
display(wr_ids_weekly_stats_df)

# csv file
wr_ids_weekly_stats_df.to_csv('wr_ids_weekly_stats_df.csv', index=False)

Downcasting floats.
Shape of merged dataframe: (17384, 46)
Row count matches: True


Unnamed: 0,season,season_type,week,player_id,player_name,position,position_group,recent_team,fantasy_points,fantasy_points_ppr,stats_id,ktc_id,yahoo_id,fleaflicker_id,espn_id,fantasy_data_id,cfbref_id,sportradar_id,cbs_id,rotoworld_id,nfl_id,pfr_id,pff_id,fantasypros_id,stats_global_id,merge_name,rotowire_id,mfl_id,sleeper_id,name,swish_id,receptions,targets,receiving_yards,receiving_tds,receiving_fumbles,receiving_fumbles_lost,receiving_air_yards,receiving_yards_after_catch,receiving_first_downs,receiving_epa,receiving_2pt_conversions,racr,target_share,air_yards_share,wopr
0,2017,REG,1,00-0022921,L.Fitzgerald,WR,WR,ARI,7.4,13.400000,6762.0,,6762.0,1732.0,5528.0,5571.0,larry-fitzgerald-1,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,492934.0,1661.0,larryfitzgerald/2506106,FitzLa00,1724.0,9383.0,246053.0,larry fitzgerald,3730.0,7393,223.0,Larry Fitzgerald,,6,13,74.0,0,0.0,0.0,144.0,44.0,4.0,0.997088,0,0.513889,0.276596,0.342043,0.654324
1,2017,REG,2,00-0022921,L.Fitzgerald,WR,WR,ARI,2.1,5.100000,6762.0,,6762.0,1732.0,5528.0,5571.0,larry-fitzgerald-1,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,492934.0,1661.0,larryfitzgerald/2506106,FitzLa00,1724.0,9383.0,246053.0,larry fitzgerald,3730.0,7393,223.0,Larry Fitzgerald,,3,6,21.0,0,0.0,0.0,29.0,17.0,2.0,-3.455533,0,0.724138,0.166667,0.069378,0.298565
2,2017,REG,3,00-0022921,L.Fitzgerald,WR,WR,ARI,20.9,33.900002,6762.0,,6762.0,1732.0,5528.0,5571.0,larry-fitzgerald-1,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,492934.0,1661.0,larryfitzgerald/2506106,FitzLa00,1724.0,9383.0,246053.0,larry fitzgerald,3730.0,7393,223.0,Larry Fitzgerald,,13,15,149.0,1,0.0,0.0,138.0,45.0,6.0,7.632769,0,1.079710,0.312500,0.369973,0.727731
3,2017,REG,4,00-0022921,L.Fitzgerald,WR,WR,ARI,9.2,13.200000,6762.0,,6762.0,1732.0,5528.0,5571.0,larry-fitzgerald-1,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,492934.0,1661.0,larryfitzgerald/2506106,FitzLa00,1724.0,9383.0,246053.0,larry fitzgerald,3730.0,7393,223.0,Larry Fitzgerald,,4,7,32.0,1,0.0,0.0,31.0,18.0,1.0,0.162141,0,1.032258,0.137255,0.070938,0.255539
4,2017,REG,5,00-0022921,L.Fitzgerald,WR,WR,ARI,5.1,11.100000,6762.0,,6762.0,1732.0,5528.0,5571.0,larry-fitzgerald-1,b6a61b38-5cfa-46eb-b1c5-b0255d7ebaf5,492934.0,1661.0,larryfitzgerald/2506106,FitzLa00,1724.0,9383.0,246053.0,larry fitzgerald,3730.0,7393,223.0,Larry Fitzgerald,,6,10,51.0,0,0.0,0.0,44.0,29.0,5.0,2.428232,0,1.159091,0.227273,0.105516,0.414770
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17379,2024,REG,2,00-0039920,M.Corley,WR,WR,NYJ,0.4,1.400000,40944.0,1607.0,40944.0,,4613104.0,,malachi-corley-1,bae59933-8b94-4837-990e-f0a4ced3cdbb,3162613.0,,,CorlMa00,,26023.0,0.0,malachi corley,17777.0,16636,11617.0,Malachi Corley,1215291.0,1,1,4.0,0,0.0,0.0,-1.0,5.0,0.0,-0.475780,0,0.000000,0.034483,-0.006579,0.047119
17380,2024,REG,9,00-0039920,M.Corley,WR,WR,NYJ,1.8,1.800000,40944.0,1607.0,40944.0,,4613104.0,,malachi-corley-1,bae59933-8b94-4837-990e-f0a4ced3cdbb,3162613.0,,,CorlMa00,,26023.0,0.0,malachi corley,17777.0,16636,11617.0,Malachi Corley,1215291.0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,,0,,,,
17381,2024,REG,10,00-0039920,M.Corley,WR,WR,NYJ,0.2,1.200000,40944.0,1607.0,40944.0,,4613104.0,,malachi-corley-1,bae59933-8b94-4837-990e-f0a4ced3cdbb,3162613.0,,,CorlMa00,,26023.0,0.0,malachi corley,17777.0,16636,11617.0,Malachi Corley,1215291.0,1,2,2.0,0,0.0,0.0,12.0,0.0,1.0,-2.246118,0,0.166667,0.060606,0.057692,0.131294
17382,2024,REG,11,00-0039920,M.Corley,WR,WR,NYJ,1.0,2.000000,40944.0,1607.0,40944.0,,4613104.0,,malachi-corley-1,bae59933-8b94-4837-990e-f0a4ced3cdbb,3162613.0,,,CorlMa00,,26023.0,0.0,malachi corley,17777.0,16636,11617.0,Malachi Corley,1215291.0,1,1,10.0,0,0.0,0.0,10.0,0.0,0.0,0.563583,0,1.000000,0.034483,0.080645,0.108176


In [19]:
# check for nulls
# ✅ Updated null value analysis using helper function
null_summary_wr_ids_weekly = check_nulls(wr_ids_weekly_stats_df, name="WR Weekly Stats")

# Filter out columns containing '_id'
null_summary_wr_ids_weekly = null_summary_wr_ids_weekly[~null_summary_wr_ids_weekly.index.str.contains('_id')]

display(null_summary_wr_ids_weekly)


📊 Missing Value Summary for: WR Weekly Stats


Unnamed: 0,Missing Count,Missing %
racr,327,0.0188
receiving_epa,285,0.0164
air_yards_share,285,0.0164
target_share,285,0.0164
wopr,285,0.0164


In [20]:
# Output: imports the NFL next-generation stats from the nfl python library

# import the next generation stats (NGS) from nfl.import_ngs_data()
# note: ngs starts at week 0 (previous season totals) - not needed so drop those rows

# Pull NGS receiving data for the specified years
wr_ngs_df = nfl.import_ngs_data('receiving', years)

# Exclude rows where 'week' == 0 and filter for 'WR' position in one step
wr_ngs_df = wr_ngs_df[(wr_ngs_df['week'] != 0) & (wr_ngs_df['player_position'] == 'WR')]

# Drop unnecessary columns (already in the nfl python baseline dataframe)
wr_ngs_df = wr_ngs_df.drop(columns=['player_jersey_number'], errors='ignore')

# Display the resulting dataframe
print(f"Shape of NGS WR DataFrame after dropping columns: {wr_ngs_df.shape}")
display(wr_ngs_df)

# ***csv file***
wr_ngs_df.to_csv('wr_ngs_df.csv', index=False)

Shape of NGS WR DataFrame after dropping columns: (8249, 22)


Unnamed: 0,season,season_type,week,player_display_name,player_position,team_abbr,avg_cushion,avg_separation,avg_intended_air_yards,percent_share_of_intended_air_yards,receptions,targets,catch_percentage,yards,rec_touchdowns,avg_yac,avg_expected_yac,avg_yac_above_expectation,player_gsis_id,player_first_name,player_last_name,player_short_name
1725,2017,REG,1,Ryan Grant,WR,WAS,9.936667,2.894592,4.410000,7.154639,4,6,66.666667,61.0,0,11.232500,10.072361,1.160139,00-0031068,Ryan,Grant,R.Grant
1726,2017,REG,1,Martavis Bryant,WR,PIT,8.300000,4.122054,12.688333,33.327496,2,6,33.333333,14.0,0,0.155000,4.098278,-3.943278,00-0031373,Martavis,Bryant,M.Bryant
1729,2017,REG,1,Jamison Crowder,WR,WAS,7.655000,3.177793,10.540000,19.949707,3,7,42.857143,14.0,0,1.450000,1.631897,-0.181897,00-0031941,Jamison,Crowder,J.Crowder
1732,2017,REG,1,Nelson Agholor,WR,PHI,7.423750,2.462620,10.463750,20.274656,6,8,75.000000,86.0,1,5.611667,3.262470,2.349197,00-0031549,Nelson,Agholor,N.Agholor
1733,2017,REG,1,John Brown,WR,ARI,7.360000,2.751526,13.422222,28.208481,4,9,44.444444,32.0,0,-0.377500,0.961993,-1.339493,00-0031051,John,Brown,J.Brown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13323,2024,POST,23,Xavier Worthy,WR,KC,8.160000,4.959113,14.276250,44.737358,8,8,100.000000,157.0,2,6.250000,6.154624,0.095376,00-0039894,Xavier,Worthy,X.Worthy
13324,2024,POST,23,DeAndre Hopkins,WR,KC,7.676000,3.446231,11.974000,23.451761,2,5,40.000000,18.0,1,0.565000,0.798474,-0.233474,00-0030564,DeAndre,Hopkins,D.Hopkins
13325,2024,POST,23,DeVonta Smith,WR,PHI,7.470000,2.221577,14.752000,40.028219,4,5,80.000000,69.0,1,0.340000,0.600076,-0.260076,00-0036912,DeVonta,Smith,D.Smith
13327,2024,POST,23,Marquise Brown,WR,KC,4.943333,3.302615,6.356667,14.939872,2,6,33.333333,15.0,0,2.450000,3.533891,-1.083891,00-0035662,Marquise,Brown,M.Brown


In [21]:
print(wr_ngs_df.columns.tolist())


['season', 'season_type', 'week', 'player_display_name', 'player_position', 'team_abbr', 'avg_cushion', 'avg_separation', 'avg_intended_air_yards', 'percent_share_of_intended_air_yards', 'receptions', 'targets', 'catch_percentage', 'yards', 'rec_touchdowns', 'avg_yac', 'avg_expected_yac', 'avg_yac_above_expectation', 'player_gsis_id', 'player_first_name', 'player_last_name', 'player_short_name']


In [22]:
# ✅ Updated null analysis using helper function
wr_ngs_null_summary_df = check_nulls(wr_ngs_df, name="NGS WR Stats")
display(wr_ngs_null_summary_df)



📊 Missing Value Summary for: NGS WR Stats


Unnamed: 0,Missing Count,Missing %
avg_expected_yac,42,0.0051
avg_yac_above_expectation,42,0.0051
avg_yac,33,0.004
yards,28,0.0034
avg_cushion,2,0.0002


In [23]:
### End: Python NFL Library Dataframe ###

In [24]:
### Begin:fantasypros webscraping ###

In [25]:
# helper function to define year range for scraping
def get_fp_scraping_year_range(start_year=2017):
    """Returns a list of years from start_year to current year inclusive."""
    current_year = datetime.now().year
    return list(range(start_year, current_year + 1))


In [26]:
# output: a dataframe of WR basic stats, antasy points and % rostered stats week-by-week for defined years

# Scrape the weekly WR fantasy points column and the % rostered from the FantasyPros website
def wr_scrape_fp_stats(start_year, end_year):
    """
    Scrapes FantasyPros WR basic stats for all weeks and years, including FantasyPros IDs.
    """
    # Generate year-week combinations
    week_combinations = generate_year_week_combinations(start_year, end_year)

    # Initialize an empty list to store data
    all_data = []

    for year, week in week_combinations:
        try:
            # Build the URL
            url = f"https://www.fantasypros.com/nfl/stats/wr.php?year={year}&week={week}&range=week"
            response = requests.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, "html.parser")

            # Locate the table
            table = soup.find("table", {"id": "data"})
            if table:
                table_str = str(table)

                # Read the table while skipping multi-level headers
                df = pd.read_html(io.StringIO(table_str), header=1)[0]

                # Extract FantasyPros Player IDs correctly
                table_rows = table.find("tbody").find_all("tr")

                fantasypros_ids = []
                player_names = []

                for row in table_rows:
                    # Extract FantasyPros Player ID
                    fantasypros_id = "Unknown"
                    player_name = "Unknown"
                    
                    player_link = row.find("a", class_="fp-player-link")
                    if player_link:
                        class_list = player_link.get("class", [])
                        for class_name in class_list:
                            if class_name.startswith("fp-id-"):
                                fantasypros_id = class_name.replace("fp-id-", "")  # Extract numeric ID
                                break
                        player_name = player_link.text.strip()  # Extract player name explicitly from <a> tag

                    fantasypros_ids.append(fantasypros_id)
                    player_names.append(player_name)

                # Add extracted FantasyPros IDs and correct player names
                df.insert(1, "FantasyPros_ID", fantasypros_ids)
                df["Player"] = player_names  # Replace with properly scraped names

                # Add Year and Week columns
                df["Year"] = year
                df["Week"] = week

                # Append dataframe to list
                all_data.append(df)

            # Delay to avoid bombarding the server
            time.sleep(uniform(0.3, 0.9))  # Randomized delay

        except Exception as e:
            print(f"⚠️ Error occurred while scraping Year: {year}, Week: {week}: {e}")

    # Combine all data into a single dataframe
    wr_fantasypros_basic_stats_df = pd.concat(all_data, ignore_index=True)

    # *** csv file *** 
    wr_fantasypros_basic_stats_df.to_csv("wr_fantasypros_basic_stats_df.csv", index=False)

    # Display shape of the dataframe
    print(f"\n📊 **Shape of WR Basic Stats dataframe after column removal:** {wr_fantasypros_basic_stats_df.shape}")

    # Display first few rows for verification
    display(wr_fantasypros_basic_stats_df.head(10))

    return wr_fantasypros_basic_stats_df

fp_years = get_fp_scraping_year_range()
wr_fantasypros_basic_stats_df = wr_scrape_fp_stats(fp_years[0], fp_years[-1])


📊 **Shape of WR Basic Stats dataframe after column removal:** (33676, 20)


Unnamed: 0,Rank,FantasyPros_ID,Player,REC,TGT,YDS,Y/R,LG,20+,TD,ATT,YDS.1,TD.1,FL,G,FPTS,FPTS/G,ROST,Year,Week
0,1,13981,Stefon Diggs,7,8,93,13.3,30,1,2,1,-6,0,0,1,20.7,20.7,85.4%,2017,1
1,2,15802,Tyreek Hill,7,8,133,19.0,75,0,1,2,5,0,0,1,19.8,19.8,97.3%,2017,1
2,3,16488,Kenny Golladay,4,7,69,17.3,45,0,2,0,0,0,0,1,18.9,18.9,4.0%,2017,1
3,4,9808,Antonio Brown,11,11,182,16.5,50,0,0,0,0,0,0,1,18.2,18.2,1.0%,2017,1
4,5,13429,Adam Thielen,9,10,157,17.4,44,2,0,0,0,0,0,1,15.7,15.7,41.4%,2017,1
5,6,13969,Nelson Agholor,6,8,86,14.3,58,0,1,0,0,0,0,1,14.6,14.6,1.0%,2017,1
6,7,13081,Bennie Fowler III,3,4,21,7.0,10,0,2,0,0,0,0,1,14.1,14.1,0.0%,2017,1
7,8,9320,Jordy Nelson,7,8,79,11.3,32,0,1,0,0,0,0,1,13.9,13.9,0.1%,2017,1
8,9,16433,Cooper Kupp,4,6,76,19.0,28,2,1,0,0,0,0,1,13.6,13.6,86.6%,2017,1
9,10,13894,Amari Cooper,5,13,62,12.4,23,1,1,0,0,0,0,1,12.2,12.2,15.2%,2017,1


In [27]:
# output: a dataframe of weekly WR fantasypros advanced stats week-by-week for defined years

# Define the function to scrape weekly WR advanced stats from Fantasy Pros
def wr_scrape_fp_advanced(start_year, end_year):
    """
    Scrapes FantasyPros WR advanced stats for all weeks and years with correct ID extraction.
    """
    year_week_combinations = generate_year_week_combinations(start_year, end_year)
    all_data = []

    for year, week in year_week_combinations:
        try:
            # Construct the URL
            url = f"https://www.fantasypros.com/nfl/advanced-stats-wr.php?year={year}&week={week}&range=week&view=pergame"
            response = requests.get(url)
            response.raise_for_status()

            # Parse HTML content
            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract table headers
            table_headers = [header.text.strip() for header in soup.find('thead').find_all('th')]

            # Find table rows
            table_rows = soup.find('tbody').find_all('tr')

            for row in table_rows:
                # Extract FantasyPros Player ID correctly
                fantasypros_id = "Unknown"
                player_name = "Unknown"
                
                player_link = row.find("a", class_="fp-player-link")
                if player_link:
                    class_list = player_link.get("class", [])
                    for class_name in class_list:
                        if class_name.startswith("fp-id-"):
                            fantasypros_id = class_name.replace("fp-id-", "")  # Extract numeric ID
                            break
                    player_name = player_link.text.strip()  # Extract player name explicitly from <a> tag

                # Extract the rest of the row data
                row_data = [cell.text.strip() for cell in row.find_all('td')]

                # Ensure data matches headers before adding
                if len(row_data) == len(table_headers):  
                    all_data.append([year, week, fantasypros_id, player_name] + row_data)

            # Random delay to avoid server overload
            time.sleep(uniform(0.3, 0.9))

        except Exception as e:
            print(f"⚠️ Error occurred while scraping Year: {year}, Week: {week}: {e}")

    # Convert data to DataFrame
    column_names = ['Year', 'Week', 'FantasyPros_ID', 'Player'] + table_headers  # No duplicate FantasyPros_ID
    wr_fp_advanced_stats_df = pd.DataFrame(all_data, columns=column_names)

    # Drop the redundant 'Rank' column if it exists
    wr_fp_advanced_stats_df.drop(columns=['Rank'], inplace=True, errors='ignore')

    return wr_fp_advanced_stats_df

fp_years = get_fp_scraping_year_range()
wr_fp_advanced_stats_df = wr_scrape_fp_advanced(fp_years[0], fp_years[-1])


# ✅ Display dataset shape for verification
print(f"\n📊 **Shape of FantasyPros advanced DataFrame:** {wr_fp_advanced_stats_df.shape}")

# ✅ Display first few rows to confirm FantasyPros IDs are correctly extracted
display(wr_fp_advanced_stats_df.head(10))

# *** csv file ***
# wr_fp_advanced_stats_df.to_csv("fantasypros_wr_advanced_stats.csv", index=False)


📊 **Shape of FantasyPros advanced DataFrame:** (1259, 22)


Unnamed: 0,Year,Week,FantasyPros_ID,Player,Player.1,G,REC,YDS,YBC,AIR,YAC,YACON,BRKTKL,TGT,CATCHABLE,DROP,RZ TGT,10+ YDS,20+ YDS,30+ YDS,40+ YDS,50+ YDS
0,2017,1,13981,Stefon Diggs,Stefon Diggs (NE),1,7,93,76,0,17,6,0,8,7,0,2,5,2,1,0,0
1,2017,1,15802,Tyreek Hill,Tyreek Hill (MIA),1,7,133,78,0,55,1,0,8,7,0,0,4,1,1,1,1
2,2017,1,16488,Kenny Golladay,Kenny Golladay (FA),1,4,69,64,0,5,0,0,7,5,1,1,2,1,1,1,0
3,2017,1,9808,Antonio Brown,Antonio Brown (FA),1,11,182,90,0,92,50,0,11,11,0,0,7,2,2,1,1
4,2017,1,13429,Adam Thielen,Adam Thielen (CAR),1,9,157,92,0,65,17,0,10,10,0,0,4,4,2,1,0
5,2017,1,13969,Nelson Agholor,Nelson Agholor (FA),1,6,86,51,0,35,18,0,8,6,0,1,3,1,1,1,1
6,2017,1,13081,Bennie Fowler III,Bennie Fowler III (FA),1,3,21,21,0,0,0,0,4,3,0,2,1,0,0,0,0
7,2017,1,9320,Jordy Nelson,Jordy Nelson (FA),1,7,79,73,0,6,1,0,8,7,0,0,3,1,1,0,0
8,2017,2,9460,Michael Crabtree,Michael Crabtree (FA),1,6,80,50,0,30,0,0,6,6,0,0,3,3,0,0,0
9,2017,2,11548,Jermaine Kearse,Jermaine Kearse (FA),1,4,64,57,0,7,0,0,5,4,0,0,3,1,1,0,0


In [28]:
# output: a dataframe of weekly WR redzone stats week-by-week for defined years

# scrape the weekly WR redzone stats from the FantasyPros
def wr_scrape_fp_redzone(start_year, end_year):
    """
    Scrapes FantasyPros WR Red Zone Stats for all weeks and years, including FantasyPros IDs.
    """
    # Generate year-week combinations
    week_combinations = generate_year_week_combinations(start_year, end_year)

    # Initialize an empty list to store data
    all_data = []

    for year, week in week_combinations:
        try:
            # Build the URL
            url = f"https://www.fantasypros.com/nfl/red-zone-stats/wr.php?year={year}&range=week&week={week}"
            response = requests.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, "html.parser")

            # Locate the table
            table = soup.find("table", {"id": "data"})
            if table:
                table_str = str(table)

                # Read the table while skipping multi-level headers
                df = pd.read_html(io.StringIO(table_str))[0]

                # Flatten the multi-level column headers
                df.columns = df.columns.droplevel(0)

                # Drop the extra header rows (if any)
                df = df[df['Player'] != 'Player']  # Filter out duplicate header rows

                # Extract FantasyPros Player IDs correctly
                table_rows = table.find("tbody").find_all("tr")

                fantasypros_ids = []
                player_names = []

                for row in table_rows:
                    # Extract FantasyPros Player ID
                    fantasypros_id = "Unknown"
                    player_name = "Unknown"

                    player_link = row.find("a", class_="fp-player-link")
                    if player_link:
                        class_list = player_link.get("class", [])
                        for class_name in class_list:
                            if class_name.startswith("fp-id-"):
                                fantasypros_id = class_name.replace("fp-id-", "")  # Extract numeric ID
                                break
                        player_name = player_link.text.strip()  # Extract player name explicitly from <a> tag

                    fantasypros_ids.append(fantasypros_id)
                    player_names.append(player_name)

                # Add extracted FantasyPros IDs and correct player names
                df.insert(1, "FantasyPros_ID", fantasypros_ids)
                df["Player"] = player_names  # Replace with properly scraped names

                # Add Year and Week columns
                df["Year"] = year
                df["Week"] = week

                # Append dataframe to list
                all_data.append(df)

            # Delay to avoid bombarding the server
            time.sleep(uniform(0.3, 0.9))  # Randomized delay

        except Exception as e:
            print(f"⚠️ Error occurred while scraping Year: {year}, Week: {week}: {e}")

    # Combine all data into a single dataframe
    wr_redzone_stats_df = pd.concat(all_data, ignore_index=True)

    # # Drop unnecessary columns
    # columns_to_drop = ['Rank', 'ATT', 'YDS', 'TD', 'PCT', 'FL', 'G', 'FPTS', 'FPTS/G', 'ROST %']
    # wr_redzone_stats_df.drop(columns=columns_to_drop, errors='ignore', inplace=True)

    # Rename columns to add 'rz' prefix, except for 'Player', 'FantasyPros_ID', 'Year', 'Week'
    wr_redzone_stats_df.rename(
        columns={
            col: f"{col}_rz" for col in wr_redzone_stats_df.columns
            if col not in ['Player', 'FantasyPros_ID', 'Year', 'Week']
        },
        inplace=True
    )

    # *** csv file ***
    # wr_redzone_stats_df.to_csv("fantasypros_wr_redzone_stats.csv", index=False)

    # Display shape of the dataframe
    print(f"\n📊 **Shape of WR Red Zone Stats dataframe after processing:** {wr_redzone_stats_df.shape}")

    # Display first few rows for verification
    display(wr_redzone_stats_df.head(10))

    return wr_redzone_stats_df

fp_years = get_fp_scraping_year_range()
wr_redzone_stats_df = wr_scrape_fp_redzone(fp_years[0], fp_years[-1])


📊 **Shape of WR Red Zone Stats dataframe after processing:** (1120, 21)


Unnamed: 0,Rank_rz,FantasyPros_ID,Player,REC_rz,TGT_rz,REC PCT_rz,YDS_rz,Y/R_rz,TD_rz,TGT PCT_rz,ATT_rz,YDS_rz.1,TD_rz.1,PCT_rz,FL_rz,G_rz,FPTS_rz,FPTS/G_rz,ROST %_rz,Year,Week
0,1,13981,Stefon Diggs,3,3,100.0%,22,7.3,2,60.0%,0,0,0,0%,0,1,14.2,14.2,85.4%,2017,1
1,2,13081,Bennie Fowler III,2,2,100.0%,11,5.5,2,66.7%,0,0,0,0%,0,1,13.1,13.1,0.0%,2017,1
2,3,13840,Seth Roberts,1,1,100.0%,19,19.0,1,20.0%,0,0,0,0%,0,1,7.9,7.9,0.0%,2017,1
3,4,16433,Cooper Kupp,1,1,100.0%,18,18.0,1,100.0%,0,0,0,0%,0,1,7.8,7.8,86.6%,2017,1
4,5,11606,DeAndre Hopkins,2,3,66.7%,11,5.5,1,75.0%,0,0,0,0%,0,1,7.1,7.1,21.8%,2017,1
5,6,16488,Kenny Golladay,1,1,100.0%,10,10.0,1,33.3%,0,0,0,0%,0,1,7.0,7.0,4.0%,2017,1
6,7,13894,Amari Cooper,1,4,25.0%,8,8.0,1,80.0%,0,0,0,0%,0,1,6.8,6.8,15.2%,2017,1
7,8,11215,Marvin Jones Jr.,1,1,100.0%,6,6.0,1,33.3%,0,0,0,0%,0,1,6.6,6.6,0.0%,2017,1
8,1,9707,Emmanuel Sanders,3,3,100.0%,21,7.0,2,75.0%,0,0,0,0%,0,1,14.1,14.1,0.0%,2017,2
9,2,9460,Michael Crabtree,2,2,100.0%,3,1.5,2,66.7%,0,0,0,0%,0,1,12.3,12.3,0.0%,2017,2


In [29]:
# Listing columns of all three FantasyPros dataframes

advanced_stats_cols = wr_fp_advanced_stats_df.columns.tolist()
fpts_rost_cols = wr_fantasypros_basic_stats_df.columns.tolist()
redzone_stats_cols = wr_redzone_stats_df.columns.tolist()

# Combine into a dataframe for comparison
comparison_df = pd.DataFrame({
    "Advanced Stats": pd.Series(advanced_stats_cols),
    "FPTS & ROST": pd.Series(fpts_rost_cols),
    "Red Zone Stats": pd.Series(redzone_stats_cols)
})
comparison_df

Unnamed: 0,Advanced Stats,FPTS & ROST,Red Zone Stats
0,Year,Rank,Rank_rz
1,Week,FantasyPros_ID,FantasyPros_ID
2,FantasyPros_ID,Player,Player
3,Player,REC,REC_rz
4,Player,TGT,TGT_rz
5,G,YDS,REC PCT_rz
6,REC,Y/R,YDS_rz
7,YDS,LG,Y/R_rz
8,YBC,20+,TD_rz
9,AIR,TD,TGT PCT_rz


In [30]:
# ✅ Display the shape of each dataframe before merging
print(f"\n📊 **Shape of WR Advanced Stats DataFrame:** {wr_fp_advanced_stats_df.shape}")
print(f"📊 **Shape of WR Basic Stats DataFrame:** {wr_fantasypros_basic_stats_df.shape}")
print(f"📊 **Shape of WR Red Zone Stats DataFrame:** {wr_redzone_stats_df.shape}")


📊 **Shape of WR Advanced Stats DataFrame:** (1259, 22)
📊 **Shape of WR Basic Stats DataFrame:** (33676, 20)
📊 **Shape of WR Red Zone Stats DataFrame:** (1120, 21)


In [31]:
# check nulls
# Apply helper function to each FantasyPros DataFrame
advanced_stats_nulls = check_nulls(wr_fp_advanced_stats_df, "FantasyPros Advanced Stats")
fpts_rost_nulls = check_nulls(wr_fantasypros_basic_stats_df, "FantasyPros Basic Stats")
redzone_nulls = check_nulls(wr_redzone_stats_df, "FantasyPros Red Zone Stats")

# Concatenate all results (only non-empty will be shown)
combined_nulls = pd.concat(
    [advanced_stats_nulls, fpts_rost_nulls, redzone_nulls],
    keys=["Advanced Stats", "FPTS & Rostered", "Red Zone Stats"]
)
combined_nulls


📊 Missing Value Summary for: FantasyPros Advanced Stats

📊 Missing Value Summary for: FantasyPros Basic Stats

📊 Missing Value Summary for: FantasyPros Red Zone Stats


Unnamed: 0,Unnamed: 1,Missing Count,Missing %


In [32]:
# ✅ Drop 'Player' from fpts and redzone dataframes before merge
wr_redzone_stats_df = wr_redzone_stats_df.drop(columns=["Player"], errors="ignore")

# ✅ Display the updated shape after dropping duplicate column
print(f"\n📊 **Fantasypros redzone Shape:** {wr_redzone_stats_df.shape}")


📊 **Fantasypros redzone Shape:** (1120, 20)


In [33]:
# ** Create a new merged dataframe **
# ✅ Merge WR FantasyPros Advanced Stats with Red Zone Stats
wr_adv_rz_merged_df = pd.merge(
    wr_fp_advanced_stats_df,  # Baseline DF (keeps 'Player')
    wr_redzone_stats_df,
    on=['FantasyPros_ID', 'Year', 'Week'],
    how='inner'  
)

In [34]:
# ✅ Display the shape fo the merged fantasypros  advanced and redzone df
print(f"\n📊 **Shape of the merged fantasypros advanced and redzone df:** {wr_adv_rz_merged_df.shape}")

# *** csv file ***
wr_adv_rz_merged_df.to_csv('wr_fp_adv_rz_merged_df.csv', index=False)
display(wr_adv_rz_merged_df.head())


📊 **Shape of the merged fantasypros advanced and redzone df:** (432, 39)


Unnamed: 0,Year,Week,FantasyPros_ID,Player,Player.1,G,REC,YDS,YBC,AIR,YAC,YACON,BRKTKL,TGT,CATCHABLE,DROP,RZ TGT,10+ YDS,20+ YDS,30+ YDS,40+ YDS,50+ YDS,Rank_rz,REC_rz,TGT_rz,REC PCT_rz,YDS_rz,Y/R_rz,TD_rz,TGT PCT_rz,ATT_rz,YDS_rz.1,TD_rz.1,PCT_rz,FL_rz,G_rz,FPTS_rz,FPTS/G_rz,ROST %_rz
0,2017,1,13981,Stefon Diggs,Stefon Diggs (NE),1,7,93,76,0,17,6,0,8,7,0,2,5,2,1,0,0,1,3,3,100.0%,22,7.3,2,60.0%,0,0,0,0%,0,1,14.2,14.2,85.4%
1,2017,1,16488,Kenny Golladay,Kenny Golladay (FA),1,4,69,64,0,5,0,0,7,5,1,1,2,1,1,1,0,6,1,1,100.0%,10,10.0,1,33.3%,0,0,0,0%,0,1,7.0,7.0,4.0%
2,2017,1,13081,Bennie Fowler III,Bennie Fowler III (FA),1,3,21,21,0,0,0,0,4,3,0,2,1,0,0,0,0,2,2,2,100.0%,11,5.5,2,66.7%,0,0,0,0%,0,1,13.1,13.1,0.0%
3,2017,2,9460,Michael Crabtree,Michael Crabtree (FA),1,6,80,50,0,30,0,0,6,6,0,0,3,3,0,0,0,2,2,2,100.0%,3,1.5,2,66.7%,0,0,0,0%,0,1,12.3,12.3,0.0%
4,2017,2,9707,Emmanuel Sanders,Emmanuel Sanders (FA),1,6,62,39,0,23,19,0,8,6,0,3,2,1,0,0,0,1,3,3,100.0%,21,7.0,2,75.0%,0,0,0,0%,0,1,14.1,14.1,0.0%


In [35]:
# Listing columns of all three FantasyPros dataframes

wr_adv_rz_merged_cols = wr_adv_rz_merged_df.columns.tolist()
fpts_rost_cols = wr_fantasypros_basic_stats_df.columns.tolist()

# Combine into a dataframe for comparison
comparison_df = pd.DataFrame({
    "Red Zone and Advanced Stats": pd.Series(advanced_stats_cols),
    "Basic Stats": pd.Series(fpts_rost_cols)
})
comparison_df

Unnamed: 0,Red Zone and Advanced Stats,Basic Stats
0,Year,Rank
1,Week,FantasyPros_ID
2,FantasyPros_ID,Player
3,Player,REC
4,Player,TGT
5,G,YDS
6,REC,Y/R
7,YDS,LG
8,YBC,20+
9,AIR,TD


In [36]:
# check nulls
# Apply helper function to each FantasyPros DataFrame
advanced_and_rz_stats_nulls = check_nulls(wr_adv_rz_merged_df, "FantasyPros Advanced and Redzone Stats")
basic_stats_nulls = check_nulls(wr_fantasypros_basic_stats_df, "FantasyPros Basic Stats")


# Concatenate all results (only non-empty will be shown)
combined_nulls = pd.concat(
    [advanced_and_rz_stats_nulls, basic_stats_nulls],
    keys=["Advanced and Redzone Stats", "Basic Stats"]
)
combined_nulls


📊 Missing Value Summary for: FantasyPros Advanced and Redzone Stats

📊 Missing Value Summary for: FantasyPros Basic Stats


Unnamed: 0,Unnamed: 1,Missing Count,Missing %


In [37]:
### End:fantasypros webscraping ###

In [None]:
# All dataframes
# wr_ids_weekly_stats_df
# wr_ngs_df
# wr_fantasypros_basic_stats_df
# wr_fp_adv_rz_merged_df


In [38]:
# Begin: Feature Engineering

In [None]:
# End: Feature Engineering

In [None]:
## Begin: webscraping sportsoddshistory betting lines data ###

In [None]:
# Generate Dynamic URLs for the regular season 
# output: urls with each year to scrape

# Helper: Define year range
def get_year_range(current_year, current_week):
    return list(range(2017, current_year + 1)) if current_week >= 1 else list(range(2017, current_year))

# Current time context
current_year = datetime.now().year
current_week = datetime.now().isocalendar()[1]

# Get the range and build the season URLs
season_list = get_year_range(current_year, current_week)
season_urls = [f"https://www.sportsoddshistory.com/nfl-game-season/?y={year}" for year in season_list]

# Preview
for season, url in zip(season_list, season_urls):
    print(f"{season}: {url}")

In [None]:
# Load Soup and Extract Regular Season Week Headers
# output: regular season years

# Container for (season, soup, regular season week headers)
tm_game_odds_season_soup_headers = []

for season, url in zip(season_list, season_urls):
    print(f"🔄 Fetching {season}...")
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract all <h3> tags
    all_headers = soup.find_all('h3')

    # Only keep regular season week headers (e.g., "2023 Regular Season - Week 1")
    reg_week_headers = [tag for tag in all_headers if f"{season} Regular Season - Week" in tag.text]

    # Store tuple for next stage
    tm_game_odds_season_soup_headers.append((season, soup, reg_week_headers))

In [None]:
# Scrape All Weeks for Each Year
# output: none

# Container to collect team-level weekly DataFrames
tm_game_odds_regular_team_level_list = []

for season, soup, week_headers in tm_game_odds_season_soup_headers:
    for tag in week_headers:
        week = int(tag.text.strip().split("Week")[1])

        table = tag.find_next('table')
        if not table:
            continue
        rows = table.find('tbody').find_all('tr')

        base_data = []
        abbr_data = []

        for row in rows:
            cells = row.find_all('td')
            if len(cells) != 11:
                continue

            base_data.append([cell.get_text(strip=True) for cell in cells])

            fav_link = cells[4].find('a')
            und_link = cells[8].find('a')

            fav_abbr = fav_link['href'].split('=')[1].split('&')[0] if fav_link else None
            und_abbr = und_link['href'].split('=')[1].split('&')[0] if und_link else None

            abbr_data.append([fav_abbr, und_abbr])

        if not base_data:
            continue

        df = pd.DataFrame(base_data, columns=[
            'Day', 'Date', 'Time (ET)', 'Home_Away', 'Favorite', 'Score', 'Spread', '',
            'Underdog', 'Over/Under', 'Notes'
        ])
        df.drop(columns=[''], inplace=True)

        abbr_df = pd.DataFrame(abbr_data, columns=['Favorite_Abbr', 'Underdog_Abbr'])
        df = pd.concat([df, abbr_df], axis=1)
        df['season'] = season
        df['week'] = week

        # Expand to team-level format
        team_rows = []
        for _, row in df.iterrows():
            home_away = row['Home_Away']
            score = row['Score']

            team_rows.extend([
                {
                    'season': row['season'], 'week': row['week'],
                    'team_abbr': row['Favorite_Abbr'],
                    'opponent_abbr': row['Underdog_Abbr'],
                    'home': home_away != '@', 'role': 'Favorite',
                    'result': score.split()[0] if score else None,
                    'score': score.split()[1] if score else None,
                    'spread': row['Spread'], 'over_under': row['Over/Under']
                },
                {
                    'season': row['season'], 'week': row['week'],
                    'team_abbr': row['Underdog_Abbr'],
                    'opponent_abbr': row['Favorite_Abbr'],
                    'home': home_away == '@', 'role': 'Underdog',
                    'result': None, 'score': None,
                    'spread': row['Spread'], 'over_under': row['Over/Under']
                }
            ])

        weekly_df = pd.DataFrame(team_rows)

        # Clean the spread column but DO NOT normalize abbreviations yet
        weekly_df['spread'] = weekly_df['spread'].str.replace(r'^P.*', '0', regex=True)
        weekly_df['spread'] = weekly_df['spread'].str.replace(r'^[WL]\s*', '', regex=True)
        weekly_df['spread'] = pd.to_numeric(weekly_df['spread'].str.strip(), errors='coerce')

        tm_game_odds_regular_team_level_list.append(weekly_df)

In [None]:
# Combine and Export the Regular Season Data
# output: csv file of regular season odds lines

# Combine all regular season data
tm_game_odds_regular_team_level = pd.concat(
    tm_game_odds_regular_team_level_list,
    ignore_index=True
)

# Preview dimensions
print(f"📊 Total rows collected: {tm_game_odds_regular_team_level.shape[0]}")
print(f"✅ Columns: {list(tm_game_odds_regular_team_level.columns)}")

# Export for inspection
# tm_game_odds_regular_team_level.to_csv("tm_game_odds_regular_season.csv", index=False)
print("📝 Exported to tm_game_odds_regular_season.csv")

In [None]:
# extract playoff <h3> tags from each season and pair them with the corresponding soup object
# output: a list of tuples

# Identify playoff <h3> tags for each season 
playoff_h3_tags = []

for season, url in zip(season_list, season_urls):
    print(f"🔄 Checking {season} playoffs...")

    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    tag = soup.find('h3', string=f"{season} Playoffs")
    
    if tag:
        playoff_h3_tags.append((season, soup, tag))
    else:
        print(f"⚠️ No playoff tag found for {season}")

In [None]:
# extract all <tr> rows from the playoff table corresponding to each <h3> tag
# output: a list of tuples: (season, [row1, row2, ..., rowN])

# Extract playoff table rows from each <h3> tag
playoff_table_rows = []

for season, soup, tag in playoff_h3_tags:
    print(f"📅 Extracting table rows for {season} playoffs...")
    
    try:
        table = tag.find_next('table')
        rows = table.find('tbody').find_all('tr')
        playoff_table_rows.append((season, rows))
    except Exception as e:
        print(f"⚠️ {season} - Failed to extract table rows: {e}")

In [None]:
# Parse each playoff row into game-level dictionaries
# output: dataframe of the playoff games

# Parse <tr> rows into dictionaries with playoff data
def parse_playoff_row(row):
    try:
        cells = row.find_all('td')
        if len(cells) < 11:
            return None  # Skip malformed rows

        # Remove seed info from names (e.g., "BUF (3)" → "BUF")
        favorite_raw = cells[5].get_text(strip=True).rsplit('(', 1)[0].strip()
        underdog_raw = cells[9].get_text(strip=True).rsplit('(', 1)[0].strip()

        return {
            'Round': cells[0].get_text(strip=True),
            'Day': cells[1].get_text(strip=True),
            'Date': cells[2].get_text(strip=True),
            'Time (ET)': cells[3].get_text(strip=True),
            'Home_Away': cells[4].get_text(strip=True),
            'Favorite': favorite_raw,
            'Score': cells[6].get_text(strip=True),
            'Spread': cells[7].get_text(strip=True),
            'Underdog': underdog_raw,
            'Over/Under': cells[10].get_text(strip=True)
        }

    except Exception as e:
        print(f"⚠️ Error parsing row: {e}")
        return None

# Apply the parser
parsed_playoff_games = []

for season, rows in playoff_table_rows:
    parsed = [parse_playoff_row(row) for row in rows]
    parsed = [game for game in parsed if game is not None]
    for game in parsed:
        game['season'] = season  # Tag season
    parsed_playoff_games.extend(parsed)

# Preview the first few
pd.DataFrame(parsed_playoff_games).head()

In [None]:
# Convert parsed playoff games into team-level rows
# output: dataframe of the playoff games with numbered weeks

# Helper to map playoff round name → week offset
def get_playoff_week(round_name, season):
    base_week = 19 if season >= 2021 else 18
    round_name = round_name.lower()
    if "wild card" in round_name:
        return base_week
    elif "divisional" in round_name:
        return base_week + 1
    elif "championship" in round_name:
        return base_week + 2
    elif "super bowl" in round_name:
        return base_week + 3
    return None  # fallback

# Convert each game into two team rows
team_level_rows = []

for game in parsed_playoff_games:
    week = get_playoff_week(game['Round'], game['season'])

    team_level_rows.append({
        'season': game['season'],
        'week': week,
        'team_abbr': game['Favorite'],
        'opponent_abbr': game['Underdog'],
        'home': game['Home_Away'] != '@',
        'role': 'Favorite',
        'result': game['Score'].split()[0] if game['Score'] else None,
        'score': game['Score'].split()[1] if game['Score'] else None,
        'spread': game['Spread'],
        'over_under': game['Over/Under']
    })

    team_level_rows.append({
        'season': game['season'],
        'week': week,
        'team_abbr': game['Underdog'],
        'opponent_abbr': game['Favorite'],
        'home': game['Home_Away'] == '@',
        'role': 'Underdog',
        'result': None,
        'score': None,
        'spread': game['Spread'],
        'over_under': game['Over/Under']
    })

# Build team-level playoff DataFrame
tm_game_odds_playoffs_team_level = pd.DataFrame(team_level_rows)

# Clean 'spread' values
tm_game_odds_playoffs_team_level['spread'] = (
    tm_game_odds_playoffs_team_level['spread']
    .str.replace(r'^P.*', '0', regex=True)
    .str.replace(r'^[WL]\s*', '', regex=True)
    .str.strip()
)
tm_game_odds_playoffs_team_level['spread'] = pd.to_numeric(
    tm_game_odds_playoffs_team_level['spread'], errors='coerce'
)

# Preview
display(tm_game_odds_playoffs_team_level.head())


In [None]:
# Merge Regular Season + Playoffs into One DataFrame
# output: combined dataframe of regular season and playoff odds and lines *(not final)
# output: csv file for inspection only *not final

# Combine regular season and playoff team-level rows
tm_game_odds_all_games = pd.concat(
    [tm_game_odds_regular_team_level, tm_game_odds_playoffs_team_level],
    ignore_index=True
)

# Optional: sort for easier inspection
tm_game_odds_all_games.sort_values(by=['season', 'week'], inplace=True)

# Export final combined dataset
# tm_game_odds_all_games.to_csv("tm_game_odds_all_games.csv", index=False)

# Preview
print("✅ Combined all regular season and playoff games.")
print(f"📊 Final shape: {tm_game_odds_all_games.shape}")
display(tm_game_odds_all_games.head())

In [None]:
# standardize team abbreviations
# output: none

# Create One Unified Map
# Canonical full team name → abbreviation
full_name_to_abbr = {
    'Arizona Cardinals': 'ARI', 'Atlanta Falcons': 'ATL', 'Baltimore Ravens': 'BAL',
    'Buffalo Bills': 'BUF', 'Carolina Panthers': 'CAR', 'Chicago Bears': 'CHI',
    'Cincinnati Bengals': 'CIN', 'Cleveland Browns': 'CLE', 'Dallas Cowboys': 'DAL',
    'Denver Broncos': 'DEN', 'Detroit Lions': 'DET', 'Green Bay Packers': 'GB',
    'Houston Texans': 'HOU', 'Indianapolis Colts': 'IND', 'Jacksonville Jaguars': 'JAX',
    'Kansas City Chiefs': 'KC', 'Las Vegas Raiders': 'LV', 'Los Angeles Chargers': 'LAC',
    'Los Angeles Rams': 'LA', 'Miami Dolphins': 'MIA', 'Minnesota Vikings': 'MIN',
    'New England Patriots': 'NE', 'New Orleans Saints': 'NO', 'New York Giants': 'NYG',
    'New York Jets': 'NYJ', 'Philadelphia Eagles': 'PHI', 'Pittsburgh Steelers': 'PIT',
    'San Francisco 49ers': 'SF', 'Seattle Seahawks': 'SEA', 'Tampa Bay Buccaneers': 'TB',
    'Tennessee Titans': 'TEN', 'Washington Commanders': 'WAS', 'Washington Football Team': 'WAS'

}

# Abbreviation fix-ups
abbr_fix_map = {
    'ARZ': 'ARI', 'TBB': 'TB', 'NEP': 'NE', 'GBP': 'GB', 'KCC': 'KC',
    'SFF': 'SF', 'LAR': 'LA', 'NOS': 'NO', 'JAC': 'JAX', 'LVR': 'LV'
}


In [None]:
# apply the correct team abbreviations and rename the dataframe
# output: no output

# Rename to final dataframe name for clarity
tm_reg_playoffs_game_lines_odds = tm_game_odds_all_games.copy()

# Apply full name and abbreviation corrections
tm_reg_playoffs_game_lines_odds['team_abbr'] = (
    tm_reg_playoffs_game_lines_odds['team_abbr']
    .replace(full_name_to_abbr)
    .replace(abbr_fix_map)
)

tm_reg_playoffs_game_lines_odds['opponent_abbr'] = (
    tm_reg_playoffs_game_lines_odds['opponent_abbr']
    .replace(full_name_to_abbr)
    .replace(abbr_fix_map)
)

In [None]:
# Compare the team abbreviations: python nfl df to the tm odds df - the sets should match!
# output: list of team abbreviaion  
# Canonical set from the base dataset
baseline_nfl_tm_abbr = set(wr_ids_ngs_pfr_stats_sorted['recent_team'].dropna().unique())

# Abbreviation sets from the cleaned odds dataframe
home_odds_lines_abbr = set(tm_reg_playoffs_game_lines_odds['team_abbr'].dropna().unique())
away_odds_lines_abbr = set(tm_reg_playoffs_game_lines_odds['opponent_abbr'].dropna().unique())

# Combined odds abbreviations
odds_abbrs = home_odds_lines_abbr.union(away_odds_lines_abbr)

# Comparison
only_in_wr_ids = baseline_nfl_tm_abbr - odds_abbrs
only_in_odds = odds_abbrs - baseline_nfl_tm_abbr

# Output
print("✅ Unique team_abbr values (home):", sorted(home_odds_lines_abbr))
print()

print("✅ Unique opponent_abbr values (away):", sorted(away_odds_lines_abbr))
print()

print("❌ Abbreviations only in wr_ids_ngs_pfr_stats_sorted:", only_in_wr_ids)
print("❌ Abbreviations only in tm_reg_playoffs_game_lines_odds:", only_in_odds)

In [None]:
# **** Final Dataframe of team odds ***

# output: final dataframe of team odds and lines with the csv file
# Save the file
# tm_reg_playoffs_game_lines_odds.to_csv("tm_reg_playoffs_game_lines_odds.csv", index=False)

# Display the first 10 rows
display(tm_reg_playoffs_game_lines_odds.head(10))

In [None]:
## End: webscraping sportsoddshistory betting lines data ###

In [None]:
## Begin: Build the dataframe for the DFS Fanduel and Draft Kings salary data from BigDataBall ##
# ** Files must be in the local directory ** NFL-20xx-DFS-Dataset.xlsx

In [None]:
# create a helper function to clean the the dfs salary data
# no output

def clean_column_dfs(col):
    """
    Cleans and flattens multi-index column names for DFS salary Excel files:
    - Joins tuples if multi-index
    - Removes special characters
    - Normalizes spaces
    - Converts to lowercase for matching
    """
    if isinstance(col, tuple):
        col = ' '.join(str(x) for x in col if x)

    return (
        str(col)
        .replace('\n', ' ')
        .replace('(', '')
        .replace(')', '')
        .replace('"', '')
        .replace('#', '')
        .replace('$', '')
        .replace('/', '')
        .replace('-', ' ')
        .strip()
        .lower()
        .replace('  ', ' ')
        .replace('   ', ' ')
    )

In [None]:
# Read the excel files
filepath = 'NFL-2024-DFS-Dataset.xlsx'
dfs_raw = pd.read_excel(filepath, header=[0, 1])
original_row_count = len(dfs_raw)

dfs_raw.columns = [clean_column_dfs(col) for col in dfs_raw.columns]
dfs_raw.head()  # Optional preview

In [None]:
# helper function does the following:
# Fanduel and Draft Kings player salary data for all positions (QB, RB, TE, WR, DST)
# creates and combines the dataframes for years 2017 - present 
# performs data validation checks
# output: none

def process_single_year(filepath, year):
    
    # Step 1: Read and clean the headers
    dfs_raw = pd.read_excel(filepath, header=[0, 1])
    original_row_count = len(dfs_raw)
    dfs_raw.columns = [clean_column_dfs(col) for col in dfs_raw.columns]

    # ✅ Step 2: Extract only relevant columns using cleaned names
    expected_cols = {
        'player': 'game information player dst',
        'week': 'game information week',
        'date': 'game information date',
        'player_id': 'game information player id',
        'dk_position': 'position draftkings',
        'fd_position': 'position fanduel',
        'dk_salary': 'salary for draftkings classic contests',
        'fd_salary': 'salary for fanduel full roster contests'
    }

    # Subset the dataframe using cleaned column names
    dfs_subset = dfs_raw[list(expected_cols.values())].copy()

    # Rename them to simple identifiers for internal use
    dfs_subset.columns = list(expected_cols.keys())

    
    dfs_subset['date'] = pd.to_datetime(dfs_subset['date'])

    team_abbreviation_mapping = {
        'NWE': 'NE',
        'SFO': 'SF',
        'OAK': 'LV',
        'KAN': 'KC',
        'TAM': 'TB',
        'NOR': 'NO',
        'LAR': 'LA',
        'GNB': 'GB'
    }
    mask_dst = dfs_subset['dk_position'] == 'DST'
    dfs_subset.loc[mask_dst, 'player_id'] = dfs_subset.loc[mask_dst, 'player_id'].replace(team_abbreviation_mapping)

    def fix_season(row):
        game_year = row['date'].year
        game_month = row['date'].month
        game_week = row['week']
        
        if game_month in [1, 2]:
            if (game_year <= 2020 and game_week >= 18):
                return game_year - 1
            elif (game_year >= 2021 and game_week >= 19):
                return game_year - 1
            elif (game_year >= 2021 and game_week == 18):
                return game_year - 1
        return game_year

    dfs_subset['season'] = dfs_subset.apply(fix_season, axis=1)

    # 🔥 Track NaNs before dropping
    season_nulls_before = dfs_subset['season'].isna().sum()

    dfs_subset = dfs_subset.dropna(subset=['season'])
    dfs_subset['season'] = dfs_subset['season'].astype(int)

    season_nulls_after = dfs_subset['season'].isna().sum()

    print(f"🔎 Season NaN rows dropped: {season_nulls_before}")
    print(f"Remaining NaN rows (should be 0): {season_nulls_after}")

    dfs_subset = dfs_subset.drop(columns=['date'])

    dfs_subset['dk_salary'] = pd.to_numeric(dfs_subset['dk_salary'], errors='coerce')
    dfs_subset['fd_salary'] = pd.to_numeric(dfs_subset['fd_salary'], errors='coerce')
    dfs_subset = dfs_subset.dropna(subset=['dk_salary', 'fd_salary'])
    dfs_subset['dk_salary'] = dfs_subset['dk_salary'].astype(int)
    dfs_subset['fd_salary'] = dfs_subset['fd_salary'].astype(int)
    dfs_subset['week'] = dfs_subset['week'].astype(int)
    
    dfs_subset = dfs_subset[['season', 'week', 'player_id', 'player', 'dk_position', 'fd_position', 'dk_salary', 'fd_salary']]
    
    unique_weeks = dfs_subset['week'].nunique()
    min_week = dfs_subset['week'].min()
    max_week = dfs_subset['week'].max()
    expected_weeks = 21 if int(year) <= 2020 else 22

    print(f"\nProcessing file: {filepath}")
    print(f"Original rows in xlsx file: {original_row_count}")
    print(f"Number of players with no salary data found in xlsx: {original_row_count - len(dfs_subset)}")
    print(f"Rows in csv file after dropping NaNs: {len(dfs_subset)}")

    if original_row_count - (original_row_count - len(dfs_subset)) == len(dfs_subset):
        print("✅ Salary Validation passed: Counts match after dropping NaNs.")
        salary_validation = 'Passed'
    else:
        print("❌ Salary Validation failed: Counts mismatch!")
        salary_validation = 'Failed'

    print(f"Weeks detected: {min_week} to {max_week}")
    print(f"Total unique weeks found: {unique_weeks}")
    print("🔔 Reminder: Missing final playoff week (e.g., Super Bowl) is normal if no salary data exists.")

    if unique_weeks == expected_weeks or unique_weeks == expected_weeks - 1:
        print(f"✅ Week Validation passed: {unique_weeks} weeks found (expected {expected_weeks}).\n")
        week_validation = 'Passed'
    else:
        print(f"❌ Week Validation failed: {unique_weeks} weeks found, expected {expected_weeks}.\n")
        week_validation = 'Failed'
    
    return dfs_subset, {
        'year': int(year),
        'original_rows': original_row_count,
        'nan_rows': original_row_count - len(dfs_subset),
        'rows_after_drop': len(dfs_subset),
        'min_week': min_week,
        'max_week': max_week,
        'unique_weeks': unique_weeks,
        'expected_weeks': expected_weeks,
        'salary_validation': salary_validation,
        'week_validation': week_validation
    }

In [None]:
# ** dataframe of Fanduel and Draft Kings Salaries FOR all positions ** 

# main control flow implements the helper function 
# output: combined dataframe and csv files of all seasons fanduel draft kings player salary data 
# output: data validation checks

# Find all matching files
file_list = sorted(glob.glob('NFL-*-DFS-Dataset.xlsx'))

# Handle if no files found
if not file_list:
    print("❌ No xlsx files detected.\nPlease download and place the BigDataBall NFL DFS Excel files into the same directory as this Jupyter Notebook file.")
else:
    # Process each file
    all_years_dfs = []
    validation_records = []
    file_years = []

    for file in file_list:
        year = file.split('-')[1]  # Extract year from filename
        file_years.append(int(year))
        
        year_df, validation_info = process_single_year(file, year)

        # ** csv file ***
        # Save per-year CSV
        # year_df.to_csv(f'nfl_fd_dk_salary_{year}.csv', index=False)
        
        # Append to master list
        all_years_dfs.append(year_df)
        validation_records.append(validation_info)

    # Create validation summary DataFrame
    validation_summary_df = pd.DataFrame(validation_records)
    print("\n📋 Validation Summary:")
    display(validation_summary_df)

    # Combine all years into one big dataframe
    nfl_fd_dk_salary_combined = pd.concat(all_years_dfs, ignore_index=True)

    # Determine latest season dynamically
    current_season = max(file_years)

    # Export final combined CSV
    final_filename = f'nfl_fd_dk_salary_2017_{current_season}.csv'

    # If the file already exists, create a backup
    if os.path.exists(final_filename):
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        backup_filename = f'nfl_fd_dk_salary_2017_{current_season}_backup_{timestamp}.csv'
        shutil.copy(final_filename, backup_filename)
        print(f"🛡️ Backup created: {backup_filename}")


    # *** csv file ***
    # nfl_fd_dk_salary_combined.to_csv(final_filename, index=False)

    print(f"\n✅ Final combined CSV saved as: {final_filename}")

    # Display a quick preview
    display(nfl_fd_dk_salary_combined.head())

In [None]:
# team abbreviations validations check
def validate_team_defenses(dfs_df, year_label, baseline_set):
    dfs_team_defense = dfs_df.loc[dfs_df['dk_position'] == 'DST', 'player_id']
    dfs_nfl_tm_abbr = set(dfs_team_defense.dropna().unique())
    
    difference_dfs = dfs_nfl_tm_abbr - baseline_set
    difference_baseline = baseline_set - dfs_nfl_tm_abbr
    
    print(f"\nValidating Team Defenses for {year_label}:")
    if not difference_dfs and not difference_baseline:
        print(f"✅ Team defenses match for {year_label}")
    else:
        print(f"❌ Team mismatch detected for {year_label}")
        print(f"Teams only in DFS: {difference_dfs}")
        print(f"Teams only in Baseline: {difference_baseline}")

# Validate each yearly dataframe
for df, info in zip(all_years_dfs, validation_records):
    validate_team_defenses(df, year_label=info['year'], baseline_set=baseline_nfl_tm_abbr)

# Validate the full combined dataframe
validate_team_defenses(nfl_fd_dk_salary_combined, year_label='Combined', baseline_set=baseline_nfl_tm_abbr)

In [None]:
# ** Final WR dataframe of Fanduel and Draft Kings player Salaries ** 

# output: 

# Determine current season based on available data
current_season = nfl_fd_dk_salary_combined['season'].max()

# Extract WR players where DraftKings position is WR
wr_fd_dk_salary_2017_current_df = nfl_fd_dk_salary_combined.loc[
    nfl_fd_dk_salary_combined['dk_position'] == 'WR'
]

# *** csv file ***
wr_csv_filename = f'wr_fd_dk_salary_2017_{current_season}.csv'
wr_fd_dk_salary_2017_current_df.to_csv(wr_csv_filename, index=False)

print(f"✅ WR DFS dataframe created and saved as {wr_csv_filename}")

# Optional: Display a quick preview
display(wr_fd_dk_salary_2017_current_df.head())


In [None]:
## End: Build the dataframe for the DFS Fanduel and Draft Kings salary data from BigDataBall ##

In [None]:
### Begin: Data Normalization and Merge Process ###

In [None]:
# helper function for normalization methods
def normalize_dataframe(df: pd.DataFrame, type_map: dict = None) -> pd.DataFrame:
    """
    Clean and normalize dataframe:
    - Strip whitespace from object columns
    - Lowercase common ID/name fields
    - Replace special NA tokens with np.nan
    - Cast to types from provided type_map
    """
    df = df.copy()

    # Replace common string-based missing values with np.nan
    df.replace(["N/A", "NA", "-", ""], np.nan, inplace=True)

    # Strip whitespace from string/object columns
    for col in df.select_dtypes(include='object').columns:
        try:
            df[col] = df[col].astype(str).str.strip()
        except Exception as e:
            print(f"⚠️ Could not strip column '{col}': {e}")

    # Lowercase likely ID/name fields (if present)
    for key in ['player', 'player_name', 'player_id', 'fantasypros_id', 'FantasyPros_ID', 'merge_name']:
        if key in df.columns:
            df[key] = df[key].astype(str).str.lower()

    # Apply type conversions as defined in type_map
    if type_map:
        for col, dtype in type_map.items():
            if col in df.columns:
                try:
                    df[col] = df[col].astype(dtype)
                except Exception as e:
                    print(f"⚠️ Warning: could not convert column '{col}' to {dtype}. Reason: {e}")

    return df

In [None]:
def clean_percentage_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
    df = df.copy()
    for col in columns:
        if col in df.columns:
            try:
                # Convert to string first (safe for NaNs)
                df[col] = df[col].astype(str).str.replace('%', '', regex=False)
                df[col] = df[col].astype(float)
            except Exception as e:
                print(f"⚠️ Could not clean and convert '{col}': {e}")
    return df

In [None]:
def inspect_dataframe_types(df: pd.DataFrame, name: str = "DataFrame") -> None:
    print(f"📋 Inspecting: {name}")
    print(f"Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
    summary = pd.DataFrame({
        'dtype': df.dtypes,
        'na_count': df.isna().sum()
    }).sort_index()
    display(summary)

In [None]:
# normalize data types for merging 
# WR Baseline type map
type_map_wr = {
    'season': 'int32',
    'week': 'int32',
    'player_id': 'string',
    'fantasypros_id': 'string',
    # Add others as needed...
}

# FP Advanced Stats type map
type_map_fp_adv = {
    'fantasypros_id': 'string',
    'season': 'int32',
    'week': 'int32',
    'Player': 'string',
    'G': 'int32',
    'REC': 'float32',
    'YDS': 'float32',
    'YBC': 'float32',
    'AIR': 'float32',
    'YAC': 'float32',
    'YACON': 'float32',
    'BRKTKL': 'float32',
    'TGT': 'float32',
    'CATCHABLE': 'float32',
    'DROP': 'float32',
    'RZ TGT': 'float32',
    '10+ YDS': 'float32',
    '20+ YDS': 'float32',
    '30+ YDS': 'float32',
    '40+ YDS': 'float32',
    '50+ YDS': 'float32',
}

# FP Fantasy Points + Rostered type map
type_map_fp_fpts = {
    'fantasypros_id': 'string',
    'season': 'int32',
    'week': 'int32',
    'FPTS': 'float32',
    'ROST': 'string',
}

# FP Redzone type map
type_map_fp_rz = {
    'fantasypros_id': 'string',
    'season': 'int32',
    'week': 'int32',
    'REC PCT_rz': 'float32',
    'REC_rz': 'int32',
    'TGT PCT_rz': 'float32',
    'TGT_rz': 'int32',
    'Y/R_rz': 'float32',
}


In [None]:
### End: Data Normalization and Merge Process ###

In [None]:
### Begin: Feature Engineering ###

In [None]:
# drop unnecessary columns
# output: updated dataframe and csv file with modified columns
# List of columns to drop
cols_to_drop = [
    'player_id', 'player_name', 'position_group', 'mfl_id', 'sportradar_id',
    'fantasypros_id', 'pff_id', 'sleeper_id', 'nfl_id', 'espn_id', 'yahoo_id',
    'fleaflicker_id', 'cbs_id', 'pfr_id', 'cfbref_id', 'rotowire_id',
    'rotoworld_id', 'ktc_id', 'stats_id', 'stats_global_id', 'fantasy_data_id',
    'swish_id', 'merge_name', 'player_gsis_id', 'player_first_name',
    'player_last_name', 'player_short_name', 'game_type', 'team', 'opponent',
    'pfr_player_name', 'pfr_player_id', 'player', 'team_abbr_y', 'merge_key'
]

# Initial state
initial_shape = wr_nfl_py_fp_odds_salary_merged.shape
initial_col_count = initial_shape[1]

print("Before modification:")
print(f"Shape: {initial_shape}")
print(f"Total Columns: {initial_col_count}")

# Drop only existing columns
cols_existing = [col for col in cols_to_drop if col in wr_nfl_py_fp_odds_salary_merged.columns]
wr_nfl_py_fp_odds_salary_merged_mod_cols = wr_nfl_py_fp_odds_salary_merged.drop(columns=cols_existing)

# Post-drop validation
final_shape = wr_nfl_py_fp_odds_salary_merged_mod_cols.shape
final_col_count = final_shape[1]
dropped_count = len(cols_existing)
expected_final_col_count = initial_col_count - dropped_count

print("\nAfter modification:")
print(f"Shape: {final_shape}")
print(f"Total Columns After Drop: {final_col_count}")
print(f"Number of Columns Dropped: {dropped_count}")
print(f"Expected Final Column Count: {expected_final_col_count}")
print(f"Column Count Validation Passed: {final_col_count == expected_final_col_count}")

# Confirm none of the dropped columns remain
all_dropped = all(col not in wr_nfl_py_fp_odds_salary_merged_mod_cols.columns for col in cols_to_drop)
print(f"All Specified Columns Successfully Dropped: {all_dropped}")

# ** csv file **
output_csv_path = "wr_nfl_py_fp_odds_salary_merged_mod_cols.csv"
# wr_nfl_py_fp_odds_salary_merged_mod_cols.to_csv(output_csv_path, index=False)
# print(f"\nModified dataframe saved to {output_csv_path}")

In [None]:
# reorder columns
# Define new column order
priority_cols = [
    'season', 'season_type', 'week', 'name', 'position', 'recent_team',
    'fpts', 'dk_salary', 'fd_salary', 'rost',
    'opponent_abbr', 'home', 'role', 'result', 'score', 'spread', 'over_under'
]

# Capture original state
original_shape = wr_nfl_py_fp_odds_salary_merged_mod_cols.shape
original_columns = wr_nfl_py_fp_odds_salary_merged_mod_cols.columns.tolist()

print("Original Shape:", original_shape)
print("Original First 20 Columns:", original_columns[:20])

# Build final column order
remaining_cols = [col for col in original_columns if col not in priority_cols]
final_col_order = priority_cols + remaining_cols

# Reorder the columns
wr_nfl_py_fp_odds_salary_merged_mod_cols = wr_nfl_py_fp_odds_salary_merged_mod_cols[final_col_order]

# Post-reorder validation
new_shape = wr_nfl_py_fp_odds_salary_merged_mod_cols.shape
new_columns = wr_nfl_py_fp_odds_salary_merged_mod_cols.columns.tolist()

print("\nNew Shape:", new_shape)
print("New First 20 Columns:", new_columns[:20])

# Validation Checks
print("\n✅ Shape Integrity:", original_shape == new_shape)
print("✅ Column Count Matches:", len(original_columns) == len(new_columns))
print("✅ All Columns Preserved:", set(original_columns) == set(new_columns))

# ** csv file **
output_reordered_csv = "wr_nfl_py_fp_odds_salary_merged_mod_cols_reordered.csv"
# wr_nfl_py_fp_odds_salary_merged_mod_cols.to_csv(output_reordered_csv, index=False)
# print(f"\nReordered dataframe saved to {output_reordered_csv}")

In [None]:
# Apply Rolling Averages and Aggregates - 3,5,and 7 week averages
# Ouput: updated dataframe with aggregates (optional csv file)

# Start from sorted copy of the main DF
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg = (
    wr_nfl_py_fp_odds_salary_merged_mod_cols
    .sort_values(by=['name', 'season', 'week'])
    .reset_index(drop=True)
)

# Feature map: full column -> short prefix
feature_map = {
    'targets': 'tgt',
    'receptions': 'rec',
    'receiving_yards': 'rec_yds',
    'receiving_air_yards': 'rec_air_yards',
    'fpts': 'fpts'
}

windows = [3, 5, 7]

# Apply rolling averages and lag features
for full_col, short in feature_map.items():
    # Group once
    grouped = wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg.groupby(['name', 'season'])

    # Rolling averages using apply (preserves group boundaries)
    for window in windows:
        col_name = f"{short}_{window}wk_avg"
        wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg[col_name] = grouped[full_col].apply(
            lambda x: x.rolling(window=window, min_periods=window).mean().shift(1)
        ).reset_index(drop=True)

    # Lag feature (1-game lookback)
    lag_col = f"{short}_lag_1"
    wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg[lag_col] = grouped[full_col].shift(1).reset_index(drop=True)

# Final integrity check
print("✅ Final shape:", wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg.shape)

# ** csv file **
# wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg.to_csv(
#     "wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg.csv",
#     index=False,
#     float_format="%.2f"
# )
# print("📤 Exported to: wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg.csv")


In [None]:
# Validation tests of aggregates
# output: there should be no aggregates prior to week 4

# Structural Check — No aggregates in first 3 weeks of a season
def check_early_aggregates(df, cols, earliest_week=4):
    early = df[df['week'] < earliest_week]
    violations = early[cols].notna().sum()
    print("🚨 Aggregates present before week", earliest_week)
    print(violations[violations > 0])

# Boundary Check — Rolling aggregates must reset per season
def check_season_boundaries(df, col_prefix):
    errors = []
    for short in col_prefix:
        col_name = f'{short}_3wk_avg'
        season_transitions = df.groupby(['name'])['season'].diff().fillna(0)
        cross_season_rows = df[season_transitions != 0]
        if cross_season_rows[col_name].notna().any():
            errors.append(col_name)
    if errors:
        print("❌ Rolling values leaked across seasons:", errors)
    else:
        print("✅ No cross-season leakage detected.")

# Shape check
def check_shape(df, expected_cols_added):
    print("✅ Final shape:", df.shape)
    print("✅ Final columns:", df.columns[-expected_cols_added:])

# === Apply Checks ===
rolling_cols = [f"{short}_{w}wk_avg" for short in ['tgt', 'rec', 'rec_yds', 'rec_air_yards', 'fpts'] for w in [3, 5, 7]]
lag_cols = [f"{short}_lag_1" for short in ['tgt', 'rec', 'rec_yds', 'rec_air_yards', 'fpts']]
check_early_aggregates(wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg, rolling_cols)
check_season_boundaries(wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg, ['tgt', 'rec', 'rec_yds', 'rec_air_yards', 'fpts'])
check_shape(wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg, expected_cols_added=len(rolling_cols + lag_cols))

In [None]:
# Add Trend Features (deltas) - recent performance over / under (3wk, 5wk, 7wk) averages
# output: updated dataframe with deltas (optional csv file)

# new dataframe
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend = wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg.copy()

# Column map
feature_map = {
    'targets': 'tgt',
    'receptions': 'rec',
    'receiving_yards': 'rec_yds',
    'receiving_air_yards': 'rec_air_yards',
    'fpts': 'fpts'
}

windows = [3, 5, 7]

# Create delta (deviation from trend) features
for full_col, short in feature_map.items():
    for window in windows:
        avg_col = f"{short}_{window}wk_avg"
        delta_col = f"{short}_{window}wk_delta"
        wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend[delta_col] = (
            wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend[full_col] -
            wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend[avg_col]
        )

# Summary and export
delta_cols = [f"{short}_{w}wk_delta" for short in feature_map.values() for w in windows]
print("✅ Added delta columns:", delta_cols)
print("✅ Final shape:", wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend.shape)

# ** csv file **
# wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend.to_csv(
#     "wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend.csv",
#     index=False,
#     float_format="%.2f"
# )
# print("📤 Exported to: wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend.csv")


In [None]:
# Create boolean columns
# output: updated dataframe with booleans (optional csv file)

# Start from the previous trend-enhanced dataframe
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool = wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend.copy()

# Define boolean columns as 0/1 integers
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['tgt_ge_5'] = (wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['targets'] >= 5).astype(int)
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['tgt_ge_7'] = (wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['targets'] >= 7).astype(int)
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['rec_ge_5'] = (wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['receptions'] >= 5).astype(int)
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['rec_ge_7'] = (wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['receptions'] >= 7).astype(int)

wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['target_share_ge_20'] = (wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['target_share'] >= 0.2).astype(int)
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['target_share_ge_30'] = (wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['target_share'] >= 0.3).astype(int)

wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['over_100_yds'] = (wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['receiving_yards'] >= 100).astype(int)
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['double_digit_targets'] = (wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['targets'] >= 10).astype(int)
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['boom_week'] = (wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['fpts'] >= 20).astype(int)
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['bust_week'] = (wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['fpts'] < 5).astype(int)

# If 'home' is already boolean, convert to int
wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['is_home_game'] = wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool['home'].astype(int)

# Final shape and column check
print("✅ Final shape:", wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool.shape)
print("✅ New boolean columns added.")

# ** csv file **
# wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool.to_csv(
#     "wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool.csv",
#     index=False,
#     float_format="%.2f"
# )
# print("📤 Exported to: wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool.csv")


In [None]:
# *** Dataframe: this can be used as the final dataframe but the filename is long ***

# split the over / under column into two columns: o_u and total
# output: updated dataframe with o_u and total columns (optional csv file) 

# Copy from final boolean-enriched dataframe
wr_nfl_py_fp_odds_salary_features = wr_nfl_py_fp_odds_salary_merged_mod_cols_sort_agg_trend_bool.copy()

# Extract 'O' or 'U' and map to "over"/"under"
wr_nfl_py_fp_odds_salary_features['O_U'] = (
    wr_nfl_py_fp_odds_salary_features['over_under']
    .str[0]
    .map({'O': 'over', 'U': 'under'})
)

# Extract the numeric total (handles int or float)
wr_nfl_py_fp_odds_salary_features['Total'] = (
    wr_nfl_py_fp_odds_salary_features['over_under']
    .str.extract(r'(\d+\.?\d*)')[0]
    .astype(float)
)

# Validation
print("✅ Final shape:", wr_nfl_py_fp_odds_salary_features.shape)
print("✅ Sample 'O_U' values:", wr_nfl_py_fp_odds_salary_features['O_U'].unique())
print("✅ Sample 'Total' values:", wr_nfl_py_fp_odds_salary_features['Total'].dropna().unique()[:5])

# ** csv file **
# wr_nfl_py_fp_odds_salary_features.to_csv(
#     "wr_nfl_py_fp_odds_salary_features.csv",
#     index=False,
#     float_format="%.2f"
# )
# print("📤 Exported to: wr_nfl_py_fp_odds_salary_features.csv")

In [None]:
### End: Feature Engineering ###

In [None]:
### Begin: Final Dataframe ###

In [None]:
### *** Final Dataframe *** ###

# Rename final dataframe
wr_nfl_df_sorted_final = wr_nfl_py_fp_odds_salary_features.copy()

# **csv file **
wr_nfl_df_sorted_final.to_csv(
    "wr_nfl_df_sorted_final.csv",
    index=False,
    float_format="%.2f"
)

print("✅ Final dataframe saved as 'wr_nfl_df_sorted_final.csv'")
print("✅ Final shape:", wr_nfl_df_sorted_final.shape)

In [None]:
### End: Final Dataframe ###

In [None]:
### Begin: Final Dataframe Summary Statistics ###

In [None]:
# Final Dataframe Summary Statistics
# Output: csv output of the dataframe summary statistics on data types and missing values

# Total columns and datatypes
total_cols = wr_nfl_df_sorted_final.shape[1]
dtypes_summary = wr_nfl_df_sorted_final.dtypes.value_counts().sort_values(ascending=False)
dtypes_percent = (dtypes_summary / total_cols * 100).round(2)

print("🧠 Data Type Distribution (by count and %):")
for dtype, count in dtypes_summary.items():
    print(f"{str(dtype):15} {count:>3} columns  ({dtypes_percent[dtype]:>5.1f}%)")

# Object-type columns
object_cols = wr_nfl_df_sorted_final.select_dtypes(include='object').columns.tolist()
print(f"\n🚨 Object-type columns found ({len(object_cols)} total / {total_cols} columns):")
print(object_cols)

# Missing value summary by count and % of rows
row_count = len(wr_nfl_df_sorted_final)
na_counts = wr_nfl_df_sorted_final.isna().sum()
na_percent = (na_counts / row_count * 100).round(2)
na_summary = pd.DataFrame({'Missing': na_counts, 'Percent': na_percent})
na_summary = na_summary[na_summary['Missing'] > 0].sort_values(by='Percent', ascending=False)

print(f"\n⚠️ Missing Value Summary (non-zero only) — Top {len(na_summary)} columns:")
display(na_summary)

# ** csv file **
na_summary.to_csv("wr_df_final_summary_stats.csv")

In [None]:
### End: Final Dataframe Summary Statistics ###