In [1]:
# This produces the dataframe for WR

In [2]:
## Notes on the NFL Library ##
# the NFL python library seem to not work on Tuesday probably due to updates (not confirmed)
# unbalanced dataframe - pfr stats start at 2018; all other stats go back to 2017

In [3]:
## REQUIRED ACTIONS - Include in a README doc ## 
# modify the season start date in the 'get_current_week' function
# modify the number of weeks if the NFL adds regular season games to the schedule

In [4]:
# import the libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import glob
from IPython.display import display, HTML
from datetime import datetime
import nfl_data_py as nfl
import os
import re
import time
from random import uniform
import io

In [5]:
# Set Pandas options to display all columns in a single row without wrapping
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

In [6]:
# Function to calculate the current week of the NFL season
def get_current_week():
    current_date = datetime.now()
    season_start_date = datetime(2024, 9, 4)  # Update for the season start
    current_week = ((current_date - season_start_date).days // 7) + 1
    return current_week

# Define the current NFL year, week, and season type
current_year = datetime.now().year
current_week = get_current_week()
seasontype = 2 if current_week <= 18 else 3  # Regular season or playoffs

In [7]:
# define the years to pull
# nfl.import_weekly_data(years, columns, downcast)
def get_year_range(current_year, current_week, start_year=2017):
    if current_week <= 18:  # Regular season
        return list(range(start_year, current_year + 1))
    else:  # Playoffs
        return list(range(start_year, current_year))

# Use the function
years = get_year_range(current_year, current_week)

In [8]:
# define the base columns. 
base_columns = [
    'season', 'season_type', 'week', 'player_id', 'player_name', 
    'position', 'position_group', 'recent_team'
]

In [9]:
# Import the player IDs from nfl.import_ids() - without parameters
ids_data = nfl.import_ids()

# Drop the unnecessary columns
columns_to_drop = [
    'position', 'team', 'birthdate', 'age', 'draft_year', 
    'draft_round', 'draft_pick', 'draft_ovr', 'twitter_username', 
    'height', 'weight', 'college', 'db_season'
]
ids_data = ids_data.drop(columns=columns_to_drop, errors='ignore')

# Display the resulting dataframe for review
# print(f"Columns after dropping unnecessary ones: {ids_data.columns.tolist()}")
# display(ids_data)

In [10]:
# import the weekly data from nfl.import_weekly_data(years, columns, downcast)
weekly_data = nfl.import_weekly_data(
    years=years,
    columns=base_columns
)

# display(weekly_data)

Downcasting floats.


In [11]:
## Output: a dataframe of ALL NFL athletes info and ids since 2017

# Merge the two dataframes on 'player_id' and 'gsis_id'
# Align column names for merging
ids_data = ids_data.rename(columns={'gsis_id': 'player_id'})  
id_dataframe = pd.merge(weekly_data, ids_data, on='player_id', how='inner')

# Assign the resulting dataframe to a variable
all_players_id_data = id_dataframe

# Display the resulting ID dataframe
# display(all_players_id_data)

In [12]:
## Output: a dataframe of NFL WR info and ids since 2017
# extract WR from the dataframe
# Create a new dataframe with only wide receivers
wide_receiver_ids = all_players_id_data[all_players_id_data['position'] == 'WR']

# Display the resulting dataframe for review
print(f"Shape of merged dataframe: {wide_receiver_ids.shape}")

# Display the resulting dataframe for review
# display(wide_receiver_ids)

Shape of merged dataframe: (17379, 29)


In [13]:
## Output: a dataframe of NFL WR info, ids, and stats since 2017
# WR-specific columns (receiving-related)
wr_columns = [
    'receptions', 'targets', 'receiving_yards', 'receiving_tds',
    'receiving_fumbles', 'receiving_fumbles_lost',
    'receiving_air_yards', 'receiving_yards_after_catch',
    'receiving_first_downs', 'receiving_epa',
    'receiving_2pt_conversions', 'racr', 'target_share',
    'air_yards_share', 'wopr'
]

# Pull WR-specific columns from weekly data
wr_stats = nfl.import_weekly_data(
    years=years,
    columns=['player_id', 'season', 'week'] + wr_columns  # Include keys for merging
)

# Merge WR-specific stats with wide_receiver_ids
wr_ids_weekly_stats = pd.merge(
    wide_receiver_ids,
    wr_stats,
    on=['player_id', 'season', 'week'],  # Ensure correct alignment
    how='inner'
)

# Display the resulting dataframe for review
print(f"Shape of merged dataframe: {wr_ids_weekly_stats.shape}")

# Row integrity check
print(
    f"Row count matches: {wr_ids_weekly_stats.shape[0] == wide_receiver_ids.shape[0]}"
)

# display the df
# display(wr_ids_weekly_stats)

Downcasting floats.
Shape of merged dataframe: (17379, 44)
Row count matches: True


In [14]:
# Output: imports the NFL next-generation stats from the nfl python library

# import the next generation stats (NGS) from nfl.import_ngs_data()
# note: ngs starts at week 0 (previous season totals) - not needed so drop those rows

# Pull NGS receiving data for the specified years
ngs_wr_df = nfl.import_ngs_data('receiving', years)

# Exclude rows where 'week' == 0 and filter for 'WR' position in one step
ngs_wr_df = ngs_wr_df[(ngs_wr_df['week'] != 0) & (ngs_wr_df['player_position'] == 'WR')]

# Drop unnecessary columns
ngs_wr_df = ngs_wr_df.drop(columns=['season_type', 'player_position', 'receptions', 'targets','player_jersey_number'], errors='ignore')

# Display the resulting dataframe
print(f"Shape of NGS WR DataFrame after dropping columns: {ngs_wr_df.shape}")
display(ngs_wr_df)

# csv file
# ngs_wr_df.to_csv('ngs_wr_df.csv', index=False)

Shape of NGS WR DataFrame after dropping columns: (8237, 18)


Unnamed: 0,season,week,player_display_name,team_abbr,avg_cushion,avg_separation,avg_intended_air_yards,percent_share_of_intended_air_yards,catch_percentage,yards,rec_touchdowns,avg_yac,avg_expected_yac,avg_yac_above_expectation,player_gsis_id,player_first_name,player_last_name,player_short_name
1725,2017,1,Ryan Grant,WAS,9.936667,2.894592,4.410000,7.154639,66.666667,61.0,0,11.232500,10.072361,1.160139,00-0031068,Ryan,Grant,R.Grant
1726,2017,1,Martavis Bryant,PIT,8.300000,4.122054,12.688333,33.327496,33.333333,14.0,0,0.155000,4.098278,-3.943278,00-0031373,Martavis,Bryant,M.Bryant
1729,2017,1,Jamison Crowder,WAS,7.655000,3.177793,10.540000,19.949707,42.857143,14.0,0,1.450000,1.631897,-0.181897,00-0031941,Jamison,Crowder,J.Crowder
1732,2017,1,Nelson Agholor,PHI,7.423750,2.462620,10.463750,20.274656,75.000000,86.0,1,5.611667,3.262470,2.349197,00-0031549,Nelson,Agholor,N.Agholor
1733,2017,1,John Brown,ARI,7.360000,2.751526,13.422222,28.208481,44.444444,32.0,0,-0.377500,0.961993,-1.339493,00-0031051,John,Brown,J.Brown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13307,2024,20,Amon-Ra St. Brown,DET,5.754000,3.568523,8.846000,24.598871,80.000000,137.0,0,7.613750,4.600767,3.012983,00-0036963,Amon-Ra,St. Brown,A.St. Brown
13308,2024,20,Khalil Shakir,BUF,5.355000,3.436815,6.344286,37.266090,85.714286,67.0,0,4.640000,5.033826,-0.393826,00-0037261,Khalil,Shakir,K.Shakir
13309,2024,20,Nico Collins,HOU,5.117500,1.892568,12.631250,39.495798,62.500000,81.0,0,0.686000,0.953086,-0.267086,00-0036554,Nico,Collins,N.Collins
13312,2024,20,Dyami Brown,WAS,4.230000,2.878581,11.838750,57.884122,75.000000,98.0,0,5.431667,5.221507,0.210160,00-0036626,Dyami,Brown,D.Brown


In [15]:
# Output: a dataframe of NFL WR info, ids, weekly stats, and next-gen stats since 2017

# Joins wr_ids_weekly_stats dataframe with ngs_wr_df using the keys 'player_id' and 'player_gsis_id'
# Merge wr_ids_weekly_stats with ngs_wr_df using a left join
wr_ids_ngs_weekly_stats = pd.merge(
    wr_ids_weekly_stats,
    ngs_wr_df,
    left_on=['player_id', 'season', 'week'],  # Keys from wr_ids_weekly_stats
    right_on=['player_gsis_id', 'season', 'week'],  # Keys from ngs_wr_df
    how='left'  # Retain all rows from wr_ids_weekly_stats
)

# Display the shape of the resulting dataframe
print(f"Shape of merged dataframe: {wr_ids_ngs_weekly_stats.shape}")
print(f"Row count matches: {wr_ids_weekly_stats.shape[0] == wr_ids_ngs_weekly_stats.shape[0]}")

# Display a sample of the merged dataframe
# display(wr_ids_ngs_weekly_stats)

# csv file
# wr_ids_ngs_weekly_stats.to_csv('wr_ids_ngs_weekly_stats.csv', index=False)

Shape of merged dataframe: (17379, 60)
Row count matches: True


In [16]:
# Output: pro-football reference dataframe for receiving data from the python nfl library
# note: PFR data not available before 2018
# there is no position info so the data will pull WR, TE, and RB receiving data

# Define the range of years for PFR data (2018 to the current year)
pfr_years = list(range(2018, current_year))

# import pro-football reference data
pfr_rec_df = nfl.import_weekly_pfr('rec',pfr_years)

# Drop unnecessary columns
pfr_rec_df = pfr_rec_df.drop(
    columns=['game_id','pfr_game_id','receiving_int','rushing_broken_tackles', 
             'passing_drops', 'passing_drop_pct'], errors='ignore')

# display dataframe
print(f"Shape of PFR dataframe: {pfr_rec_df.shape}")
# display(pfr_rec_df)

# csv file
# pfr_rec_df.to_csv('pfr_rec_df.csv', index=False)

Shape of PFR dataframe: (31145, 11)


In [17]:
# Output: a dataframe of NFL WR info, ids, weekly stats, next-gen stats, and pro-football reference data
# NOTE: unbalanced dataframe - pfr stats start at 2018

# merge the pfr_rec_df with the wr_ids_ngs_weekly_stats dataframe
# match with ids then filter out the unmatched rows as they are likely (TE)
# Step 1: Merge the dataframes with a LEFT JOIN
wr_ids_ngs_pfr_stats = pd.merge(
    wr_ids_ngs_weekly_stats,
    pfr_rec_df,  # Use the full PFR dataframe as position data is unavailable
    left_on=['pfr_id', 'season', 'week'],  # Keys from wr_ids_ngs_weekly_stats
    right_on=['pfr_player_id', 'season', 'week'],  # Keys from pfr_rec_df
    how='left'  # Retain all rows from wr_ids_ngs_weekly_stats
)

# Display the shape of the resulting dataframe
print(f"Shape of merged dataframe: {wr_ids_ngs_pfr_stats.shape}")

# Row integrity check
print(
    f"Row count matches: {wr_ids_weekly_stats.shape[0] == wr_ids_ngs_weekly_stats.shape[0] == wr_ids_ngs_pfr_stats.shape[0]}"
)

# Display the first few rows of the merged dataframe for review
# display(wr_ids_ngs_pfr_stats)

# csv file
# wr_ids_ngs_pfr_stats.to_csv('wr_ids_ngs_pfr_stats.csv', index=False)

Shape of merged dataframe: (17379, 69)
Row count matches: True


In [18]:
# Output: a dataframe of NFL WR info, ids, weekly stats, next-gen stats, and pro-footeball reference data
# Output: Ordered the df by year, week, and receiving yards
# NOTE: unbalanced dataframe - pfr stats start at 2018

# Order the dataframe by season (year), week, and receiving_yards
wr_ids_ngs_pfr_stats_sorted = wr_ids_ngs_pfr_stats.sort_values(
    by=['season', 'week', 'receiving_yards'], 
    ascending=[True, True, False]  # Ascending for season and week, descending for receiving_yards
)

# Display the shape of the resulting dataframe
print(f"Shape of merged dataframe: {wr_ids_ngs_pfr_stats_sorted.shape}")

# Row integrity check
print(
    f"Row count matches: {wr_ids_weekly_stats.shape[0] == wr_ids_ngs_weekly_stats.shape[0] == wr_ids_ngs_pfr_stats.shape[0] == wr_ids_ngs_pfr_stats_sorted.shape[0]}"
)

# Display the sorted dataframe
print("Dataframe sorted by season, week, and receiving_yards:")
display(wr_ids_ngs_pfr_stats_sorted)


# Save the sorted dataframe to a csv
# wr_ids_ngs_pfr_stats_sorted.to_csv('wr_ids_ngs_pfr_stats_sorted.csv', index=False)

Shape of merged dataframe: (17379, 69)
Row count matches: True
Dataframe sorted by season, week, and receiving_yards:


Unnamed: 0,season,season_type,week,player_id,player_name,position,position_group,recent_team,mfl_id,sportradar_id,fantasypros_id,pff_id,sleeper_id,nfl_id,espn_id,yahoo_id,fleaflicker_id,cbs_id,pfr_id,cfbref_id,rotowire_id,rotoworld_id,ktc_id,stats_id,stats_global_id,fantasy_data_id,swish_id,name,merge_name,receptions,targets,receiving_yards,receiving_tds,receiving_fumbles,receiving_fumbles_lost,receiving_air_yards,receiving_yards_after_catch,receiving_first_downs,receiving_epa,receiving_2pt_conversions,racr,target_share,air_yards_share,wopr,player_display_name,team_abbr,avg_cushion,avg_separation,avg_intended_air_yards,percent_share_of_intended_air_yards,catch_percentage,yards,rec_touchdowns,avg_yac,avg_expected_yac,avg_yac_above_expectation,player_gsis_id,player_first_name,player_last_name,player_short_name,game_type,team,opponent,pfr_player_name,pfr_player_id,receiving_broken_tackles,receiving_drop,receiving_drop_pct,receiving_rat
214,2017,REG,1,00-0027793,A.Brown,WR,WR,PIT,9988,16e33176-b73e-49b7-b0aa-c405b47a706e,9808.0,5718.0,536.0,antoniobrown/2508061,13934.0,24171.0,,1272852.0,BrowAn04,antonio-brown-1,6454.0,5698.0,,24171.0,406214.0,11056.0,406214.0,Antonio Brown,antonio brown,11,11,182.0,0,1.0,0.0,90.0,92.0,8.0,10.870283,0,2.022222,0.305556,0.400000,0.738333,Antonio Brown,PIT,4.442727,4.311392,7.329091,35.293088,100.000000,182.0,0.0,9.137273,6.636465,2.500807,00-0027793,Antonio,Brown,A.Brown,,,,,,,,,
645,2017,REG,1,00-0030035,A.Thielen,WR,WR,MIN,11938,2fa2b2da-4aa9-44b5-b27e-56876dfe2ad4,13429.0,8288.0,1689.0,,16460.0,27277.0,,2059362.0,ThieAd00,,8986.0,9054.0,308.0,27277.0,733643.0,15534.0,733643.0,Adam Thielen,adam thielen,9,10,157.0,0,0.0,0.0,105.0,59.0,4.0,6.655833,0,1.495238,0.312500,0.439331,0.776281,Adam Thielen,MIN,3.920000,2.277252,10.673000,42.259265,90.000000,157.0,0.0,6.504444,7.789293,-1.284848,00-0030035,Adam,Thielen,A.Thielen,,,,,,,,,
1673,2017,REG,1,00-0033040,T.Hill,WR,WR,KC,12801,01d8aee3-e1c4-4988-970a-8c0c2d08bd83,15802.0,10799.0,3321.0,,3116406.0,29399.0,,2131163.0,HillTy00,,11222.0,11458.0,286.0,29399.0,823156.0,18082.0,823156.0,Tyreek Hill,tyreek hill,7,8,133.0,1,0.0,0.0,94.0,55.0,5.0,9.070634,0,1.414894,0.235294,0.361538,0.606018,Tyreek Hill,KC,7.078571,5.210156,11.551250,35.725055,87.500000,133.0,1.0,7.945714,10.144182,-2.198467,00-0033040,Tyreek,Hill,T.Hill,,,,,,,,,
245,2017,REG,1,00-0027891,G.Tate,WR,WR,DET,9831,c88d9352-b835-45ed-a909-1cfec09a58bc,9683.0,5585.0,642.0,goldentate/497326,13217.0,24035.0,,1265470.0,TateGo00,golden-tate-1,6389.0,5583.0,,24035.0,400490.0,11611.0,400490.0,Golden Tate,golden tate,10,12,107.0,0,0.0,0.0,68.0,43.0,5.0,-5.569108,0,1.573529,0.307692,0.232877,0.624552,Golden Tate,DET,5.639167,2.270264,5.055833,20.000659,83.333333,107.0,0.0,4.657000,4.412235,0.244765,00-0027891,Golden,Tate,G.Tate,,,,,,,,,
38,2017,REG,1,00-0026035,D.Amendola,WR,WR,NE,9308,973bfe3c-6d0d-4130-a79c-f860650b1da6,9146.0,4717.0,491.0,dannyamendola/2649,11674.0,9037.0,5595.0,516968.0,AmenDa00,,5813.0,4991.0,,9037.0,263758.0,9906.0,263758.0,Danny Amendola,danny amendola,6,7,100.0,0,1.0,0.0,54.0,49.0,5.0,6.142825,0,1.851852,0.194444,0.095238,0.358333,Danny Amendola,NE,2.763333,3.564135,7.581429,9.771861,85.714286,100.0,0.0,8.360000,8.952487,-0.592487,00-0026035,Daniel,Amendola,D.Amendola,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15314,2024,POST,20,00-0031544,A.Cooper,WR,WR,BUF,12175,00f88be8-45f9-4237-b2b8-3271ec790d07,13894.0,9437.0,2309.0,amaricooper/2552487,2976499.0,28392.0,,1984220.0,CoopAm00,amari-cooper-1,10055.0,10310.0,,28392.0,650914.0,16765.0,650914.0,Amari Cooper,amari cooper,0,1,0.0,0,0.0,0.0,7.0,0.0,0.0,-0.684956,0,0.000000,0.047619,0.050360,0.106680,,,,,,,,,,,,,,,,,DIV,BUF,BAL,Amari Cooper,CoopAm00,0.0,0.0,0.0,39.6
15329,2024,POST,20,00-0031549,N.Agholor,WR,WR,BAL,12181,cfb0ff68-51cb-4dad-ba81-f9e019a93a91,13969.0,9453.0,2325.0,nelsonagholor/2552600,2971618.0,28408.0,,1996508.0,AghoNe00,nelson-agholor-1,10132.0,10360.0,207.0,28408.0,691055.0,16781.0,691055.0,Nelson Agholor,nelson agholor,0,1,0.0,0,0.0,0.0,2.0,0.0,0.0,-0.546602,0,0.000000,0.040000,0.007692,0.065385,,,,,,,,,,,,,,,,,DIV,BAL,BUF,Nelson Agholor,AghoNe00,0.0,0.0,0.0,39.6
15562,2024,POST,20,00-0033857,J.Smith-Schuster,WR,WR,KC,13156,9547fbb1-0d4f-4d9e-83b9-e2fa30463bb9,16427.0,11817.0,4040.0,,3120348.0,30175.0,,2139620.0,SmitJu00,juju-smith-1,11877.0,12184.0,,30175.0,835909.0,18883.0,835909.0,JuJu Smith-Schuster,juju smith-schuster,0,1,0.0,0,0.0,0.0,7.0,0.0,0.0,-1.662924,0,0.000000,0.041667,0.045455,0.094318,,,,,,,,,,,,,,,,,DIV,KC,HOU,JuJu Smith-Schuster,SmitJu00,0.0,0.0,0.0,39.6
15819,2024,POST,20,00-0035208,O.Zaccheaus,WR,WR,WAS,14592,d8281390-f081-41e5-b55e-75779536fe94,18864.0,44920.0,6271.0,,3917914.0,32123.0,,2186266.0,ZaccOl01,,13833.0,,609.0,32123.0,883976.0,21142.0,883976.0,Olamide Zaccheaus,olamide zaccheaus,0,2,0.0,0,0.0,0.0,31.0,0.0,0.0,-2.204745,0,0.000000,0.068966,0.183432,0.231851,,,,,,,,,,,,,,,,,DIV,WAS,DET,Olamide Zaccheaus,ZaccOl01,0.0,0.0,0.0,39.6


In [19]:
### The next section scrapes data from fantasypros ###

In [20]:
# Generates a list of (year, week) combinations for web scraping.
# - 2017-2020: Weeks 1-17
# - 2021 and beyond: Weeks 1-18
def generate_year_week_combinations(start_year, end_year):
    year_week_combinations = []
    for year in range(start_year, end_year + 1):
        max_week = 17 if year <= 2020 else 18
        year_week_combinations.extend([(year, week) for week in range(1, max_week + 1)])
    return year_week_combinations

In [21]:
# output a dataframe of weekly WR fantasypros advanced stats week-by-week
# Define the function to scrape weekly WR advanced stats from Fantasy Pros
def wr_scrape_fantasypros_advanced_stats(start_year, end_year):
    
    # Generate year-week combinations
    year_week_combinations = generate_year_week_combinations(start_year, end_year)
    
    # Initialize an empty list to store data
    all_data = []

    for year, week in year_week_combinations:
        try:
            # Construct URL
            url = f"https://www.fantasypros.com/nfl/advanced-stats-wr.php?year={year}&week={week}&range=week&view=pergame"
            print(f"Scraping data for Year: {year}, Week: {week} from {url}")
            
            # Send GET request
            response = requests.get(url)
            response.raise_for_status()

            # Parse HTML content
            soup = BeautifulSoup(response.content, 'html.parser')

            # Find table headers and rows
            table_headers = [header.text.strip() for header in soup.find('thead').find_all('th')]
            table_rows = soup.find('tbody').find_all('tr')

            # Parse each row
            for row in table_rows:
                row_data = [cell.text.strip() for cell in row.find_all('td')]
                if len(row_data) == len(table_headers):  # Ensure the row matches headers
                    all_data.append([year, week] + row_data)

            # Random delay to avoid server overload
            time.sleep(uniform(0.3, 0.9))

        except Exception as e:
            print(f"Error occurred while scraping Year: {year}, Week: {week}: {e}")

    # Convert data to DataFrame
    column_names = ['Year', 'Week'] + table_headers
    wr_fp_advanced_stats_df = pd.DataFrame(all_data, columns=column_names)

    # Drop the 'Rank' column if it exists
    if 'Rank' in wr_fp_advanced_stats_df.columns:
        wr_fp_advanced_stats_df = wr_fp_advanced_stats_df.drop(columns=['Rank'])
    
    return wr_fp_advanced_stats_df

# Test for year 2024 only
wr_fp_advanced_stats_df = wr_scrape_fantasypros_advanced_stats(2024, 2024)

# Display the resulting dataframe shape
print(f"Shape of the resulting dataframe: {wr_fp_advanced_stats_df.shape}")

# Display the resulting dataframe
display(wr_fp_advanced_stats_df)

# Save to CSV with the updated name
# wr_fp_advanced_stats_df.to_csv("fantasypros_wr_advanced_stats.csv", index=False)

Scraping data for Year: 2024, Week: 1 from https://www.fantasypros.com/nfl/advanced-stats-wr.php?year=2024&week=1&range=week&view=pergame
Scraping data for Year: 2024, Week: 2 from https://www.fantasypros.com/nfl/advanced-stats-wr.php?year=2024&week=2&range=week&view=pergame
Scraping data for Year: 2024, Week: 3 from https://www.fantasypros.com/nfl/advanced-stats-wr.php?year=2024&week=3&range=week&view=pergame
Scraping data for Year: 2024, Week: 4 from https://www.fantasypros.com/nfl/advanced-stats-wr.php?year=2024&week=4&range=week&view=pergame
Scraping data for Year: 2024, Week: 5 from https://www.fantasypros.com/nfl/advanced-stats-wr.php?year=2024&week=5&range=week&view=pergame
Scraping data for Year: 2024, Week: 6 from https://www.fantasypros.com/nfl/advanced-stats-wr.php?year=2024&week=6&range=week&view=pergame
Scraping data for Year: 2024, Week: 7 from https://www.fantasypros.com/nfl/advanced-stats-wr.php?year=2024&week=7&range=week&view=pergame
Scraping data for Year: 2024, Week

Unnamed: 0,Year,Week,Player,G,REC,YDS,YBC,AIR,YAC,YACON,BRKTKL,TGT,CATCHABLE,DROP,RZ TGT,10+ YDS,20+ YDS,30+ YDS,40+ YDS,50+ YDS
0,2024,1,Jayden Reed (GB),1,4,138,83,104,55,34,1,6,4,0,0,3,3,2,1,1
1,2024,1,Allen Lazard (NYJ),1,6,89,60,91,29,7,0,9,6,0,3,3,2,1,0,0
2,2024,1,Jameson Williams (DET),1,5,121,58,127,63,5,1,9,5,0,1,3,3,2,1,1
3,2024,1,Tyreek Hill (MIA),1,7,130,46,143,84,13,1,12,7,0,2,5,2,1,1,1
4,2024,1,Xavier Worthy (KC),1,2,47,24,37,23,1,0,3,2,0,1,2,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4475,2024,18,Dee Williams (NYG),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4476,2024,18,Amari Cooper (BUF),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4477,2024,18,Cedric Tillman (CLE),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4478,2024,18,Jaelon Darden (SEA),1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [22]:
# Output: a dataframe of weekly WR fantasy points and % rostered data from FantasyPros
# Scrape the weekly WR fantasy points column and the % rostered from the FantasyPros website
def wr_scrape_fantasypros_fpts_rost(start_year, end_year):
    
    # Generate year-week combinations
    week_combinations = generate_year_week_combinations(start_year, end_year)

    # Initialize an empty list to store data
    all_data = []

    for year, week in week_combinations:
        # Build the URL
        url = f"https://www.fantasypros.com/nfl/stats/wr.php?year={year}&week={week}&range=week"
        print(f"Scraping data for Year: {year}, Week: {week} from {url}")
        
        # Request the page
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        
        # Locate the table and extract data
        table = soup.find("table", {"id": "data"})
        if table:
            table_str = str(table)
            # Skip multi-level headers using skiprows
            df = pd.read_html(io.StringIO(table_str), header=1)[0]
            df["Year"] = year
            df["Week"] = week
            all_data.append(df)
        
        # Delay to avoid bombarding the server
        time.sleep(uniform(0.3, 0.9))  # Randomized delay

    # Combine all data into a single dataframe
    wr_fpts_perct_rost_df = pd.concat(all_data, ignore_index=True)

    # Drop unnecessary columns
    columns_to_drop = [
        "Rank", "REC", "TGT", "YDS", "Y/R", "LG", "20+", "TD", 
        "ATT", "YDS.1", "TD.1", "FL", "G", "FPTS/G"
    ]
    wr_fpts_perct_rost_df.drop(columns=columns_to_drop, errors="ignore", inplace=True)

    # Save to CSV
    # wr_fpts_perct_rost_df.to_csv("fantasypros_wr_fpts_perct_rost.csv", index=False)

    # Display shape of the dataframe
    print(f"Shape of WR FPTS and % Rostered dataframe after column removal: {wr_fpts_perct_rost_df.shape}")
    return wr_fpts_perct_rost_df

# Testing the function for the year 2024
wr_fpts_perct_rost_df = wr_scrape_fantasypros_fpts_rost(2024, 2024)
display(wr_fpts_perct_rost_df.head())

Scraping data for Year: 2024, Week: 1 from https://www.fantasypros.com/nfl/stats/wr.php?year=2024&week=1&range=week
Scraping data for Year: 2024, Week: 2 from https://www.fantasypros.com/nfl/stats/wr.php?year=2024&week=2&range=week
Scraping data for Year: 2024, Week: 3 from https://www.fantasypros.com/nfl/stats/wr.php?year=2024&week=3&range=week
Scraping data for Year: 2024, Week: 4 from https://www.fantasypros.com/nfl/stats/wr.php?year=2024&week=4&range=week
Scraping data for Year: 2024, Week: 5 from https://www.fantasypros.com/nfl/stats/wr.php?year=2024&week=5&range=week
Scraping data for Year: 2024, Week: 6 from https://www.fantasypros.com/nfl/stats/wr.php?year=2024&week=6&range=week
Scraping data for Year: 2024, Week: 7 from https://www.fantasypros.com/nfl/stats/wr.php?year=2024&week=7&range=week
Scraping data for Year: 2024, Week: 8 from https://www.fantasypros.com/nfl/stats/wr.php?year=2024&week=8&range=week
Scraping data for Year: 2024, Week: 9 from https://www.fantasypros.com/n

Unnamed: 0,Player,FPTS,ROST,Year,Week
0,Jayden Reed (GB),29.1,88.5%,2024,1
1,Allen Lazard (NYJ),20.9,9.6%,2024,1
2,Jameson Williams (DET),19.4,86.4%,2024,1
3,Tyreek Hill (MIA),19.0,99.2%,2024,1
4,Xavier Worthy (KC),18.8,78.4%,2024,1


In [23]:
# output: a dataframe of weekly WR redzone stats from FantasyPros
# scrape the weekly WR redzone stats from the FantasyPros
def wr_scrape_fantasypros_redzone_stats(start_year, end_year):
    
    # Generate year-week combinations
    week_combinations = generate_year_week_combinations(start_year, end_year)

    # Initialize an empty list to store data
    all_data = []

    for year, week in week_combinations:
        url = f"https://www.fantasypros.com/nfl/red-zone-stats/wr.php?year={year}&range=week&week={week}"
        print(f"Scraping data for Year: {year}, Week: {week} from {url}")
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        
        # Locate the table and extract data
        table = soup.find("table", {"id": "data"})
        if table:
            table_str = str(table)
            df = pd.read_html(io.StringIO(table_str))[0]  # Wrap table HTML in StringIO

            # Flatten the multi-level column headers
            df.columns = df.columns.droplevel(0)
            
            # Drop the extra header rows (if any)
            df = df[df['Player'] != 'Player']  # Filter out duplicate header rows

            # Add Year and Week columns
            df["Year"] = year
            df["Week"] = week
            all_data.append(df)
        
        # Delay to avoid bombarding the server
        time.sleep(uniform(0.3, 0.9))  # Randomized delay

    # Combine all data into a single dataframe
    wr_redzone_stats_df = pd.concat(all_data, ignore_index=True)

    # Drop unnecessary columns
    columns_to_drop = ['Rank', 'ATT', 'YDS', 'TD', 'PCT', 'FL', 'G', 'FPTS', 'FPTS/G', 'ROST %']
    wr_redzone_stats_df = wr_redzone_stats_df.drop(columns=columns_to_drop, errors='ignore')

    # Rename columns to add 'rz' prefix
    wr_redzone_stats_df = wr_redzone_stats_df.rename(
        columns={
            col: f"{col}_rz" for col in wr_redzone_stats_df.columns
            if col not in ['Player', 'Year', 'Week']
        }
    )

    # Save to CSV
    # wr_redzone_stats_df.to_csv("fantasypros_wr_redzone_stats.csv", index=False)

    # Display shape of the dataframe
    print(f"Shape of WR Red Zone Stats dataframe: {wr_redzone_stats_df.shape}")
    return wr_redzone_stats_df

# Testing the function for the year 2024
wr_redzone_stats_df = wr_scrape_fantasypros_redzone_stats(2024, 2024)
display(wr_redzone_stats_df.head())

Scraping data for Year: 2024, Week: 1 from https://www.fantasypros.com/nfl/red-zone-stats/wr.php?year=2024&range=week&week=1
Scraping data for Year: 2024, Week: 2 from https://www.fantasypros.com/nfl/red-zone-stats/wr.php?year=2024&range=week&week=2
Scraping data for Year: 2024, Week: 3 from https://www.fantasypros.com/nfl/red-zone-stats/wr.php?year=2024&range=week&week=3
Scraping data for Year: 2024, Week: 4 from https://www.fantasypros.com/nfl/red-zone-stats/wr.php?year=2024&range=week&week=4
Scraping data for Year: 2024, Week: 5 from https://www.fantasypros.com/nfl/red-zone-stats/wr.php?year=2024&range=week&week=5
Scraping data for Year: 2024, Week: 6 from https://www.fantasypros.com/nfl/red-zone-stats/wr.php?year=2024&range=week&week=6
Scraping data for Year: 2024, Week: 7 from https://www.fantasypros.com/nfl/red-zone-stats/wr.php?year=2024&range=week&week=7
Scraping data for Year: 2024, Week: 8 from https://www.fantasypros.com/nfl/red-zone-stats/wr.php?year=2024&range=week&week=8


Unnamed: 0,Player,REC_rz,TGT_rz,REC PCT_rz,Y/R_rz,TGT PCT_rz,Year,Week
0,Mike Evans (TB),2,3,66.7%,9.0,75.0%,2024,1
1,Stefon Diggs (HOU),3,3,100.0%,3.7,75.0%,2024,1
2,Brian Thomas Jr. (JAC),1,1,100.0%,14.0,100.0%,2024,1
3,Cooper Kupp (LAR),2,3,66.7%,6.5,60.0%,2024,1
4,Mack Hollins (BUF),1,1,100.0%,11.0,25.0%,2024,1


In [24]:
# output: a dataframe of weekly WR advanced stats, fantasy and rosterd data, and redzone from fantasyPros
# Merge WR Advanced Stats with Fantasy Points and % Rostered
wr_adv_fp_rost_merged_df = pd.merge(
    wr_fp_advanced_stats_df,
    wr_fpts_perct_rost_df,
    on=['Player', 'Year', 'Week'],
    how='outer'
)

# Merge the resulting dataframe with Red Zone Stats
wr_adv_fp_rost_rz_merged_df = pd.merge(
    wr_adv_fp_rost_merged_df,
    wr_redzone_stats_df,
    on=['Player', 'Year', 'Week'],
    how='outer'
)

# Display the shape and a sample of the merged dataframe
print(f"Shape of the merged dataframe: {wr_adv_fp_rost_rz_merged_df.shape}")
display(wr_adv_fp_rost_rz_merged_df.head())

# Save the merged dataframe to a CSV file
# wr_adv_fp_rost_rz_merged_df.to_csv('wr_fantasypros.csv', index=False)
# print("Merged dataframe saved as 'wr_fantasypros.csv'.")

Shape of the merged dataframe: (5956, 27)


Unnamed: 0,Year,Week,Player,G,REC,YDS,YBC,AIR,YAC,YACON,BRKTKL,TGT,CATCHABLE,DROP,RZ TGT,10+ YDS,20+ YDS,30+ YDS,40+ YDS,50+ YDS,FPTS,ROST,REC_rz,TGT_rz,REC PCT_rz,Y/R_rz,TGT PCT_rz
0,2024,1,A.J. Brown (PHI),1,5,119,54,108,65,7,0,10,5,0,1,4,2,1,1,1,17.9,98.7%,0.0,1.0,0.0%,0.0,33.3%
1,2024,2,A.J. Brown (PHI),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,98.7%,,,,,
2,2024,3,A.J. Brown (PHI),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,98.7%,,,,,
3,2024,4,A.J. Brown (PHI),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,98.7%,,,,,
4,2024,5,A.J. Brown (PHI),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,98.7%,,,,,


In [25]:
# output: a dataframe of weekly WR advanced stats, fantasy and rosterd data, and redzone from fantasyPros
# Order the merged dataframe by Year, Week, and YDS
wr_adv_fp_rost_rz_merged_df_sorted = wr_adv_fp_rost_rz_merged_df.sort_values(
    by=['Year', 'Week', 'YDS'], 
    ascending=[True, True, False]  # Ascending for Year and Week, Descending for YDS
)

# Display the shape and a sample of the sorted dataframe
print(f"Shape of the sorted dataframe: {wr_adv_fp_rost_rz_merged_df_sorted.shape}")
display(wr_adv_fp_rost_rz_merged_df_sorted.head())

# Save the sorted dataframe to a CSV file
wr_adv_fp_rost_rz_merged_df_sorted.to_csv('wr_fantasypros_sorted.csv', index=False)
print("Sorted dataframe saved as 'wr_fantasypros_sorted.csv'.")

Shape of the sorted dataframe: (5956, 27)


Unnamed: 0,Year,Week,Player,G,REC,YDS,YBC,AIR,YAC,YACON,BRKTKL,TGT,CATCHABLE,DROP,RZ TGT,10+ YDS,20+ YDS,30+ YDS,40+ YDS,50+ YDS,FPTS,ROST,REC_rz,TGT_rz,REC PCT_rz,Y/R_rz,TGT PCT_rz
1170,2024,1,DJ Turner (LV),1,1,9,-2,-2,11,4,1,1,1,0,0,0,0,0,0,0,0.9,0.6%,,,,,
1887,2024,1,Elijah Moore (CLE),1,3,9,4,22,5,2,0,6,4,2,0,0,0,0,0,0,0.9,20.7%,1.0,1.0,100.0%,4.0,20.0%
162,2024,1,Allen Lazard (NYJ),1,6,89,60,91,29,7,0,9,6,0,3,3,2,1,0,0,20.9,9.6%,2.0,3.0,66.7%,3.5,50.0%
1962,2024,1,George Pickens (PIT),1,6,85,79,99,6,-1,0,7,6,0,0,3,2,2,1,0,7.5,93.4%,,,,,
1494,2024,1,DeVonta Smith (PHI),1,7,84,33,64,51,19,0,8,7,0,1,3,1,0,0,0,8.4,92.6%,1.0,1.0,100.0%,11.0,33.3%


Sorted dataframe saved as 'wr_fantasypros_sorted.csv'.


In [76]:
## Next tasks
# Note: you may want to push the sorting tasks after you do the integrity and split tasks
# Perform integrity checks on the fantasyPros data
# -ask chatGPT for ideas to program a simple integrity check 
# split the team abbreviation from the 'player' column into a separate column 
# merge the dataframe with the wr_ids_ngs_pfr_stats_sorted dataframe