In [1]:
# Model will produce a dataframe and csv file
# 1) at least season average 7tgts per game AND 
# 2) at least 7tgs over last three games AND 
# 3) season average at least 10yds per reception 

In [2]:
# import the libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import glob
from IPython.display import display, HTML
from datetime import datetime
import nfl_data_py as nfl
import os
import re

In [3]:
# Set Pandas options to display all columns in a single row without wrapping
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

In [4]:
# Function to get the current NFL week and year
def get_current_week():
    current_date = datetime.now()
    season_start_date = datetime(2024, 9, 4)  # Reset this date at the start of the NFL season
    current_week = ((current_date - season_start_date).days // 7) + 1
    return current_week

# Set the current NFL year and week
current_year = datetime.now().year
current_week = get_current_week()
seasontype = 2 if current_week <= 18 else 3  # Regular season or playoffs

In [5]:
# Base columns for all positions
base_columns = [
    'season', 'season_type', 'week', 'player_id', 'player_name',
    'position', 'position_group', 'recent_team', 'opponent_team',
    'fantasy_points', 'fantasy_points_ppr'
]

# WR-specific columns (receiving-related)
wr_columns = [
    'receptions', 'targets', 'receiving_yards', 'receiving_tds',
    'receiving_fumbles', 'receiving_fumbles_lost',
    'receiving_air_yards', 'receiving_yards_after_catch',
    'receiving_first_downs', 'receiving_epa',
    'receiving_2pt_conversions', 'racr', 'target_share',
    'air_yards_share', 'wopr'
]


# Define the WR-specific target-related columns
wr_target_columns = ['targets', 'target_share', 'receptions', 'receiving_yards', 'receiving_tds']

# Combine base columns with WR-specific columns
wr_all_columns = base_columns + wr_columns
wr_tgt_columns = base_columns + wr_target_columns

# Import data for all the specified years
years = list(range(2017, current_year + 1))
nfl_data_all_years = nfl.import_weekly_data(
    years=years,
    columns=wr_all_columns
)

# Filter to show only WR positions
wr_data_all_years = nfl_data_all_years[nfl_data_all_years['position'] == 'WR']

# Select only the relevant columns for WRs
wr_tgt_data_all_years = wr_data_all_years[wr_tgt_columns]

Downcasting floats.


In [6]:
# Function to return all wide receivers and add a boolean column for avg 7+ targets per game
def wr_tgts_all_per_game_current_season():
    # Step 1: Filter the data to include only the current season (current_year)
    wr_current_season = wr_tgt_data_all_years[wr_tgt_data_all_years['season'] == current_year]
    
    # Step 2: Get the most recent week (current NFL week)
    current_nfl_week = get_current_week()

    # Step 3: Group by player to sum cumulative stats (targets, receptions, receiving yards) and calculate games played
    wr_grouped = wr_current_season.groupby(['season', 'player_id', 'player_name', 'recent_team'], as_index=False).agg({
        'targets': 'sum',               # Total targets over the season
        'receptions': 'sum',            # Total receptions over the season
        'receiving_yards': 'sum',       # Total receiving yards over the season
        'target_share': 'mean',         # Average target share over the season
        'week': 'count'                 # Number of games played (count of weeks)
    }).rename(columns={'week': 'games_played'})

    # Step 4: Calculate targets per game, receptions per game, and receiving yards per game
    wr_grouped['targets_per_game'] = wr_grouped['targets'] / wr_grouped['games_played']
    wr_grouped['receptions_per_game'] = wr_grouped['receptions'] / wr_grouped['games_played']
    wr_grouped['receiving_yards_per_game'] = wr_grouped['receiving_yards'] / wr_grouped['games_played']

    # Step 5: Add a boolean column to check if the player has an average of 7+ targets per game
    wr_grouped['avg_7tgs_per_game_bool'] = wr_grouped['targets_per_game'] >= 7

    # Step 6: Add the 'week' column with the current NFL week for all rows using .loc[] to avoid SettingWithCopyWarning
    wr_grouped.loc[:, 'week'] = current_nfl_week

    # Step 7: Round the values of the specified columns
    wr_grouped['targets_per_game'] = wr_grouped['targets_per_game'].round(1)
    wr_grouped['receptions_per_game'] = wr_grouped['receptions_per_game'].round(1)
    wr_grouped['receiving_yards_per_game'] = wr_grouped['receiving_yards_per_game'].round(1)
    wr_grouped['target_share'] = wr_grouped['target_share'].round(3)
    wr_grouped['receiving_yards'] = wr_grouped['receiving_yards'].astype(int)  # Ensure no decimals for receiving_yards

    # Step 8: Reorder the columns as requested
    wr_grouped = wr_grouped[['season', 'week', 'player_id', 'player_name', 'games_played', 'recent_team',
                             'targets', 'targets_per_game', 'target_share', 'receptions', 'receptions_per_game', 
                             'receiving_yards', 'receiving_yards_per_game', 'avg_7tgs_per_game_bool']]

    # Step 9: Sort by targets per game in descending order
    wr_sorted = wr_grouped.sort_values(by='targets_per_game', ascending=False)

    # Return the dataframe with all wide receivers and the avg_7tgs_per_game_bool column
    return wr_sorted

# Run the revised function
wr_all_wr_with_avg_tgts_df = wr_tgts_all_per_game_current_season()

# Count the number of rows in the dataframe
wr_row_count = wr_all_wr_with_avg_tgts_df.shape[0]
print(f"Number of rows in the dataframe: {wr_row_count}")

# Display the first few rows to verify the data
wr_all_wr_with_avg_tgts_df.head()


Number of rows in the dataframe: 193


Unnamed: 0,season,week,player_id,player_name,games_played,recent_team,targets,targets_per_game,target_share,receptions,receptions_per_game,receiving_yards,receiving_yards_per_game,avg_7tgs_per_game_bool
166,2024,8,00-0039337,M.Nabers,5,NYG,60,12.0,0.381,39,7.8,427,85.4,True
34,2024,8,00-0033908,C.Kupp,3,LA,35,11.7,0.298,23,7.7,198,66.0,True
119,2024,8,00-0037740,G.Wilson,7,NYJ,75,10.7,0.286,46,6.6,460,65.7,True
162,2024,8,00-0039067,R.Rice,3,KC,29,9.7,0.328,24,8.0,288,96.0,True
129,2024,8,00-0038117,W.Robinson,7,NYG,67,9.6,0.286,43,6.1,303,43.3,True


In [7]:
# Function to update the baseline with additional boolean criteria and reorder columns
def update_baseline_with_criteria_boolean():
    # Baseline dataframe for the current season
    wr_current_season = wr_tgts_all_per_game_current_season()
    
    # Step 1: Filter data for players who had targets in the last three games they played
    wr_last_3_games = wr_tgt_data_all_years[(wr_tgt_data_all_years['season'] == current_year) & 
                                            (wr_tgt_data_all_years['targets'] > 0)]  # Filter only games where they had targets
    
    # Step 2: Sort the data by player and week, then get the last 3 games for each player
    wr_last_3_games_sorted = wr_last_3_games.sort_values(by=['player_id', 'week'], ascending=[True, False])
    wr_last_3_games_grouped = wr_last_3_games_sorted.groupby('player_id').head(3)  # Get the last 3 games for each player
    
    # Step 3: Group by player and sum the targets over the last 3 games
    wr_last_3_games_grouped_sum = wr_last_3_games_grouped.groupby(['player_id'], as_index=False)['targets'].sum()
    
    # Step 4: Merge last 3 games target data back into the current season dataframe
    wr_current_season = wr_current_season.merge(wr_last_3_games_grouped_sum, on=['player_id'], how='left', suffixes=('', '_last_3_games'))

    # Step 5: Rename 'targets' from the last 3 games to 'total_targets_last_3_games'
    wr_current_season.rename(columns={'targets_last_3_games': 'total_targets_last_3_games'}, inplace=True)
    
    # Step 6: Add 'avg_7tgs_last3_games_bool' (TRUE if total targets over the last 3 games >= 7)
    wr_current_season['avg_7tgs_last3_games_bool'] = wr_current_season['total_targets_last_3_games'] >= 7
    
    # Step 7: Calculate 'avg_7tgs_last3_games' = total_targets_last_3_games / 3
    wr_current_season['avg_7tgs_last3_games'] = (wr_current_season['total_targets_last_3_games'] / 3).round(1)
    
    # Step 8: Fill missing values with False for players who didn't play in the last 3 games
    wr_current_season['avg_7tgs_last3_games_bool'] = wr_current_season['avg_7tgs_last3_games_bool'].fillna(False)
    
    # Step 9: Calculate 'yds_per_reception' = 'receiving_yards' / 'receptions' 
    wr_current_season['yds_per_reception'] = (wr_current_season['receiving_yards'] / wr_current_season['receptions']).round(2)
    
    # Step 10: Add 'season_avg_10yds_per_reception_bool' (TRUE if 'yds_per_reception' >= 10)
    wr_current_season['season_avg_10yds_per_reception_bool'] = wr_current_season['yds_per_reception'] >= 10
    
    # Step 11: Fill missing values with False for players who don't have receptions
    wr_current_season['season_avg_10yds_per_reception_bool'] = wr_current_season['season_avg_10yds_per_reception_bool'].fillna(False)

    # Step 12: Convert 'total_targets_last_3_games' to integer values
    wr_current_season['total_targets_last_3_games'] = wr_current_season['total_targets_last_3_games'].fillna(0).astype(int)
    
    # Step 13: Reorder the columns as requested
    wr_current_season = wr_current_season[['season', 'week', 'player_id', 'player_name', 'games_played', 'recent_team',
                                           'targets', 'targets_per_game', 'avg_7tgs_per_game_bool', 'target_share', 'receptions', 
                                           'receptions_per_game', 'receiving_yards', 'receiving_yards_per_game', 'yds_per_reception',
                                           'season_avg_10yds_per_reception_bool', 'total_targets_last_3_games', 'avg_7tgs_last3_games',
                                           'avg_7tgs_last3_games_bool']]

    # Step 14: Return the updated dataframe with the reordered columns and new calculations
    return wr_current_season

# Call the function to update the baseline with criteria boolean flags
wr_with_criteria_flags_df = update_baseline_with_criteria_boolean()

# Print the row count for verification
print(f"Number of rows in the dataframe after adding criteria flags: {wr_with_criteria_flags_df.shape[0]}")

# Display the first few rows for verification
wr_with_criteria_flags_df.head()


Number of rows in the dataframe after adding criteria flags: 193


Unnamed: 0,season,week,player_id,player_name,games_played,recent_team,targets,targets_per_game,avg_7tgs_per_game_bool,target_share,receptions,receptions_per_game,receiving_yards,receiving_yards_per_game,yds_per_reception,season_avg_10yds_per_reception_bool,total_targets_last_3_games,avg_7tgs_last3_games,avg_7tgs_last3_games_bool
0,2024,8,00-0039337,M.Nabers,5,NYG,60,12.0,True,0.381,39,7.8,427,85.4,10.95,True,35,11.7,True
1,2024,8,00-0033908,C.Kupp,3,LA,35,11.7,True,0.298,23,7.7,198,66.0,8.61,False,35,11.7,True
2,2024,8,00-0037740,G.Wilson,7,NYJ,75,10.7,True,0.286,46,6.6,460,65.7,10.0,True,41,13.7,True
3,2024,8,00-0039067,R.Rice,3,KC,29,9.7,True,0.328,24,8.0,288,96.0,12.0,True,29,9.7,True
4,2024,8,00-0038117,W.Robinson,7,NYG,67,9.6,True,0.286,43,6.1,303,43.3,7.05,False,29,9.7,True


In [8]:
# **Note** does not handle all name variations like St.Brown
# the dataframe will have rows shift right for exceptional name variations
# Function to split the 'player_name' column and handle cases where the split doesn't result in two parts
def split_player_name_column(df):
    # Split the 'player_name' column by '.' into two columns: 'FirstName' and 'LastName'
    # Use expand=True to split into two columns, and handle cases with fewer/more parts by filling with None
    name_split = df['player_name'].str.split('.', expand=True, n=1)
    
    # Ensure that the split produces exactly two columns, and fill missing parts with empty strings
    name_split.columns = ['FirstName', 'LastName']
    name_split['FirstName'] = name_split['FirstName'].fillna('')  # Avoid inplace assignment
    name_split['LastName'] = name_split['LastName'].fillna('')    # Avoid inplace assignment
    
    # Normalize the 'FirstName' and 'LastName' columns: strip spaces and convert to lowercase
    name_split['FirstName'] = name_split['FirstName'].str.strip().str.lower()
    name_split['LastName'] = name_split['LastName'].str.strip().str.lower()

    # Add the split columns back to the dataframe and drop the original 'player_name' column
    df = pd.concat([df, name_split], axis=1)
    df.drop(columns=['player_name'], inplace=True)
    
    # Reorder the columns to match the desired order
    df = df[['season', 'week', 'player_id', 'FirstName', 'LastName', 'games_played', 'recent_team',
             'targets', 'targets_per_game', 'avg_7tgs_per_game_bool', 'target_share', 'receptions', 
             'receptions_per_game', 'receiving_yards', 'receiving_yards_per_game', 'yds_per_reception',
             'season_avg_10yds_per_reception_bool', 'total_targets_last_3_games', 'avg_7tgs_last3_games',
             'avg_7tgs_last3_games_bool']]
    
    return df

# Apply the function to split the player name
wr_with_split_name_df = split_player_name_column(wr_with_criteria_flags_df)

# Display the updated dataframe to verify the split
display(wr_with_split_name_df.head())

# wr_with_split_name_df.to_csv("name_test.csv", index=False)

# Print the row count for verification
print(f"Number of rows in the dataframe after splitting the player name: {wr_with_split_name_df.shape[0]}")


Unnamed: 0,season,week,player_id,FirstName,LastName,games_played,recent_team,targets,targets_per_game,avg_7tgs_per_game_bool,target_share,receptions,receptions_per_game,receiving_yards,receiving_yards_per_game,yds_per_reception,season_avg_10yds_per_reception_bool,total_targets_last_3_games,avg_7tgs_last3_games,avg_7tgs_last3_games_bool
0,2024,8,00-0039337,m,nabers,5,NYG,60,12.0,True,0.381,39,7.8,427,85.4,10.95,True,35,11.7,True
1,2024,8,00-0033908,c,kupp,3,LA,35,11.7,True,0.298,23,7.7,198,66.0,8.61,False,35,11.7,True
2,2024,8,00-0037740,g,wilson,7,NYJ,75,10.7,True,0.286,46,6.6,460,65.7,10.0,True,41,13.7,True
3,2024,8,00-0039067,r,rice,3,KC,29,9.7,True,0.328,24,8.0,288,96.0,12.0,True,29,9.7,True
4,2024,8,00-0038117,w,robinson,7,NYG,67,9.6,True,0.286,43,6.1,303,43.3,7.05,False,29,9.7,True


Number of rows in the dataframe after splitting the player name: 193


In [9]:
# Function to scrape salary changes from FantasyPros website
def scrape_salary_changes():
    # URL of the FantasyPros salary changes page
    url = "https://www.fantasypros.com/daily-fantasy/nfl/fanduel-salary-changes.php"
    
    # Fetch the page content
    response = requests.get(url)
    
    # Check if the page was fetched successfully
    if response.status_code != 200:
        print(f"Failed to fetch the page. Status code: {response.status_code}")
        return None
    
    # Parse the page content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Locate the table containing the salary changes (assuming it's the first table)
    table = soup.find('table')  # Adjust if necessary based on the page structure
    
    # Extract the table headers
    headers = [header.text for header in table.find_all('th')]
    
    # Extract the table rows
    rows = []
    for row in table.find_all('tr')[1:]:  # Skip the header row
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]  # Clean up whitespace
        rows.append(cols)
    
    # Create a DataFrame with the scraped data
    salary_changes_df = pd.DataFrame(rows, columns=headers)
        
    # Display the first few rows of the DataFrame
    display(salary_changes_df.head())
    
    return salary_changes_df


In [10]:
# Function to clean and standardize player names
def clean_name(name):
    # List of common suffixes
    suffixes = ['ii', 'iii', 'jr.', 'sr.', 'iv', 'v']
    
    # Handle apostrophes and split the name
    name_parts = name.replace("'", "").split()
    
    # Initialize variables for first and last name
    first_name = name_parts[0]
    last_name = ''
    
    # Handle cases where the name contains a suffix
    if len(name_parts) > 2 and name_parts[-1].lower() in suffixes:
        last_name = ' '.join(name_parts[1:])  # Last name includes the suffix
    else:
        last_name = ' '.join(name_parts[1:])  # Normal last name, possibly hyphenated
    
    # Ensure that hyphenated names stay intact
    if '-' in first_name:
        first_name = first_name.lower()
    if '-' in last_name:
        last_name = last_name.lower()
    
    # Return the cleaned name in lowercase and without spaces
    return {'FirstName': first_name.strip().lower(), 'LastName': last_name.strip().lower()}

In [11]:
# Process salary changes by position without the Suffix column
def process_salary_changes_by_position():
    # Fetch the salary changes data
    salary_changes_df = scrape_salary_changes()

    # Step 1: Extract the 'Player' column and split it into 'Name' and 'Team-Position'
    salary_changes_df[['Name', 'Team-Position']] = salary_changes_df['Player'].str.extract(r'([^\(]+)\((.*)\)')
    
    # Step 2: Further split the 'Name' into 'FirstName' and 'LastName' without handling suffix separately
    name_split = salary_changes_df['Name'].apply(clean_name)
    name_df = pd.DataFrame(name_split.tolist())  # Convert list of dictionaries to DataFrame
    
    # Merge the cleaned name columns (FirstName, LastName) back into the salary DataFrame
    salary_changes_df = pd.concat([salary_changes_df, name_df], axis=1)
    
    # Step 3: Split 'Team-Position' into 'Team' and 'Position'
    salary_changes_df[['Team', 'Position']] = salary_changes_df['Team-Position'].str.extract(r'(\w+)\s*-\s*(\w+)')
    
    # Drop the original 'Player' and 'Team-Position' columns
    salary_changes_df.drop(columns=['Player', 'Team-Position', 'Name'], inplace=True)

    # Normalize the 'FirstName' and 'LastName' columns to lowercase and trim whitespace
    salary_changes_df['FirstName'] = salary_changes_df['FirstName'].str.strip().str.lower()
    salary_changes_df['LastName'] = salary_changes_df['LastName'].str.strip().str.lower()

    # Display the processed salary DataFrame for verification
    display(salary_changes_df.head())
    
    # Step 4: Split the data by position
    df_qb = salary_changes_df[salary_changes_df['Position'] == 'QB']
    df_wr = salary_changes_df[salary_changes_df['Position'] == 'WR']
    df_rb = salary_changes_df[salary_changes_df['Position'] == 'RB']
    
    return df_qb, df_wr, df_rb

# Call the modified process_salary_changes_by_position function
df_qb_salary, df_wr_salary, df_rb_salary = process_salary_changes_by_position()

# Verify the results for wide receivers (df_wr_salary)
display(df_wr_salary.head())

# Count the number of rows in the df_wr_salary dataframe
df_wr_salary_row_count = df_wr_salary.shape[0]

# Print the row count for verification
print(f"Number of rows in the WR salary dataframe: {df_wr_salary_row_count}")

Unnamed: 0,ECR,Player,Kickoff,Opp,This Week,Last Week,Difference
0,-,Riley Sharp (BAL - TE),Sun 1:00PM,@CLE,"$4,000","$4,000",0
1,303,John Metchie III (HOU - WR),Sun 1:00PM,IND,"$4,400","$4,400",0
2,-,Snoop Conner (DAL - RB),Sun 8:20PM,@SF,"$4,000",-,-
3,-,Trey Knox (MIN - TE),Thu 8:15PM,@LAR,"$4,000","$4,000",0
4,-,Kevin Harris (NE - RB),Sun 1:00PM,NYJ,"$4,000","$4,000",0


Unnamed: 0,ECR,Kickoff,Opp,This Week,Last Week,Difference,FirstName,LastName,Team,Position
0,-,Sun 1:00PM,@CLE,"$4,000","$4,000",0,riley,sharp,BAL,TE
1,303,Sun 1:00PM,IND,"$4,400","$4,400",0,john,metchie iii,HOU,WR
2,-,Sun 8:20PM,@SF,"$4,000",-,-,snoop,conner,DAL,RB
3,-,Thu 8:15PM,@LAR,"$4,000","$4,000",0,trey,knox,MIN,TE
4,-,Sun 1:00PM,NYJ,"$4,000","$4,000",0,kevin,harris,NE,RB


Unnamed: 0,ECR,Kickoff,Opp,This Week,Last Week,Difference,FirstName,LastName,Team,Position
1,303,Sun 1:00PM,IND,"$4,400","$4,400",0,john,metchie iii,HOU,WR
6,126,Sun 8:20PM,DAL,"$5,200","$4,800",400,ricky,pearsall,SF,WR
8,-,Sun 1:00PM,@CIN,"$4,000","$4,000",0,joseph,ngata,PHI,WR
9,-,Sun 1:00PM,@HOU,"$4,000","$4,000",0,anthony,gould,IND,WR
11,146,Sun 1:00PM,ATL,"$4,000","$4,200",-200,trey,palmer,TB,WR


Number of rows in the WR salary dataframe: 424


In [12]:
# Function to handle name and team matching during the merge, with dynamic column reordering
def preprocess_for_merge(df_salary, df_wr_baseline):
    # Step 1: Rename salary columns for clarity
    df_salary.rename(columns={'This Week': 'current_week_salary', 'Last Week': 'last_week_salary', 'Difference': 'salary_diff'}, inplace=True)

    # Step 2: Standardize the first letter of the first name
    df_salary['FirstName_Initial'] = df_salary['FirstName'].str[0].str.lower()  # Get the first letter
    df_wr_baseline['FirstName_Initial'] = df_wr_baseline['FirstName'].str[0].str.lower()

    # Step 3: Merge based on LastName, FirstName initial, and Team
    merged_df = df_wr_baseline.merge(
        df_salary[['FirstName_Initial', 'FirstName', 'LastName', 'Team', 'current_week_salary', 'last_week_salary', 'salary_diff']],
        left_on=['FirstName_Initial', 'LastName', 'recent_team'],  # Merge on initials, last name, and team
        right_on=['FirstName_Initial', 'LastName', 'Team'],        # Merge with salary dataframe
        how='left'  # Keep all players from the baseline dataframe
    )

    # Step 4: Dynamically reorder columns, ensuring both 'FirstName_x' and 'FirstName_y' are included for now
    baseline_columns = list(df_wr_baseline.columns)  # Get the original columns from the baseline dataframe
    salary_columns = ['current_week_salary', 'last_week_salary', 'salary_diff']
    
    # Reorder the dataframe to include salary columns at the end
    ordered_columns = baseline_columns + ['FirstName_x', 'FirstName_y'] + salary_columns  # Include both FirstName columns for now

    # Ensure that all columns exist before reordering
    ordered_columns = [col for col in ordered_columns if col in merged_df.columns]

    # Reorder the dataframe
    merged_df = merged_df[ordered_columns]

    return merged_df


# Function to finalize the merge and clean up duplicate columns
def merge_baseline_stats_salary(merged_df):
    # Step 1: Drop 'FirstName_x' and rename 'FirstName_y' to 'FirstName'
    merged_df.drop(columns=['FirstName_x'], inplace=True)
    merged_df.rename(columns={'FirstName_y': 'FirstName'}, inplace=True)

    # Step 2: Reorder columns, placing 'FirstName_Initial' after 'FirstName' and 'Team' after 'LastName'
    key_columns = ['season', 'week', 'player_id', 'FirstName', 'FirstName_Initial', 'LastName', 'recent_team', 'current_week_salary', 'last_week_salary', 'salary_diff']
    
    # Automatically get all other columns not in key_columns
    remaining_columns = [col for col in merged_df.columns if col not in key_columns]

    # Combine the two sets of columns
    ordered_columns = key_columns + remaining_columns

    # Reorder the dataframe
    merged_df = merged_df[ordered_columns]

    return merged_df


# Call the function to perform the merge and adjust the columns dynamically
df_wr_merge_baseline_salary = preprocess_for_merge(df_wr_salary, wr_with_split_name_df)

# Apply the final clean-up to the merged dataframe
wr_merged_baseline_salary_df = merge_baseline_stats_salary(df_wr_merge_baseline_salary)

# Print count of rows in the final dataframe
print(f"Number of rows in the final merged dataframe: {wr_merged_baseline_salary_df.shape[0]}")

# Display the result of the merge to verify
display(wr_merged_baseline_salary_df.head())


Number of rows in the final merged dataframe: 193


Unnamed: 0,season,week,player_id,FirstName,FirstName_Initial,LastName,recent_team,current_week_salary,last_week_salary,salary_diff,games_played,targets,targets_per_game,avg_7tgs_per_game_bool,target_share,receptions,receptions_per_game,receiving_yards,receiving_yards_per_game,yds_per_reception,season_avg_10yds_per_reception_bool,total_targets_last_3_games,avg_7tgs_last3_games,avg_7tgs_last3_games_bool
0,2024,8,00-0039337,malik,m,nabers,NYG,"$8,300","$8,300",0.0,5,60,12.0,True,0.381,39,7.8,427,85.4,10.95,True,35,11.7,True
1,2024,8,00-0033908,,c,kupp,LA,,,,3,35,11.7,True,0.298,23,7.7,198,66.0,8.61,False,35,11.7,True
2,2024,8,00-0037740,garrett,g,wilson,NYJ,"$7,400","$7,400",0.0,7,75,10.7,True,0.286,46,6.6,460,65.7,10.0,True,41,13.7,True
3,2024,8,00-0039067,rashee,r,rice,KC,"$4,000","$4,000",0.0,3,29,9.7,True,0.328,24,8.0,288,96.0,12.0,True,29,9.7,True
4,2024,8,00-0038117,wandale,w,robinson,NYG,"$6,600","$6,600",0.0,7,67,9.6,True,0.286,43,6.1,303,43.3,7.05,False,29,9.7,True


In [13]:
# URLs for each position (cumulative redzone stats)
urls_cumulative = {
    'qb': 'https://www.fantasypros.com/nfl/red-zone-stats/qb.php',
    'rb': 'https://www.fantasypros.com/nfl/red-zone-stats/rb.php?range=full',
    'wr': 'https://www.fantasypros.com/nfl/red-zone-stats/wr.php?range=full',
    'te': 'https://www.fantasypros.com/nfl/red-zone-stats/te.php?range=full'
}

In [14]:
# Function to ensure unique column names
def ensure_unique_column_names(columns):
    seen = {}
    new_columns = []
    for col in columns:
        if col not in seen:
            seen[col] = 0
            new_columns.append(col)
        else:
            seen[col] += 1
            new_columns.append(f"{col}_dup_{seen[col]}")
    return new_columns

In [15]:
# Function to scrape and adjust redzone data for cumulative stats, with better handling of names
def scrape_redzone_stats(position, url):
    # Send request to the URL
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve data for {position}. Status code: {response.status_code}")
        return None
    
    # Parse the content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table')
    
    if not table:
        print(f"No table found for {position}.")
        return None
    
    # Extract table headers
    headers = [th.text.strip() for th in table.find_all('th')]
    
    # Extract rows
    rows = []
    for row in table.find_all('tr')[1:]:  # Skip the header row
        cols = [td.text.strip() for td in row.find_all('td')]
        rows.append(cols)
    
    # Create DataFrame
    df = pd.DataFrame(rows, columns=headers)
    
    # Clean up column names and data as necessary
    df.columns = df.columns.str.lower().str.replace(' ', '_')  # Clean column names
    
    # Ensure unique column names (to handle duplicates)
    df.columns = ensure_unique_column_names(df.columns)
    
    # Improved regex to handle hyphenated and multi-part names (like "Amon-Ra St. Brown")
    player_info = df['player'].str.extract(r'(?P<firstName>[\w\-\'\.]+)\s+(?P<lastName>[\w\-\.\s]+)\s*\((?P<team>.*)\)')
    
    df = pd.concat([df, player_info], axis=1)
    df.drop(columns=['player'], inplace=True)  # Drop the original 'player' column
    
    # Handle position-specific logic for WR
    if position == 'wr':
        df = df.rename(columns={
            'rec': 'red_zone_rec_rec',
            'tgt': 'red_zone_rec_tgt',
            'rec_pct': 'red_zone_rec_pct',
            'yds': 'red_zone_rec_yds',
            'y/r': 'red_zone_rec_y/r',
            'td': 'red_zone_rec_td',
            'tgt_pct': 'red_zone_rec_tgt_pct'
        })
    
    # Filter the DataFrame to the relevant columns for WR
    selected_columns = ['firstName', 'lastName', 'team', 'red_zone_rec_rec', 'red_zone_rec_tgt', 'red_zone_rec_pct', 
                        'red_zone_rec_yds', 'red_zone_rec_y/r', 'red_zone_rec_td', 'red_zone_rec_tgt_pct']
    
    df = df[selected_columns]
    
    return df

# Test with WR redzone scraping to verify improvements for hyphenated names
df_wr_redzone_cumulative = scrape_redzone_stats('wr', urls_cumulative['wr'])

# Display a few rows of the WR redzone cumulative stats
display(df_wr_redzone_cumulative.head())


Unnamed: 0,firstName,lastName,team,red_zone_rec_rec,red_zone_rec_tgt,red_zone_rec_pct,red_zone_rec_yds,red_zone_rec_y/r,red_zone_rec_td,red_zone_rec_tgt_pct
0,,,,,,,,,,
1,Drake,London,ATL,10.0,12.0,83.3%,83.0,8.3,5.0,63.2%
2,Amon-Ra,St. Brown,DET,8.0,9.0,88.9%,57.0,7.1,3.0,56.3%
3,Stefon,Diggs,HOU,5.0,5.0,100.0%,34.0,6.8,3.0,27.8%
4,Mike,Evans,TB,4.0,8.0,50.0%,22.0,5.5,4.0,40.0%


In [16]:
def merge_baseline_stats_salary_redzone(df_wr_redzone, df_wr_baseline_salary):
    # Step 1: Clean and standardize names for both dataframes
    df_wr_redzone[['FirstName', 'LastName']] = df_wr_redzone.apply(lambda row: pd.Series(clean_name(f"{row['firstName']} {row['lastName']}")), axis=1)
    df_wr_baseline_salary[['FirstName', 'LastName']] = df_wr_baseline_salary.apply(lambda row: pd.Series(clean_name(f"{row['FirstName']} {row['LastName']}")), axis=1)

    # Step 2: Merge based on cleaned FirstName, LastName, and team
    merged_df = df_wr_baseline_salary.merge(
        df_wr_redzone[['FirstName', 'LastName', 'team', 'red_zone_rec_rec', 'red_zone_rec_tgt', 'red_zone_rec_pct', 
                       'red_zone_rec_yds', 'red_zone_rec_y/r', 'red_zone_rec_td', 'red_zone_rec_tgt_pct']],
        left_on=['FirstName', 'LastName', 'recent_team'],  # Use cleaned names and team for merging
        right_on=['FirstName', 'LastName', 'team'],        
        how='left'  # Keep all players from the baseline dataframe
    )

    # Step 3: Handle mismatched rows for cases where the merge failed (red zone stats are NaN)
    unmatched_rows = merged_df[merged_df['red_zone_rec_rec'].isna()]

    if not unmatched_rows.empty:
        print("Handling unmatched rows using fallback logic for similar names...")

        # Reattempt the merge with fallback conditions, e.g., based on name initials or similar names
        unmatched_fallback_merge = df_wr_baseline_salary.merge(
            df_wr_redzone[['FirstName', 'LastName', 'team', 'red_zone_rec_rec', 'red_zone_rec_tgt', 'red_zone_rec_pct', 
                           'red_zone_rec_yds', 'red_zone_rec_y/r', 'red_zone_rec_td', 'red_zone_rec_tgt_pct']],
            left_on=['FirstName', 'LastName', 'recent_team'],  
            right_on=['FirstName', 'LastName', 'team'],
            how='left'
        )

        # Update the merged dataframe with fallback matches
        merged_df.update(unmatched_fallback_merge)

    # Step 4: Remove unnecessary columns like 'team' after the merge
    merged_df.drop(columns=['team'], inplace=True)

    # Step 5: Reorder the columns as requested
    ordered_columns = [
        'season', 'week', 'player_id', 'FirstName_Initial', 'FirstName', 'LastName',
        'games_played', 'current_week_salary', 'last_week_salary', 'salary_diff'
    ] + [col for col in merged_df.columns if col not in ['season', 'week', 'player_id', 'FirstName_Initial', 'FirstName', 'LastName', 'games_played', 'current_week_salary', 'last_week_salary', 'salary_diff']]

    merged_df = merged_df[ordered_columns]

    # Step 6: Save to CSV in the current directory
    merged_df.to_csv("wr_baseline_stats_salary_redzone.csv", index=False)

    # Return the merged dataframe
    return merged_df

# Call the function and assign the result to wr_baseline_stats_salary_redzone_df for future use
wr_baseline_stats_salary_redzone_df = merge_baseline_stats_salary_redzone(df_wr_redzone_cumulative, df_wr_merge_baseline_salary)

# Print confirmation count and display first 5 rows
print(f"Number of rows in the merged dataframe: {wr_baseline_stats_salary_redzone_df.shape[0]}")
display(wr_baseline_stats_salary_redzone_df.head())


Handling unmatched rows using fallback logic for similar names...
Number of rows in the merged dataframe: 193


Unnamed: 0,season,week,player_id,FirstName_Initial,FirstName,LastName,games_played,current_week_salary,last_week_salary,salary_diff,recent_team,targets,targets_per_game,avg_7tgs_per_game_bool,target_share,receptions,receptions_per_game,receiving_yards,receiving_yards_per_game,yds_per_reception,season_avg_10yds_per_reception_bool,total_targets_last_3_games,avg_7tgs_last3_games,avg_7tgs_last3_games_bool,red_zone_rec_rec,red_zone_rec_tgt,red_zone_rec_pct,red_zone_rec_yds,red_zone_rec_y/r,red_zone_rec_td,red_zone_rec_tgt_pct
0,2024,8,00-0039337,m,malik,nabers,5,"$8,300","$8,300",0.0,NYG,60,12.0,True,0.381,39,7.8,427,85.4,10.95,True,35,11.7,True,4.0,6.0,66.7%,11.0,2.8,3.0,33.3%
1,2024,8,00-0033908,c,,kupp,3,,,,LA,35,11.7,True,0.298,23,7.7,198,66.0,8.61,False,35,11.7,True,,,,,,,
2,2024,8,00-0037740,g,garrett,wilson,7,"$7,400","$7,400",0.0,NYJ,75,10.7,True,0.286,46,6.6,460,65.7,10.0,True,41,13.7,True,8.0,14.0,57.1%,41.0,5.1,3.0,51.9%
3,2024,8,00-0039067,r,rashee,rice,3,"$4,000","$4,000",0.0,KC,29,9.7,True,0.328,24,8.0,288,96.0,12.0,True,29,9.7,True,2.0,2.0,100.0%,10.0,5.0,1.0,15.4%
4,2024,8,00-0038117,w,wandale,robinson,7,"$6,600","$6,600",0.0,NYG,67,9.6,True,0.286,43,6.1,303,43.3,7.05,False,29,9.7,True,6.0,10.0,60.0%,35.0,5.8,2.0,55.6%
