In [1]:
# Model will produce a dataframe and csv file
# 1) at least season average 7tgts per game AND 
# 2) at least 7tgs over last three games AND 
# 3) season average at least 10yds per reception 

In [2]:
# import the libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import glob
from IPython.display import display, HTML
from datetime import datetime
import nfl_data_py as nfl
import os
import re

In [3]:
# Set Pandas options to display all columns in a single row without wrapping
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

In [4]:
# Function to get the current NFL week and year
def get_current_week():
    current_date = datetime.now()
    season_start_date = datetime(2024, 9, 4)  # Reset this date at the start of the NFL season
    current_week = ((current_date - season_start_date).days // 7) + 1
    return current_week

# Set the current NFL year and week
current_year = datetime.now().year
current_week = get_current_week()
seasontype = 2 if current_week <= 18 else 3  # Regular season or playoffs

In [5]:
# Base columns for all positions
base_columns = [
    'season', 'season_type', 'week', 'player_id', 'player_name',
    'position', 'position_group', 'recent_team', 'opponent_team',
    'fantasy_points', 'fantasy_points_ppr'
]

# RB-specific columns (rushing-related)
rb_columns = [
    'carries', 'attempts', 'targets', 'target_share', 'receptions', 'rushing_yards', 'receiving_yards', 
    'receiving_yards_after_catch', 'rushing_first_downs', 'receiving_first_downs', 'rushing_tds', 
    'receiving_tds', 'rushing_epa', 'receiving_epa', 'rushing_2pt_conversions', 'receiving_2pt_conversions'
]

# Combine base columns with RB-specific columns
rb_tgt_columns = base_columns + rb_columns

# Import data for all the specified years
years = list(range(2017, current_year + 1))
nfl_data_all_years = nfl.import_weekly_data(
    years=years,
    columns=rb_tgt_columns
)

# Filter to show only RB positions
rb_data_all_years = nfl_data_all_years[nfl_data_all_years['position'] == 'RB']

# Select only the relevant columns for RBs 
rb_tgt_data_all_years = rb_data_all_years[rb_tgt_columns]

# test print
display(rb_tgt_data_all_years.head())

Downcasting floats.


Unnamed: 0,season,season_type,week,player_id,player_name,position,position_group,recent_team,opponent_team,fantasy_points,fantasy_points_ppr,carries,attempts,targets,target_share,receptions,rushing_yards,receiving_yards,receiving_yards_after_catch,rushing_first_downs,receiving_first_downs,rushing_tds,receiving_tds,rushing_epa,receiving_epa,rushing_2pt_conversions,receiving_2pt_conversions
192,2017,REG,1,00-0023500,F.Gore,RB,RB,IND,LA,5.2,6.2,10,0,1,0.047619,1,42.0,10.0,14.0,1.0,0.0,0,0,-0.449218,0.374096,0,0
193,2017,REG,2,00-0023500,F.Gore,RB,RB,IND,ARI,10.6,10.6,14,0,2,0.055556,0,46.0,0.0,0.0,2.0,0.0,1,0,-0.98442,-1.49643,0,0
194,2017,REG,3,00-0023500,F.Gore,RB,RB,IND,CLE,11.7,12.7,25,0,1,0.041667,1,57.0,0.0,5.0,2.0,0.0,1,0,-5.832108,-0.788598,0,0
195,2017,REG,4,00-0023500,F.Gore,RB,RB,IND,SEA,8.0,11.0,12,0,3,0.107143,3,46.0,34.0,43.0,1.0,2.0,0,0,-1.547445,1.7184,0,0
196,2017,REG,5,00-0023500,F.Gore,RB,RB,IND,SF,8.6,11.6,14,0,4,0.121212,3,48.0,38.0,40.0,2.0,1.0,0,0,-4.457448,1.223682,0,0


In [6]:
# Function to return all running backs and add a boolean column for avg 12+ carries or receptions per game
def rb_stats_all_avg12_car_or_rec_per_game_current_season():
    # Step 1: Filter the data to include only the current season (current_year)
    rb_current_season = rb_tgt_data_all_years[rb_tgt_data_all_years['season'] == current_year]
    
    # Step 2: Get the most recent week (current NFL week)
    current_nfl_week = get_current_week()

    # Step 3: Group by player to sum cumulative stats (carries, attempts, receptions) and calculate games played
    rb_grouped = rb_current_season.groupby(['season', 'player_id', 'player_name', 'recent_team'], as_index=False).agg({
        'carries': 'sum',               # Total carries over the season
        'attempts': 'sum',              # Total attempts over the season
        'receptions': 'sum',            # Total receptions over the season
        'rushing_yards': 'sum',         # Total rushing yards over the season
        'week': 'count'                 # Number of games played (count of weeks)
    }).rename(columns={'week': 'games_played'})

    # Step 4: Calculate averages per game for carries, attempts, and receptions
    rb_grouped['carries_per_game'] = rb_grouped['carries'] / rb_grouped['games_played']
    rb_grouped['attempts_per_game'] = rb_grouped['attempts'] / rb_grouped['games_played']
    rb_grouped['receptions_per_game'] = rb_grouped['receptions'] / rb_grouped['games_played']
    
    # Step 5: Calculate rushing yards per game
    rb_grouped['rush_yds_per_game'] = rb_grouped['rushing_yards'] / rb_grouped['games_played']
    
    # Step 6: Calculate average yards per carry
    rb_grouped['avg_yds_per_carry'] = rb_grouped['rushing_yards'] / rb_grouped['carries']

    # Step 7: Add a boolean column to check if the player has an average of 12+ carries OR 12+ receptions per game
    rb_grouped['avg_12_carries_or_receptions_bool'] = (rb_grouped['carries_per_game'] >= 12) | (rb_grouped['receptions_per_game'] >= 12)

    # Step 8: Add the 'week' column with the current NFL week for all rows
    rb_grouped['week'] = current_nfl_week

    # Step 9: Round the values of the specified columns
    rb_grouped['carries_per_game'] = rb_grouped['carries_per_game'].round(1)
    rb_grouped['attempts_per_game'] = rb_grouped['attempts_per_game'].round(1)
    rb_grouped['receptions_per_game'] = rb_grouped['receptions_per_game'].round(1)
    rb_grouped['rush_yds_per_game'] = rb_grouped['rush_yds_per_game'].round(1)
    rb_grouped['avg_yds_per_carry'] = rb_grouped['avg_yds_per_carry'].round(2)  # Average yards per carry rounded to 2 decimals
    rb_grouped['rushing_yards'] = rb_grouped['rushing_yards'].astype(int)  # Ensure no decimals for rushing_yards

    # Step 10: Reorder the columns as requested
    rb_grouped = rb_grouped[['season', 'week', 'player_id', 'player_name', 'games_played', 'recent_team',
                             'carries', 'carries_per_game', 'avg_yds_per_carry', 'attempts', 'attempts_per_game',
                             'receptions', 'receptions_per_game', 'rushing_yards', 'rush_yds_per_game',
                             'avg_12_carries_or_receptions_bool']]

    # Step 11: Sort by rushing_yards in descending order
    rb_sorted = rb_grouped.sort_values(by='rushing_yards', ascending=False)

    # Return the dataframe with all running backs and the avg_12_carries_or_receptions_bool column
    return rb_sorted

# Run the revised function
rb_avg_12_carries_or_receptions_per_game_df = rb_stats_all_avg12_car_or_rec_per_game_current_season()

# Count the number of rows in the dataframe
rb_row_count = rb_avg_12_carries_or_receptions_per_game_df.shape[0]
print(f"Number of rows in the dataframe: {rb_row_count}")

# Display the first few rows to verify the data
display(rb_avg_12_carries_or_receptions_per_game_df.head())


Number of rows in the dataframe: 113


Unnamed: 0,season,week,player_id,player_name,games_played,recent_team,carries,carries_per_game,avg_yds_per_carry,attempts,attempts_per_game,receptions,receptions_per_game,rushing_yards,rush_yds_per_game,avg_12_carries_or_receptions_bool
3,2024,9,00-0032764,D.Henry,8,BAL,145,18.1,6.52,0,0.0,8,1.0,946,118.2,True
21,2024,9,00-0034844,S.Barkley,7,PHI,130,18.6,5.89,0,0.0,17,2.4,766,109.4,True
65,2024,9,00-0037525,J.Mason,8,SF,134,16.8,5.11,0,0.0,10,1.2,685,85.6,True
33,2024,9,00-0035700,J.Jacobs,8,GB,145,18.1,4.6,0,0.0,17,2.1,667,83.4,True
11,2024,9,00-0033897,J.Mixon,6,HOU,126,21.0,4.83,1,0.2,14,2.3,609,101.5,True


In [7]:
# Function to calculate average carries or receptions in the last 3 games for RBs
def rb_avg_15_carries_or_receptions_last3_games(rb_avg_12_carries_or_receptions_per_game_df):
    # Step 1: Filter the current season data and ensure non-zero games
    rb_last_3_games = rb_tgt_data_all_years[
        (rb_tgt_data_all_years['season'] == current_year) &
        (rb_tgt_data_all_years['position'] == 'RB') &
        ((rb_tgt_data_all_years['carries'] > 0) | (rb_tgt_data_all_years['receptions'] > 0))
    ]

    # Step 2: Sort data by player_id and week, then get the last 3 games played for each player
    rb_last_3_games_sorted = rb_last_3_games.sort_values(by=['player_id', 'week'], ascending=[True, False])
    rb_last_3_games_grouped = rb_last_3_games_sorted.groupby('player_id').head(3)

    # Step 3: Calculate sum and average of carries and receptions for the last 3 games
    rb_last_3_games_grouped_sum = rb_last_3_games_grouped.groupby('player_id').agg({
        'carries': 'sum',
        'receptions': 'sum'
    }).rename(columns={
        'carries': 'carries_last3_games_played',
        'receptions': 'rec_last3_games_played'
    })

    # Step 4: Calculate the averages based on the last 3 games played
    rb_last_3_games_grouped_sum['avg_carries_last3_games_played'] = rb_last_3_games_grouped_sum['carries_last3_games_played'] / 3
    rb_last_3_games_grouped_sum['avg_rec_last3_games_played'] = rb_last_3_games_grouped_sum['rec_last3_games_played'] / 3

    # Step 5: Create boolean columns based on averages
    rb_last_3_games_grouped_sum['avg_carries_last3_games_played_bool'] = rb_last_3_games_grouped_sum['avg_carries_last3_games_played'] >= 15
    rb_last_3_games_grouped_sum['avg_rec_last3_games_played_bool'] = rb_last_3_games_grouped_sum['avg_rec_last3_games_played'] >= 15
    rb_last_3_games_grouped_sum['avg_15_rec_or_carries_last3_games_bool'] = (
        rb_last_3_games_grouped_sum['avg_carries_last3_games_played_bool'] |
        rb_last_3_games_grouped_sum['avg_rec_last3_games_played_bool']
    )

    # Step 6: Merge last 3 games stats back into the main dataframe
    rb_avg_15_carries_or_receptions_last3_games_df = rb_avg_12_carries_or_receptions_per_game_df.merge(
        rb_last_3_games_grouped_sum,
        on='player_id',
        how='left'
    )

    # Return the new dataframe with the additional columns
    return rb_avg_15_carries_or_receptions_last3_games_df

# Run the function to generate the dataframe for players with averages based on last 3 games played
rb_avg_15_carries_or_receptions_last3_games_df = rb_avg_15_carries_or_receptions_last3_games(rb_avg_12_carries_or_receptions_per_game_df)

# Print the row count and display the first few rows
rb_row_count_last3 = rb_avg_15_carries_or_receptions_last3_games_df.shape[0]
print(f"Number of rows in the dataframe: {rb_row_count_last3}")
display(rb_avg_15_carries_or_receptions_last3_games_df.head())


Number of rows in the dataframe: 113


Unnamed: 0,season,week,player_id,player_name,games_played,recent_team,carries,carries_per_game,avg_yds_per_carry,attempts,attempts_per_game,receptions,receptions_per_game,rushing_yards,rush_yds_per_game,avg_12_carries_or_receptions_bool,carries_last3_games_played,rec_last3_games_played,avg_carries_last3_games_played,avg_rec_last3_games_played,avg_carries_last3_games_played_bool,avg_rec_last3_games_played_bool,avg_15_rec_or_carries_last3_games_bool
0,2024,9,00-0032764,D.Henry,8,BAL,145,18.1,6.52,0,0.0,8,1.0,946,118.2,True,50.0,2.0,16.666667,0.666667,True,False,True
1,2024,9,00-0034844,S.Barkley,7,PHI,130,18.6,5.89,0,0.0,17,2.4,766,109.4,True,57.0,5.0,19.0,1.666667,True,False,True
2,2024,9,00-0037525,J.Mason,8,SF,134,16.8,5.11,0,0.0,10,1.2,685,85.6,True,29.0,3.0,9.666667,1.0,False,False,False
3,2024,9,00-0035700,J.Jacobs,8,GB,145,18.1,4.6,0,0.0,17,2.1,667,83.4,True,55.0,9.0,18.333333,3.0,True,False,True
4,2024,9,00-0033897,J.Mixon,6,HOU,126,21.0,4.83,1,0.2,14,2.3,609,101.5,True,74.0,6.0,24.666667,2.0,True,False,True


In [8]:
# Function to add a boolean column for avg 4+ yards per carry
def rb_avg_4yds_per_carry(rb_avg_15_carries_or_receptions_last3_games_df):
    # Copy the previous dataframe to retain all existing columns
    rb_df = rb_avg_15_carries_or_receptions_last3_games_df.copy()
    
    # Step 1: Add the boolean column 'avg_4yds_per_carry_bool'
    rb_df['avg_4yds_per_carry_bool'] = rb_df['avg_yds_per_carry'] >= 4

    # Step 2: Ensure all columns are in the correct order as specified
    rb_df = rb_df[['season', 'week', 'player_id', 'player_name', 'games_played', 'recent_team', 'carries', 
                   'carries_per_game', 'avg_yds_per_carry', 'attempts', 'attempts_per_game', 'receptions', 
                   'receptions_per_game', 'rushing_yards', 'rush_yds_per_game', 'avg_12_carries_or_receptions_bool', 
                   'carries_last3_games_played', 'rec_last3_games_played', 'avg_carries_last3_games_played', 
                   'avg_rec_last3_games_played', 'avg_carries_last3_games_played_bool', 
                   'avg_rec_last3_games_played_bool', 'avg_15_rec_or_carries_last3_games_bool', 'avg_4yds_per_carry_bool']]

    # Return the updated dataframe
    return rb_df

# Call the new function
rb_avg_4yds_per_carry_df = rb_avg_4yds_per_carry(rb_avg_15_carries_or_receptions_last3_games_df)

# Display the row count and a sample of the dataframe
rb_row_count = rb_avg_4yds_per_carry_df.shape[0]
print(f"Number of rows in the dataframe: {rb_row_count}")
display(rb_avg_4yds_per_carry_df.head())


Number of rows in the dataframe: 113


Unnamed: 0,season,week,player_id,player_name,games_played,recent_team,carries,carries_per_game,avg_yds_per_carry,attempts,attempts_per_game,receptions,receptions_per_game,rushing_yards,rush_yds_per_game,avg_12_carries_or_receptions_bool,carries_last3_games_played,rec_last3_games_played,avg_carries_last3_games_played,avg_rec_last3_games_played,avg_carries_last3_games_played_bool,avg_rec_last3_games_played_bool,avg_15_rec_or_carries_last3_games_bool,avg_4yds_per_carry_bool
0,2024,9,00-0032764,D.Henry,8,BAL,145,18.1,6.52,0,0.0,8,1.0,946,118.2,True,50.0,2.0,16.666667,0.666667,True,False,True,True
1,2024,9,00-0034844,S.Barkley,7,PHI,130,18.6,5.89,0,0.0,17,2.4,766,109.4,True,57.0,5.0,19.0,1.666667,True,False,True,True
2,2024,9,00-0037525,J.Mason,8,SF,134,16.8,5.11,0,0.0,10,1.2,685,85.6,True,29.0,3.0,9.666667,1.0,False,False,False,True
3,2024,9,00-0035700,J.Jacobs,8,GB,145,18.1,4.6,0,0.0,17,2.1,667,83.4,True,55.0,9.0,18.333333,3.0,True,False,True,True
4,2024,9,00-0033897,J.Mixon,6,HOU,126,21.0,4.83,1,0.2,14,2.3,609,101.5,True,74.0,6.0,24.666667,2.0,True,False,True,True


In [9]:
# Function to split the 'player_name' column and handle cases where the split doesn't result in two parts
def split_player_name_column_rb(df):
    # Split the 'player_name' column by '.' into two columns: 'FirstName' and 'LastName'
    # Use expand=True to split into two columns, and handle cases with fewer/more parts by filling with None
    name_split = df['player_name'].str.split('.', expand=True, n=1)
    
    # Ensure that the split produces exactly two columns, and fill missing parts with empty strings
    name_split.columns = ['FirstName', 'LastName']
    name_split['FirstName'] = name_split['FirstName'].fillna('')  # Avoid inplace assignment
    name_split['LastName'] = name_split['LastName'].fillna('')    # Avoid inplace assignment
    
    # Normalize the 'FirstName' and 'LastName' columns: strip spaces and convert to lowercase
    name_split['FirstName'] = name_split['FirstName'].str.strip().str.lower()
    name_split['LastName'] = name_split['LastName'].str.strip().str.lower()

    # Add the split columns back to the dataframe and drop the original 'player_name' column
    df = pd.concat([df, name_split], axis=1)
    df.drop(columns=['player_name'], inplace=True)
    
    # Reorder the columns to match the desired order
    df = df[['season', 'week', 'player_id', 'FirstName', 'LastName', 'games_played', 'recent_team',
             'carries', 'carries_per_game', 'avg_yds_per_carry', 'attempts', 'attempts_per_game', 
             'receptions', 'receptions_per_game', 'rushing_yards', 'rush_yds_per_game', 
             'avg_12_carries_or_receptions_bool', 'carries_last3_games_played', 'rec_last3_games_played', 
             'avg_carries_last3_games_played', 'avg_rec_last3_games_played', 
             'avg_carries_last3_games_played_bool', 'avg_rec_last3_games_played_bool', 
             'avg_15_rec_or_carries_last3_games_bool', 'avg_4yds_per_carry_bool']]

    # Sort by 'rushing_yards' in descending order
    df = df.sort_values(by='rushing_yards', ascending=False)

    return df

# Apply the function to split the player name and sort
rb_with_split_name_df = split_player_name_column_rb(rb_avg_4yds_per_carry_df)

# Display the specified range of rows to verify
display(rb_with_split_name_df.iloc[:5])  # Adjust the row range as needed for review

# Export to CSV for manual verification
# rb_with_split_name_df.to_csv("rb_name_test.csv", index=False)

# Print the row count for verification
print(f"Number of rows in the dataframe after splitting the player name: {rb_with_split_name_df.shape[0]}")


Unnamed: 0,season,week,player_id,FirstName,LastName,games_played,recent_team,carries,carries_per_game,avg_yds_per_carry,attempts,attempts_per_game,receptions,receptions_per_game,rushing_yards,rush_yds_per_game,avg_12_carries_or_receptions_bool,carries_last3_games_played,rec_last3_games_played,avg_carries_last3_games_played,avg_rec_last3_games_played,avg_carries_last3_games_played_bool,avg_rec_last3_games_played_bool,avg_15_rec_or_carries_last3_games_bool,avg_4yds_per_carry_bool
0,2024,9,00-0032764,d,henry,8,BAL,145,18.1,6.52,0,0.0,8,1.0,946,118.2,True,50.0,2.0,16.666667,0.666667,True,False,True,True
1,2024,9,00-0034844,s,barkley,7,PHI,130,18.6,5.89,0,0.0,17,2.4,766,109.4,True,57.0,5.0,19.0,1.666667,True,False,True,True
2,2024,9,00-0037525,j,mason,8,SF,134,16.8,5.11,0,0.0,10,1.2,685,85.6,True,29.0,3.0,9.666667,1.0,False,False,False,True
3,2024,9,00-0035700,j,jacobs,8,GB,145,18.1,4.6,0,0.0,17,2.1,667,83.4,True,55.0,9.0,18.333333,3.0,True,False,True,True
4,2024,9,00-0033897,j,mixon,6,HOU,126,21.0,4.83,1,0.2,14,2.3,609,101.5,True,74.0,6.0,24.666667,2.0,True,False,True,True


Number of rows in the dataframe after splitting the player name: 113


In [10]:
# Function to scrape salary changes from FantasyPros website
def scrape_salary_changes():
    # URL of the FantasyPros salary changes page
    url = "https://www.fantasypros.com/daily-fantasy/nfl/fanduel-salary-changes.php"
    
    # Fetch the page content
    response = requests.get(url)
    
    # Check if the page was fetched successfully
    if response.status_code != 200:
        print(f"Failed to fetch the page. Status code: {response.status_code}")
        return None
    
    # Parse the page content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Locate the table containing the salary changes (assuming it's the first table)
    table = soup.find('table')  # Adjust if necessary based on the page structure
    
    # Extract the table headers
    headers = [header.text for header in table.find_all('th')]
    
    # Extract the table rows
    rows = []
    for row in table.find_all('tr')[1:]:  # Skip the header row
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]  # Clean up whitespace
        rows.append(cols)
    
    # Create a DataFrame with the scraped data
    salary_changes_df = pd.DataFrame(rows, columns=headers)
        
    # Display the first few rows of the DataFrame
    display(salary_changes_df.head())
    
    return salary_changes_df


In [11]:
# Function to clean and standardize player names
def clean_name(name):
    # List of common suffixes
    suffixes = ['ii', 'iii', 'jr.', 'sr.', 'iv', 'v']
    
    # Handle apostrophes and split the name
    name_parts = name.replace("'", "").split()
    
    # Initialize variables for first and last name
    first_name = name_parts[0]
    last_name = ''
    
    # Handle cases where the name contains a suffix
    if len(name_parts) > 2 and name_parts[-1].lower() in suffixes:
        last_name = ' '.join(name_parts[1:])  # Last name includes the suffix
    else:
        last_name = ' '.join(name_parts[1:])  # Normal last name, possibly hyphenated
    
    # Ensure that hyphenated names stay intact
    if '-' in first_name:
        first_name = first_name.lower()
    if '-' in last_name:
        last_name = last_name.lower()
    
    # Return the cleaned name in lowercase and without spaces
    return {'FirstName': first_name.strip().lower(), 'LastName': last_name.strip().lower()}

In [12]:
# Process salary changes by position without the Suffix column
def process_salary_changes_by_position():
    # Fetch the salary changes data
    salary_changes_df = scrape_salary_changes()

    # Step 1: Extract the 'Player' column and split it into 'Name' and 'Team-Position'
    salary_changes_df[['Name', 'Team-Position']] = salary_changes_df['Player'].str.extract(r'([^\(]+)\((.*)\)')
    
    # Step 2: Further split the 'Name' into 'FirstName' and 'LastName' without handling suffix separately
    name_split = salary_changes_df['Name'].apply(clean_name)
    name_df = pd.DataFrame(name_split.tolist())  # Convert list of dictionaries to DataFrame
    
    # Merge the cleaned name columns (FirstName, LastName) back into the salary DataFrame
    salary_changes_df = pd.concat([salary_changes_df, name_df], axis=1)
    
    # Step 3: Split 'Team-Position' into 'Team' and 'Position'
    salary_changes_df[['Team', 'Position']] = salary_changes_df['Team-Position'].str.extract(r'(\w+)\s*-\s*(\w+)')
    
    # Drop the original 'Player' and 'Team-Position' columns
    salary_changes_df.drop(columns=['Player', 'Team-Position', 'Name'], inplace=True)

    # Normalize the 'FirstName' and 'LastName' columns to lowercase and trim whitespace
    salary_changes_df['FirstName'] = salary_changes_df['FirstName'].str.strip().str.lower()
    salary_changes_df['LastName'] = salary_changes_df['LastName'].str.strip().str.lower()

    # Display the processed salary DataFrame for verification
    display(salary_changes_df.head())
    
    # Step 4: Split the data by position
    df_qb = salary_changes_df[salary_changes_df['Position'] == 'QB']
    df_wr = salary_changes_df[salary_changes_df['Position'] == 'WR']
    df_rb = salary_changes_df[salary_changes_df['Position'] == 'RB']
    
    return df_qb, df_wr, df_rb

# Call the modified process_salary_changes_by_position function
df_qb_salary, df_wr_salary, df_rb_salary = process_salary_changes_by_position()

# Verify the results for wide receivers (df_wr_salary)
display(df_rb_salary.head())

# Count the number of rows in the df_wr_salary dataframe
df_rb_salary_row_count = df_rb_salary.shape[0]

# Print the row count for verification
print(f"Number of rows in the RB salary dataframe: {df_rb_salary_row_count}")

Unnamed: 0,ECR,Player,Kickoff,Opp,This Week,Last Week,Difference
0,-,Riley Sharp (BAL - TE),Sun 1:00PM,DEN,"$4,000","$4,000",0
1,166,John Metchie III (HOU - WR),Thu 8:15PM,@NYJ,"$5,600","$4,400",1200
2,-,Snoop Conner (DAL - RB),Sun 1:00PM,@ATL,"$4,000","$4,000",0
3,-,Trey Knox (MIN - TE),Sun 8:20PM,IND,"$4,000","$4,000",0
4,-,Kevin Harris (NE - RB),Sun 1:00PM,@TEN,"$4,000","$4,000",0


Unnamed: 0,ECR,Kickoff,Opp,This Week,Last Week,Difference,FirstName,LastName,Team,Position
0,-,Sun 1:00PM,DEN,"$4,000","$4,000",0,riley,sharp,BAL,TE
1,166,Thu 8:15PM,@NYJ,"$5,600","$4,400",1200,john,metchie iii,HOU,WR
2,-,Sun 1:00PM,@ATL,"$4,000","$4,000",0,snoop,conner,DAL,RB
3,-,Sun 8:20PM,IND,"$4,000","$4,000",0,trey,knox,MIN,TE
4,-,Sun 1:00PM,@TEN,"$4,000","$4,000",0,kevin,harris,NE,RB


Unnamed: 0,ECR,Kickoff,Opp,This Week,Last Week,Difference,FirstName,LastName,Team,Position
2,-,Sun 1:00PM,@ATL,"$4,000","$4,000",0,snoop,conner,DAL,RB
4,-,Sun 1:00PM,@TEN,"$4,000","$4,000",0,kevin,harris,NE,RB
9,-,Sun 4:05PM,JAC,"$4,000","$4,000",0,tyrion,davis-price,PHI,RB
12,229,Sun 1:00PM,WAS,"$5,000","$4,300",700,eric,gray,NYG,RB
15,-,Mon 8:15PM,@KC,"$4,000","$4,000",0,d.j.,williams,TB,RB


Number of rows in the RB salary dataframe: 205


In [13]:
# Function to handle name and team matching during the merge, with dynamic column reordering
def preprocess_for_merge(df_salary, df_wr_baseline):
    # Step 1: Rename salary columns for clarity
    df_salary.rename(columns={'This Week': 'current_week_salary', 'Last Week': 'last_week_salary', 'Difference': 'salary_diff'}, inplace=True)

    # Step 2: Standardize the first letter of the first name
    df_salary['FirstName_Initial'] = df_salary['FirstName'].str[0].str.lower()  # Get the first letter
    df_wr_baseline['FirstName_Initial'] = df_wr_baseline['FirstName'].str[0].str.lower()

    # Step 3: Merge based on LastName, FirstName initial, and Team
    merged_df = df_wr_baseline.merge(
        df_salary[['FirstName_Initial', 'FirstName', 'LastName', 'Team', 'current_week_salary', 'last_week_salary', 'salary_diff']],
        left_on=['FirstName_Initial', 'LastName', 'recent_team'],  # Merge on initials, last name, and team
        right_on=['FirstName_Initial', 'LastName', 'Team'],        # Merge with salary dataframe
        how='left'  # Keep all players from the baseline dataframe
    )

    # Step 4: Dynamically reorder columns, ensuring both 'FirstName_x' and 'FirstName_y' are included for now
    baseline_columns = list(df_wr_baseline.columns)  # Get the original columns from the baseline dataframe
    salary_columns = ['current_week_salary', 'last_week_salary', 'salary_diff']
    
    # Reorder the dataframe to include salary columns at the end
    ordered_columns = baseline_columns + ['FirstName_x', 'FirstName_y'] + salary_columns  # Include both FirstName columns for now

    # Ensure that all columns exist before reordering
    ordered_columns = [col for col in ordered_columns if col in merged_df.columns]

    # Reorder the dataframe
    merged_df = merged_df[ordered_columns]

    return merged_df


# Function to finalize the merge and clean up duplicate columns
def merge_baseline_stats_salary(merged_df):
    # Step 1: Drop 'FirstName_x' and rename 'FirstName_y' to 'FirstName'
    merged_df.drop(columns=['FirstName_x'], inplace=True)
    merged_df.rename(columns={'FirstName_y': 'FirstName'}, inplace=True)

    # Step 2: Reorder columns, placing 'FirstName_Initial' after 'FirstName' and 'Team' after 'LastName'
    key_columns = ['season', 'week', 'player_id', 'FirstName', 'FirstName_Initial', 'LastName', 'recent_team', 'current_week_salary', 'last_week_salary', 'salary_diff']
    
    # Automatically get all other columns not in key_columns
    remaining_columns = [col for col in merged_df.columns if col not in key_columns]

    # Combine the two sets of columns
    ordered_columns = key_columns + remaining_columns

    # Reorder the dataframe
    merged_df = merged_df[ordered_columns]

    return merged_df


# Call the function to perform the merge and adjust the columns dynamically for running backs
df_rb_merge_baseline_salary = preprocess_for_merge(df_rb_salary, rb_with_split_name_df)

# Apply the final clean-up to the merged dataframe
rb_merged_baseline_salary_df = merge_baseline_stats_salary(df_rb_merge_baseline_salary)

# Print count of rows in the final dataframe
print(f"Number of rows in the final merged dataframe: {rb_merged_baseline_salary_df.shape[0]}")

# Display the result of the merge to verify
display(rb_merged_baseline_salary_df.head())

# Save the final DataFrame to a CSV file for manual inspection
# rb_merged_baseline_salary_df.to_csv("rb_merged_baseline_salary.csv", index=False)


Number of rows in the final merged dataframe: 113


Unnamed: 0,season,week,player_id,FirstName,FirstName_Initial,LastName,recent_team,current_week_salary,last_week_salary,salary_diff,games_played,carries,carries_per_game,avg_yds_per_carry,attempts,attempts_per_game,receptions,receptions_per_game,rushing_yards,rush_yds_per_game,avg_12_carries_or_receptions_bool,carries_last3_games_played,rec_last3_games_played,avg_carries_last3_games_played,avg_rec_last3_games_played,avg_carries_last3_games_played_bool,avg_rec_last3_games_played_bool,avg_15_rec_or_carries_last3_games_bool,avg_4yds_per_carry_bool
0,2024,9,00-0032764,derrick,d,henry,BAL,"$9,000","$9,200",-200.0,8,145,18.1,6.52,0,0.0,8,1.0,946,118.2,True,50.0,2.0,16.666667,0.666667,True,False,True,True
1,2024,9,00-0034844,saquon,s,barkley,PHI,"$9,400","$9,100",300.0,7,130,18.6,5.89,0,0.0,17,2.4,766,109.4,True,57.0,5.0,19.0,1.666667,True,False,True,True
2,2024,9,00-0037525,,j,mason,SF,,,,8,134,16.8,5.11,0,0.0,10,1.2,685,85.6,True,29.0,3.0,9.666667,1.0,False,False,False,True
3,2024,9,00-0035700,josh,j,jacobs,GB,"$7,400","$7,500",-100.0,8,145,18.1,4.6,0,0.0,17,2.1,667,83.4,True,55.0,9.0,18.333333,3.0,True,False,True,True
4,2024,9,00-0033897,joe,j,mixon,HOU,"$9,200","$9,000",200.0,6,126,21.0,4.83,1,0.2,14,2.3,609,101.5,True,74.0,6.0,24.666667,2.0,True,False,True,True


In [14]:
# URLs for each position (cumulative redzone stats)
urls_cumulative = {
    'qb': 'https://www.fantasypros.com/nfl/red-zone-stats/qb.php',
    'rb': 'https://www.fantasypros.com/nfl/red-zone-stats/rb.php?range=full',
    'wr': 'https://www.fantasypros.com/nfl/red-zone-stats/wr.php?range=full',
    'te': 'https://www.fantasypros.com/nfl/red-zone-stats/te.php?range=full'
}

In [15]:
# Function to ensure unique column names
def ensure_unique_column_names(columns):
    seen = {}
    new_columns = []
    for col in columns:
        if col not in seen:
            seen[col] = 0
            new_columns.append(col)
        else:
            seen[col] += 1
            new_columns.append(f"{col}_dup_{seen[col]}")
    return new_columns

In [16]:
# Function to scrape and adjust redzone data for cumulative stats, handling duplicate column names based on position
def scrape_redzone_stats(position, url):
    # Send request to the URL
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve data for {position}. Status code: {response.status_code}")
        return None
    
    # Parse the content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table')
    
    if not table:
        print(f"No table found for {position}.")
        return None
    
    # Extract table headers
    headers = [th.text.strip() for th in table.find_all('th')]
    
    # Extract rows
    rows = []
    for row in table.find_all('tr')[1:]:  # Skip the header row
        cols = [td.text.strip() for td in row.find_all('td')]
        rows.append(cols)
    
    # Create DataFrame
    df = pd.DataFrame(rows, columns=headers)
    
    # Clean up column names and data as necessary
    df.columns = df.columns.str.lower().str.replace(' ', '_')  # Clean column names
    
    # Ensure unique column names (to handle duplicates)
    df.columns = ensure_unique_column_names(df.columns)
    
    # Improved regex to handle hyphenated and multi-part names (like "Amon-Ra St. Brown")
    player_info = df['player'].str.extract(r'(?P<firstName>[\w\-\'\.]+)\s+(?P<lastName>[\w\-\.\s]+)\s*\((?P<team>.*)\)')
    
    df = pd.concat([df, player_info], axis=1)
    df.drop(columns=['player'], inplace=True)  # Drop the original 'player' column
    
    # Step 1: Rename columns based on their positions in the dataframe
    if position == 'rb':
        # Assuming the order is consistent as shown in your screenshot:
        df.rename(columns={
            df.columns[1]: 'red_zone_rush_att',
            df.columns[2]: 'red_zone_rush_yds',
            df.columns[3]: 'red_zone_rush_y/a',
            df.columns[4]: 'red_zone_rush_td',
            df.columns[5]: 'red_zone_rush_pct',
            df.columns[6]: 'red_zone_rec_rec',
            df.columns[7]: 'red_zone_rec_tgt',
            df.columns[8]: 'red_zone_rec_pct',
            df.columns[9]: 'red_zone_rec_yds',
            df.columns[10]: 'red_zone_rec_y/r',
            df.columns[11]: 'red_zone_rec_td',
            df.columns[12]: 'red_zone_rec_tgt_pct'
        }, inplace=True)
    
    # Step 2: Select only the relevant columns for RB
    selected_columns = [
        'firstName', 'lastName', 'team', 'red_zone_rush_att', 'red_zone_rush_yds', 
        'red_zone_rush_y/a', 'red_zone_rush_td', 'red_zone_rush_pct', 'red_zone_rec_rec', 
        'red_zone_rec_tgt', 'red_zone_rec_pct', 'red_zone_rec_yds', 'red_zone_rec_y/r', 
        'red_zone_rec_td', 'red_zone_rec_tgt_pct'
    ]
    
    df = df[selected_columns]
    
    return df

# Test with RB redzone scraping to verify inclusion of both rushing and receiving stats
df_rb_redzone_cumulative = scrape_redzone_stats('rb', urls_cumulative['rb'])
display(df_rb_redzone_cumulative.head())


Unnamed: 0,firstName,lastName,team,red_zone_rush_att,red_zone_rush_yds,red_zone_rush_y/a,red_zone_rush_td,red_zone_rush_pct,red_zone_rec_rec,red_zone_rec_tgt,red_zone_rec_pct,red_zone_rec_yds,red_zone_rec_y/r,red_zone_rec_td,red_zone_rec_tgt_pct
0,,,,,,,,,,,,,,,
1,Kyren,Williams,LAR,37.0,104.0,2.8,8.0,90.2%,6.0,7.0,85.7%,38.0,6.3,2.0,100.0%
2,Derrick,Henry,BAL,22.0,40.0,1.8,7.0,88.0%,3.0,3.0,100.0%,22.0,7.3,2.0,33.3%
3,David,Montgomery,DET,21.0,74.0,3.5,7.0,50.0%,1.0,1.0,100.0%,3.0,3.0,0.0,14.3%
4,James,Cook,BUF,21.0,79.0,3.8,6.0,56.8%,1.0,2.0,50.0%,17.0,17.0,1.0,33.3%


In [17]:
# Function to merge baseline stats, salary, and redzone stats for running backs
def merge_baseline_stats_salary_redzone(df_rb_redzone, df_rb_baseline_salary):
    # Step 1: Clean and standardize names for both dataframes
    df_rb_redzone[['FirstName', 'LastName']] = df_rb_redzone.apply(lambda row: pd.Series(clean_name(f"{row['firstName']} {row['lastName']}")), axis=1)
    df_rb_baseline_salary[['FirstName', 'LastName']] = df_rb_baseline_salary.apply(lambda row: pd.Series(clean_name(f"{row['FirstName']} {row['LastName']}")), axis=1)

    # Step 2: Merge based on cleaned FirstName, LastName, and team
    merged_df = df_rb_baseline_salary.merge(
        df_rb_redzone[['FirstName', 'LastName', 'team', 'red_zone_rush_att', 'red_zone_rush_yds', 'red_zone_rush_y/a', 'red_zone_rush_td', 'red_zone_rush_pct', 
                       'red_zone_rec_rec', 'red_zone_rec_tgt', 'red_zone_rec_pct', 'red_zone_rec_yds', 'red_zone_rec_y/r', 'red_zone_rec_td', 'red_zone_rec_tgt_pct']],
        left_on=['FirstName', 'LastName', 'recent_team'],  # Use cleaned names and team for merging
        right_on=['FirstName', 'LastName', 'team'],        
        how='left'  # Keep all players from the baseline dataframe
    )

    # Step 3: Handle mismatched rows for cases where the merge failed (red zone stats are NaN)
    unmatched_rows = merged_df[merged_df['red_zone_rush_att'].isna() & merged_df['red_zone_rec_rec'].isna()]

    if not unmatched_rows.empty:
        print("Handling unmatched rows using fallback logic for similar names...")

        # Reattempt the merge with fallback conditions, e.g., based on name initials or similar names
        unmatched_fallback_merge = df_rb_baseline_salary.merge(
            df_rb_redzone[['FirstName', 'LastName', 'team', 'red_zone_rush_att', 'red_zone_rush_yds', 'red_zone_rush_y/a', 'red_zone_rush_td', 'red_zone_rush_pct', 
                           'red_zone_rec_rec', 'red_zone_rec_tgt', 'red_zone_rec_pct', 'red_zone_rec_yds', 'red_zone_rec_y/r', 'red_zone_rec_td', 'red_zone_rec_tgt_pct']],
            left_on=['FirstName', 'LastName', 'recent_team'],  
            right_on=['FirstName', 'LastName', 'team'],
            how='left'
        )

        # Update the merged dataframe with fallback matches
        merged_df.update(unmatched_fallback_merge)

    # Step 4: Remove unnecessary columns like 'team' after the merge
    merged_df.drop(columns=['team'], inplace=True)

    # Step 5: Reorder the columns as requested
    ordered_columns = [
        'season', 'week', 'player_id', 'FirstName_Initial', 'FirstName', 'LastName',
        'games_played', 'current_week_salary', 'last_week_salary', 'salary_diff'
    ] + [col for col in merged_df.columns if col not in ['season', 'week', 'player_id', 'FirstName_Initial', 'FirstName', 'LastName', 'games_played', 'current_week_salary', 'last_week_salary', 'salary_diff']]

    merged_df = merged_df[ordered_columns]

    # Step 6: Save to CSV in the current directory for verification
    # merged_df.to_csv("rb_baseline_stats_salary_redzone.csv", index=False)

    # Return the merged dataframe
    return merged_df

# Call the function and assign the result to rb_baseline_stats_salary_redzone_df for future use
rb_baseline_stats_salary_redzone_df = merge_baseline_stats_salary_redzone(df_rb_redzone_cumulative, rb_merged_baseline_salary_df)

# Print confirmation count and display first 5 rows
print(f"Number of rows in the merged dataframe: {rb_baseline_stats_salary_redzone_df.shape[0]}")
display(rb_baseline_stats_salary_redzone_df.head())


Handling unmatched rows using fallback logic for similar names...
Number of rows in the merged dataframe: 113


Unnamed: 0,season,week,player_id,FirstName_Initial,FirstName,LastName,games_played,current_week_salary,last_week_salary,salary_diff,recent_team,carries,carries_per_game,avg_yds_per_carry,attempts,attempts_per_game,receptions,receptions_per_game,rushing_yards,rush_yds_per_game,avg_12_carries_or_receptions_bool,carries_last3_games_played,rec_last3_games_played,avg_carries_last3_games_played,avg_rec_last3_games_played,avg_carries_last3_games_played_bool,avg_rec_last3_games_played_bool,avg_15_rec_or_carries_last3_games_bool,avg_4yds_per_carry_bool,red_zone_rush_att,red_zone_rush_yds,red_zone_rush_y/a,red_zone_rush_td,red_zone_rush_pct,red_zone_rec_rec,red_zone_rec_tgt,red_zone_rec_pct,red_zone_rec_yds,red_zone_rec_y/r,red_zone_rec_td,red_zone_rec_tgt_pct
0,2024,9,00-0032764,d,derrick,henry,8,"$9,000","$9,200",-200.0,BAL,145,18.1,6.52,0,0.0,8,1.0,946,118.2,True,50.0,2.0,16.666667,0.666667,True,False,True,True,22.0,40.0,1.8,7.0,88.0%,3.0,3.0,100.0%,22.0,7.3,2.0,33.3%
1,2024,9,00-0034844,s,saquon,barkley,7,"$9,400","$9,100",300.0,PHI,130,18.6,5.89,0,0.0,17,2.4,766,109.4,True,57.0,5.0,19.0,1.666667,True,False,True,True,26.0,94.0,3.6,4.0,72.2%,3.0,4.0,75.0%,24.0,8.0,1.0,100.0%
2,2024,9,00-0037525,j,,mason,8,,,,SF,134,16.8,5.11,0,0.0,10,1.2,685,85.6,True,29.0,3.0,9.666667,1.0,False,False,False,True,,,,,,,,,,,,
3,2024,9,00-0035700,j,josh,jacobs,8,"$7,400","$7,500",-100.0,GB,145,18.1,4.6,0,0.0,17,2.1,667,83.4,True,55.0,9.0,18.333333,3.0,True,False,True,True,20.0,35.0,1.8,2.0,76.9%,3.0,4.0,75.0%,14.0,4.7,1.0,66.7%
4,2024,9,00-0033897,j,joe,mixon,6,"$9,200","$9,000",200.0,HOU,126,21.0,4.83,1,0.2,14,2.3,609,101.5,True,74.0,6.0,24.666667,2.0,True,False,True,True,23.0,68.0,3.0,5.0,65.7%,1.0,2.0,50.0%,10.0,10.0,1.0,33.3%


In [18]:
# Function to reorder columns, round specified columns, and drop unnecessary ones
def finalize_rb_dataframe(df):
    # Drop columns 'attempts' and 'attempts_per_game'
    df = df.drop(columns=['attempts', 'attempts_per_game'])

    # Round the specified columns to one decimal place
    df['avg_carries_last3_games_played'] = df['avg_carries_last3_games_played'].round(1)
    df['avg_rec_last3_games_played'] = df['avg_rec_last3_games_played'].round(1)

    # Reorder the columns
    ordered_columns = [
        'season', 'week', 'player_id', 'FirstName_Initial', 'FirstName', 'LastName', 'recent_team',
        'games_played', 'current_week_salary', 'last_week_salary', 'salary_diff',
        'rushing_yards', 'rush_yds_per_game', 'carries', 'carries_per_game', 'avg_yds_per_carry',
        'carries_last3_games_played', 'avg_carries_last3_games_played', 'receptions', 'receptions_per_game',
        'rec_last3_games_played', 'avg_rec_last3_games_played', 'red_zone_rush_att', 'red_zone_rush_yds',
        'red_zone_rush_y/a', 'red_zone_rush_td', 'red_zone_rush_pct', 'red_zone_rec_rec', 'red_zone_rec_tgt',
        'red_zone_rec_pct', 'red_zone_rec_yds', 'red_zone_rec_y/r', 'red_zone_rec_td', 'red_zone_rec_tgt_pct',
        'avg_12_carries_or_receptions_bool', 'avg_carries_last3_games_played_bool', 'avg_rec_last3_games_played_bool',
        'avg_15_rec_or_carries_last3_games_bool', 'avg_4yds_per_carry_bool'
    ]

    # Select and reorder the columns
    df = df[ordered_columns]

    # Return the reordered DataFrame
    return df

# Apply the function to the merged dataframe
rb_final_df = finalize_rb_dataframe(rb_baseline_stats_salary_redzone_df)

# Save the final DataFrame to CSV outside the function for inspection
rb_final_df.to_csv("rb_baseline_stats_salary_redzone.csv", index=False)

# Display the first few rows of the final DataFrame for verification
display(rb_final_df.head())

# Print confirmation of the CSV file creation
print("The final CSV file 'rb_baseline_stats_salary_redzone.csv' has been created.")


Unnamed: 0,season,week,player_id,FirstName_Initial,FirstName,LastName,recent_team,games_played,current_week_salary,last_week_salary,salary_diff,rushing_yards,rush_yds_per_game,carries,carries_per_game,avg_yds_per_carry,carries_last3_games_played,avg_carries_last3_games_played,receptions,receptions_per_game,rec_last3_games_played,avg_rec_last3_games_played,red_zone_rush_att,red_zone_rush_yds,red_zone_rush_y/a,red_zone_rush_td,red_zone_rush_pct,red_zone_rec_rec,red_zone_rec_tgt,red_zone_rec_pct,red_zone_rec_yds,red_zone_rec_y/r,red_zone_rec_td,red_zone_rec_tgt_pct,avg_12_carries_or_receptions_bool,avg_carries_last3_games_played_bool,avg_rec_last3_games_played_bool,avg_15_rec_or_carries_last3_games_bool,avg_4yds_per_carry_bool
0,2024,9,00-0032764,d,derrick,henry,BAL,8,"$9,000","$9,200",-200.0,946,118.2,145,18.1,6.52,50.0,16.7,8,1.0,2.0,0.7,22.0,40.0,1.8,7.0,88.0%,3.0,3.0,100.0%,22.0,7.3,2.0,33.3%,True,True,False,True,True
1,2024,9,00-0034844,s,saquon,barkley,PHI,7,"$9,400","$9,100",300.0,766,109.4,130,18.6,5.89,57.0,19.0,17,2.4,5.0,1.7,26.0,94.0,3.6,4.0,72.2%,3.0,4.0,75.0%,24.0,8.0,1.0,100.0%,True,True,False,True,True
2,2024,9,00-0037525,j,,mason,SF,8,,,,685,85.6,134,16.8,5.11,29.0,9.7,10,1.2,3.0,1.0,,,,,,,,,,,,,True,False,False,False,True
3,2024,9,00-0035700,j,josh,jacobs,GB,8,"$7,400","$7,500",-100.0,667,83.4,145,18.1,4.6,55.0,18.3,17,2.1,9.0,3.0,20.0,35.0,1.8,2.0,76.9%,3.0,4.0,75.0%,14.0,4.7,1.0,66.7%,True,True,False,True,True
4,2024,9,00-0033897,j,joe,mixon,HOU,6,"$9,200","$9,000",200.0,609,101.5,126,21.0,4.83,74.0,24.7,14,2.3,6.0,2.0,23.0,68.0,3.0,5.0,65.7%,1.0,2.0,50.0%,10.0,10.0,1.0,33.3%,True,True,False,True,True


The final CSV file 'rb_final_stats_salary_redzone.csv' has been created.
