In [1]:
# Import packages
import pandas as pd
import numpy as np
import math
import requests
from bs4 import BeautifulSoup
import os

# Notebook Description

This script uses Beautiful Soup to scrape historical fantasy results and season statistics from pro-football-reference.com

# Web Scraping Season Stats

In [2]:
def get_season_stats(year):
    '''
    Function to get seasonal stats for players
    '''
    
    # Extract table HTML from input URL
    page = requests.get('https://www.pro-football-reference.com/years/' + str(year) + '/fantasy.htm')
    soup = BeautifulSoup(page.text, 'lxml')
    table = soup.find('table', attrs={'id': 'fantasy'})
    
    # Convert table HTML to pandas df
    df = pd.read_html(str(table))[0]
    
    # Rename columns
    df.columns = df.columns.droplevel(0) # Remove multi-layer header
    df.columns = [
        'Rk', 'PlayerName', 'Team', 'Position', 'Age', 'GamesPlayed', 'GamesStarted', 'PassCmp', 
        'PassAtt', 'PassYds', 'PassTD', 'Int', 'RushAtt', 'RushYds', 'RushYardsPerAtt', 'RushTD', 
        'Tgts', 'Receptions', 'RecYds', 'YardsPerRec', 'RecTD', 'Fmb', 'FL', 'TD', '2PM', '2PP', 
        'FantPt', 'PointsPPR', 'DKPt', 'PointsHalfPPR', 'VBD', 'PosRank', 'OvRank']

    # Selected relavent columns
    df = df[[
        'PlayerName', 'Team', 'Position', 'PosRank', 'Age', 'GamesPlayed', 'RushAtt', 'RushYds', 
        'RushYardsPerAtt', 'RushTD', 'Tgts', 'Receptions', 'RecYds', 'YardsPerRec', 'RecTD', 
        'PointsPPR', 'PointsHalfPPR']]
    
    # Select relevant player positions
    df['Year'] = year
    df = df[df.PlayerName != 'Player']
    df = df[(df.Position == 'RB') | (df.Position == 'WR') | (df.Position == 'TE') | (df.Position == 'QB')]
    df = df[(df.Team != '2TM') & (df.Team != '3TM') & (df.Team != '4TM')]
    
    # Convert data types
    df = df.astype({'Age': 'int', 'GamesPlayed': 'int', 'PointsPPR': 'float', 'PointsHalfPPR': 'float',
                    'RushAtt': 'int', 'RushYds': 'int', 'RushYardsPerAtt': 'float', 'RushTD': 'int', 
                    'Tgts': 'float', 'Receptions': 'int', 'RecYds': 'int', 'YardsPerRec': 'float', 'RecTD': 'int'})
    df['PPG_PPR'] = round(df.PointsPPR / df.GamesPlayed, 2)
    df['PPG_HalfPPR'] = round(df.PointsHalfPPR / df.GamesPlayed, 2)
    df['TgtsPerGame'] = round(df.Tgts / df.GamesPlayed, 2)
    
    # Split name into columns and edit
    df[['FirstName', 'LastName']] = df.PlayerName.str.split(" ", n=1, expand=True)
    df['FirstName'] = df['FirstName'].str.replace('[^\w\s]', '', regex=True)
    df['LastName'] = df['LastName'].str.replace('[^\w\s]', '', regex=True)
    df['LastName'] = df['LastName'].str.replace(' III', '', regex=True)            # with naming convention in
    df['LastName'] = df['LastName'].str.replace(' II', '', regex=True)             # Edit names for consistency
    df['LastName'] = df['LastName'].str.replace(' IV', '', regex=True)             # other data sets (no suffix)
    df['LastName'] = df['LastName'].str.replace(' V', '', regex=True)
    df['LastName'] = df['LastName'].str.replace(' Sr', '', regex=True)
    df['LastName'] = df['LastName'].str.replace(' Jr', '', regex=True)
    
    # Reorder columns
    df = df[[
        'FirstName', 'LastName', 'Team', 'Position', 'PosRank', 'Year', 'Age', 'GamesPlayed', 'PointsPPR', 
        'PointsHalfPPR', 'PPG_PPR', 'PPG_HalfPPR', 'RushAtt', 'RushYds', 'RushYardsPerAtt', 'RushTD', 
        'Tgts', 'TgtsPerGame', 'Receptions', 'RecYds', 'YardsPerRec', 'RecTD']]
    
    return df


In [3]:
# Create data frame containing all stats from 2013 to 2022
years = list(np.arange(2013, 2023))
stats_df = [get_season_stats(year) for year in years] # Scrape all data
stats_df = pd.concat(stats_df)                        # Concatenate data frames from each year
stats_df.to_csv(os.getcwd() + "/../season-statistics-2013-2022.csv", index=False)
