In [1]:
# Import packages
import pandas as pd
import numpy as np
import math
import requests
from bs4 import BeautifulSoup
import os

# Notebook Description

This script uses Beautiful Soup to scrape QB ADP data from myfantasyleague.com and season-long "top" QB information from pro-football-reference.com

# Web Scraping QB ADP Data

In [2]:
def myleague_qb_adp_data(year: str):
    '''
    Function to scrape QB ADP (average draft position) data from https://api.myfantasyleague.com/
    This returns the top QBs from each team along with ADP data
    '''
    
    # Remove warnings
    pd.options.mode.chained_assignment = None
    
    # Base url
    base_url = 'https://api.myfantasyleague.com/'
    
    # All players or rookie-only url
    url_spec = f'/reports?R=ADP&POS=QB&ROOKIES=0&INJURED=1&CUTOFF=0&FCOUNT=0&IS_PPR=1&IS_KEEPER=N&IS_MOCK=1&PERIOD=START'
    
    # Extract table HTML from input URL
    page = requests.get(base_url + str(year) + url_spec + '&PAGE=ALL')
    soup = BeautifulSoup(page.text, 'lxml')
    table = soup.find('table', attrs={'class': 'report'})
    
    # Convert table HTML to pandas df and clean up
    df = pd.read_html(str(table))[0]
    df.drop(df.tail(1).index, inplace=True) # Drop last row (Page: 1 2 3 4 ALL)
    df['Year'] = year
    df['TopQB_ADP'] = df['Avg Pick'].astype(float)                                # Rename column and force to float
    df = df[~df.Player.str.contains('TMQB')]                                      # Remove "team qb" rows
    df[['LastName', 'Player']] = df.Player.str.split(", ", expand=True)           # Expand player info across columns
    df[['FirstName', 'Team', 'Position']] = df.Player.str.split(expand=True)      # Ex: McCaffrey, Christian CAR RB
    df = df.drop_duplicates(subset=['Team'], keep='first').reset_index(drop=True) # Keep top QB by ADP
    df['FirstName'] = df['FirstName'].str.replace('[^\w\s]', '', regex=True)      # Remove punctuation
    df['LastName'] = df['LastName'].str.replace('[^\w\s]', '', regex=True)        # with naming convention in
    df['LastName'] = df['LastName'].str.replace(' III', '', regex=True)           # Edit names for consistency
    df['LastName'] = df['LastName'].str.replace(' II', '', regex=True)            # other data sets (no suffix)
    df['LastName'] = df['LastName'].str.replace(' IV', '', regex=True)
    df['LastName'] = df['LastName'].str.replace(' V', '', regex=True)
    df['LastName'] = df['LastName'].str.replace(' Sr', '', regex=True)
    df['LastName'] = df['LastName'].str.replace(' Jr', '', regex=True)
    df['QB'] = df.FirstName + ' ' + df.LastName
    df = df[['QB', 'Team', 'Year', 'TopQB_ADP']]                                  # Select and order columns
    
    # Convert team names for consistency
    team_conv = {'GBP': 'GNB', 'JAC': 'JAX', 'KCC': 'KAN', 'NOS': 'NOR', 'NEP': 'NWE', 'RAM': 'LAR', 'SDC': 'SDG', 'TBB': 'TAM'}
    df = df.replace({'Team': team_conv})
    
    return df
    

In [3]:
# Create data frame containing all stats from 2013 to 2022
years = list(np.arange(2013, 2023))
qb_df = [myleague_qb_adp_data(year) for year in years] # Scrape all data
qb_df = pd.concat(qb_df)                               # Concatenate data frames from each year
qb_df.to_csv(os.getcwd() + "/../myfantasyleague-qb-adp-data-2013-2022.csv", index=False)

# Web Scraping Historic Top QB Data

In [4]:
def yearly_top_qb_stats(year):
    '''
    Function to get yearly top QB stats from each team
    Can use this data as additional data for performance of WRs
    '''
    
    # Extract table HTML from input URL
    page = requests.get('https://www.pro-football-reference.com/years/' + str(year) + '/passing.htm')
    soup = BeautifulSoup(page.text, 'lxml')
    table = soup.find('table', attrs={'id': 'passing'})
    
    # Convert table HTML to pandas df
    df = pd.read_html(str(table))[0]
    df = df[(df.Tm != 'Tm') & (df.Tm != '2TM') & (df.Tm != '3TM') & (df.Tm != '4TM')]
    df = df.astype({'Age': 'int', 'Att': 'int', 'QBR': 'float', 'GS': 'int'})

    # Extract top QB information for each team (top QB will be the QB with the most attempts)
    teams = list(set(df.Tm))
    team_qbs = []
    for team in teams:
        sorted_df = df[df.Tm == team].sort_values(by='Att', ascending=False) # QB with most attempts at top
        top_qb_df = sorted_df.head(1)
        team_qbs.append(top_qb_df)  # Store each top QB df in a list
    all_qb_df = pd.concat(team_qbs) # Concatentate all dfs

    # Confirm there are 32 QBs in the concatenated df
    assert(len(all_qb_df) == 32)

    # Clean up output df
    all_qb_df['Year'] = year
    all_qb_df['Player'] = all_qb_df['Player'].str.replace('[^\w\s]', '', regex=True) # Remove punctuation
    all_qb_df['AmbiguousQB'] = np.where(all_qb_df['GS'] <= 10, True, False)  # Ambiguous QB: Top QB starts <= 10 games
    all_qb_df = all_qb_df[['Tm', 'Year', 'Player', 'Age', 'QBR', 'AmbiguousQB']]
    all_qb_df.columns = ['Team', 'Year', 'QB', 'QB_Age', 'QB_QBR', 'AmbiguousQB']

    return all_qb_df


In [5]:
# Create data frame containing all stats from 2013 to 2022
years = list(np.arange(2013, 2023))
qb_df = [yearly_top_qb_stats(year) for year in years] # Scrape all data
qb_df = pd.concat(qb_df)                              # Concatenate data frames from each year
qb_df.to_csv(os.getcwd() + "/../top-qb-info-2013-2022.csv", index=False)
