In [None]:
# Import packages
import pandas as pd
import numpy as np
import math
import requests
from bs4 import BeautifulSoup
import os

# Notebook Description

This script uses Beautiful Soup to scrape draft information from pro-football-reference.com

# Web Scraping Draft Information

In [None]:
def get_draft_info(year):
    '''
    Function to get draft information for all QBs, RBs and WRs
    Can use this data as information about rookie season
    '''
    
    # Extract table HTML from input URL
    page = requests.get('https://www.pro-football-reference.com/years/' + str(year) + '/draft.htm')
    soup = BeautifulSoup(page.text, 'lxml')
    table = soup.find('table', attrs={'id': 'drafts'})

    # Convert table HTML to pandas df
    df = pd.read_html(str(table))[0]
    df.columns = df.columns.droplevel(0)                                           # Remove multi-layer header
    df = df[(df.Tm != 'Tm')]                                                       # Remove mid-table headers
    df = df[(df.Pos == 'QB') | (df.Pos == 'RB') | (df.Pos == 'WR')]                # Select positions of interest
    df['DraftYear'] = year                                                         # Set rookie year
    df['Player'] = df['Player'].str.replace('[^\w\s]', '', regex=True)             # Remove punctuation
    df['DraftRnd'] = df['Rnd']                                                     # Rename columns
    df['DraftPick'] = df['Pick']
    df['DraftTm'] = df['Tm']
    df['DraftAge'] = df['Age']

    # Split name into columns and edit
    df[['FirstName', 'LastName']] = df.Player.str.split(" ", n=1, expand=True)
    df['FirstName'] = df['FirstName'].str.replace('[^\w\s]', '', regex=True)
    df['LastName'] = df['LastName'].str.replace('[^\w\s]', '', regex=True)         # Remove punctuation
    df['LastName'] = df['LastName'].str.replace(' III', '', regex=True)            # Edit names for consistency
    df['LastName'] = df['LastName'].str.replace(' II', '', regex=True)             # with other data sets (no suffix)
    df['LastName'] = df['LastName'].str.replace(' IV', '', regex=True)
    df['LastName'] = df['LastName'].str.replace(' V', '', regex=True)
    df['LastName'] = df['LastName'].str.replace(' Sr', '', regex=True)
    df['LastName'] = df['LastName'].str.replace(' Jr', '', regex=True)

    # Select columns of interest
    df = df[['FirstName','LastName','DraftYear','DraftRnd','DraftPick','DraftTm','Pos','DraftAge','College/Univ']]
    
    return df
    

In [None]:
# Create data frame containing all draft information from 2000 to 2022
years = list(np.arange(2000, 2023))
df = [get_draft_info(year) for year in years] # Scrape all data
df = pd.concat(df)                            # Concatenate data frames from each year
df.to_csv(os.getcwd() + "/../draft-information-2000-2022.csv", index=False)

# Remove Name Dupes

In [None]:
# Import data set outputted above (some times we get flagged for too many requests by pro-football-reference.com)
df = pd.read_csv(os.getcwd() + "/../draft-information-2000-2022.csv")

In [None]:
# Remove the entries of players who share first and last names with 2013-2022 fantasy relevant players
df['FirstLastYear'] = df.FirstName + df.LastName + df.DraftYear.astype(str)
df = df[df.FirstLastYear != 'AdrianPeterson2002']
df = df[df.FirstLastYear != 'SteveSmith2007']
df = df[df.FirstLastYear != 'MikeWilliams2005']
df = df[df.FirstLastYear != 'MikeThomas2009']
df = df[df.FirstLastYear != 'MikeWilliams2010']
df = df[[column for column in df.columns if column != 'FirstLastYear']]

In [None]:
df.to_csv(os.getcwd() + "/../draft-information-2000-2022.deduped.csv", index=False)