In [None]:
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup
import re

In [None]:
probowlUrl = "https://www.pro-football-reference.com/years/{}/probowl.htm"
probowlHtmlFile = "proBowl{}.html"
probowlId = "pro_bowl" 

collegeStatsUrl = "https://www.sports-reference.com/cfb/years/{}-passing.html"
collegeStatsHtmlFile = "collegeStats{}.html"
collegeStatsId = "div_passing"

combineUrl = "https://www.pro-football-reference.com/draft/{}-combine.htm"
combineHtmlFile = "combineStats{}.html"
combineId = "div_combine" 

In [None]:
def scrapeSite(url, htmlFileName, idName, startYear, endYear):
    dfs = []
    rateLimit = False

    for year in range(startYear, endYear+1):
        url_year = url.format(year)
        data = requests.get(url_year)

        with open(htmlFileName.format(year), "w+") as f:
            f.write(data.text)
        with open(htmlFileName.format(year)) as f:

            html_page = f.read()

        soup = BeautifulSoup(html_page, "html.parser")

        try:
            df = soup.find(id=idName)
            df = pd.read_html(str(df))[0]
            dfs.append(df)
        except ValueError:
            print("Rate Limit: Too many requests ve are blocked")
            rateLimit = True

        os.remove(htmlFileName.format(year))

    return rateLimit, dfs 

def getProbowl(rateLimit, dfs):
    if rateLimit == False:
        #append all pro bowl years together into dataframe
        df = pd.concat(dfs)

        df = df.reset_index(drop=True) # needed so that each row has unique index

        # Drop all other positions except qb
        df.drop(df.loc[df['Pos']!="QB"].index, inplace=True)
        # print("only qb df: ", df)

        #Clean data names and add to dict
        regex = re.compile('[^a-zA-Z\s]')
        playerNamesExctract = df.Player.tolist()
        playerNames = {}
        for name in playerNamesExctract:
            playerNames[regex.sub('', name)] = True

        return playerNames
    return None

# need new column names to differentiate passing and rushing yards
def changeTrainColNames(college_stats_df):
    new_cols = []

    for i in range(len(college_stats_df.columns.values)):
        col = college_stats_df.columns.values[i]
        if i in range(0,5):
            new_cols.append(col[len(col)-1])
        else:
            new_cols.append('_'.join(col))

    college_stats_df.columns = new_cols
    return new_cols

# some names have "*" next to them which needs to be removed
def removeStarFromName(df):
    index = 0
    for name in df['Player']:
        if "*" in name:
            newName = name.split("*")[0]
            df['Player'].values[index] = newName
        index += 1

def getCollegeStatsDf(rateLimit, dfs):
    if rateLimit == False:
        df = pd.concat(dfs)
        df = df.reset_index(drop=True) # needed so that each row has unique index
        newCols = changeTrainColNames(df)
        removeStarFromName(df)
        return df, newCols
    return None, None

def getCombineDf(rateLimit, dfs):
    if rateLimit == False:
        df = pd.concat(dfs)
        df = df.reset_index(drop=True) # needed so that each row has unique index
        df.drop(index=df.loc[df['Pos']!='QB'].index, inplace=True)
        df = df.drop(columns=['School', 'College', 'Drafted (tm/rnd/yr)'])
        return df
    return None

In [None]:
# for training 2008 to 2017 college stats
trainRateLimit, trainDfsStats = scrapeSite(collegeStatsUrl, collegeStatsHtmlFile, collegeStatsId, 2008, 2017) 

# for validation 2018 to 2019 college stats
valRateLimit, valDfsStats = scrapeSite(collegeStatsUrl, collegeStatsHtmlFile, collegeStatsId, 2018, 2019)

# for testing 2020 to 2021 college stats
testRateLimit, testDfsStats = scrapeSite(collegeStatsUrl, collegeStatsHtmlFile, collegeStatsId, 2020, 2021) 

trainDf, newCols = getCollegeStatsDf(trainRateLimit, trainDfsStats)
valDf, newCols = getCollegeStatsDf(valRateLimit, valDfsStats)
testDf, newCols = getCollegeStatsDf(testRateLimit, testDfsStats)

trainDf

In [None]:
for x, row in  trainDf.iterrows():
    print("x: \n", x)
    print("row: \n", row)
    break


In [394]:
newCols

['Rk',
 'Player',
 'School',
 'Conf',
 'G',
 'Passing_Cmp',
 'Passing_Att',
 'Passing_Pct',
 'Passing_Yds',
 'Passing_Y/A',
 'Passing_AY/A',
 'Passing_TD',
 'Passing_Int',
 'Passing_Rate',
 'Rushing_Att',
 'Rushing_Yds',
 'Rushing_Avg',
 'Rushing_TD']

In [395]:
trainLimit, trainDfsCombine = scrapeSite(combineUrl,combineHtmlFile,combineId,2009,2018)
trainCombineDf = getCombineDf(trainLimit, trainDfsCombine)

trainCombineDf

Unnamed: 0,Player,Pos,Ht,Wt,40yd,Vertical,Bench,Broad Jump,3Cone,Shuttle
16,Jason Boltus,QB,6-3,225,4.82,31.5,26,111,7.00,4.47
17,Rhett Bomar,QB,6-2,225,4.70,,25,106,6.91,4.06
20,Tom Brandstater,QB,6-5,220,4.87,28.5,,106,6.93,4.37
29,Nathan Brown,QB,6-1,219,4.86,30.5,,106,7.43,4.44
47,Hunter Cantwell,QB,6-4,235,5.22,26.0,,99,7.40,4.59
...,...,...,...,...,...,...,...,...,...,...
3261,Josh Rosen,QB,6-4,226,4.92,31.0,,111,7.09,4.28
3263,Mason Rudolph,QB,6-5,235,4.90,26.0,,,,
3277,Nic Shimonek,QB,6-3,220,4.88,28.5,,101,7.28,4.32
3329,Mike White,QB,6-5,224,5.09,27.0,,96,7.40,4.40


In [396]:
for col in newCols[5:]:
    trainCombineDf[col] = None

trainCombineDf

Unnamed: 0,Player,Pos,Ht,Wt,40yd,Vertical,Bench,Broad Jump,3Cone,Shuttle,...,Passing_Yds,Passing_Y/A,Passing_AY/A,Passing_TD,Passing_Int,Passing_Rate,Rushing_Att,Rushing_Yds,Rushing_Avg,Rushing_TD
16,Jason Boltus,QB,6-3,225,4.82,31.5,26,111,7.00,4.47,...,,,,,,,,,,
17,Rhett Bomar,QB,6-2,225,4.70,,25,106,6.91,4.06,...,,,,,,,,,,
20,Tom Brandstater,QB,6-5,220,4.87,28.5,,106,6.93,4.37,...,,,,,,,,,,
29,Nathan Brown,QB,6-1,219,4.86,30.5,,106,7.43,4.44,...,,,,,,,,,,
47,Hunter Cantwell,QB,6-4,235,5.22,26.0,,99,7.40,4.59,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3261,Josh Rosen,QB,6-4,226,4.92,31.0,,111,7.09,4.28,...,,,,,,,,,,
3263,Mason Rudolph,QB,6-5,235,4.90,26.0,,,,,...,,,,,,,,,,
3277,Nic Shimonek,QB,6-3,220,4.88,28.5,,101,7.28,4.32,...,,,,,,,,,,
3329,Mike White,QB,6-5,224,5.09,27.0,,96,7.40,4.40,...,,,,,,,,,,


In [397]:
trainCombineDf.columns

Index(['Player', 'Pos', 'Ht', 'Wt', '40yd', 'Vertical', 'Bench', 'Broad Jump',
       '3Cone', 'Shuttle', 'Passing_Cmp', 'Passing_Att', 'Passing_Pct',
       'Passing_Yds', 'Passing_Y/A', 'Passing_AY/A', 'Passing_TD',
       'Passing_Int', 'Passing_Rate', 'Rushing_Att', 'Rushing_Yds',
       'Rushing_Avg', 'Rushing_TD'],
      dtype='object')

In [400]:
trainDf.loc[trainDf['Player'] == 'Tim Tebow']

Unnamed: 0,Rk,Player,School,Conf,G,Passing_Cmp,Passing_Att,Passing_Pct,Passing_Yds,Passing_Y/A,Passing_AY/A,Passing_TD,Passing_Int,Passing_Rate,Rushing_Att,Rushing_Yds,Rushing_Avg,Rushing_TD
3,4,Tim Tebow,Florida,SEC,14,192,298,64.4,2746,9.2,10.6,30,4,172.4,176,673,3.8,12
110,3,Tim Tebow,Florida,SEC,14,213,314,67.8,2895,9.2,9.8,21,5,164.2,217,910,4.2,14


In [401]:
trainCombineDf.loc[trainCombineDf['Player'] == 'Tim Tebow']

Unnamed: 0,Player,Pos,Ht,Wt,40yd,Vertical,Bench,Broad Jump,3Cone,Shuttle,...,Passing_Yds,Passing_Y/A,Passing_AY/A,Passing_TD,Passing_Int,Passing_Rate,Rushing_Att,Rushing_Yds,Rushing_Avg,Rushing_TD
609,Tim Tebow,QB,6-3,236,4.71,38.5,,115,6.66,4.17,...,,,,,,,,,,


In [None]:
# trainRateLimit, trainDfsProbowl = scrapeSite(probowlUrl, probowlHtmlFile, probowlId, 2009, 2018)
# valRateLimit, valDfsProbowl = scrapeSite(probowlUrl, probowlHtmlFile, probowlId, 2019, 2020)
# testRateLimit, testDfsProbowl =  scrapeSite(probowlUrl, probowlHtmlFile, probowlId, 2021, 2022)

# trainProbowl = getProbowl(trainRateLimit, trainDfsProbowl)
# valProbowl= getProbowl(valRateLimit, valDfsProbowl)
# testProbowl = getProbowl(testRateLimit, testDfsProbowl)

# trainProbowl

{'Peyton Manning': True,
 'Drew Brees': True,
 'Philip Rivers': True,
 'Tony Romo': True,
 'Aaron Rodgers': True,
 'Matt Schaub': True,
 'David Garrard': True,
 'Brett Favre': True,
 'Tom Brady': True,
 'Vince Young': True,
 'Donovan McNabb': True,
 'Matt Ryan': True,
 'Michael Vick': True,
 'Matt Cassel': True,
 'Cam Newton': True,
 'Andy Dalton': True,
 'Eli Manning': True,
 'Ben Roethlisberger': True,
 'Andrew Luck': True,
 'Russell Wilson': True,
 'Robert Griffin III': True,
 'Alex Smith': True,
 'Nick Foles ': True,
 'Matthew Stafford': True,
 'Carson Palmer': True,
 'Tyrod Taylor': True,
 'Teddy Bridgewater': True,
 'Derek Carr': True,
 'Jameis Winston': True,
 'Dak Prescott': True,
 'Kirk Cousins': True,
 'Carson Wentz': True,
 'Jared Goff': True,
 'Patrick Mahomes': True,
 'Mitchell Trubisky': True,
 'Deshaun Watson': True}