In [1]:
from bs4 import BeautifulSoup as bs

import requests
import re


import pandas as pd
import numpy as np
import pyodbc


import time
import random

import os

In [2]:
# this is grabing the Cardinals html web page
#url = 'https://www.pro-football-reference.com/teams/sfo/2022.htm'

# Table Function

In [3]:
# use this function before creating tables
def tables_html(url:str):
    html = requests.get(url).text
    
    soup = bs(html, 'html.parser')
    
    tables = soup.find_all(class_='table_wrapper')
    
    return tables

# DF Columns!

In [4]:
def webscrape_columns (tables:str, table_num:int):

    table = tables[table_num]

    str_table = str(table)

    start_num_str_comment_tag = str_table.find('<!--')+4

    table_html = bs(str_table[start_num_str_comment_tag:], 'html.parser')
    
    
    #find thead tag to isolate col names
    table_html = table_html.find_all('thead')[0]

    # find class_ = 'poptip' to isolate col names
    col_th_tag = table_html.find_all(class_='poptip')

    # get how many column names there are
    num_col_passing = len(col_th_tag)

    # list of column names
    columns = []

    for i in range(0, num_col_passing):
        try:
            data = col_th_tag[i].contents[0]
            columns.append(data)
        except:
            columns.append('n/a')   
            
    return columns

# DF Rows!

## Row

In [5]:
def webscrape_index(tables:str, table_num:int):
    
    table = tables[table_num]

    str_table = str(table)

    start_num_str_comment_tag = str_table.find('<!--')+4

    table_html = bs(str_table[start_num_str_comment_tag:], 'html.parser')

    rows_tbody = table_html.find_all('tbody')[0]
    
       
    #get index
    num_index = len(rows_tbody.find_all('th'))
    
    index = []
    for i in range(0, num_index):
    
        try:
            data = rows_tbody.find_all('th')[i].contents[0]
            index.append(data)
        except:
            index.append('n/a')
        
    return index

In [6]:
def webscrape_rows(tables:str, table_num:int):
    
    table = tables[table_num]

    str_table = str(table)

    start_num_str_comment_tag = str_table.find('<!--')+4

    table_html = bs(str_table[start_num_str_comment_tag:], 'html.parser')

    rows_tbody = table_html.find_all('tbody')[0]
    
    

    rows_tr = rows_tbody.find_all('tr')
    
    num_index = len(rows_tbody.find_all('th'))
    num_rows = len(rows_tr[0].find_all('td'))
    
    
    row_data = []

    for i in range(0, num_index):
        row_td = rows_tr[i].find_all('td')
    
        data = []

        for c in range(0, num_rows):
            
            try:
                if len(row_td[c].find_all('a')) > 0:
            
                    d = row_td[c].find_all('a')[0].contents[0]
                    data.append(d)
                
                else:
                    d = row_td[c].contents[0]
                    data.append(d)
            
            except:
                data.append('n/a')
        

        row_data.append(data)
        
        
        
    index = []
    for i in range(0, num_index):
    
        try:
            data = rows_tbody.find_all('th')[i].contents[0]
            index.append(data)
        except:
            index.append('n/a')
        
        
    #insert index to each row to be equal to all columns
    for i in range(0, len(index)):
        row_data[i].insert(0, index[i])
        
        
    return row_data

# Fix Column Labels

In [7]:
#fix defense and fumbles table

In [8]:
def fix_col_labels(table_df_li: list, team: str):
    
    
    #Team Stats and Rankings
    table_df_li[0].columns = [
        'Player', 'Points Scored by Team', 'Total Yds and TO', 'Offensic Plays: Pass Attempts + Rush Attempts + Times Sacked', 
        'Yards per Offensive Play', 'Team Turnovers Lost', 'Fumbles Lost by Player or Team', '1stD', 'Passes Completed', 'Passes Attempted', 
        'Yards Gained by Passing', 'Passing Touchdowns', 'Interceptions Thrown', 'Net Yards Gained per Pass Attempt', 'First Downs by Passing',
        'Rushing Attempts', 'Rushing Yards', 'Rushin Touchdowns', 'Rushing Yards per Attempt', 'First Downs by Rushing', 'Penalties committed by Team and Accepted',
        'Penalties in Yards Committed by Team', 'First Downs by Penalty', 'Number of Drives', '% of Drives Ending in an Offensive Score',
        '% of Drives Ending in an Offensive Turnover', 'Average Starting Field Position', 'Average Time per Drive', 'Average # of Plays per Drive',
        'Net Yards per Drive', 'Average Points Scored per Drive'
    ]
    
    #Schedule and Game Results
    
        #fix col to names
    try:
        table_df_li[1].columns = ['Week', 'Day', 'Date', 'Time', 'n/a', 'Win/Loss', 'Overtime', 'Team Record', 'Home/Away',
                              'Opponent', 'Points Scored', 'Points Allowed', 'Offense 1st Down', 'Total Yards Gained of Offense', 'Total yards Gained by Passing',
                              'Total Yards Gained by Rushing', 'Offense Turnovers', 'Defense 1st Down', 'Total Yards Allowed by Defense',
                              'Total Passing Yards Allowed by Defense', 'Total Rushing Yards Allowed by Defense', 'Defense Turnovers', 
                              'Offense', 'Defense', 'Sp. Tms']

        table_df_li[1].drop(columns=['n/a'], inplace=True)

        
    except:        
        table_df_li[1].columns = ['Week', 'Day', 'Date', 'Time', 'Win/Loss', 'Overtime', 'Team Record', 'Home/Away',
                              'Opponent', 'Points Scored', 'Points Allowed', 'Offense 1st Down', 'Total Yards Gained of Offense', 'Total yards Gained by Passing',
                              'Total Yards Gained by Rushing', 'Offense Turnovers', 'Defense 1st Down', 'Total Yards Allowed by Defense',
                              'Total Passing Yards Allowed by Defense', 'Total Rushing Yards Allowed by Defense', 'Defense Turnovers', 
                              'Offense', 'Defense', 'Sp. Tms']
        # home/away games rows
    table_df_li[1].iloc[:, 7] = ['away' if r == '@' else 'home' for r in table_df_li[1].iloc[:, 7]]          

    
    #Team Conversions
    table_df_li[2].columns = [
        'Player', '3rd Down Attempts in Game', '3rd Down Conversions', '3rd Down Conversion %', '4th Down Attempts in Game',
        '4th Down Conversions in Game', '4th Down Conversion %', 'Red Zone Attempts', 'Touchdowns Scored After the Team Entered the Red Zone', 
        '% of the Time a Team Reaches the Red Zone and Scores a Touchdown', 
    ]

    
    #Passing
    table_df_li[3].columns = [
        'No.', 'Player', 'Age', 'Position', 'Games Played', 'Games Started as an Offensive or Defensive Player', 
        'Team Record in Games Stareted by This QB', 'Passes Completed', 'Passes Attemped', '% of Passes Completed', 
        'Yards Gained by Passing', 'Passing Touchdowns', '% of Touchdowns Thrown when Attempting to Pass', 'Interceptions Thrown',
        '% of Times Interceped when Attempting to Pass', 'First Downs Passing', 'Passing Success Rate', 'Longest Completed Pass Thrown',
        'Yards Gained per Pass Attempt', 'Adjusted Yards gained per Pass Attempt', 'Yards Gained per Pass Completion', 
        'Yards Gained per Game Played', 'QB Rating', 'ESPN QB Rating', 'Times Sacked', 'Yards Lost due to Sacks', 
        '% of Time Sacked when Attempting to Pass', 'Net Yards Gained per Pass Attempt', 'Adejsuted Net Yards per Pass Attempt',
        'Comebacks led by QB', 'Game-winning Drives led by QB'
    ]
    
    
    #Rushing and Receiving
    table_df_li[4].columns = [
        'No.', 'Player', 'Age', 'Position', 'Games Played', 'Games Started as an Offensive or Defensive Player', 'Rushing Attempts',
        'Rushing Yards Gained', 'Rushing Touchdown', 'First Downs Rushing', 'Rushing Success Rate', 'Longest Rushing Attempt', 
        'Rushing Yards per Attempt', 'Rushing Yards per Game', 'Rushing Attempts per Game', 'Pass Targets', 'Receptions', 'Receiving Yards',
        'Receiving Yards per Reception', 'Receiving Touchdowns', 'First Downs Receiving', 'Receiving Success Rate', 'Longest Reception', 
        'Receptions per Game', 'Receiving Yards per Game', 'Catch%', 'Receiving Yards per Target', 'Touches: Rushing Attempts and Receptions',
        'Scrimmage Yards per Touch: Rushing + Receiving Yardage per Opportunity', 'Yards from Scrimmage: Receiving and Rushing Yards', 
        'Rushing and Receiving Touchdowns', 'Lost and Recovered Fumbles'
    ]
    
    #kick and punt returns
    table_df_li[5].columns = [
        'No.', 'Player', 'Age', 'Position', 'Games Played', 'Games Started', 'Punts Returned',
        'Punts Return Yardage', 'Punts Returned for Touchdown', 'Longest Punt Return', 
        'Yards per Punt Return', 'Kickoff Returns', 'Yardage for Kickoffs Returned', 
        'Kickoffs Returned for a Touchdown', 'Longest Kickoff Return', 'Yards per Kickoff Return',
        'All-purpose Yards'
    ]
    
    #Kicking
    
    table_df_li[6].columns = [
        'No.', 'Player', 'Age', 'Position', 'Games Played', 'Games Started', 
        'FGA 0-19', 'FGM 0-19', 'FGA 20-29', 'FGM 20-29', 'FGA 30-39', 'FGM 30-39',
        'FGA 40-49', 'FGM 40-49', 'FGA 50+', 'FGM 50+', 'Field Goals Attempted', 'Field Goals Made',
        'Longest Field Goal Made', '% of Field Goals Made', 'Extra Points Attempted', 'Extra Points Made',
        'Extra Point Percentage', 'Kickoffs', 'Kickoff Yards', 'Kickoff Touchbacks', 
        '% Kickoff was a Touchback', 'Kickoff Average Yardage'
        

    ]
    
    #Punting
    table_df_li[7].columns = [
        'No.', 'Player', 'Age', 'Pos', 'Games Played', 'Games Started', 'Times Puned',
        'Total Punt Yardage', 'Yards per Punt', 'Punt Return Yardage by Opposition', 
        'Punt Net yards', 'Punt Net Yards per Punt', 'Longest Punt', 
        'Punts Resulting in a Touchback', '% of Punts Resulting in a Touchback', 
        'Punts Inside Opp. 20 Yard Line', '% of Punts Downed Inside Opp. 20 Yard Line',
        'Times Punts Blocked'
    ]
    
    #defense and fumbles
    table_df_li[8].columns = [
        'No.', 'Player', 'Age', 'Pos', 'Games Played', 'Games Started', 'Passes Intercepted on Defense',
        'Yards Interceptions were Returned', 'Interceptions Returned for Touchdowns', 
        'Longest Interception Return', 'Passes Defended by Defensive Player', '# Forced Fumble by Opp.', 
        '# Fumbled both Lost and Recovered by Own Team', 'Fumbles Recovered by Original Fumbler', 
        'Yards Recovered Fumbles were Returned', 'Fumbles Recovered for Touchdown', 'Sacks', 
        'Tackles Solo+Assisted', 'Solo Tackles', 'Assisted Tackles', 'Tackles for Loss', 
        'Quarterback Hits', 'Safeties Scored by Player/Team'
        
    ]
    
    #Scoring Summary
    table_df_li[9].columns = [
         'No.', 'Player', 'Age', 'Pos', 'Games Played', 'Games Started', 'Rush TD', 'Reception TD', 
        'Punt Return TD', 'Kick Return TD', 'Fumble Return TD', 'Interception TD', 'Other TD', 
        'All Touchdown Scored', '2-Point Conversions Made', 'Two-Point Conversions Attempted', 
        'Defensive Two-Point Conversions', 'Extra Points Made', 'Extra Points Allowed', 'Field Goals Made',
        'Field Goals Attempted', 'Safeties Scored by Player/Team', 'Total Points Scored by all Means',
        'Poins per Game'
    ]
    
    try:
        #touchdown log
        table_df_li[10].drop(columns=['n/a'], inplace=True)
    
        #opponenet touchdown log
        table_df_li[11].drop(columns=['n/a'], inplace=True)
    except:
        pass
    
    #touchdown log
    table_df_li[10].columns = [
        'Rank', 'Date', 'Opponent', 'Results', 'Quarter', 'Distance', 'Type', 'Detail'
    ]
    
        #opponent touchdown log
    table_df_li[11].columns = [
        'Rank', 'Date', 'Opponent', 'Results', 'Quarter', 'Distance', 'Type', 'Detail'
    ]
    
    for i in range(0, len(table_df_li)):
        
        num_rows = len(table_df_li[i].values)

        
        table_df_li[i].insert(1, 'Team', np.repeat(team, num_rows))
    
    return table_df_li



# DF!

In [9]:
#df = pd.DataFrame(index=index, data=row_data, columns=columns)
#df

In [10]:
def webscrape_tables(tables:str, table_num:int):
    
    table = tables[table_num]

    str_table = str(table)

    start_num_str_comment_tag = str_table.find('<!--')+4

    table_html = bs(str_table[start_num_str_comment_tag:], 'html.parser')

    rows_tbody = table_html.find_all('tbody')[0]
    
    
    #columns
    #find thead tag to isolate col names
    table_html = table_html.find_all('thead')[0]

    # find class_ = 'poptip' to isolate col names
    col_th_tag = table_html.find_all(class_='poptip')

    # get how many column names there are
    num_col_passing = len(col_th_tag)

    # list of column names
    columns = []

    for i in range(0, num_col_passing):
        try:
            data = col_th_tag[i].contents[0]
            columns.append(data)
        except:
            columns.append('n/a') 
    
    
    ##rows
    
    rows_tr = rows_tbody.find_all('tr')
    
    num_index = len(rows_tbody.find_all('th'))
    num_rows = len(rows_tr[0].find_all('td'))
    
    
    row_data = []

    for i in range(0, num_index):
        row_td = rows_tr[i].find_all('td')
    
        data = []

        for c in range(0, num_rows):
            
            try:
                if len(row_td[c].find_all('a')) > 0:
            
                    d = row_td[c].find_all('a')[0].contents[0]
                    data.append(d)
                
                else:
                    d = row_td[c].contents[0]
                    data.append(d)
            
            except:
                data.append('n/a')
        

        row_data.append(data)
        
        
        
    index = []
    for i in range(0, num_index):
    
        try:
            data = rows_tbody.find_all('th')[i].contents[0]
            index.append(data)
        except:
            index.append('n/a')
        
        
    #insert index to each row to be equal to all columns
    for i in range(0, len(index)):
        row_data[i].insert(0, index[i])
    
    df = pd.DataFrame(index=index, data=row_data, columns=columns)
    
    return df

# Final Save CSV Function

In [11]:
def webscrape_create_dataframes_save_csv(team: str, year: int):    
    
    team_to_url_df = pd.DataFrame(
        data=[
            [
                'crd', #Cardinals
                'atl', #Falcons
                'rav', #Ravens
                'buf', #Buffalos
                'car', #Panthers
                'chi', #Bears
                'cin', #Bengals
                'cle', #Browns
                'dal', #Cowboys
                'den', #Broncos
                'det', #Lions
                'gnb', #Packers
                'htx', #Texans
                'clt', # Colts
                'jax', #Jaguars
                'kan', #Chiefs
                'rai', #Raiders
                'sdg', #Chargers
                'ram', #Rams
                'mia', #Dolphins
                'min', #Vikings
                'nwe', #Patriots
                'nor', #Saints
                'nyg', #Giants
                'nyj', #Jets
                'phi', #Eagles
                'pit', #Steelers
                'sfo', #49ers
                'sea', #Seahawks
                'tam', #Buccaneers
                'oti', #Titans
                'was' #Commanders
            ]
        ], 
        columns=[
            'Cardinals',
            'Falcons',
            'Ravens',
            'Buffalos',
            'Panthers',
            'Bears',
            'Bengals',
            'Browns',
            'Cowboys',
            'Broncos',
            'Lions',
            'Packers',
            'Texans',
            'Colts',
            'Jaguars',
            'Chiefs',
            'Raiders',
            'Chargers',
            'Rams',
            'Dolphins',
            'Vikings',
            'Patriots',
            'Saints',
            'Giants',
            'Jets',
            'Eagles',
            'Steelers',
            '49ers',
            'Seahawks',
            'Buccaneers',
            'Titans',
            'Commanders'
        ])
    
    # string url
    str_url = 'https://www.pro-football-reference.com/teams/{}/{}.htm'

    #get team url ID
    team_url_str = team_to_url_df[team].values[0]
    
    # URL
    url = str_url.format(team_url_str, year)
    
    # request html text
    tables_html_request = tables_html(url=url)
    
    # delay execute so not to trigger ip address block
    time.sleep(random.randrange(7,9))
    
    # webscrape tables/ create dataframes
    
    # table name list with team and year to format
    table_name_li = [
        'team_stats_and_ranking_{}_{}',
        'schedule_and_game_results_{}_{}',
        'team_conversions_{}_{}',
        'passing_{}_{}',
        'rushing_and_receiving_{}_{}',
        'kick_and_punt_returns_{}_{}',
        'kicking_{}_{}',
        'punting_{}_{}',
        'defense_and_fumbles_{}_{}',
        'scoring_summary_{}_{}',
        'touchdown_log_{}_{}',
        'opponent_touchdown_log_{}_{}'
    ]
    
    #file location
    file_loc = 'NFL_Data_{}/{}/'.format(year, team)
    
    table_num = 12
    
    table_df_li = [webscrape_tables(tables=tables_html_request, table_num=i) for i in range(0,table_num)]

    table_df_li = fix_col_labels(table_df_li=table_df_li, team=team)
    
    for t in range(0, table_num):
        
        table_df_li[t].to_csv(file_loc+table_name_li[t].format(team, year))
        
    return  table_df_li
        
        

# Download CSV Files from all teams 2010-2022

In [12]:
team_names=[
    'Cardinals',
    'Falcons',
    'Ravens',
    'Buffalos',
    'Panthers',
    'Bears',
    'Bengals',
    'Browns',
    'Cowboys',
    'Broncos',
    'Lions',
    'Packers',
    'Texans',
    'Colts',
    'Jaguars',
    'Chiefs',
    'Raiders',
    'Chargers',
    'Rams',
    'Dolphins',
    'Vikings',
    'Patriots',
    'Saints',
    'Giants',
    'Jets',
    'Eagles',
    'Steelers',
    '49ers',
    'Seahawks',
    'Buccaneers',
    'Titans',
    'Commanders'
]

In [13]:
years = range(2010,2023)

In [26]:
#this will webscrape all the teams tables from 2010-2022
#do not run 
###                         45-60 min run
#for team in team_names:
#    for year in years:
#        webscrape_create_dataframes_save_csv(team=team, year=year)

# 2023

In [22]:
years = [2023]
years

[2023]

In [23]:
def webscrape_create_dataframes_save_csv_2023(team: str, year: int):    
    
    team_to_url_df = pd.DataFrame(
        data=[
            [
                'crd', #Cardinals
                'atl', #Falcons
                'rav', #Ravens
                'buf', #Buffalos
                'car', #Panthers
                'chi', #Bears
                'cin', #Bengals
                'cle', #Browns
                'dal', #Cowboys
                'den', #Broncos
                'det', #Lions
                'gnb', #Packers
                'htx', #Texans
                'clt', # Colts
                'jax', #Jaguars
                'kan', #Chiefs
                'rai', #Raiders
                'sdg', #Chargers
                'ram', #Rams
                'mia', #Dolphins
                'min', #Vikings
                'nwe', #Patriots
                'nor', #Saints
                'nyg', #Giants
                'nyj', #Jets
                'phi', #Eagles
                'pit', #Steelers
                'sfo', #49ers
                'sea', #Seahawks
                'tam', #Buccaneers
                'oti', #Titans
                'was' #Commanders
            ]
        ], 
        columns=[
            'Cardinals',
            'Falcons',
            'Ravens',
            'Buffalos',
            'Panthers',
            'Bears',
            'Bengals',
            'Browns',
            'Cowboys',
            'Broncos',
            'Lions',
            'Packers',
            'Texans',
            'Colts',
            'Jaguars',
            'Chiefs',
            'Raiders',
            'Chargers',
            'Rams',
            'Dolphins',
            'Vikings',
            'Patriots',
            'Saints',
            'Giants',
            'Jets',
            'Eagles',
            'Steelers',
            '49ers',
            'Seahawks',
            'Buccaneers',
            'Titans',
            'Commanders'
        ])
    
    # string url
    str_url = 'https://www.pro-football-reference.com/teams/{}/{}.htm'

    #get team url ID
    team_url_str = team_to_url_df[team].values[0]
    
    # URL
    url = str_url.format(team_url_str, year)
    
    # request html text
    tables_html_request = tables_html(url=url)
    
    # delay execute so not to trigger ip address block
    time.sleep(random.randrange(7,9))
    
    # webscrape tables/ create dataframes
    
    # table name list with team and year to format
    table_name_li = [
        'team_stats_and_ranking_{}_{}',
        'schedule_and_game_results_{}_{}',
        'team_conversions_{}_{}',
        'passing_{}_{}',
        'rushing_and_receiving_{}_{}',
        'kick_and_punt_returns_{}_{}',
        'kicking_{}_{}',
        'punting_{}_{}',
        'defense_and_fumbles_{}_{}',
        'scoring_summary_{}_{}',
        'touchdown_log_{}_{}',
        'opponent_touchdown_log_{}_{}'
    ]
    
    #file location
    file_loc = 'NFL_Data_{}/{}/'.format(year, team)
    
    table_num = 12
    
    table_df_li = [webscrape_tables(tables=tables_html_request[1:], table_num=i) for i in range(0,table_num)]

    table_df_li = fix_col_labels(table_df_li=table_df_li, team=team)
    
    for t in range(0, table_num):
        
        table_df_li[t].to_csv(file_loc+table_name_li[t].format(team, year))
        
    return  table_df_li
        
        

In [24]:
for team in team_names:
    for year in years:
        webscrape_create_dataframes_save_csv_2023(team=team, year=year) 

In [18]:
#sort teams alpha

In [19]:
        data=[
            [
                'crd', #Cardinals
                'atl', #Falcons
                'rav', #Ravens
                'buf', #Buffalos
                'car', #Panthers
                'chi', #Bears
                'cin', #Bengals
                'cle', #Browns
                'dal', #Cowboys
                'den', #Broncos
                'det', #Lions
                'gnb', #Packers
                'htx', #Texans
                'clt', #Colts
                'jax', #Jaguars
                'kan', #Chiefs
                'rai', #Raiders
                'sdg', #Chargers
                'ram', #Rams
                'mia', #Dolphins
                'min', #Vikings
                'nwe', #Patriots
                'nor', #Saints
                'nyg', #Giants
                'nyj', #Jets
                'phi', #Eagles
                'pit', #Steelers
                'sfo', #49ers
                'sea', #Seahawks
                'tam', #Buccaneers
                'oti', #Titans
                'was' #Commanders
            ]
        ], 


In [20]:
        columns=[
            'Cardinals',
            'Falcons',
            'Ravens',
            'Buffalos',
            'Panthers',
            'Bears',
            'Bengals',
            'Browns',
            'Cowboys',
            'Broncos',
            'Lions',
            'Packers',
            'Texans',
            'Colts',
            'Jaguars',
            'Chiefs',
            'Raiders',
            'Chargers',
            'Rams',
            'Dolphins',
            'Vikings',
            'Patriots',
            'Saints',
            'Giants',
            'Jets',
            'Eagles',
            'Steelers',
            '49ers',
            'Seahawks',
            'Buccaneers',
            'Titans',
            'Commanders'
        ]

In [21]:
columns.sort()