In [5]:
# import necessary packages
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import time
from IPython.display import display
from functools import reduce
from tqdm import tqdm

# Scrape stats page and parse html

In [6]:
# link to the standing table
table = "https://fbref.com/en/comps/20/Bundesliga-Stats"

In [7]:
# download html
table_data = requests.get(table)
time.sleep(1)

In [8]:
# initialize a soup
table_soup = BeautifulSoup(table_data.text)

In [9]:
# select the whole table and check length
len(table_soup.select("table.stats_table"))

24

There are 24 elements, we need to check the ids

In [10]:
for element in table_soup.select("table.stats_table"):
    print(element.get('id'))

results2023-2024201_overall
results2023-2024201_home_away
stats_squads_standard_for
stats_squads_standard_against
stats_squads_keeper_for
stats_squads_keeper_against
stats_squads_keeper_adv_for
stats_squads_keeper_adv_against
stats_squads_shooting_for
stats_squads_shooting_against
stats_squads_passing_for
stats_squads_passing_against
stats_squads_passing_types_for
stats_squads_passing_types_against
stats_squads_gca_for
stats_squads_gca_against
stats_squads_defense_for
stats_squads_defense_against
stats_squads_possession_for
stats_squads_possession_against
stats_squads_playing_time_for
stats_squads_playing_time_against
stats_squads_misc_for
stats_squads_misc_against


In [11]:
# since we need overall stats, we choose the first element
standings_table = table_soup.select('table.stats_table')[0]

In [12]:
# find all a-tags in standings_table
a_tags = standings_table.find_all('a')

In [13]:
# get all hrefs for each a-Tags
hrefs = [tag.get('href') for tag in a_tags]

In [14]:
# filter a link with info of squads
links = [link for link in hrefs if "/squads/" in link]

In [15]:
# show all links
links

['/en/squads/c7a9f859/Bayer-Leverkusen-Stats',
 '/en/squads/054efa67/Bayern-Munich-Stats',
 '/en/squads/598bc722/Stuttgart-Stats',
 '/en/squads/add600ae/Dortmund-Stats',
 '/en/squads/acbb6a5b/RB-Leipzig-Stats',
 '/en/squads/f0ac8ee6/Eintracht-Frankfurt-Stats',
 '/en/squads/a486e511/Freiburg-Stats',
 '/en/squads/033ea6b8/Hoffenheim-Stats',
 '/en/squads/18d9d2a7/Heidenheim-Stats',
 '/en/squads/62add3bf/Werder-Bremen-Stats',
 '/en/squads/0cdc4311/Augsburg-Stats',
 '/en/squads/4eaa11d7/Wolfsburg-Stats',
 '/en/squads/32f3ee20/Monchengladbach-Stats',
 '/en/squads/b42c6323/Bochum-Stats',
 '/en/squads/7a41008f/Union-Berlin-Stats',
 '/en/squads/bc357bf7/Koln-Stats',
 '/en/squads/a224b06a/Mainz-05-Stats',
 '/en/squads/6a6967fc/Darmstadt-98-Stats']

In [16]:
len(links)

18

In [17]:
# create full links to access
team_urls = [f"https://fbref.com{link}" for link in links]
team_urls

['https://fbref.com/en/squads/c7a9f859/Bayer-Leverkusen-Stats',
 'https://fbref.com/en/squads/054efa67/Bayern-Munich-Stats',
 'https://fbref.com/en/squads/598bc722/Stuttgart-Stats',
 'https://fbref.com/en/squads/add600ae/Dortmund-Stats',
 'https://fbref.com/en/squads/acbb6a5b/RB-Leipzig-Stats',
 'https://fbref.com/en/squads/f0ac8ee6/Eintracht-Frankfurt-Stats',
 'https://fbref.com/en/squads/a486e511/Freiburg-Stats',
 'https://fbref.com/en/squads/033ea6b8/Hoffenheim-Stats',
 'https://fbref.com/en/squads/18d9d2a7/Heidenheim-Stats',
 'https://fbref.com/en/squads/62add3bf/Werder-Bremen-Stats',
 'https://fbref.com/en/squads/0cdc4311/Augsburg-Stats',
 'https://fbref.com/en/squads/4eaa11d7/Wolfsburg-Stats',
 'https://fbref.com/en/squads/32f3ee20/Monchengladbach-Stats',
 'https://fbref.com/en/squads/b42c6323/Bochum-Stats',
 'https://fbref.com/en/squads/7a41008f/Union-Berlin-Stats',
 'https://fbref.com/en/squads/bc357bf7/Koln-Stats',
 'https://fbref.com/en/squads/a224b06a/Mainz-05-Stats',
 'http

# Extract match stats

In this section we work with an example, that is Bayer Leverkusen

In [18]:
# get Leverkusen's url
Leverkusen_url = team_urls[0]
Leverkusen_url

'https://fbref.com/en/squads/c7a9f859/Bayer-Leverkusen-Stats'

In [19]:
# get Leverkusen's html
Leverkusen_html = requests.get(Leverkusen_url)
time.sleep(1)

In [20]:
# grab all matches of Leverkusen
Leverkusen_matches_list = pd.read_html(Leverkusen_html.text, match="Scores & Fixtures")
# get all matches as a dataframe
Leverkusen_matches = Leverkusen_matches_list[0]
Leverkusen_matches.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes
0,2023-08-12,15:30,DFB-Pokal,Round of 64,Sat,Away,W,8.0,0.0,FC Teutonia Ottensen,,,66.0,11035.0,Lukáš Hrádecký,4-2-3-1,Tom Bauer,Match Report,
1,2023-08-19,15:30,Bundesliga,Matchweek 1,Sat,Home,W,3.0,2.0,RB Leipzig,1.4,1.6,44.0,29464.0,Lukáš Hrádecký,3-4-3,Felix Brych,Match Report,
2,2023-08-26,18:30,Bundesliga,Matchweek 2,Sat,Away,W,3.0,0.0,M'Gladbach,2.7,0.9,60.0,54042.0,Lukáš Hrádecký,3-4-3,Christian Dingert,Match Report,
3,2023-09-02,15:30,Bundesliga,Matchweek 3,Sat,Home,W,5.0,1.0,Darmstadt 98,2.8,0.4,72.0,29653.0,Lukáš Hrádecký,3-4-3,Sven Jablonski,Match Report,
4,2023-09-15,20:30,Bundesliga,Matchweek 4,Fri,Away,D,2.0,2.0,Bayern Munich,2.1,2.1,49.0,75000.0,Lukáš Hrádecký,3-4-3,Daniel Schlager,Match Report,


In [21]:
Leverkusen_matches = Leverkusen_matches.loc[:,['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA',
       'Opponent', 'xG', 'xGA', 'Poss']]
Leverkusen_matches.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss
0,2023-08-12,15:30,DFB-Pokal,Round of 64,Sat,Away,W,8.0,0.0,FC Teutonia Ottensen,,,66.0
1,2023-08-19,15:30,Bundesliga,Matchweek 1,Sat,Home,W,3.0,2.0,RB Leipzig,1.4,1.6,44.0
2,2023-08-26,18:30,Bundesliga,Matchweek 2,Sat,Away,W,3.0,0.0,M'Gladbach,2.7,0.9,60.0
3,2023-09-02,15:30,Bundesliga,Matchweek 3,Sat,Home,W,5.0,1.0,Darmstadt 98,2.8,0.4,72.0
4,2023-09-15,20:30,Bundesliga,Matchweek 4,Fri,Away,D,2.0,2.0,Bayern Munich,2.1,2.1,49.0


- GF = goals for
- GA = goals against
- xG = expected goals
- xGA = expected goals allowed
- Poss = possession, calculated as the percentage of passess attempted

In [22]:
Leverkusen_matches.shape

(45, 13)

# Get stats

## Get shooting stats

In [23]:
# initialize a soup
Leverkusen_soup = BeautifulSoup(Leverkusen_html.text)

In [24]:
# find a-tags
Leverkusen_a_tags = Leverkusen_soup.find_all('a')

In [25]:
# get links with 'href'
Leverkusen_hrefs = [tag.get('href') for tag in Leverkusen_a_tags]

In [26]:
# filter with shooting
Leverkusen_shooting_links = [href for href in Leverkusen_hrefs if href and 'all_comps/shooting/' in href]
Leverkusen_shooting_links

['/en/squads/c7a9f859/2023-2024/matchlogs/all_comps/shooting/Bayer-Leverkusen-Match-Logs-All-Competitions',
 '/en/squads/c7a9f859/2023-2024/matchlogs/all_comps/shooting/Bayer-Leverkusen-Match-Logs-All-Competitions',
 '/en/squads/c7a9f859/2023-2024/matchlogs/all_comps/shooting/Bayer-Leverkusen-Match-Logs-All-Competitions',
 '/en/squads/c7a9f859/2023-2024/matchlogs/all_comps/shooting/Bayer-Leverkusen-Match-Logs-All-Competitions']

In [27]:
# get shooting data
Leverkusen_shooting_data = requests.get(f"https://fbref.com{Leverkusen_shooting_links[0]}")
time.sleep(1)

In [28]:
# get Leverkusen's shooting stats
Leverkusen_shooting = pd.read_html(Leverkusen_shooting_data.text, match='Shooting')[0]
# drop top level
Leverkusen_shooting.columns = Leverkusen_shooting.columns.droplevel()
Leverkusen_shooting = Leverkusen_shooting.loc[:,['Date', 'Sh', 'SoT', 'SoT%']]

Leverkusen_shooting.head()

Unnamed: 0,Date,Sh,SoT,SoT%
0,2023-08-12,22,11,50.0
1,2023-08-19,11,7,63.6
2,2023-08-26,24,11,45.8
3,2023-09-02,25,13,52.0
4,2023-09-15,12,4,33.3


- Sh = shots total
- SoT = shots on target
- SoT% = shots on target %

We write a function to scrape stats data for other stats

In [29]:
def get_stats(team_hrefs, stat, match_string, indices_to_remove=None, selected_features=None):
    """
    team_hrefs = list of all hrefs associated with a team
    stat = a string, that is desired stat
    match_string = a string, used to match    
    indices_to_remove = a list of top level indices to remove. Some tables have multilevel index.
    selected_features = a list of features to select after dropping top level index
    
    """

    # filter links with desired stat
    links = [href for href in team_hrefs if href and "all_comps/" + stat +"/" in href]
    # get stats data
    stat_data = requests.get(f"https://fbref.com{links[0]}")    
    # get stats table
    stat_table = pd.read_html(stat_data.text, match=match_string)[0]    
    # Handle case where indices_to_remove is None
    if indices_to_remove is None:
        indices_to_remove = []  # Set it to an empty list    
    # top-level indices to remove
    columns_to_drop = \
        stat_table.columns[stat_table.columns.get_level_values(0).isin(indices_to_remove)]
    # remove some top-level indices
    stat_table = stat_table.drop(columns=columns_to_drop)
    # drop top-level index
    stat_table.columns = stat_table.columns.droplevel()    
    # Handle case where selected_features is None
    if selected_features is not None:
        # select features
        stat_table = stat_table.loc[:,selected_features]    
    
    return stat_table

## Get Goalkeeping stats

In [30]:
Leverkusen_goalkeeping =\
    get_stats(Leverkusen_hrefs, "keeper",'Goalkeeping', selected_features=['Date', 'Saves', 'Save%', 'Stp', 'Stp%', '#OPA'])
time.sleep(1)
Leverkusen_goalkeeping.head()

Unnamed: 0,Date,Saves,Save%,Stp,Stp%,#OPA
0,2023-08-12,1,100.0,,,
1,2023-08-19,4,66.7,2.0,18.2,1.0
2,2023-08-26,2,100.0,1.0,6.7,1.0
3,2023-09-02,0,0.0,0.0,0.0,3.0
4,2023-09-15,7,77.8,2.0,11.1,1.0


- Saves = number of saves
- Save% = save percentage
- Stp = number of crosses into penalty area which was succesfully stopped by the goalkeeper
- Stp% = crosses stopped %
- #OPA = number of defensive action outside of penalty area

## Get passing stats

In [31]:
# get passing stats
Leverkusen_passing = get_stats(Leverkusen_hrefs, "passing", "Passing", indices_to_remove=['Short', 'Medium', 'Long'], 
          selected_features=['Date', 'Cmp', 'Att', 'Cmp%', 'TotDist', 'PrgDist', 'Ast', 'KP', 'PPA', 'CrsPA'])
# rename features
Leverkusen_passing.columns = \
        ['Date', 'Passes_Cmp', 'Passes_Att', 'Passes_Cmp%', 'TotDist', 'PrgDist', 'Ast', 'KP', 'PPA', 'CrsPA']
time.sleep(1)
Leverkusen_passing.head()

Unnamed: 0,Date,Passes_Cmp,Passes_Att,Passes_Cmp%,TotDist,PrgDist,Ast,KP,PPA,CrsPA
0,2023-08-12,,,,,,5,,,
1,2023-08-19,380.0,475.0,80.0,6130.0,2150.0,3,8.0,8.0,3.0
2,2023-08-26,649.0,740.0,87.7,9491.0,3417.0,3,22.0,21.0,0.0
3,2023-09-02,727.0,812.0,89.5,10511.0,3807.0,5,20.0,17.0,2.0
4,2023-09-15,512.0,601.0,85.2,8322.0,3157.0,0,10.0,12.0,0.0


- Passes_Cmp = passes completed
- Passes_Att = passes attempted
- Passes_Cmp% = pass completion %
- TotDist = total distance in yards, that completed passes have traveled in any direction
- PrgDist = total distance in yards, that completed passes have traveled towards opponents's goal
- Ast = assists
- KP = key passes, directly lead to a shot
- PPA = passes into penalty area
- CrsPA = crosses into penalty area

## Get defensive actions stats

In [32]:
Leverkusen_defense = get_stats(Leverkusen_hrefs, "defense", "Defensive Actions", indices_to_remove=['Challenges'],
                               selected_features=['Date', 'Tkl', 'TklW', 'Blocks', 'Int', 'Clr'])
time.sleep(1)
Leverkusen_defense.head()

Unnamed: 0,Date,Tkl,TklW,Blocks,Int,Clr
0,2023-08-12,,10,,6,
1,2023-08-19,15.0,11,14.0,8,15.0
2,2023-08-26,16.0,11,9.0,9,11.0
3,2023-09-02,7.0,4,10.0,4,3.0
4,2023-09-15,23.0,12,9.0,21,19.0


- Tkl = tackles
- TklW = tackles won
- Blocks = number of times blocking the ball by standing in its path
- Int = interceptions
- Clr = clearances

## Get possession stats

In [33]:
Leverkusen_possession = get_stats(Leverkusen_hrefs, "possession", "Possession", indices_to_remove=['Carries', 'Receiving'],
                                 selected_features=['Date', 'Touches', 'Att', 'Succ', 'Succ%'])
# rename features
Leverkusen_possession.columns = ['Date', 'Touches', 'Take_ons_Att', 'Succ', 'Succ%']
time.sleep(1)
# first 5 rows
Leverkusen_possession.head()

Unnamed: 0,Date,Touches,Take_ons_Att,Succ,Succ%
0,2023-08-12,,,,
1,2023-08-19,574.0,22.0,11.0,50.0
2,2023-08-26,854.0,16.0,7.0,43.8
3,2023-09-02,913.0,26.0,15.0,57.7
4,2023-09-15,744.0,33.0,16.0,48.5


- Touches = number of times a player touches the ball
- Att = take-ons attempted
- Succ = succesful take-ons
- Succ% = succesful take-ons %

# Cleaning and Merge

In [34]:
# list of dataframes
dfs = [Leverkusen_matches, Leverkusen_shooting, 
       Leverkusen_goalkeeping, Leverkusen_passing, Leverkusen_defense, Leverkusen_possession]
# merge dfs
Leverkusen_data = reduce(lambda  left,right: pd.merge(left,right,on=['Date']), dfs)
Leverkusen_data.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,CrsPA,Tkl,TklW,Blocks,Int,Clr,Touches,Take_ons_Att,Succ,Succ%
0,2023-08-12,15:30,DFB-Pokal,Round of 64,Sat,Away,W,8.0,0.0,FC Teutonia Ottensen,...,,,10,,6,,,,,
1,2023-08-19,15:30,Bundesliga,Matchweek 1,Sat,Home,W,3.0,2.0,RB Leipzig,...,3.0,15.0,11,14.0,8,15.0,574.0,22.0,11.0,50.0
2,2023-08-26,18:30,Bundesliga,Matchweek 2,Sat,Away,W,3.0,0.0,M'Gladbach,...,0.0,16.0,11,9.0,9,11.0,854.0,16.0,7.0,43.8
3,2023-09-02,15:30,Bundesliga,Matchweek 3,Sat,Home,W,5.0,1.0,Darmstadt 98,...,2.0,7.0,4,10.0,4,3.0,913.0,26.0,15.0,57.7
4,2023-09-15,20:30,Bundesliga,Matchweek 4,Fri,Away,D,2.0,2.0,Bayern Munich,...,0.0,23.0,12,9.0,21,19.0,744.0,33.0,16.0,48.5


In [35]:
Leverkusen_data.columns

Index(['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA',
       'Opponent', 'xG', 'xGA', 'Poss', 'Sh', 'SoT', 'SoT%', 'Saves', 'Save%',
       'Stp', 'Stp%', '#OPA', 'Passes_Cmp', 'Passes_Att', 'Passes_Cmp%',
       'TotDist', 'PrgDist', 'Ast', 'KP', 'PPA', 'CrsPA', 'Tkl', 'TklW',
       'Blocks', 'Int', 'Clr', 'Touches', 'Take_ons_Att', 'Succ', 'Succ%'],
      dtype='object')

In [36]:
Leverkusen_data.columns.duplicated(keep=False)

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False])

# Scraping through seasons and teams

In [37]:
# a list of years to scrape
years = list(range(2023, 2018, -1))

In [38]:
years

[2023, 2022, 2021, 2020, 2019]

In [39]:
all_matches = []

In [40]:
table

'https://fbref.com/en/comps/20/Bundesliga-Stats'

In [42]:
# let's scrape!
for year in tqdm(years, desc="Scraping Progress. Please be patient!"):
    table_data = requests.get(table)
    time.sleep(1)
    table_soup = BeautifulSoup(table_data.text)
    standings_table = table_soup.select('table.stats_table')[0]
    
    hrefs = [tag.get('href') for tag in standings_table.find_all('a')]
    links = [link for link in hrefs if '/squads/' in link]
    team_urls = [f"https://fbref.com{link}" for link in links]
    
    previous_season = table_soup.select("a.prev")[0].get("href")
    table = f"https://fbref.com/{previous_season}"
    
    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        
        # get all matches
        team_html = requests.get(team_url)
        time.sleep(1)
        team_matches = pd.read_html(team_html.text, match="Scores & Fixtures")[0]
        
        # get stats
        team_soup = BeautifulSoup(team_html.text)
        team_hrefs = [tag.get("href") for tag in team_soup.find_all('a')]
        
        # get shooting stats
        team_shooting = get_stats(team_hrefs, "shooting", "Shooting", selected_features=['Date', 'Sh', 'SoT', 'SoT%'])
        time.sleep(1)
        
        # get goalkeeping stats
        team_goalkeeping = get_stats(team_hrefs, "keeper",'Goalkeeping', 
                                     selected_features=['Date', 'Saves', 'Save%', 'Stp', 'Stp%', '#OPA'])
        time.sleep(1)
        # get passing stats
        team_passing = get_stats(team_hrefs, "passing", "Passing", indices_to_remove=['Short', 'Medium', 'Long'], 
          selected_features=['Date', 'Cmp', 'Att', 'Cmp%', 'TotDist', 'PrgDist', 'Ast', 'KP', 'PPA', 'CrsPA'])
        time.sleep(1)
        
        # get defense stats
        team_defense = get_stats(team_hrefs, "defense", "Defensive Actions", indices_to_remove=['Challenges'],
                               selected_features=['Date', 'Tkl', 'TklW', 'Blocks', 'Int', 'Clr'])
        time.sleep(1)
        
        # get possession stats
        team_possession = get_stats(team_hrefs, "possession", "Possession", indices_to_remove=['Carries', 'Receiving'],
                                 selected_features=['Date', 'Touches', 'Att', 'Succ', 'Succ%'])
        time.sleep(1)
        
        # list of dataframes
        dfs = [team_matches, team_shooting, 
               team_goalkeeping, team_passing, team_defense, team_possession]
               

        try:
            team_data = reduce(lambda  left,right: pd.merge(left,right,on=['Date']), dfs)
        except ValueError:
            continue
            
        team_data = team_data[team_data['Comp']=="Bundesliga"]
        team_data["Season"] = str(year)+"-"+ str(year+1)
        team_data["Team"] = team_name
        all_matches.append(team_data)
        time.sleep(1)        

Scraping Progress. Please be patient!: 100%|████████████████████████████████████████████| 5/5 [30:36<00:00, 367.27s/it]


In [43]:
len(all_matches)

90

In [50]:
# concatenate all dataframes
match_df = pd.concat(all_matches)

In [54]:
# turn column names in lower case
match_df.columns = [column.lower() for column in match_df.columns]

In [55]:
match_df.columns

Index(['date', 'time', 'comp', 'round', 'day', 'venue', 'result', 'gf', 'ga',
       'opponent', 'xg', 'xga', 'poss', 'attendance', 'captain', 'formation',
       'referee', 'match report', 'notes', 'sh', 'sot', 'sot%', 'saves',
       'save%', 'stp', 'stp%', '#opa', 'cmp', 'att_x', 'cmp%', 'totdist',
       'prgdist', 'ast', 'kp', 'ppa', 'crspa', 'tkl', 'tklw', 'blocks', 'int',
       'clr', 'touches', 'att_y', 'succ', 'succ%', 'season', 'team'],
      dtype='object')

In [57]:
# save datasets to csv file
match_df.to_csv("matches.csv")

In [58]:
# check the first rows
match_df.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,tklw,blocks,int,clr,touches,att_y,succ,succ%,season,team
1,2023-08-19,15:30,Bundesliga,Matchweek 1,Sat,Home,W,3.0,2.0,RB Leipzig,...,11.0,14.0,8.0,15.0,574.0,22.0,11.0,50.0,2023-2024,Bayer Leverkusen
2,2023-08-26,18:30,Bundesliga,Matchweek 2,Sat,Away,W,3.0,0.0,M'Gladbach,...,11.0,9.0,9.0,11.0,854.0,16.0,7.0,43.8,2023-2024,Bayer Leverkusen
3,2023-09-02,15:30,Bundesliga,Matchweek 3,Sat,Home,W,5.0,1.0,Darmstadt 98,...,4.0,10.0,4.0,3.0,913.0,26.0,15.0,57.7,2023-2024,Bayer Leverkusen
4,2023-09-15,20:30,Bundesliga,Matchweek 4,Fri,Away,D,2.0,2.0,Bayern Munich,...,12.0,9.0,21.0,19.0,744.0,33.0,16.0,48.5,2023-2024,Bayer Leverkusen
6,2023-09-24,15:30,Bundesliga,Matchweek 5,Sun,Home,W,4.0,1.0,Heidenheim,...,10.0,11.0,4.0,12.0,964.0,23.0,14.0,60.9,2023-2024,Bayer Leverkusen
