In [121]:
# import necessary packages
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import time
from IPython.display import display

# Scrape stats page and parse html

In [122]:
# link to the standing table
table = "https://fbref.com/en/comps/20/Bundesliga-Stats"

In [123]:
# download html
table_data = requests.get(table)

In [124]:
# initialize a soup
table_soup = BeautifulSoup(table_data.text)

In [125]:
# select the whole table and check length
len(table_soup.select("table.stats_table"))

24

There are 24 elements, we need to check the ids

In [126]:
for element in table_soup.select("table.stats_table"):
    print(element.get('id'))

results2023-2024201_overall
results2023-2024201_home_away
stats_squads_standard_for
stats_squads_standard_against
stats_squads_keeper_for
stats_squads_keeper_against
stats_squads_keeper_adv_for
stats_squads_keeper_adv_against
stats_squads_shooting_for
stats_squads_shooting_against
stats_squads_passing_for
stats_squads_passing_against
stats_squads_passing_types_for
stats_squads_passing_types_against
stats_squads_gca_for
stats_squads_gca_against
stats_squads_defense_for
stats_squads_defense_against
stats_squads_possession_for
stats_squads_possession_against
stats_squads_playing_time_for
stats_squads_playing_time_against
stats_squads_misc_for
stats_squads_misc_against


In [127]:
# since we need overall stats, we choose the first element
standings_table = table_soup.select('table.stats_table')[0]

In [128]:
# find all a-tags in standings_table
a_tags = standings_table.find_all('a')

In [129]:
# get all hrefs for each a-Tags
hrefs = [tag.get('href') for tag in a_tags]

In [130]:
# filter a link with info of squads
links = [link for link in hrefs if "/squads/" in link]

In [131]:
# show all links
links

['/en/squads/c7a9f859/Bayer-Leverkusen-Stats',
 '/en/squads/054efa67/Bayern-Munich-Stats',
 '/en/squads/598bc722/Stuttgart-Stats',
 '/en/squads/add600ae/Dortmund-Stats',
 '/en/squads/acbb6a5b/RB-Leipzig-Stats',
 '/en/squads/f0ac8ee6/Eintracht-Frankfurt-Stats',
 '/en/squads/a486e511/Freiburg-Stats',
 '/en/squads/033ea6b8/Hoffenheim-Stats',
 '/en/squads/62add3bf/Werder-Bremen-Stats',
 '/en/squads/18d9d2a7/Heidenheim-Stats',
 '/en/squads/4eaa11d7/Wolfsburg-Stats',
 '/en/squads/0cdc4311/Augsburg-Stats',
 '/en/squads/32f3ee20/Monchengladbach-Stats',
 '/en/squads/b42c6323/Bochum-Stats',
 '/en/squads/7a41008f/Union-Berlin-Stats',
 '/en/squads/bc357bf7/Koln-Stats',
 '/en/squads/a224b06a/Mainz-05-Stats',
 '/en/squads/6a6967fc/Darmstadt-98-Stats']

In [132]:
len(links)

18

In [133]:
# create full links to access
team_urls = [f"https://fbref.com{link}" for link in links]
team_urls

['https://fbref.com/en/squads/c7a9f859/Bayer-Leverkusen-Stats',
 'https://fbref.com/en/squads/054efa67/Bayern-Munich-Stats',
 'https://fbref.com/en/squads/598bc722/Stuttgart-Stats',
 'https://fbref.com/en/squads/add600ae/Dortmund-Stats',
 'https://fbref.com/en/squads/acbb6a5b/RB-Leipzig-Stats',
 'https://fbref.com/en/squads/f0ac8ee6/Eintracht-Frankfurt-Stats',
 'https://fbref.com/en/squads/a486e511/Freiburg-Stats',
 'https://fbref.com/en/squads/033ea6b8/Hoffenheim-Stats',
 'https://fbref.com/en/squads/62add3bf/Werder-Bremen-Stats',
 'https://fbref.com/en/squads/18d9d2a7/Heidenheim-Stats',
 'https://fbref.com/en/squads/4eaa11d7/Wolfsburg-Stats',
 'https://fbref.com/en/squads/0cdc4311/Augsburg-Stats',
 'https://fbref.com/en/squads/32f3ee20/Monchengladbach-Stats',
 'https://fbref.com/en/squads/b42c6323/Bochum-Stats',
 'https://fbref.com/en/squads/7a41008f/Union-Berlin-Stats',
 'https://fbref.com/en/squads/bc357bf7/Koln-Stats',
 'https://fbref.com/en/squads/a224b06a/Mainz-05-Stats',
 'http

# Extract match stats

In this section we work with an example, that is Bayer Leverkusen

In [134]:
# get Leverkusen's url
Leverkusen_url = team_urls[0]
Leverkusen_url

'https://fbref.com/en/squads/c7a9f859/Bayer-Leverkusen-Stats'

In [135]:
# get Leverkusen's html
Leverkusen_data = requests.get(Leverkusen_url)

In [136]:
# grab all matches of Leverkusen
Leverkusen_matches_list = pd.read_html(Leverkusen_data.text, match="Scores & Fixtures")
# get all matches as a dataframe
Leverkusen_matches = Leverkusen_matches_list[0]
Leverkusen_matches.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes
0,2023-08-12,15:30,DFB-Pokal,Round of 64,Sat,Away,W,8.0,0.0,FC Teutonia Ottensen,,,66.0,11035.0,Lukáš Hrádecký,4-2-3-1,Tom Bauer,Match Report,
1,2023-08-19,15:30,Bundesliga,Matchweek 1,Sat,Home,W,3.0,2.0,RB Leipzig,1.4,1.6,44.0,29464.0,Lukáš Hrádecký,3-4-3,Felix Brych,Match Report,
2,2023-08-26,18:30,Bundesliga,Matchweek 2,Sat,Away,W,3.0,0.0,M'Gladbach,2.7,0.9,60.0,54042.0,Lukáš Hrádecký,3-4-3,Christian Dingert,Match Report,
3,2023-09-02,15:30,Bundesliga,Matchweek 3,Sat,Home,W,5.0,1.0,Darmstadt 98,2.8,0.4,72.0,29653.0,Lukáš Hrádecký,3-4-3,Sven Jablonski,Match Report,
4,2023-09-15,20:30,Bundesliga,Matchweek 4,Fri,Away,D,2.0,2.0,Bayern Munich,2.1,2.1,49.0,75000.0,Lukáš Hrádecký,3-4-3,Daniel Schlager,Match Report,


In [137]:
Leverkusen_matches = Leverkusen_matches.loc[:,['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA',
       'Opponent', 'xG', 'xGA', 'Poss']]
Leverkusen_matches.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss
0,2023-08-12,15:30,DFB-Pokal,Round of 64,Sat,Away,W,8.0,0.0,FC Teutonia Ottensen,,,66.0
1,2023-08-19,15:30,Bundesliga,Matchweek 1,Sat,Home,W,3.0,2.0,RB Leipzig,1.4,1.6,44.0
2,2023-08-26,18:30,Bundesliga,Matchweek 2,Sat,Away,W,3.0,0.0,M'Gladbach,2.7,0.9,60.0
3,2023-09-02,15:30,Bundesliga,Matchweek 3,Sat,Home,W,5.0,1.0,Darmstadt 98,2.8,0.4,72.0
4,2023-09-15,20:30,Bundesliga,Matchweek 4,Fri,Away,D,2.0,2.0,Bayern Munich,2.1,2.1,49.0


- GF = goals for
- GA = goals against
- xG = expected goals
- xGA = expected goals allowed
- Poss = possession, calculated as the percentage of passess attempted

In [138]:
Leverkusen_matches.shape

(44, 13)

# Get stats

## Get shooting stats

In [139]:
# initialize a soup
Leverkusen_soup = BeautifulSoup(Leverkusen_data.text)

In [140]:
# find a-tags
Leverkusen_a_tags = Leverkusen_soup.find_all('a')

In [141]:
# get links with 'href'
Leverkusen_hrefs = [tag.get('href') for tag in Leverkusen_a_tags]

In [142]:
# filter with shooting
Leverkusen_shooting_links = [href for href in Leverkusen_hrefs if href and 'all_comps/shooting/' in href]
Leverkusen_shooting_links

['/en/squads/c7a9f859/2023-2024/matchlogs/all_comps/shooting/Bayer-Leverkusen-Match-Logs-All-Competitions',
 '/en/squads/c7a9f859/2023-2024/matchlogs/all_comps/shooting/Bayer-Leverkusen-Match-Logs-All-Competitions',
 '/en/squads/c7a9f859/2023-2024/matchlogs/all_comps/shooting/Bayer-Leverkusen-Match-Logs-All-Competitions',
 '/en/squads/c7a9f859/2023-2024/matchlogs/all_comps/shooting/Bayer-Leverkusen-Match-Logs-All-Competitions']

In [143]:
# get shooting data
Leverkusen_shooting_data = requests.get(f"https://fbref.com{Leverkusen_shooting_links[0]}")
time.sleep(1)

In [144]:
# get Leverkusen's shooting stats
Leverkusen_shooting = pd.read_html(Leverkusen_shooting_data.text, match='Shooting')[0]
# drop top level
Leverkusen_shooting.columns = Leverkusen_shooting.columns.droplevel()
Leverkusen_shooting = Leverkusen_shooting.loc[:,['Date', 'Sh', 'SoT']]

Leverkusen_shooting.head()

Unnamed: 0,Date,Sh,SoT
0,2023-08-12,22,11
1,2023-08-19,11,7
2,2023-08-26,24,11
3,2023-09-02,25,13
4,2023-09-15,12,4


- Sh = shots total
- SoT = shots on target

We write a function to scrape stats data for other stats

In [145]:
def get_stats(stat, match_string):
    # filter links with desired stat
    links = [href for href in Leverkusen_hrefs if href and "all_comps/" + stat +"/" in href]

    # get stats data
    stat_data = requests.get(f"https://fbref.com{links[0]}")
    
    # get stats table
    stat_table = pd.read_html(stat_data.text, match=match_string)[0]
    
    return stat_table

## Get Goalkeeping stats

In [146]:
Leverkusen_goalkeeping = get_stats("keeper",'Goalkeeping')
Leverkusen_goalkeeping.columns = Leverkusen_goalkeeping.columns.droplevel()
Leverkusen_goalkeeping = Leverkusen_goalkeeping.loc[:,['Date', 'Saves', 'Opp', 'Stp', '#OPA']]
Leverkusen_goalkeeping.head()
time.sleep(1)

- saves = number of saves
- Opp = Opponent's attempted crosses into penalty area
- Stp = number of crosses into penalty area which was succesfully stopped by the goalkeeper
- #OPA = number of defensive action outside of penalty area

## Get passing stats

In [148]:
# get passing stats
Leverkusen_passing = get_stats("passing", "Passing")
# drop 'short', 'medium' and 'long' level
columns_to_drop = \
Leverkusen_passing.columns[Leverkusen_passing.columns.get_level_values(0).isin(['Short', 'Medium', 'Long'])]
# drop 0-level 'Short', 'Medium', 'Long'
Leverkusen_passing = Leverkusen_passing.drop(columns=columns_to_drop)
Leverkusen_passing.head()

Unnamed: 0_level_0,For Leverkusen,For Leverkusen,For Leverkusen,For Leverkusen,For Leverkusen,For Leverkusen,For Leverkusen,For Leverkusen,For Leverkusen,For Leverkusen,...,Total,Unnamed: 24_level_0,Unnamed: 25_level_0,Unnamed: 26_level_0,Unnamed: 27_level_0,Unnamed: 28_level_0,Unnamed: 29_level_0,Unnamed: 30_level_0,Unnamed: 31_level_0,Unnamed: 32_level_0
Unnamed: 0_level_1,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,PrgDist,Ast,xAG,xA,KP,1/3,PPA,CrsPA,PrgP,Match Report
0,2023-08-12,15:30,DFB-Pokal,Round of 64,Sat,Away,W,8.0,0.0,FC Teutonia Ottensen,...,,5,,,,,,,,Match Report
1,2023-08-19,15:30,Bundesliga,Matchweek 1,Sat,Home,W,3.0,2.0,RB Leipzig,...,2150.0,3,1.3,1.5,8.0,31.0,8.0,3.0,26.0,Match Report
2,2023-08-26,18:30,Bundesliga,Matchweek 2,Sat,Away,W,3.0,0.0,M'Gladbach,...,3417.0,3,2.4,3.1,22.0,43.0,21.0,0.0,54.0,Match Report
3,2023-09-02,15:30,Bundesliga,Matchweek 3,Sat,Home,W,5.0,1.0,Darmstadt 98,...,3807.0,5,2.4,2.8,20.0,79.0,17.0,2.0,86.0,Match Report
4,2023-09-15,20:30,Bundesliga,Matchweek 4,Fri,Away,D,2.0,2.0,Bayern Munich,...,3157.0,0,1.1,0.9,10.0,42.0,12.0,0.0,50.0,Match Report


In [149]:
# drop level
Leverkusen_passing.columns = Leverkusen_passing.columns.droplevel()
# select features
Leverkusen_passing = Leverkusen_passing.loc[:,['Date', 'Cmp', 'Att', 'TotDist', 'PrgDist', 'Ast', 'KP', 'PPA', 'CrsPA']]

Leverkusen_passing.head()

Unnamed: 0,Date,Cmp,Att,TotDist,PrgDist,Ast,KP,PPA,CrsPA
0,2023-08-12,,,,,5,,,
1,2023-08-19,380.0,475.0,6130.0,2150.0,3,8.0,8.0,3.0
2,2023-08-26,649.0,740.0,9491.0,3417.0,3,22.0,21.0,0.0
3,2023-09-02,727.0,812.0,10511.0,3807.0,5,20.0,17.0,2.0
4,2023-09-15,512.0,601.0,8322.0,3157.0,0,10.0,12.0,0.0


- Cmp = passes completed
- Att = passes attempted
- TotDist = total distance in yards, that completed passes have traveled in any direction
- PrgDist = total distance in yards, that completed passes have traveled towards opponents's goal
- Ast = assists
- KP = key passes, directly lead to a shot
- PPA = passes into penalty area
- CrsPA = crosses into penalty area

## Get defensive actions stats

In [37]:
Leverkusen_defense = get_stats("defense", "Defensive Actions")
Leverkusen_defense.head()
time.sleep(1)

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Tkl%,Lost,Blocks,Sh,Pass,Int,Tkl+Int,Clr,Err,Match Report
0,2023-08-12,15:30,DFB-Pokal,Round of 64,Sat,Away,W,8.0,0.0,FC Teutonia Ottensen,...,,,,,,6,0,,,Match Report
1,2023-08-19,15:30,Bundesliga,Matchweek 1,Sat,Home,W,3.0,2.0,RB Leipzig,...,47.1,9.0,14.0,4.0,10.0,8,23,15.0,0.0,Match Report
2,2023-08-26,18:30,Bundesliga,Matchweek 2,Sat,Away,W,3.0,0.0,M'Gladbach,...,100.0,0.0,9.0,2.0,7.0,9,25,11.0,0.0,Match Report
3,2023-09-02,15:30,Bundesliga,Matchweek 3,Sat,Home,W,5.0,1.0,Darmstadt 98,...,71.4,2.0,10.0,4.0,6.0,4,11,3.0,0.0,Match Report
4,2023-09-15,20:30,Bundesliga,Matchweek 4,Fri,Away,D,2.0,2.0,Bayern Munich,...,45.0,11.0,9.0,2.0,7.0,21,44,19.0,0.0,Match Report


In [38]:
Leverkusen_defense.columns

Index(['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA',
       'Opponent', 'Tkl', 'TklW', 'Def 3rd', 'Mid 3rd', 'Att 3rd', 'Tkl',
       'Att', 'Tkl%', 'Lost', 'Blocks', 'Sh', 'Pass', 'Int', 'Tkl+Int', 'Clr',
       'Err', 'Match Report'],
      dtype='object')

## Get possession stats

In [39]:
Leverkusen_possession = get_stats("possession", "Possession")
Leverkusen_possession.head()
time.sleep(1)

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,TotDist,PrgDist,PrgC,1/3,CPA,Mis,Dis,Rec,PrgR,Match Report
0,2023-08-12,15:30,DFB-Pokal,Round of 64,Sat,Away,W,8.0,0.0,FC Teutonia Ottensen,...,,,,,,,,,,Match Report
1,2023-08-19,15:30,Bundesliga,Matchweek 1,Sat,Home,W,3.0,2.0,RB Leipzig,...,1965.0,1104.0,29.0,21.0,3.0,9.0,7.0,375.0,26.0,Match Report
2,2023-08-26,18:30,Bundesliga,Matchweek 2,Sat,Away,W,3.0,0.0,M'Gladbach,...,2587.0,1487.0,31.0,24.0,7.0,21.0,13.0,644.0,53.0,Match Report
3,2023-09-02,15:30,Bundesliga,Matchweek 3,Sat,Home,W,5.0,1.0,Darmstadt 98,...,2980.0,1725.0,27.0,30.0,6.0,20.0,14.0,722.0,83.0,Match Report
4,2023-09-15,20:30,Bundesliga,Matchweek 4,Fri,Away,D,2.0,2.0,Bayern Munich,...,2191.0,1065.0,18.0,5.0,4.0,21.0,6.0,511.0,50.0,Match Report


In [40]:
Leverkusen_possession.columns

Index(['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA',
       'Opponent', 'Poss', 'Touches', 'Def Pen', 'Def 3rd', 'Mid 3rd',
       'Att 3rd', 'Att Pen', 'Live', 'Att', 'Succ', 'Succ%', 'Tkld', 'Tkld%',
       'Carries', 'TotDist', 'PrgDist', 'PrgC', '1/3', 'CPA', 'Mis', 'Dis',
       'Rec', 'PrgR', 'Match Report'],
      dtype='object')

# Cleaning and Merge

**begin test**

In [46]:
Leverkusen_defense.shape

(31, 27)

In [47]:
Leverkusen_possession.shape

(31, 34)

In [68]:
Leverkusen_shooting.columns

Index(['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA',
       'Opponent', 'Gls', 'Sh', 'SoT', 'SoT%', 'G/Sh', 'G/SoT', 'Dist', 'FK',
       'PK', 'PKatt', 'xG', 'npxG', 'npxG/Sh', 'G-xG', 'np:G-xG',
       'Match Report'],
      dtype='object')

In [48]:
# test merge
a = Leverkusen_matches.merge(Leverkusen_shooting[['Date','Sh', 'SoT','Dist', 'FK',
       'PK', 'PKatt']], on='Date')
a

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt
0,2023-08-12,15:30,DFB-Pokal,Round of 64,Sat,Away,W,8.0,0.0,FC Teutonia Ottensen,...,4-2-3-1,Tom Bauer,Match Report,,22,11,,,1,1
1,2023-08-19,15:30,Bundesliga,Matchweek 1,Sat,Home,W,3.0,2.0,RB Leipzig,...,3-4-3,Felix Brych,Match Report,,11,7,19.0,0.0,0,0
2,2023-08-26,18:30,Bundesliga,Matchweek 2,Sat,Away,W,3.0,0.0,M'Gladbach,...,3-4-3,Christian Dingert,Match Report,,24,11,15.8,0.0,0,0
3,2023-09-02,15:30,Bundesliga,Matchweek 3,Sat,Home,W,5.0,1.0,Darmstadt 98,...,3-4-3,Sven Jablonski,Match Report,,25,13,17.3,1.0,0,0
4,2023-09-15,20:30,Bundesliga,Matchweek 4,Fri,Away,D,2.0,2.0,Bayern Munich,...,3-4-3,Daniel Schlager,Match Report,,12,4,20.7,1.0,1,1
5,2023-09-21,18:45,Europa Lg,Group stage,Thu,Home,W,4.0,0.0,se Häcken,...,4-4-2,Manfredas Lukjančukas,Match Report,,22,10,20.2,0.0,0,0
6,2023-09-24,15:30,Bundesliga,Matchweek 5,Sun,Home,W,4.0,1.0,Heidenheim,...,3-4-3,Christian Dingert,Match Report,,20,9,16.7,0.0,1,1
7,2023-09-30,15:30,Bundesliga,Matchweek 6,Sat,Away,W,3.0,0.0,Mainz 05,...,3-4-3,Benjamin Brand,Match Report,,6,3,20.6,2.0,0,0
8,2023-10-05,21:00,Europa Lg,Group stage,Thu,Away,W,2.0,1.0,no Molde,...,3-4-3,Anastasios Papapetrou,Match Report,,12,3,16.4,1.0,0,0
9,2023-10-08,15:30,Bundesliga,Matchweek 7,Sun,Home,W,3.0,0.0,Köln,...,3-4-3,Felix Zwayer,Match Report,,23,10,16.6,0.0,0,0


In [71]:
a.columns

Index(['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA',
       'Opponent', 'xG', 'xGA', 'Poss', 'Attendance', 'Captain', 'Formation',
       'Referee', 'Match Report', 'Notes', 'Sh', 'SoT', 'Dist', 'FK', 'PK',
       'PKatt'],
      dtype='object')

In [72]:
Leverkusen_matches.columns

Index(['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA',
       'Opponent', 'xG', 'xGA', 'Poss', 'Attendance', 'Captain', 'Formation',
       'Referee', 'Match Report', 'Notes'],
      dtype='object')

In [73]:
Leverkusen_shooting.columns # 'Date','Sh', 'SoT','Dist', 'FK','PK', 'PKatt'

Index(['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA',
       'Opponent', 'Gls', 'Sh', 'SoT', 'SoT%', 'G/Sh', 'G/SoT', 'Dist', 'FK',
       'PK', 'PKatt', 'xG', 'npxG', 'npxG/Sh', 'G-xG', 'np:G-xG',
       'Match Report'],
      dtype='object')

**end test**

In [51]:
shooting.head()

Unnamed: 0_level_0,For Liverpool,For Liverpool,For Liverpool,For Liverpool,For Liverpool,For Liverpool,For Liverpool,For Liverpool,For Liverpool,For Liverpool,...,Standard,Standard,Standard,Standard,Expected,Expected,Expected,Expected,Expected,Unnamed: 25_level_0
Unnamed: 0_level_1,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Match Report
0,2023-08-13,16:30,Premier League,Matchweek 1,Sun,Away,D,1.0,1.0,Chelsea,...,17.8,0.0,0,0,1.3,1.3,0.1,-0.3,-0.3,Match Report
1,2023-08-19,15:00,Premier League,Matchweek 2,Sat,Home,W,3.0,1.0,Bournemouth,...,16.8,1.0,0,1,3.0,2.1,0.09,0.0,0.9,Match Report
2,2023-08-27,16:30,Premier League,Matchweek 3,Sun,Away,W,2.0,1.0,Newcastle Utd,...,17.2,1.0,0,0,0.9,0.9,0.1,1.1,1.1,Match Report
3,2023-09-03,14:00,Premier League,Matchweek 4,Sun,Home,W,3.0,0.0,Aston Villa,...,14.7,0.0,0,0,2.5,2.5,0.15,-0.5,-0.5,Match Report
4,2023-09-16,12:30,Premier League,Matchweek 5,Sat,Away,W,3.0,1.0,Wolves,...,15.8,0.0,0,0,2.5,2.5,0.16,-0.5,-0.5,Match Report


In [52]:
# drop top level
shooting.columns = shooting.columns.droplevel()

In [55]:
matches[0].head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes
0,2023-08-13,16:30,Premier League,Matchweek 1,Sun,Away,D,1.0,1.0,Chelsea,1.3,1.4,35.0,40096.0,Virgil van Dijk,4-3-3,Anthony Taylor,Match Report,
1,2023-08-19,15:00,Premier League,Matchweek 2,Sat,Home,W,3.0,1.0,Bournemouth,3.0,1.3,64.0,53145.0,Virgil van Dijk,4-3-3,Thomas Bramall,Match Report,
2,2023-08-27,16:30,Premier League,Matchweek 3,Sun,Away,W,2.0,1.0,Newcastle Utd,0.9,2.0,41.0,52214.0,Virgil van Dijk,4-3-3,John Brooks,Match Report,
3,2023-09-03,14:00,Premier League,Matchweek 4,Sun,Home,W,3.0,0.0,Aston Villa,2.5,0.7,63.0,50109.0,Trent Alexander-Arnold,4-3-3,Simon Hooper,Match Report,
4,2023-09-16,12:30,Premier League,Matchweek 5,Sat,Away,W,3.0,1.0,Wolves,2.5,0.6,65.0,31257.0,Andrew Robertson,4-3-3,Michael Oliver,Match Report,


In [53]:
shooting.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Match Report
0,2023-08-13,16:30,Premier League,Matchweek 1,Sun,Away,D,1.0,1.0,Chelsea,...,17.8,0.0,0,0,1.3,1.3,0.1,-0.3,-0.3,Match Report
1,2023-08-19,15:00,Premier League,Matchweek 2,Sat,Home,W,3.0,1.0,Bournemouth,...,16.8,1.0,0,1,3.0,2.1,0.09,0.0,0.9,Match Report
2,2023-08-27,16:30,Premier League,Matchweek 3,Sun,Away,W,2.0,1.0,Newcastle Utd,...,17.2,1.0,0,0,0.9,0.9,0.1,1.1,1.1,Match Report
3,2023-09-03,14:00,Premier League,Matchweek 4,Sun,Home,W,3.0,0.0,Aston Villa,...,14.7,0.0,0,0,2.5,2.5,0.15,-0.5,-0.5,Match Report
4,2023-09-16,12:30,Premier League,Matchweek 5,Sat,Away,W,3.0,1.0,Wolves,...,15.8,0.0,0,0,2.5,2.5,0.16,-0.5,-0.5,Match Report


In [57]:
shooting.columns

Index(['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA',
       'Opponent', 'Gls', 'Sh', 'SoT', 'SoT%', 'G/Sh', 'G/SoT', 'Dist', 'FK',
       'PK', 'PKatt', 'xG', 'npxG', 'npxG/Sh', 'G-xG', 'np:G-xG',
       'Match Report'],
      dtype='object')

In [58]:
# merge
team_data = matches[0].merge(shooting[['Date','Sh', 'SoT','Dist', 'FK',
       'PK', 'PKatt']], on='Date')

In [59]:
team_data

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt
0,2023-08-13,16:30,Premier League,Matchweek 1,Sun,Away,D,1.0,1.0,Chelsea,...,4-3-3,Anthony Taylor,Match Report,,13,1,17.8,0.0,0,0
1,2023-08-19,15:00,Premier League,Matchweek 2,Sat,Home,W,3.0,1.0,Bournemouth,...,4-3-3,Thomas Bramall,Match Report,,25,9,16.8,1.0,0,1
2,2023-08-27,16:30,Premier League,Matchweek 3,Sun,Away,W,2.0,1.0,Newcastle Utd,...,4-3-3,John Brooks,Match Report,,9,4,17.2,1.0,0,0
3,2023-09-03,14:00,Premier League,Matchweek 4,Sun,Home,W,3.0,0.0,Aston Villa,...,4-3-3,Simon Hooper,Match Report,,17,4,14.7,0.0,0,0
4,2023-09-16,12:30,Premier League,Matchweek 5,Sat,Away,W,3.0,1.0,Wolves,...,4-3-3,Michael Oliver,Match Report,,16,5,15.8,0.0,0,0
5,2023-09-21,18:45,Europa Lg,Group stage,Thu,Away,W,3.0,1.0,at LASK,...,4-3-3,Marco Di Bello,Match Report,,13,4,16.1,0.0,1,1
6,2023-09-24,14:00,Premier League,Matchweek 6,Sun,Home,W,3.0,1.0,West Ham,...,4-3-3,Chris Kavanagh,Match Report,,21,6,19.3,2.0,1,1
7,2023-09-27,19:45,EFL Cup,Third round,Wed,Home,W,3.0,1.0,Leicester City,...,4-3-3,Tim Robinson,Match Report,,29,10,,,0,0
8,2023-09-30,17:30,Premier League,Matchweek 7,Sat,Away,L,1.0,2.0,Tottenham,...,4-3-3,Simon Hooper,Match Report,,12,4,14.9,0.0,0,0
9,2023-10-05,20:00,Europa Lg,Group stage,Thu,Home,W,2.0,0.0,be Union SG,...,4-3-3,Morten Krogh,Match Report,,19,9,17.2,0.0,0,0


# Scraping through seasons and teams

In [72]:
years = list(range(2023, 2021, -1))

In [73]:
years

[2023, 2022]

In [74]:
all_matches = []

In [75]:
standings_url = 'https://fbref.com/en/comps/9/Premier-League-Stats'

In [80]:
for year in years:
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0]
    
    links = [l.get('href') for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]
    
    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com/{previous_season}"
    
    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        
        data = requests.get(team_url)
        matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
        
        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]
        data = requests.get(f"https://fbref.com{links[0]}")
        shooting = pd.read_html(data.text, match='Shooting')[0]
        shooting.columns = shooting.columns.droplevel()
        try:
            team_data = matches.merge(shooting[['Date','Sh', 'SoT','Dist', 'FK', 'PK', 'PKatt']], on='Date')
        except ValueError:
            continue
            
        team_data = team_data[team_data['Comp']=="Premier League"]
        team_data["Season"] = year
        team_data["Team"] = team_name
        all_matches.append(team_data)
        time.sleep(1)
        

In [81]:
match_df = pd.concat(all_matches)

In [82]:
match_df.columns = [c.lower() for c in match_df.columns]

In [83]:
match_df.to_csv("matches.csv")

In [84]:
match_df

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2019-08-09,20:00,Premier League,Matchweek 1,Fri,Home,W,4,1,Norwich City,...,Match Report,,15.0,7.0,17.1,1.0,0,0,2023,Liverpool
3,2019-08-17,15:00,Premier League,Matchweek 2,Sat,Away,W,2,1,Southampton,...,Match Report,,15.0,6.0,18.6,1.0,0,0,2023,Liverpool
4,2019-08-24,17:30,Premier League,Matchweek 3,Sat,Home,W,3,1,Arsenal,...,Match Report,,24.0,4.0,18.8,0.0,1,1,2023,Liverpool
5,2019-08-31,17:30,Premier League,Matchweek 4,Sat,Away,W,3,0,Burnley,...,Match Report,,15.0,7.0,21.0,0.0,0,0,2023,Liverpool
6,2019-09-14,12:30,Premier League,Matchweek 5,Sat,Home,W,3,1,Newcastle Utd,...,Match Report,,21.0,8.0,13.6,0.0,0,0,2023,Liverpool
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35,2019-04-13,12:30,Premier League,Matchweek 34,Sat,Away,L,0,4,Tottenham,...,Match Report,,7.0,1.0,18.9,1.0,0,0,2022,Huddersfield Town
36,2019-04-20,15:00,Premier League,Matchweek 35,Sat,Home,L,1,2,Watford,...,Match Report,,13.0,3.0,18.1,1.0,0,0,2022,Huddersfield Town
37,2019-04-26,20:00,Premier League,Matchweek 36,Fri,Away,L,0,5,Liverpool,...,Match Report,,5.0,1.0,21.6,0.0,0,0,2022,Huddersfield Town
38,2019-05-05,14:00,Premier League,Matchweek 37,Sun,Home,D,1,1,Manchester Utd,...,Match Report,,7.0,3.0,19.0,1.0,0,0,2022,Huddersfield Town


In [85]:
match_df.columns

Index(['date', 'time', 'comp', 'round', 'day', 'venue', 'result', 'gf', 'ga',
       'opponent', 'xg', 'xga', 'poss', 'attendance', 'captain', 'formation',
       'referee', 'match report', 'notes', 'sh', 'sot', 'dist', 'fk', 'pk',
       'pkatt', 'season', 'team'],
      dtype='object')