# Scraping Serie A Data

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

We download the Seria A data using the Requests module, parse the data using the BeautifulSoup module and finally, we load the data in a Pandas DataFrame. This is done for the following seasons: 2022-2023, 2021-2022, 2020-2021. We then store the combined data in a csv file locally.

Example for a single team and a single season.

In [2]:
# defines the URL for the 2022-2023 Seria A season standings data
seriaAStandings = "https://fbref.com/en/comps/11/2022-2023/2022-2023-Serie-A-Stats"

In [3]:
# sends a GET request to specified URL, and returns a response object 
responseData = requests.get(seriaAStandings)

In [4]:
# construct a BeautifulSoup object based on responseData.text i.e., content of the response
# this allows us to parse the content
soup = BeautifulSoup(responseData.text)

In [5]:
# select the 1st table with class stats_table i.e., Regular Season
# from there we select all <a> elements i.e., the link elements to each squad
# we pass a css selector to the .select()
standingsTableSquadLinks = soup.select('table.stats_table')[0].select('a')

In [6]:
standingsTableSquadLinks

[<a href="/en/squads/d48ad4ff/2022-2023/Napoli-Stats">Napoli</a>,
 <a href="/en/players/8c90fd7a/Victor-Osimhen">Victor Osimhen</a>,
 <a href="/en/players/9413b19f/Alex-Meret">Alex Meret</a>,
 <a href="/en/squads/7213da33/2022-2023/Lazio-Stats">Lazio</a>,
 <a href="/en/players/4431aed2/Ciro-Immobile">Ciro Immobile</a>,
 <a href="/en/players/277da414/Ivan-Provedel">Ivan Provedel</a>,
 <a href="/en/squads/d609edc0/2022-2023/Internazionale-Stats">Inter</a>,
 <a href="/en/players/f7036e1c/Lautaro-Martinez">Lautaro Martínez</a>,
 <a href="/en/players/e9c0c1b2/Andre-Onana">André Onana</a>,
 <a href="/en/squads/dc56fe14/2022-2023/Milan-Stats">Milan</a>,
 <a href="/en/players/20730eae/Rafael-Leao">Rafael Leão</a>,
 <a href="/en/players/fcb38f57/Mike-Maignan">Mike Maignan</a>,
 <a href="/en/squads/922493f3/2022-2023/Atalanta-Stats">Atalanta</a>,
 <a href="/en/players/7c104bb7/Ademola-Lookman">Ademola Lookman</a>,
 <a href="/en/players/a111cf41/Juan-Musso">Juan Musso</a>,
 <a href="/en/squads/cf

In [7]:
# get the href value from each <a> element
squadPostfixLinks = map(lambda x: x.get('href'), standingsTableSquadLinks)

# note some property values are for players, we are only interested in the actual teams
squadPostfixLinks = list(filter(lambda x: '/squads' in x, squadPostfixLinks))

In [8]:
squadPostfixLinks

['/en/squads/d48ad4ff/2022-2023/Napoli-Stats',
 '/en/squads/7213da33/2022-2023/Lazio-Stats',
 '/en/squads/d609edc0/2022-2023/Internazionale-Stats',
 '/en/squads/dc56fe14/2022-2023/Milan-Stats',
 '/en/squads/922493f3/2022-2023/Atalanta-Stats',
 '/en/squads/cf74a709/2022-2023/Roma-Stats',
 '/en/squads/e0652b02/2022-2023/Juventus-Stats',
 '/en/squads/421387cf/2022-2023/Fiorentina-Stats',
 '/en/squads/1d8099f8/2022-2023/Bologna-Stats',
 '/en/squads/105360fe/2022-2023/Torino-Stats',
 '/en/squads/21680aa4/2022-2023/Monza-Stats',
 '/en/squads/04eea015/2022-2023/Udinese-Stats',
 '/en/squads/e2befd26/2022-2023/Sassuolo-Stats',
 '/en/squads/a3d88bd8/2022-2023/Empoli-Stats',
 '/en/squads/c5577084/2022-2023/Salernitana-Stats',
 '/en/squads/ffcbe334/2022-2023/Lecce-Stats',
 '/en/squads/68449f6d/2022-2023/Spezia-Stats',
 '/en/squads/0e72edf2/2022-2023/Hellas-Verona-Stats',
 '/en/squads/9aad3a77/2022-2023/Cremonese-Stats',
 '/en/squads/8ff9e3b3/2022-2023/Sampdoria-Stats']

In [9]:
squadURLs = list(map(lambda x: f"https://fbref.com{x}", squadPostfixLinks))

In [10]:
squadURLs

['https://fbref.com/en/squads/d48ad4ff/2022-2023/Napoli-Stats',
 'https://fbref.com/en/squads/7213da33/2022-2023/Lazio-Stats',
 'https://fbref.com/en/squads/d609edc0/2022-2023/Internazionale-Stats',
 'https://fbref.com/en/squads/dc56fe14/2022-2023/Milan-Stats',
 'https://fbref.com/en/squads/922493f3/2022-2023/Atalanta-Stats',
 'https://fbref.com/en/squads/cf74a709/2022-2023/Roma-Stats',
 'https://fbref.com/en/squads/e0652b02/2022-2023/Juventus-Stats',
 'https://fbref.com/en/squads/421387cf/2022-2023/Fiorentina-Stats',
 'https://fbref.com/en/squads/1d8099f8/2022-2023/Bologna-Stats',
 'https://fbref.com/en/squads/105360fe/2022-2023/Torino-Stats',
 'https://fbref.com/en/squads/21680aa4/2022-2023/Monza-Stats',
 'https://fbref.com/en/squads/04eea015/2022-2023/Udinese-Stats',
 'https://fbref.com/en/squads/e2befd26/2022-2023/Sassuolo-Stats',
 'https://fbref.com/en/squads/a3d88bd8/2022-2023/Empoli-Stats',
 'https://fbref.com/en/squads/c5577084/2022-2023/Salernitana-Stats',
 'https://fbref.com/

In [11]:
# for this example, we pick the AC Milan team
milanURL = squadURLs[3]

# we get the AC Milan data from its corresponding URL
milanResponseData = requests.get(milanURL)

In [12]:
# get the Scores & Fixtures table and load it into a Pandas DataFrame
# note pd.read_html returns a list of dataframes -> only one dataframe at idx=0
milanMatchesData = pd.read_html(milanResponseData.text, match="Scores & Fixtures")[0]

In [13]:
milanMatchesData.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes
0,2022-08-13,18:30,Serie A,Matchweek 1,Sat,Home,W,4,2,Udinese,2.3,0.5,59,70197.0,Davide Calabria,4-2-3-1,Livio Marinelli,Match Report,
1,2022-08-21,20:45,Serie A,Matchweek 2,Sun,Away,D,1,1,Atalanta,1.3,0.5,60,19216.0,Davide Calabria,4-2-3-1,Fabio Maresca,Match Report,
2,2022-08-27,20:45,Serie A,Matchweek 3,Sat,Home,W,2,0,Bologna,1.9,0.5,48,70430.0,Davide Calabria,4-2-3-1,Gianluca Manganiello,Match Report,
3,2022-08-30,18:30,Serie A,Matchweek 4,Tue,Away,D,0,0,Sassuolo,0.7,1.1,62,18053.0,Theo Hernández,4-2-3-1,Giovanni Ayroldi,Match Report,
4,2022-09-03,18:00,Serie A,Matchweek 5,Sat,Home,W,3,2,Inter,1.3,1.8,47,75475.0,Davide Calabria,4-2-3-1,Daniele Chiffi,Match Report,


In [14]:
# we also want to get the shooting match logs of the particular team we selected
# we first create a BeautifulSoup object based on the milanResponseData
soup = BeautifulSoup(milanResponseData.text)

In [15]:
# we get all <a> elements within the Milan data page
anchorElems = soup.select('a')

In [16]:
anchorElems

[<a class="pt" href="https://fbref.com/pt/squads/dc56fe14/2022-2023/Milan-Stats" onclick="sr_set_trans_choice('pt');">Português</a>,
 <a class="en" href="https://fbref.com/en/squads/dc56fe14/2022-2023/Milan-Stats" onclick="sr_set_trans_choice('en');">English</a>,
 <a class="fr" href="https://fbref.com/fr/squads/dc56fe14/2022-2023/Milan-Stats" onclick="sr_set_trans_choice('fr');">Français</a>,
 <a class="it" href="https://fbref.com/it/squads/dc56fe14/2022-2023/Milan-Stats" onclick="sr_set_trans_choice('it');">Italiano</a>,
 <a class="de" href="https://fbref.com/de/squads/dc56fe14/2022-2023/Milan-Stats" onclick="sr_set_trans_choice('de');">Deutsch</a>,
 <a class="es" href="https://fbref.com/es/squads/dc56fe14/2022-2023/Milan-Stats" onclick="sr_set_trans_choice('es');">Español</a>,
 <a href="https://www.sports-reference.com/?utm_source=fb&amp;utm_medium=sr_xsite&amp;utm_campaign=2023_01_srnav"><svg height="15px" width="20px"><use xlink:href="#ic-sr-pennant"></use></svg> Sports Reference ®

In [17]:
# get the href value from each <a> elements
postfixLinks = map(lambda x: x.get('href'), anchorElems)

# note we are only interested in the 1st all_comps/shooting/ link
postfixLinkToShooting = list(filter(lambda x: x and 'all_comps/shooting/' in x, postfixLinks))[0]

In [18]:
postfixLinkToShooting

'/en/squads/dc56fe14/2022-2023/matchlogs/all_comps/shooting/Milan-Match-Logs-All-Competitions'

In [19]:
# get AC Milan shooting data from its corresponding URL
milanResponseShootingData = requests.get(f"https://fbref.com{postfixLinkToShooting}")

In [20]:
# get the Shooting table and load it into a Pandas DataFrame
# note pd.read_html returns a list of dataframes -> only one dataframe at idx=0
milanShootingData = pd.read_html(milanResponseShootingData.text, match="Shooting")[0]

In [21]:
milanShootingData.head()

Unnamed: 0_level_0,For Milan,For Milan,For Milan,For Milan,For Milan,For Milan,For Milan,For Milan,For Milan,For Milan,...,Standard,Standard,Standard,Standard,Expected,Expected,Expected,Expected,Expected,Unnamed: 25_level_0
Unnamed: 0_level_1,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Match Report
0,2022-08-13,18:30,Serie A,Matchweek 1,Sat,Home,W,4,2,Udinese,...,19.6,2.0,1,1,2.3,1.5,0.12,1.7,1.5,Match Report
1,2022-08-21,20:45,Serie A,Matchweek 2,Sun,Away,D,1,1,Atalanta,...,20.3,0.0,0,0,1.3,1.3,0.07,-0.3,-0.3,Match Report
2,2022-08-27,20:45,Serie A,Matchweek 3,Sat,Home,W,2,0,Bologna,...,15.1,0.0,0,0,1.9,1.9,0.1,0.1,0.1,Match Report
3,2022-08-30,18:30,Serie A,Matchweek 4,Tue,Away,D,0,0,Sassuolo,...,18.8,1.0,0,0,0.7,0.7,0.06,-0.7,-0.7,Match Report
4,2022-09-03,18:00,Serie A,Matchweek 5,Sat,Home,W,3,2,Inter,...,16.7,1.0,0,0,1.3,1.3,0.06,1.7,1.7,Match Report


In [22]:
# drop 1st row of the milanShootingData as it corresponds to the multi-level heading which we don't need
milanShootingData.columns = milanShootingData.columns.droplevel()

In [23]:
milanShootingData.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Match Report
0,2022-08-13,18:30,Serie A,Matchweek 1,Sat,Home,W,4,2,Udinese,...,19.6,2.0,1,1,2.3,1.5,0.12,1.7,1.5,Match Report
1,2022-08-21,20:45,Serie A,Matchweek 2,Sun,Away,D,1,1,Atalanta,...,20.3,0.0,0,0,1.3,1.3,0.07,-0.3,-0.3,Match Report
2,2022-08-27,20:45,Serie A,Matchweek 3,Sat,Home,W,2,0,Bologna,...,15.1,0.0,0,0,1.9,1.9,0.1,0.1,0.1,Match Report
3,2022-08-30,18:30,Serie A,Matchweek 4,Tue,Away,D,0,0,Sassuolo,...,18.8,1.0,0,0,0.7,0.7,0.06,-0.7,-0.7,Match Report
4,2022-09-03,18:00,Serie A,Matchweek 5,Sat,Home,W,3,2,Inter,...,16.7,1.0,0,0,1.3,1.3,0.06,1.7,1.7,Match Report


In [24]:
# merge milanMatchesData and milanShootingData on the common date field
# note for the milanShootingData we are only interested in the following metrics:
# Sh - shots total
# SoT - shots on target 
# Dist - average distance travelled by a shot
# FK - shots from free kicks 
# PK - penalty kicks made
# PKAtts - penalty kicks made
milanData = milanMatchesData.merge(milanShootingData[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")

In [25]:
milanData.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt
0,2022-08-13,18:30,Serie A,Matchweek 1,Sat,Home,W,4,2,Udinese,...,4-2-3-1,Livio Marinelli,Match Report,,13,4,19.6,2.0,1,1
1,2022-08-21,20:45,Serie A,Matchweek 2,Sun,Away,D,1,1,Atalanta,...,4-2-3-1,Fabio Maresca,Match Report,,18,7,20.3,0.0,0,0
2,2022-08-27,20:45,Serie A,Matchweek 3,Sat,Home,W,2,0,Bologna,...,4-2-3-1,Gianluca Manganiello,Match Report,,19,7,15.1,0.0,0,0
3,2022-08-30,18:30,Serie A,Matchweek 4,Tue,Away,D,0,0,Sassuolo,...,4-2-3-1,Giovanni Ayroldi,Match Report,,11,2,18.8,1.0,0,0
4,2022-09-03,18:00,Serie A,Matchweek 5,Sat,Home,W,3,2,Inter,...,4-2-3-1,Daniele Chiffi,Match Report,,21,5,16.7,1.0,0,0


Now we need to generalise this to multiple teams and multiple seasons.

In [26]:
# we are interested in the seasons 2020-2021, 2021-2022, 2022-2023
years = [2023, 2022, 2021]

# will store all dataframes for each team and each season
allData = []

yearSerieAStandings = "https://fbref.com/en/comps/11/2022-2023/2022-2023-Serie-A-Stats"

for year in years:
    # get current year standings
    yearResponseData = requests.get(yearSerieAStandings)
    soup = BeautifulSoup(responseData.text)
    standingsTableSquadLinks = soup.select('table.stats_table')[0].select('a')

    # get links to each squad for the particular year
    squadPostfixLinks = map(lambda x: x.get('href'), standingsTableSquadLinks)
    squadPostfixLinks = list(filter(lambda x: '/squads' in x, squadPostfixLinks))
    squadURLs = list(map(lambda x: f"https://fbref.com{x}", squadPostfixLinks))

    # get past season url, and update yearSerieAStanding
    previouSeasonPostfixLink = soup.select("a.prev")[0].get("href")
    yearSerieAStandings = f"https://fbref.com{previouSeasonPostfixLink}"

    # iterate over each time
    for teamURL in squadURLs:
        # individually scrape the match logs for each team
        teamName = teamURL.split("/")[-1].replace("-Stats", "").replace("-", " ")
        teamResponseData = requests.get(teamURL)
        teamMatchesData = pd.read_html(teamResponseData.text, match="Scores & Fixtures")[0]
        
        # get shooting data
        soup = BeautifulSoup(teamResponseData.text)
        anchorElems = soup.select('a')
        postfixLinks = map(lambda x: x.get('href'), anchorElems)
        postfixLinkToShooting = list(filter(lambda x: x and 'all_comps/shooting/' in x, postfixLinks))[0]
        teamResponseShootingData = requests.get(f"https://fbref.com{postfixLinkToShooting}")
        teamShootingData = pd.read_html(teamResponseShootingData.text, match="Shooting")[0]
        # get rid of multi-level heading
        teamShootingData.columns = teamShootingData.columns.droplevel()
        
        # merge the two dataframes
        try:
            # sometimes there are no shooting stats
            teamData = teamMatchesData.merge(teamShootingData[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
        except:
            continue

        # we remove games which are not in Serie A i.e., remove Champions League games
        teamData = teamData[teamData["Comp"] == "Serie A"]
        # add columns for the season and team data
        teamData["Season"] = year
        teamData["Team"] = teamName
        allData.append(teamData)
        time.sleep(5)

Finally, we concatenate all the individual dataframes stored in 'allData' into a single dataframe.

In [27]:
allDataDF = pd.concat(allData)

In [31]:
allDataDF

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team
0,2022-08-15,18:30,Serie A,Matchweek 1,Mon,Away,W,5,2,Hellas Verona,...,Match Report,,25.0,8.0,15.2,0.0,0,0,2023,Napoli
1,2022-08-21,18:30,Serie A,Matchweek 2,Sun,Home,W,4,0,Monza,...,Match Report,,22.0,5.0,15.3,1.0,0,0,2023,Napoli
2,2022-08-28,20:45,Serie A,Matchweek 3,Sun,Away,D,0,0,Fiorentina,...,Match Report,,13.0,2.0,14.7,1.0,0,0,2023,Napoli
3,2022-08-31,20:45,Serie A,Matchweek 4,Wed,Home,D,1,1,Lecce,...,Match Report,,19.0,7.0,17.7,0.0,0,0,2023,Napoli
4,2022-09-03,20:45,Serie A,Matchweek 5,Sat,Away,W,2,1,Lazio,...,Match Report,,19.0,7.0,16.1,0.0,0,0,2023,Napoli
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36,2023-05-08,18:30,Serie A,Matchweek 34,Mon,Away,L,0,2,Udinese,...,Match Report,,11.0,3.0,21.1,1.0,0,0,2021,Sampdoria
37,2023-05-15,20:45,Serie A,Matchweek 35,Mon,Home,D,1,1,Empoli,...,Match Report,,13.0,3.0,16.5,1.0,0,0,2021,Sampdoria
38,2023-05-20,20:45,Serie A,Matchweek 36,Sat,Away,L,1,5,Milan,...,Match Report,,6.0,3.0,17.5,0.0,0,0,2021,Sampdoria
39,2023-05-26,20:45,Serie A,Matchweek 37,Fri,Home,D,2,2,Sassuolo,...,Match Report,,17.0,3.0,15.0,0.0,0,0,2021,Sampdoria


Store the resulting dataframe to a csv file.

In [32]:
allDataDF.to_csv("data.csv")