# Premier League Data Scraping

### Included in this file - 
1. A breakdown of how the data is extracted for each individual team
2. Reading the data for multiple teams, multiple seasons per team and exporting the data into a .csv 




## 1. Breakdown of how the data is extracted for the first team.

In [1]:
import requests

data_url = "https://fbref.com/en/comps/9/Premier-League-Stats"


In [2]:
data = requests.get(data_url)

In [3]:
from bs4 import BeautifulSoup

# extract the table containing the team stats
soup = BeautifulSoup(data.text)
standings_table = soup.select('table.stats_table')[0]

In [4]:
# extract all links from the table
team_links = standings_table.find_all('a')

# extract all links from the a tags
all_links = [link.get('href') for link in team_links]

# extract only the squad links
squad_links = [link for link in all_links if '/en/squads/' in link]

In [5]:
# create full link
team_urls = [f"https://fbref.com{link}" for link in squad_links]

team_urls[0]


'https://fbref.com/en/squads/b8fd03ef/Manchester-City-Stats'

In [6]:
import pandas as pd

# get team data for the first team (manchester city)
team_url = team_urls[0]

data = requests.get(team_url)


In [7]:
# convert to dataframe
matches = pd.read_html(data.text, match="Scores & Fixtures")[0]

matches.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes
0,2021-08-07,17:15,Community Shield,FA Community Shield,Sat,Neutral,L,0,1,Leicester City,,,57,,Fernandinho,4-3-3,Paul Tierney,Match Report,
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0,1,Tottenham,1.9,1.3,64,58262.0,Fernandinho,4-3-3,Anthony Taylor,Match Report,
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5,0,Norwich City,2.7,0.1,67,51437.0,İlkay Gündoğan,4-3-3,Graham Scott,Match Report,
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5,0,Arsenal,3.8,0.1,80,52276.0,İlkay Gündoğan,4-3-3,Martin Atkinson,Match Report,
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1,0,Leicester City,2.9,0.8,61,32087.0,İlkay Gündoğan,4-3-3,Paul Tierney,Match Report,


In [8]:
# scrape the shooting data for manchester city and store in a dataframe

soup = BeautifulSoup(data.text)
links = soup.find_all('a') # find all a tags
links = [link.get('href') for link in links] # extract the href attribute
links = [link for link in links if link and 'all_comps/shooting/' in link] # filter out the links that don't contain 'all_comps/shooting'

links


['/en/squads/b8fd03ef/2021-2022/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions',
 '/en/squads/b8fd03ef/2021-2022/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions',
 '/en/squads/b8fd03ef/2021-2022/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions',
 '/en/squads/b8fd03ef/2021-2022/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions']

In [9]:
data = requests.get(f"https://fbref.com{links[0]}")

In [10]:
shooting = pd.read_html(data.text, match="Shooting")[0]

In [11]:
shooting.describe()

Unnamed: 0_level_0,Standard,Standard,Standard,Standard,Standard,Standard,Standard,Standard,Standard,Standard,Expected,Expected,Expected,Expected,Expected
Unnamed: 0_level_1,Gls,Sh,SoT,SoT%,G/Sh,G/SoT,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG
count,59.0,59.0,59.0,59.0,59.0,59.0,51.0,51.0,59.0,59.0,51.0,51.0,51.0,51.0,51.0
mean,4.949153,35.661017,12.237288,33.955932,0.126949,0.372203,15.845098,1.058824,0.372881,0.474576,4.437255,4.103922,0.119216,0.856863,0.798039
std,18.772158,134.683871,46.2815,12.574211,0.08939,0.264608,1.666531,3.770474,1.472721,1.850964,15.551938,14.384352,0.040192,4.780889,4.4352
min,0.0,10.0,1.0,6.3,0.0,0.0,11.6,0.0,0.0,0.0,0.7,0.7,0.05,-2.5,-2.7
25%,1.0,15.0,4.0,25.0,0.065,0.17,14.75,0.0,0.0,0.0,1.6,1.3,0.09,-0.6,-0.6
50%,2.0,18.0,6.0,33.3,0.13,0.37,15.7,0.0,0.0,0.0,2.1,1.9,0.12,0.1,0.1
75%,4.0,21.5,8.0,40.6,0.19,0.5,16.85,1.0,0.0,0.0,3.05,2.8,0.14,1.25,1.25
max,146.0,1052.0,361.0,61.5,0.33,1.0,19.0,27.0,11.0,14.0,113.1,104.6,0.22,32.9,30.4


In [12]:
# drop the first level of indexing 
shooting.columns = shooting.columns.droplevel()

In [13]:
# merge the match information with the shooting data
team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")

team_data.describe()

Unnamed: 0,xG,xGA,Poss,Attendance,Sh,SoT,Dist,FK,PK,PKatt
count,50.0,50.0,58.0,56.0,58.0,58.0,50.0,50.0,58.0,58.0
mean,2.264,0.774,66.034483,45755.625,18.137931,6.224138,15.848,0.54,0.189655,0.241379
std,1.004025,0.568137,8.048026,14188.773408,4.868258,2.967917,1.68332,0.705951,0.437573,0.470548
min,0.7,0.0,49.0,13405.0,10.0,1.0,11.6,0.0,0.0,0.0
25%,1.6,0.325,60.25,34850.0,15.0,4.0,14.725,0.0,0.0,0.0
50%,2.05,0.75,67.0,52172.0,17.5,6.0,15.7,0.0,0.0,0.0
75%,2.975,1.0,71.75,53207.25,21.0,8.0,16.875,1.0,0.0,0.0
max,4.6,2.6,80.0,73793.0,31.0,15.0,19.0,3.0,2.0,2.0


## 2. How the data is extracted for multiple teams for multiple seasons

In [14]:
years = list(range(2022, 2019, -1))
years

[2022, 2021, 2020]

In [15]:
all_matches = []

In [16]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [17]:
import time
for year in years:

    # get the premier league table for the current year
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0]

    # each team has a link in the table, find all the team links
    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]
    
    # on the page there is a button to go the previous season
    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"
    
    # get the data for each team
    for team_url in team_urls:

        #clean the team name
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")

        # get the data for the current team page
        data = requests.get(team_url)

        # the season data is stored in a "Scores & Fixtures" table
        matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
        soup = BeautifulSoup(data.text)

        # on each page there is a button to get the shooting data for each match
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]
        data = requests.get(f"https://fbref.com{links[0]}")

        # the shooting data is stored in a "Shooting" table
        shooting = pd.read_html(data.text, match="Shooting")[0]
        shooting.columns = shooting.columns.droplevel()

        # merge the match information with the shooting data
        try:
            team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
        except ValueError:
            continue

        # filter by competition
        team_data = team_data[team_data["Comp"] == "Premier League"]
        
        # add in season and team columns
        team_data["Season"] = year
        team_data["Team"] = team_name

        # add data to all_matches and sleep for a bit to avoid being blocked
        all_matches.append(team_data)
        time.sleep(0.5)
    

    

            


In [None]:
# merge all the data together and lowercase the column names
match_data = pd.concat(all_matches)
match_data.columns = [c.lower() for c in match_data.columns]

#write data to a csv file
match_data.to_csv("premier_league_data.csv")
