In [92]:
!python3 -m ensurepip --default-pip
!python3 -m pip install --upgrade pip
!python3 -m pip install requests beautifulsoup4 pandas

Looking in links: /var/folders/12/5y1ph4xx3v72zr1wc64m1yrh0000gq/T/tmp0kxkrdnz


# Scraping data from premier league stats to a csv file

In [93]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
import random

# Base URL
BASE_URL = "https://fbref.com"

# Seasons to scrape
seasons = ["2021-2022"]

user_agents = [
    # Chrome (Windows)
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
    
    # Chrome (Mac)
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",

    # Firefox (Windows)
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:114.0) Gecko/20100101 Firefox/114.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:113.0) Gecko/20100101 Firefox/113.0",

    # Firefox (Mac)
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:114.0) Gecko/20100101 Firefox/114.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:113.0) Gecko/20100101 Firefox/113.0",

    # Safari (Mac)
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Version/16.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Version/15.6 Safari/537.36",

    # Edge (Windows)
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.0.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.0.0",

    # Android Chrome
    "Mozilla/5.0 (Linux; Android 11; SM-G998B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36",
    "Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Mobile Safari/537.36",

    # iPhone Safari
    "Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/537.36",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 15_6 like Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) Version/15.6 Mobile/15E148 Safari/537.36"
]

headers = {"User-Agent": random.choice(user_agents)}

for season in seasons:
    print(f"Scraping season: {season}....")

    url = f"https://fbref.com/en/comps/9/{season}/schedule/{season}-Premier-League-Scores-and-Fixtures"

    print(url)

    time.sleep(random.uniform(3, 7)) 

    response = requests.get(url, headers=headers)

    print(response)

    soup = BeautifulSoup(response.text, 'html.parser')

    matches_table_data = soup.find('table', {'class': 'stats_table'})

    matches_tbody_data = matches_table_data.find('tbody')

    match_rows_data = matches_tbody_data.find_all('tr')

    matches = []

    for row in match_rows_data:
        cols = row.find_all('td')

        if len(cols) > 0:
            match_link_tag = cols[11].find('a') if len(cols) > 11 else None
            match_link = f"{BASE_URL}{match_link_tag['href']}" if match_link_tag else None

            hour = cols[2].text.strip()
            date = cols[1].text.strip()
            team_home = cols[3].text.strip()
            xg_home_team = cols[4].text.strip()
            team_away = cols[7].text.strip()
            xg_away_team = cols[6].text.strip()
            referee = cols[10].text.strip()
            score = cols[5].text.strip()

            if date and team_home and team_away and score:
                home_goals, away_goals = map(int, score.split('–'))
                result = '1' if home_goals > away_goals else '2' if home_goals < away_goals else 'X'

                matches.append({
                    'Date': date,
                    'Time': hour,
                    'Home Team': team_home,
                    'XGH': xg_home_team,
                    'Away Team': team_away,
                    'XGA': xg_away_team,
                    'Score': score,
                    'FTHG': home_goals, # full time home goals
                    'FTAG': away_goals, # full time away goals
                    'HTGDIFF': home_goals - away_goals, # Home team goal difference
                    'ATGDIFF': away_goals - home_goals, # Away team goal difference
                    'Result': result,
                    'Referee': referee,
                    'Match Link': match_link
                })

    def fetch_shots_on_target(match):
        try:
            time.sleep(random.uniform(5, 8)) 
            match_page = requests.get(match['Match Link'], headers=headers, timeout=10)
            print(match['Match Link'])
            print(match_page)
            match_soup = BeautifulSoup(match_page.text, 'html.parser')
            match_stats_div = match_soup.find('div', {'id': 'team_stats'})
            shots_on_target_row = match_stats_div.select_one("tr:has(th:-soup-contains('Shots on Target')) + tr")
            possession_row = match_stats_div.select_one("tr:has(th:-soup-contains('Possession')) + tr")
            cols_shots = shots_on_target_row.find_all('td')
            cols_possession = possession_row.find_all('td')

            shots_home, shots_away, posession_home, possesion_away = None, None, None, None

            shots_home = cols_shots[0].text.strip().split(" of ")[0]
            away_numbers = re.findall(r'(\d+) of (\d+)', cols_shots[1].text.strip())
            shots_away = away_numbers[0][0] if away_numbers else None 

            possesion_home = cols_possession[0].text.strip().split("%")[0]
            away_possession = cols_possession[1].text.strip().split("%")[0]

            # print(f"Fetched - Home: {shots_home, possesion_home}, Away: {shots_away, away_possession}")

            match["STH"] = shots_home
            match["STA"] = shots_away
            match["PH"] = possesion_home
            match["PA"] = away_possession

        except Exception as e:
            print(f"Error fetching shots on target: {e}")
            match["STH"] = None
            match["STA"] = None
            match["PH"] = None
            match["PA"] = None

    for match in matches:
        fetch_shots_on_target(match)

    df = pd.DataFrame(matches)
    df.drop(columns=['Match Link'], inplace=True)

    # Save as CSV
    file_name = f"matches-{season}.csv"
    df.to_csv(file_name, index=False)

    print(f"Saved data for {season} to {file_name}")

    print(df.head())

Scraping season: 2021-2022....
https://fbref.com/en/comps/9/2021-2022/schedule/2021-2022-Premier-League-Scores-and-Fixtures
<Response [200]>
https://fbref.com/en/matches/3adf2aa7/Brentford-Arsenal-August-13-2021-Premier-League
<Response [200]>
https://fbref.com/en/matches/e62685d4/Manchester-United-Leeds-United-August-14-2021-Premier-League
<Response [200]>
https://fbref.com/en/matches/0b346a62/Leicester-City-Wolverhampton-Wanderers-August-14-2021-Premier-League
<Response [200]>
https://fbref.com/en/matches/4eb36e37/Burnley-Brighton-and-Hove-Albion-August-14-2021-Premier-League
<Response [200]>
https://fbref.com/en/matches/6f454493/Chelsea-Crystal-Palace-August-14-2021-Premier-League
<Response [200]>
https://fbref.com/en/matches/814b563c/Watford-Aston-Villa-August-14-2021-Premier-League
<Response [200]>
https://fbref.com/en/matches/c99ebbf5/Everton-Southampton-August-14-2021-Premier-League
<Response [200]>
https://fbref.com/en/matches/c52500ad/Norwich-City-Liverpool-August-14-2021-Prem