In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [3]:
# Updated, more comprehensive headers
custom_headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Referer": "https://fbref.com/en/matches/01d155b4/Southampton-Arsenal-May-25-2025-Premier-League", # The URL itself
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Cache-Control": "max-age=0",
    # Modern browser headers
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "none",
    "Sec-Fetch-User": "?1"
}
url = "https://fbref.com/en/comps/9/Premier-League-Stats"
table_attrs = {"class": "stats_table"}

In [4]:
try:
    # 2. Use requests.get to fetch the content with the custom headers
    response = requests.get(url, headers=custom_headers)
    
    # Raise an exception for bad status codes (like 403, 404, 500, etc.)
    response.raise_for_status() 
    

    print("Request successful, parsing HTML...")

    # 3. Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table', attrs=table_attrs)
    if table:
        df = pd.read_html(str(table))[0]
        print("Table successfully extracted into DataFrame:")
        print(df.head())
    else:
        print("Table not found in the HTML content.")
except requests.exceptions.HTTPError as e:
    print(f"HTTP Error occurred: {e}. The server is still blocking the request.")
except Exception as e:
    print(f"An error occurred: {e}")

Request successful, parsing HTML...
Table successfully extracted into DataFrame:
   Rk            Squad  MP   W  D  L  GF  GA  GD  Pts  Pts/MP    xG   xGA  \
0   1          Arsenal  16  11  3  2  30  10  20   36    2.25  26.5   9.6   
1   2  Manchester City  16  11  1  4  38  16  22   34    2.13  30.3  17.6   
2   3      Aston Villa  16  10  3  3  25  17   8   33    2.06  16.7  21.4   
3   4          Chelsea  16   8  4  4  27  15  12   28    1.75  26.9  19.7   
4   5   Crystal Palace  16   7  5  4  20  15   5   26    1.63  26.2  18.5   

    xGD  xGD/90     Last 5  Attendance  \
0  16.9    1.06  W D W L W       60177   
1  12.7    0.79  L W W W W       52373   
2  -4.7   -0.29  W W W W W       41503   
3   7.2    0.45  W D L D W       39611   
4   7.7    0.48  W L W W L       25005   

                            Top Team Scorer            Goalkeeper  Notes  
0  Leandro Trossard, Viktor Gyökeres... - 4            David Raya    NaN  
1                       Erling Haaland - 17  Gianluig

  df = pd.read_html(str(table))[0]


## Get the links of each teams

In [5]:
try:
    response = requests.get(url, headers=custom_headers)
    response.raise_for_status()
    
    print("Request successful, parsing HTML...")
    
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table', attrs=table_attrs)
    
    if table:
        # Extract rows from the table
        rows = []
        for tr in table.find_all('tr')[1:]:  # Skip header row
            cells = tr.find_all(['td', 'th'])
            row_data = {}
            for i, cell in enumerate(cells):
                # Get text
                text = cell.get_text(strip=True)
                # Get link if it exists
                link = cell.find('a')
                link_href = link['href'] if link else None
                
                row_data[f'col_{i}_text'] = text
                row_data[f'col_{i}_link'] = link_href
            
            rows.append(row_data)
        
        df = pd.DataFrame(rows)
        print("Table with links extracted:")
        print(df.head())
    else:
        print("Table not found in the HTML content.")
        
except requests.exceptions.HTTPError as e:
    print(f"HTTP Error occurred: {e}")
except Exception as e:
    print(f"An error occurred: {e}")

Request successful, parsing HTML...
Table with links extracted:
  col_0_text col_0_link       col_1_text  \
0          1       None          Arsenal   
1          2       None  Manchester City   
2          3       None      Aston Villa   
3          4       None          Chelsea   
4          5       None   Crystal Palace   

                                  col_1_link col_2_text col_2_link col_3_text  \
0          /en/squads/18bb7c10/Arsenal-Stats         16       None         11   
1  /en/squads/b8fd03ef/Manchester-City-Stats         16       None         11   
2      /en/squads/8602292d/Aston-Villa-Stats         16       None         10   
3          /en/squads/cff3d9bb/Chelsea-Stats         16       None          8   
4   /en/squads/47c64c55/Crystal-Palace-Stats         16       None          7   

  col_3_link col_4_text col_4_link  ... col_15_text  \
0       None          3       None  ...       WDWLW   
1       None          1       None  ...       LWWWW   
2       None       

In [7]:
# Extract team code and name from the links
teams_list = []

for idx, row in df.iterrows():
    # Extract code from the link path
    # Link format: /en/squads/47c64c55/Crystal-Palace-Stats
    link = row['col_1_link']
    
    if link:
        # Extract the code (it's between /squads/ and the next /)
        code = link.split('/')[3]  # Gets '47c64c55'
        # Extract the name (it's after the code and before '-Stats')
        name_with_stats = link.split('/')[4]  # Gets 'Crystal-Palace-Stats'
        name = name_with_stats.replace('-Stats', '')  # Gets 'Crystal-Palace'
        teams_list.append({
            'code': code,
            'name': name
        })

# Convert to DataFrame for better visualization
teams_df = pd.DataFrame(teams_list)
print(teams_df)

# Or as a simple list of tuples
teams_simple = [(row['code'], row['name']) for _, row in teams_df.iterrows()]
print("Teams list:", teams_simple)

        code                      name
0   18bb7c10                   Arsenal
1   b8fd03ef           Manchester-City
2   8602292d               Aston-Villa
3   cff3d9bb                   Chelsea
4   47c64c55            Crystal-Palace
5   822bd0ba                 Liverpool
6   8ef52968                Sunderland
7   19538871         Manchester-United
8   d3fd31cc                   Everton
9   d07537b9  Brighton-and-Hove-Albion
10  361ca564         Tottenham-Hotspur
11  b2b47a98          Newcastle-United
12  fd962109                    Fulham
13  cd051869                 Brentford
14  4ba7cbea               Bournemouth
15  e4a775cb         Nottingham-Forest
16  5bfb9659              Leeds-United
17  7c21e445           West-Ham-United
18  943e8050                   Burnley
19  8cec06e1   Wolverhampton-Wanderers
Teams list: [('18bb7c10', 'Arsenal'), ('b8fd03ef', 'Manchester-City'), ('8602292d', 'Aston-Villa'), ('cff3d9bb', 'Chelsea'), ('47c64c55', 'Crystal-Palace'), ('822bd0ba', 'Liverpool'

In [None]:
# Save teams data to files (uses existing teams_df and teams_simple)


teams_df.to_json('teams.json', orient='records', force_ascii=False)

print("Saved: teams.json")

Saved: teams.csv, teams.json, teams_simple.txt


In [9]:
#load json
import json
with open('teams.json', 'r', encoding='utf-8') as f:
    loaded_teams = json.load(f)
print("Loaded teams from JSON:", loaded_teams)

Loaded teams from JSON: [{'code': '18bb7c10', 'name': 'Arsenal'}, {'code': 'b8fd03ef', 'name': 'Manchester-City'}, {'code': '8602292d', 'name': 'Aston-Villa'}, {'code': 'cff3d9bb', 'name': 'Chelsea'}, {'code': '47c64c55', 'name': 'Crystal-Palace'}, {'code': '822bd0ba', 'name': 'Liverpool'}, {'code': '8ef52968', 'name': 'Sunderland'}, {'code': '19538871', 'name': 'Manchester-United'}, {'code': 'd3fd31cc', 'name': 'Everton'}, {'code': 'd07537b9', 'name': 'Brighton-and-Hove-Albion'}, {'code': '361ca564', 'name': 'Tottenham-Hotspur'}, {'code': 'b2b47a98', 'name': 'Newcastle-United'}, {'code': 'fd962109', 'name': 'Fulham'}, {'code': 'cd051869', 'name': 'Brentford'}, {'code': '4ba7cbea', 'name': 'Bournemouth'}, {'code': 'e4a775cb', 'name': 'Nottingham-Forest'}, {'code': '5bfb9659', 'name': 'Leeds-United'}, {'code': '7c21e445', 'name': 'West-Ham-United'}, {'code': '943e8050', 'name': 'Burnley'}, {'code': '8cec06e1', 'name': 'Wolverhampton-Wanderers'}]
