In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

In [6]:
ids = [
    {"id":"results20246761A_overall"},
    {"id":"results20246761B_overall"},
    {"id":"results20246761C_overall"},
    {"id":"results20246761D_overall"},
    {"id":"results20246761E_overall"},
    {"id":"results20246761F_overall"},
]

In [7]:
url = "https://fbref.com/en/comps/676/European-Championship-Stats"

In [8]:
dataframes = []

In [9]:
for attr in ids:
    tables = pd.read_html(url, attrs=attr)

    if tables:
        dataframes.append(tables[0])

result = pd.concat(dataframes, ignore_index=True)

In [10]:
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

In [11]:
squad_links= [] 

In [12]:
for attr in ids:
    table = soup.find('table', attrs=attr)
    if table:
        for row in table.find_all('tr'):
            squad_cell = row.find('td', {'data-stat': 'team'})
            if squad_cell and squad_cell.find('a'):
                squad_link = squad_cell.find('a')['href']
                full_link = f"https://fbref.com{squad_link}"
                squad_links.append(full_link)

if len(squad_links) < len(result):
    squad_links.extend([None] * (len(result) - len(squad_links)))

result['Squad Link'] = squad_links

In [13]:
squad_links_pd = pd.Series(squad_links)

In [14]:
squad_links_pd

0     https://fbref.com/en/squads/c1e40422/Germany-M...
1     https://fbref.com/en/squads/b4ac5e97/Hungary-M...
2     https://fbref.com/en/squads/602d3994/Scotland-...
3     https://fbref.com/en/squads/81021a70/Switzerla...
4     https://fbref.com/en/squads/7b08e376/Croatia-M...
5     https://fbref.com/en/squads/998c5958/Italy-Men...
6     https://fbref.com/en/squads/b561dd30/Spain-Men...
7     https://fbref.com/en/squads/b44b9eb7/Albania-M...
8     https://fbref.com/en/squads/29a4e4af/Denmark-M...
9     https://fbref.com/en/squads/1862c019/England-M...
10    https://fbref.com/en/squads/1d6f5c9b/Serbia-Me...
11    https://fbref.com/en/squads/6b9f868f/Slovenia-...
12    https://fbref.com/en/squads/d5121f10/Austria-M...
13    https://fbref.com/en/squads/b1b36dcd/France-Me...
14    https://fbref.com/en/squads/5bb5024a/Netherlan...
15    https://fbref.com/en/squads/8912dcf0/Poland-Me...
16    https://fbref.com/en/squads/361422b9/Belgium-M...
17    https://fbref.com/en/squads/7def9493/Roman

In [2]:
import os

In [16]:
output_dir = 'team_html_files'
os.makedirs(output_dir, exist_ok=True)

In [17]:
for i, link in enumerate(squad_links):
    if link:
        response = requests.get(link)
        if response.status_code == 200:
            team_name = result.at[i, 'Squad']
            filename = f"{team_name.replace(' ', '-')}.html"
            filepath = os.path.join(output_dir, filename)
            with open(filepath, 'w', encoding='utf-8') as file:
                file.write(response.text)
            print(f"Saved {filename} to {filepath}")
        else:
            print(f"Failed to download {link}")
    else:
        print(f"No link found for {i}")

Saved de-Germany.html to team_html_files\de-Germany.html
Saved hu-Hungary.html to team_html_files\hu-Hungary.html
Saved sct-Scotland.html to team_html_files\sct-Scotland.html
Saved ch-Switzerland.html to team_html_files\ch-Switzerland.html
Saved hr-Croatia.html to team_html_files\hr-Croatia.html
Saved it-Italy.html to team_html_files\it-Italy.html
Saved es-Spain.html to team_html_files\es-Spain.html
Saved al-Albania.html to team_html_files\al-Albania.html
Saved dk-Denmark.html to team_html_files\dk-Denmark.html
Saved eng-England.html to team_html_files\eng-England.html
Saved rs-Serbia.html to team_html_files\rs-Serbia.html
Saved si-Slovenia.html to team_html_files\si-Slovenia.html
Saved at-Austria.html to team_html_files\at-Austria.html
Saved fr-France.html to team_html_files\fr-France.html
Saved nl-Netherlands.html to team_html_files\nl-Netherlands.html
Saved pl-Poland.html to team_html_files\pl-Poland.html
Saved be-Belgium.html to team_html_files\be-Belgium.html
Saved ro-Romania.html

In [18]:
input_dir = "team_html_files"

In [19]:
all_player_data = pd.DataFrame()
all_match_data = pd.DataFrame()

In [20]:
for filename in os.listdir(input_dir):
    if filename.endswith(".html"):
        filepath = os.path.join(input_dir, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            content = file.read()
        soup = BeautifulSoup(content, 'html.parser')

        player_table = soup.find('table', {'id': "stats_standard_678"})
        if player_table:
            print(f"Found player table for {filename}")
            player_df = pd.read_html(str(player_table))[0]
            player_df['Team'] = filename.replace('.html', '').replace('-', ' ')
            all_player_data = pd.concat([all_player_data, player_df], ignore_index=True)
        else:
            print(f"No player table found for {filename}")

        match_table = soup.find('table', {"id":"matchlogs_for"})
        if match_table:
            print(f"Found match table for {filename}")
            match_df = pd.read_html(str(match_table))[0]
            match_df['Team'] = filename.replace('.html', '').replace('-', ' ')
            all_match_data = pd.concat([all_match_data, match_df], ignore_index=True)
        else:
            print(f"No match table found for {filename}")


Found player table for al-Albania.html


Found match table for al-Albania.html
Found player table for at-Austria.html


  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]
  player_df = pd.read_html(str(player_table))[0]


Found match table for at-Austria.html
Found player table for be-Belgium.html
Found match table for be-Belgium.html


  match_df = pd.read_html(str(match_table))[0]
  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]


Found player table for ch-Switzerland.html
Found match table for ch-Switzerland.html
Found player table for cz-Czechia.html
Found match table for cz-Czechia.html


  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]
  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]


No player table found for de-Germany.html
Found match table for de-Germany.html
Found player table for dk-Denmark.html


  match_df = pd.read_html(str(match_table))[0]
  player_df = pd.read_html(str(player_table))[0]


Found match table for dk-Denmark.html
Found player table for eng-England.html
Found match table for eng-England.html


  match_df = pd.read_html(str(match_table))[0]
  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]


Found player table for es-Spain.html
Found match table for es-Spain.html
Found player table for fr-France.html
Found match table for fr-France.html


  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]
  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]


Found player table for ge-Georgia.html
Found match table for ge-Georgia.html


  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]


Found player table for hr-Croatia.html
Found match table for hr-Croatia.html
Found player table for hu-Hungary.html
Found match table for hu-Hungary.html


  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]
  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]


Found player table for it-Italy.html
Found match table for it-Italy.html
Found player table for nl-Netherlands.html


  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]
  player_df = pd.read_html(str(player_table))[0]


Found match table for nl-Netherlands.html
Found player table for pl-Poland.html
Found match table for pl-Poland.html


  match_df = pd.read_html(str(match_table))[0]
  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]


Found player table for pt-Portugal.html
Found match table for pt-Portugal.html
Found player table for ro-Romania.html
Found match table for ro-Romania.html


  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]
  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]


Found player table for rs-Serbia.html
Found match table for rs-Serbia.html
Found player table for sct-Scotland.html


  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]
  player_df = pd.read_html(str(player_table))[0]


Found match table for sct-Scotland.html
Found player table for si-Slovenia.html
Found match table for si-Slovenia.html


  match_df = pd.read_html(str(match_table))[0]
  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]


Found player table for sk-Slovakia.html
Found match table for sk-Slovakia.html


  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]


Found player table for tr-Türkiye.html
Found match table for tr-Türkiye.html
Found player table for ua-Ukraine.html
Found match table for ua-Ukraine.html


  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]
  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]


In [21]:
all_player_data.to_csv('all_player_data.csv', index=False)
all_match_data.to_csv('all_match_data.csv', index=False)

In [3]:
input_dir = "team_html_files"
base_url = "https://fbref.com"

In [4]:
all_match_report_links = []

In [5]:
for filename in os.listdir(input_dir):
    if filename.endswith(".html"):
        filepath = os.path.join(input_dir, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            content = file.read()

        print('--------------------------------')
        soup = BeautifulSoup(content, 'html.parser')
        print(f"Processing file: {filename}")
        
        match_report_element = soup.find_all('td', {'class': 'left group_start', 'data-stat': 'match_report'})
        print(f"Found {len(match_report_element)} match report elements in {filename}")

        for element in match_report_element:
            link_element = element.find('a')
            if link_element:
                match_report_link = base_url + link_element.get('href')
                all_match_report_links.append(match_report_link)
                print(f"Extracted link: {match_report_link}")

--------------------------------
Processing file: al-Albania.html
Found 15 match report elements in al-Albania.html
Extracted link: https://fbref.com/en/matches/b3cfd5ef/Poland-Albania-March-27-2023-UEFA-Euro-Qualifying
Extracted link: https://fbref.com/en/matches/5d41385c/Albania-Moldova-June-17-2023-UEFA-Euro-Qualifying
Extracted link: https://fbref.com/en/matches/ef410b8b/Faroe-Islands-Albania-June-20-2023-UEFA-Euro-Qualifying
Extracted link: https://fbref.com/en/matches/248ee808/Czechia-Albania-September-7-2023-UEFA-Euro-Qualifying
Extracted link: https://fbref.com/en/matches/95ef12bc/Albania-Poland-September-10-2023-UEFA-Euro-Qualifying
Extracted link: https://fbref.com/en/matches/38548416/Albania-Czechia-October-12-2023-UEFA-Euro-Qualifying
Extracted link: https://fbref.com/en/matches/8c0f8711/Moldova-Albania-November-17-2023-UEFA-Euro-Qualifying
Extracted link: https://fbref.com/en/matches/9532c14b/Albania-Faroe-Islands-November-20-2023-UEFA-Euro-Qualifying
Extracted link: https

In [6]:
all_match_report_links

['https://fbref.com/en/matches/b3cfd5ef/Poland-Albania-March-27-2023-UEFA-Euro-Qualifying',
 'https://fbref.com/en/matches/5d41385c/Albania-Moldova-June-17-2023-UEFA-Euro-Qualifying',
 'https://fbref.com/en/matches/ef410b8b/Faroe-Islands-Albania-June-20-2023-UEFA-Euro-Qualifying',
 'https://fbref.com/en/matches/248ee808/Czechia-Albania-September-7-2023-UEFA-Euro-Qualifying',
 'https://fbref.com/en/matches/95ef12bc/Albania-Poland-September-10-2023-UEFA-Euro-Qualifying',
 'https://fbref.com/en/matches/38548416/Albania-Czechia-October-12-2023-UEFA-Euro-Qualifying',
 'https://fbref.com/en/matches/8c0f8711/Moldova-Albania-November-17-2023-UEFA-Euro-Qualifying',
 'https://fbref.com/en/matches/9532c14b/Albania-Faroe-Islands-November-20-2023-UEFA-Euro-Qualifying',
 'https://fbref.com/en/matches/01304208/Albania-Chile-March-22-2024-Friendlies-M',
 'https://fbref.com/en/matches/a4d38035/Sweden-Albania-March-25-2024-Friendlies-M',
 'https://fbref.com/en/stathead/matchup/teams/c8f74183/b44b9eb7/Li

In [7]:
len(all_match_report_links)

362

In [8]:
output_file_path = 'all_match_report_links.txt'
with open(output_file_path, 'w') as file:
    for link in all_match_report_links:
        file.write(link + '\n')

print(f"All match report links have been saved to {output_file_path}")

All match report links have been saved to all_match_report_links.txt


In [9]:
output_dir = 'match_reports_html_files'
os.makedirs(output_dir, exist_ok=True)

In [10]:
def save_html(content, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(content)

In [11]:
def download_html(link, delay=5):
    try:
        response = requests.get(link)
        if response.status_code == 200:
            file_name = link.split('/')[-1] + '.html'
            file_path = os.path.join(output_dir, file_name)
            save_html(response.text, file_path)
            print(f"Saved: {file_path}")
        else:
            print(f"Failed to download: {link} (status code: {response.status_code})")
    except requests.RequestException as e:
        print(f"Request failed: {e}")
    finally:
        time.sleep(delay)

In [13]:
for link in all_match_report_links:
    download_html(link)

Failed to download: https://fbref.com/en/matches/b3cfd5ef/Poland-Albania-March-27-2023-UEFA-Euro-Qualifying (status code: 429)
Failed to download: https://fbref.com/en/matches/5d41385c/Albania-Moldova-June-17-2023-UEFA-Euro-Qualifying (status code: 429)


KeyboardInterrupt: 