In [18]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [19]:
ids = [
    {"id":"results20246761A_overall"},
    {"id":"results20246761B_overall"},
    {"id":"results20246761C_overall"},
    {"id":"results20246761D_overall"},
    {"id":"results20246761E_overall"},
    {"id":"results20246761F_overall"},
]

In [20]:
url = "https://fbref.com/en/comps/676/European-Championship-Stats"

In [21]:
dataframes = []

In [22]:
for attr in ids:
    tables = pd.read_html(url, attrs=attr)

    if tables:
        dataframes.append(tables[0])

result = pd.concat(dataframes, ignore_index=True)

In [23]:
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

In [24]:
squad_links= [] 

In [26]:
for attr in ids:
    table = soup.find('table', attrs=attr)
    if table:
        for row in table.find_all('tr'):
            squad_cell = row.find('td', {'data-stat': 'team'})
            if squad_cell and squad_cell.find('a'):
                squad_link = squad_cell.find('a')['href']
                full_link = f"https://fbref.com{squad_link}"
                squad_links.append(full_link)

if len(squad_links) < len(result):
    squad_links.extend([None] * (len(result) - len(squad_links)))

result['Squad Link'] = squad_links

In [28]:
squad_links_pd = pd.Series(squad_links)

In [29]:
squad_links_pd

0     https://fbref.com/en/squads/c1e40422/Germany-M...
1     https://fbref.com/en/squads/b4ac5e97/Hungary-M...
2     https://fbref.com/en/squads/602d3994/Scotland-...
3     https://fbref.com/en/squads/81021a70/Switzerla...
4     https://fbref.com/en/squads/7b08e376/Croatia-M...
5     https://fbref.com/en/squads/998c5958/Italy-Men...
6     https://fbref.com/en/squads/b561dd30/Spain-Men...
7     https://fbref.com/en/squads/b44b9eb7/Albania-M...
8     https://fbref.com/en/squads/29a4e4af/Denmark-M...
9     https://fbref.com/en/squads/1862c019/England-M...
10    https://fbref.com/en/squads/1d6f5c9b/Serbia-Me...
11    https://fbref.com/en/squads/6b9f868f/Slovenia-...
12    https://fbref.com/en/squads/d5121f10/Austria-M...
13    https://fbref.com/en/squads/b1b36dcd/France-Me...
14    https://fbref.com/en/squads/5bb5024a/Netherlan...
15    https://fbref.com/en/squads/8912dcf0/Poland-Me...
16    https://fbref.com/en/squads/361422b9/Belgium-M...
17    https://fbref.com/en/squads/7def9493/Roman

In [30]:
import os

In [31]:
output_dir = 'team_html_files'
os.makedirs(output_dir, exist_ok=True)

In [33]:
for i, link in enumerate(squad_links):
    if link:
        response = requests.get(link)
        if response.status_code == 200:
            team_name = result.at[i, 'Squad']
            filename = f"{team_name.replace(' ', '-')}.html"
            filepath = os.path.join(output_dir, filename)
            with open(filepath, 'w', encoding='utf-8') as file:
                file.write(response.text)
            print(f"Saved {filename} to {filepath}")
        else:
            print(f"Failed to download {link}")
    else:
        print(f"No link found for {i}")

Saved de-Germany.html to team_html_files\de-Germany.html
Saved hu-Hungary.html to team_html_files\hu-Hungary.html
Saved sct-Scotland.html to team_html_files\sct-Scotland.html
Saved ch-Switzerland.html to team_html_files\ch-Switzerland.html
Saved hr-Croatia.html to team_html_files\hr-Croatia.html
Saved it-Italy.html to team_html_files\it-Italy.html
Saved es-Spain.html to team_html_files\es-Spain.html
Saved al-Albania.html to team_html_files\al-Albania.html
Saved dk-Denmark.html to team_html_files\dk-Denmark.html
Saved eng-England.html to team_html_files\eng-England.html
Saved rs-Serbia.html to team_html_files\rs-Serbia.html
Saved si-Slovenia.html to team_html_files\si-Slovenia.html
Saved at-Austria.html to team_html_files\at-Austria.html
Saved fr-France.html to team_html_files\fr-France.html
Saved nl-Netherlands.html to team_html_files\nl-Netherlands.html
Saved pl-Poland.html to team_html_files\pl-Poland.html
Saved be-Belgium.html to team_html_files\be-Belgium.html
Saved ro-Romania.html

In [34]:
input_dir = "team_html_files"

In [35]:
all_player_data = pd.DataFrame()
all_match_data = pd.DataFrame()

In [42]:
for filename in os.listdir(input_dir):
    if filename.endswith(".html"):
        filepath = os.path.join(input_dir, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            content = file.read()
        soup = BeautifulSoup(content, 'html.parser')

        player_table = soup.find('table', {'id': "stats_standard_678"})
        if player_table:
            print(f"Found player table for {filename}")
            player_df = pd.read_html(str(player_table))[0]
            player_df['Team'] = filename.replace('.html', '').replace('-', ' ')
            all_player_data = pd.concat([all_player_data, player_df], ignore_index=True)
        else:
            print(f"No player table found for {filename}")

        match_table = soup.find('table', {"id":"matchlogs_for"})
        if match_table:
            print(f"Found match table for {filename}")
            match_df = pd.read_html(str(match_table))[0]
            match_df['Team'] = filename.replace('.html', '').replace('-', ' ')
            all_match_data = pd.concat([all_match_data, match_df], ignore_index=True)
        else:
            print(f"No match table found for {filename}")


Found player table for al-Albania.html
Found match table for al-Albania.html


  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]


Found player table for at-Austria.html
Found match table for at-Austria.html


  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]


Found player table for be-Belgium.html
Found match table for be-Belgium.html


  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]


Found player table for ch-Switzerland.html
Found match table for ch-Switzerland.html


  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]


Found player table for cz-Czechia.html
Found match table for cz-Czechia.html


  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]


No player table found for de-Germany.html
Found match table for de-Germany.html


  match_df = pd.read_html(str(match_table))[0]


Found player table for dk-Denmark.html
Found match table for dk-Denmark.html


  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]


Found player table for eng-England.html
Found match table for eng-England.html


  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]


Found player table for es-Spain.html
Found match table for es-Spain.html


  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]


Found player table for fr-France.html
Found match table for fr-France.html


  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]


Found player table for ge-Georgia.html
Found match table for ge-Georgia.html


  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]


Found player table for hr-Croatia.html
Found match table for hr-Croatia.html


  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]


Found player table for hu-Hungary.html
Found match table for hu-Hungary.html


  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]


Found player table for it-Italy.html
Found match table for it-Italy.html


  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]


Found player table for nl-Netherlands.html
Found match table for nl-Netherlands.html


  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]


Found player table for pl-Poland.html
Found match table for pl-Poland.html


  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]


Found player table for pt-Portugal.html
Found match table for pt-Portugal.html


  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]


Found player table for ro-Romania.html
Found match table for ro-Romania.html


  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]


Found player table for rs-Serbia.html
Found match table for rs-Serbia.html


  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]


Found player table for sct-Scotland.html
Found match table for sct-Scotland.html


  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]


Found player table for si-Slovenia.html
Found match table for si-Slovenia.html


  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]


Found player table for sk-Slovakia.html
Found match table for sk-Slovakia.html


  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]


Found player table for tr-Türkiye.html
Found match table for tr-Türkiye.html


  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]


Found player table for ua-Ukraine.html
Found match table for ua-Ukraine.html


  player_df = pd.read_html(str(player_table))[0]
  match_df = pd.read_html(str(match_table))[0]


In [47]:
all_player_data.to_csv('all_player_data.csv', index=False)
all_match_data.to_csv('all_match_data.csv', index=False)

In [48]:
html_element = html_element = '<td class="left group_start" data-stat="match_report"><a href="/en/matches/4c992559/Belarus-Switzerland-March-25-2023-UEFA-Euro-Qualifying">Match Report</a></td>'


In [49]:
soup = BeautifulSoup(html_element, 'html.parser')

In [50]:
link_element = soup.find('a')

In [51]:
if link_element:
    match_report_link = link_element.get('href')
    full_match_p
    print("Match report link:", match_report_link)
else:
    print("No Link found")

Match report link: /en/matches/4c992559/Belarus-Switzerland-March-25-2023-UEFA-Euro-Qualifying
