In [61]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from getpass import getuser


In [62]:
# ---- EURO datasets ----
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\uefa\eu\goals_eu_uefa.xlsx'
goals_eu_uefa = pd.read_excel(data_path)

data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\eu\goals_eu_fifa.xlsx'
goals_eu_fifa = pd.read_excel(data_path)

data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\uefa\eu\mbm_eu_uefa.xlsx'
mbm_eu_uefa = pd.read_excel(data_path)

data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\eu\mbm_eu_fifa.xlsx'
mbm_eu_fifa = pd.read_excel(data_path)

# ---- WORLD CUP datasets ----
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\uefa\wc\goals_wc_uefa.xlsx'
goals_wc_uefa = pd.read_excel(data_path)

data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\wc\goals_wc_fifa.xlsx'
goals_wc_fifa = pd.read_excel(data_path)

data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\uefa\wc\mbm_wc_uefa.xlsx'
mbm_wc_uefa = pd.read_excel(data_path)

data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\wc\mbm_wc_fifa.xlsx'
mbm_wc_fifa = pd.read_excel(data_path)


# World Cup

In [63]:
user = getuser()
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\cards_subs_wc.xlsx'


In [64]:
# Years and group stages
world_cup_years = list(range(1986, 2023, 4))
group_letters = ['Group_' + chr(i) for i in range(ord('A'), ord('H') + 1)]
base_url = 'https://en.wikipedia.org/wiki/{}_FIFA_World_Cup_{}'

event_rows = []

In [65]:

for year in world_cup_years:
    print(f"\n=== {year} ===")
    for group in group_letters:
        page_url = base_url.format(year, group)
        try:
            response = requests.get(page_url)
            if response.status_code != 200:
                print(f"  Skipping {group} (not found)")
                continue

            soup = BeautifulSoup(response.content, 'html.parser')
            print(f"  Scraping {group}...")

            for header in soup.find_all('h3'):
                match_id = header.get('id')
                if match_id and 'vs' in match_id:
                    match_name = header.get_text(strip=True)
                    if ' vs ' not in match_name:
                        continue
                    home_team, away_team = match_name.split(' vs ', 1)

                    footballbox = header.find_next('div', class_='footballbox')
                    date_div = footballbox.find('div', class_='fdate') if footballbox else None
                    time_div = footballbox.find('div', class_='ftime') if footballbox else None

                    match_date = date_div.get_text(strip=True) if date_div else None
                    match_time = time_div.get_text(strip=True).replace("\xa0", "") if time_div else None

                    # Search for the next player table with width="100%"
                    match_table = footballbox.find_next('table', attrs={'width': '100%'})
                    if match_table:
                        tds = match_table.find_all('td', valign='top', width='50%')
                        if len(tds) == 2:
                            home_td, away_td = tds

                            for event_type in ['Booked', 'Substituted in']:
                                for tag in home_td.find_all('span', title=event_type):
                                    td = tag.find_parent('td')
                                    if td:
                                        minute_span = td.find('span', style='vertical-align: middle;')
                                        if minute_span:
                                            minute_text = minute_span.get_text(strip=True).replace("'", "")
                                            if minute_text.isdigit():
                                                event_rows.append({
                                                    'year': year,
                                                    'stage': group.replace('_', ' '),
                                                    'home_team': home_team.strip(),
                                                    'away_team': away_team.strip(),
                                                    'date': match_date,
                                                    'time': match_time,
                                                    'event_type': 'Substitution' if event_type == 'Substituted in' else event_type,
                                                    'minute': int(minute_text),
                                                    'team_side': 'home'
                                                })

                                for tag in away_td.find_all('span', title=event_type):
                                    td = tag.find_parent('td')
                                    if td:
                                        minute_span = td.find('span', style='vertical-align: middle;')
                                        if minute_span:
                                            minute_text = minute_span.get_text(strip=True).replace("'", "")
                                            if minute_text.isdigit():
                                                event_rows.append({
                                                    'year': year,
                                                    'stage': group.replace('_', ' '),
                                                    'home_team': home_team.strip(),
                                                    'away_team': away_team.strip(),
                                                    'date': match_date,
                                                    'time': match_time,
                                                    'event_type': 'Substitution' if event_type == 'Substituted in' else event_type,
                                                    'minute': int(minute_text),
                                                    'team_side': 'away'
                                                })

            time.sleep(1)

        except Exception as e:
            print(f"  Error for {group}: {e}")

# Final DataFrame
df = pd.DataFrame(event_rows)
df = df[['year', 'stage', 'home_team', 'away_team', 'date', 'time', 'event_type', 'minute', 'team_side']]

# Save to Excel
df.to_excel(data_path, index=False)
print(f"✅ Excel file saved to: {data_path}")



=== 1986 ===
  Scraping Group_A...
  Scraping Group_B...
  Scraping Group_C...
  Scraping Group_D...
  Scraping Group_E...
  Scraping Group_F...
  Skipping Group_G (not found)
  Skipping Group_H (not found)

=== 1990 ===
  Scraping Group_A...
  Scraping Group_B...
  Scraping Group_C...
  Scraping Group_D...
  Scraping Group_E...
  Scraping Group_F...
  Skipping Group_G (not found)
  Skipping Group_H (not found)

=== 1994 ===
  Scraping Group_A...
  Scraping Group_B...
  Scraping Group_C...
  Scraping Group_D...
  Scraping Group_E...
  Scraping Group_F...
  Skipping Group_G (not found)
  Skipping Group_H (not found)

=== 1998 ===
  Scraping Group_A...
  Scraping Group_B...
  Scraping Group_C...
  Scraping Group_D...
  Scraping Group_E...
  Scraping Group_F...
  Scraping Group_G...
  Scraping Group_H...

=== 2002 ===
  Scraping Group_A...
  Scraping Group_B...
  Scraping Group_C...
  Scraping Group_D...
  Scraping Group_E...
  Scraping Group_F...
  Scraping Group_G...
  Scraping Group_H

# Euro

In [66]:
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\cards_subs_eu.xlsx'

In [67]:
# Define years and corresponding group formats
euro_cup_years = list(range(1984, 2025, 4))
group_map = {
    1984: ['Group_1', 'Group_2'],
    1988: ['Group_1', 'Group_2'],
    1992: ['Group_1', 'Group_2'],
}
# From 1996 onwards: Group_A to Group_H
for year in euro_cup_years:
    if year not in group_map:
        group_map[year] = [f'Group_{chr(i)}' for i in range(ord('A'), ord('H') + 1)]

base_url = 'https://en.wikipedia.org/wiki/UEFA_Euro_{}_{}'

event_rows = []


In [68]:
for year in euro_cup_years:
    for group in group_map[year]:
        url = base_url.format(year, group)
        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')

            for header in soup.find_all('h3'):
                match_id = header.get('id')
                if match_id and 'vs' in match_id:
                    match_name = header.get_text(strip=True)
                    if ' vs ' not in match_name:
                        continue
                    home_team, away_team = match_name.split(' vs ', 1)

                    footballbox = header.find_next('div', class_='footballbox')
                    if not footballbox:
                        continue

                    date_div = footballbox.find('div', class_='fdate')
                    time_div = footballbox.find('div', class_='ftime')
                    match_date = date_div.get_text(strip=True) if date_div else None
                    match_time = time_div.get_text(strip=True).replace("\xa0", "") if time_div else None

                    match_table = None
                    for tbl in footballbox.find_all_next('table'):
                        if tbl.find('span', style='vertical-align: middle;'):
                            match_table = tbl
                            break

                    if not match_table:
                        continue

                    tds = [td for td in match_table.find_all('td') 
                           if td.get('style') and 'vertical-align:top' in td['style']]
                    if len(tds) < 2:
                        continue

                    home_td, away_td = tds[:2]

                    for td_block, team_side in [(home_td, 'home'), (away_td, 'away')]:
                        for span in td_block.find_all('span', style='vertical-align: middle;'):
                            try:
                                minute = int(span.get_text(strip=True).replace("'", ""))
                            except:
                                continue

                            td = span.find_parent('td')
                            if not td:
                                continue

                            title_span = td.find('span', title=True)
                            if not title_span:
                                continue

                            title = title_span['title'].strip()
                            if title == 'Substituted off':
                                continue
                            elif title == 'Substituted in':
                                event_type = 'Substitution'
                            elif title == 'Booked':
                                event_type = 'Booked'
                            else:
                                continue

                            event_rows.append({
                                'year': year,
                                'stage': group.replace('_', ' '),
                                'home_team': home_team.strip(),
                                'away_team': away_team.strip(),
                                'date': match_date,
                                'time': match_time,
                                'event_type': event_type,
                                'minute': minute,
                                'team_side': team_side
                            })
            time.sleep(1)
        except:
            continue

# Convert to DataFrame
df = pd.DataFrame(event_rows)
df = df[['year', 'stage', 'home_team', 'away_team', 'date', 'time', 'event_type', 'minute', 'team_side']]

# Save to Excel
df.to_excel(data_path, index=False)
print(f"✅ Excel file saved to: {data_path}")


✅ Excel file saved to: C:\Users\aldi\Documents\GitHub\tiebreak_wc\data\in\cards_subs_eu.xlsx
