In [17]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from getpass import getuser
import re


In [18]:
user = getuser()


In [19]:
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\cards_subs_wc.xlsx'


In [20]:
# Years and group stages
world_cup_years = list(range(1986, 2023, 4))
group_letters = ['Group_' + chr(i) for i in range(ord('A'), ord('H') + 1)]
base_url = 'https://en.wikipedia.org/wiki/{}_FIFA_World_Cup_{}'

event_rows = []


In [21]:
for year in world_cup_years:
    print(f"\n=== {year} ===")
    for group in group_letters:
        page_url = base_url.format(year, group)
        try:
            response = requests.get(page_url)
            if response.status_code != 200:
                print(f"  Skipping {group} (not found)")
                continue

            soup = BeautifulSoup(response.content, 'html.parser')
            print(f"  Scraping {group}...")

            for header in soup.find_all('h3'):
                match_id = header.get('id')
                if match_id and 'vs' in match_id:
                    match_name = header.get_text(strip=True)
                    if ' vs ' not in match_name:
                        continue
                    home_team, away_team = match_name.split(' vs ', 1)

                    footballbox = header.find_next('div', class_='footballbox')
                    if not footballbox:
                        continue

                    match_date = footballbox.find('div', class_='fdate')
                    match_time = footballbox.find('div', class_='ftime')
                    match_date = match_date.get_text(strip=True) if match_date else None
                    match_time = match_time.get_text(strip=True).replace("\xa0", "") if match_time else None

                    next_tag = footballbox.find_next_sibling()
                    while next_tag and next_tag.name != 'h3':
                        if next_tag.name == 'table':
                            for tag in next_tag.find_all('span', title=True):
                                title = tag['title'].strip()
                                if any(x in title for x in ['Booked', 'Sent off', 'Substituted in']):
                                    td = tag.find_parent('td')
                                    if not td:
                                        continue
                                    raw = td.get_text(strip=True)
                                    minute_match = re.search(r'\d{1,4}', raw)
                                    if not minute_match:
                                        continue
                                    raw_minute = minute_match.group()
                                    if title.startswith('Sent off'):
                                        if len(raw_minute) == 4 and raw_minute[:2] != raw_minute[2:]:
                                            continue  # Skip double yellow that already created red
                                        minute = int(raw_minute[-2:])
                                        event_type = 'Red card'
                                    elif title == 'Booked':
                                        minute = int(raw_minute[:2] if len(raw_minute) == 4 else raw_minute)
                                        event_type = 'Yellow card'
                                    else:
                                        minute = int(raw_minute)
                                        event_type = 'Substitution'

                                    event_rows.append({
                                        'year': year,
                                        'stage': group.replace('_', ' '),
                                        'home_team': home_team.strip(),
                                        'away_team': away_team.strip(),
                                        'date': match_date,
                                        'time': match_time,
                                        'event_type': event_type,
                                        'minute': minute
                                    })

                        next_tag = next_tag.find_next_sibling()

            time.sleep(1)

        except Exception as e:
            print(f"  Error for {group}: {e}")

# Build DataFrame and clean
df = pd.DataFrame(event_rows)
df.drop_duplicates(inplace=True)
df = df[['year', 'stage', 'home_team', 'away_team', 'date', 'time', 'event_type', 'minute']]



=== 1986 ===
  Scraping Group_A...
  Scraping Group_B...
  Scraping Group_C...
  Scraping Group_D...
  Scraping Group_E...
  Scraping Group_F...
  Skipping Group_G (not found)
  Skipping Group_H (not found)

=== 1990 ===
  Scraping Group_A...
  Scraping Group_B...
  Scraping Group_C...
  Scraping Group_D...
  Scraping Group_E...
  Scraping Group_F...
  Skipping Group_G (not found)
  Skipping Group_H (not found)

=== 1994 ===
  Scraping Group_A...
  Scraping Group_B...
  Scraping Group_C...
  Scraping Group_D...
  Scraping Group_E...
  Scraping Group_F...
  Skipping Group_G (not found)
  Skipping Group_H (not found)

=== 1998 ===
  Scraping Group_A...
  Scraping Group_B...
  Scraping Group_C...
  Scraping Group_D...
  Scraping Group_E...
  Scraping Group_F...
  Scraping Group_G...
  Scraping Group_H...

=== 2002 ===
  Scraping Group_A...
  Scraping Group_B...
  Scraping Group_C...
  Scraping Group_D...
  Scraping Group_E...
  Scraping Group_F...
  Scraping Group_G...
  Scraping Group_H

In [23]:
# === Save to Excel ===
df.to_excel(data_path, index=False)
print(f"\n✅ Clean Excel file saved to: {data_path}")



✅ Clean Excel file saved to: C:\Users\ALESSANDRO\Documents\GitHub\tiebreak_wc\data\in\cards_subs_wc.xlsx
