In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from getpass import getuser


In [6]:
user = getuser()
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\cards_subs_wc.xlsx'


In [None]:
# Years and group stages
world_cup_years = list(range(1986, 2023, 4))
group_letters = ['Group_' + chr(i) for i in range(ord('A'), ord('H') + 1)]
base_url = 'https://en.wikipedia.org/wiki/{}_FIFA_World_Cup_{}'

event_rows = []

for year in world_cup_years:
    print(f"\n=== {year} ===")
    for group in group_letters:
        page_url = base_url.format(year, group)
        try:
            response = requests.get(page_url)
            if response.status_code != 200:
                print(f"  Skipping {group} (not found)")
                continue

            soup = BeautifulSoup(response.content, 'html.parser')
            print(f"  Scraping {group}...")

            for header in soup.find_all('h3'):
                match_id = header.get('id')
                if match_id and 'vs' in match_id:
                    match_name = header.get_text(strip=True)
                    if ' vs ' not in match_name:
                        continue
                    home_team, away_team = match_name.split(' vs ', 1)

                    footballbox = header.find_next('div', class_='footballbox')
                    date_div = footballbox.find('div', class_='fdate') if footballbox else None
                    time_div = footballbox.find('div', class_='ftime') if footballbox else None

                    match_date = date_div.get_text(strip=True) if date_div else None
                    match_time = time_div.get_text(strip=True).replace("\xa0", "") if time_div else None

                    next_tag = footballbox.find_next_sibling() if footballbox else None
                    while next_tag and next_tag.name != 'h3':
                        if next_tag.name == 'table':
                            for event_type in ['Booked', 'Substituted in']:
                                for tag in next_tag.find_all('span', title=event_type):
                                    td = tag.find_parent('td')
                                    if td:
                                        minute_span = td.find('span', style='vertical-align: middle;')
                                        if minute_span:
                                            minute_text = minute_span.get_text(strip=True).replace("'", "")
                                            if minute_text.isdigit():
                                                event_rows.append({
                                                    'year': year,
                                                    'stage': group.replace('_', ' '),
                                                    'home_team': home_team.strip(),
                                                    'away_team': away_team.strip(),
                                                    'date': match_date,
                                                    'time': match_time,
                                                    'event_type': 'Substitution' if event_type == 'Substituted in' else event_type,
                                                    'minute': int(minute_text)
                                                })
                        next_tag = next_tag.find_next_sibling()

            time.sleep(1)

        except Exception as e:
            print(f"  Error for {group}: {e}")

# Convert to DataFrame and save to Excel
df = pd.DataFrame(event_rows)
df = df[['year', 'stage', 'home_team', 'away_team', 'date', 'time', 'event_type', 'minute']]

# Save to Excel
df.to_excel(data_path, index=False)
print(f"✅ Excel file saved to: {data_path}")



=== 1986 ===
  Scraping Group_A...
  Scraping Group_B...
  Scraping Group_C...
  Scraping Group_D...
  Scraping Group_E...
  Scraping Group_F...
  Skipping Group_G (not found)
  Skipping Group_H (not found)

=== 1990 ===
  Scraping Group_A...
  Scraping Group_B...
  Scraping Group_C...
  Scraping Group_D...
  Scraping Group_E...
  Scraping Group_F...
  Skipping Group_G (not found)
  Skipping Group_H (not found)

=== 1994 ===
  Scraping Group_A...
  Scraping Group_B...
  Scraping Group_C...
  Scraping Group_D...
  Scraping Group_E...
  Scraping Group_F...
  Skipping Group_G (not found)
  Skipping Group_H (not found)

=== 1998 ===
  Scraping Group_A...
  Scraping Group_B...
  Scraping Group_C...
  Scraping Group_D...
  Scraping Group_E...
  Scraping Group_F...
  Scraping Group_G...
  Scraping Group_H...

=== 2002 ===
  Scraping Group_A...
  Scraping Group_B...
  Scraping Group_C...
  Scraping Group_D...
  Scraping Group_E...
  Scraping Group_F...
  Scraping Group_G...
  Scraping Group_H