In [85]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from getpass import getuser
import re


In [86]:
user = getuser()


In [87]:
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\cards_subs_wc.xlsx'


In [88]:
# Years and group stages
world_cup_years = list(range(1986, 2023, 4))
group_letters = ['Group_' + chr(i) for i in range(ord('A'), ord('H') + 1)]
base_url = 'https://en.wikipedia.org/wiki/{}_FIFA_World_Cup_{}'

event_rows = []

In [89]:
# === MAIN SCRAPER ===
for year in world_cup_years:
    print(f"\n=== {year} ===")
    for group in group_letters:
        page_url = base_url.format(year, group)
        try:
            response = requests.get(page_url)
            if response.status_code != 200:
                print(f"  Skipping {group} (not found)")
                continue

            soup = BeautifulSoup(response.content, 'html.parser')
            print(f"  Scraping {group}...")

            for fevent_table in soup.find_all('table', class_='fevent'):
                tbody = fevent_table.find('tbody')
                if not tbody:
                    continue

                # === HOME & AWAY TEAMS ===
                home_th = tbody.find('th', class_='fhome')
                away_th = tbody.find('th', class_='faway')
                home_span = home_th.find('span', itemprop='name') if home_th else None
                away_span = away_th.find('span', itemprop='name') if away_th else None
                home_team = home_span.get_text(strip=True) if home_span else None
                away_team = away_span.get_text(strip=True) if away_span else None
                print(f"🔍 Processing match: {home_team} vs {away_team} ({match_date})")


                # === MATCH DATE & TIME ===
                footballbox = fevent_table.find_previous('div', class_='footballbox')
                match_date = footballbox.find('div', class_='fdate').get_text(strip=True) if footballbox and footballbox.find('div', class_='fdate') else None
                match_time = footballbox.find('div', class_='ftime').get_text(strip=True).replace("\xa0", "") if footballbox and footballbox.find('div', class_='ftime') else None

                # === FIND EVENT TABLE ===
                event_table = fevent_table.find_next('table', attrs={'width': '100%'})
                if not event_table:
                    continue
                event_tbody = event_table.find('tbody')
                if not event_tbody:
                    continue

                for row in event_tbody.find_all('tr'):
                    tds = [td for td in row.find_all('td', attrs={'valign': 'top'}) if td.get('width') in ['50%', '40%']]
                    if len(tds) == 2:
                        td_pairs = [(tds[0], 'home'), (tds[1], 'away')]
                    elif len(tds) == 3:
                        td_pairs = [(tds[0], 'home'), (tds[2], 'away')]
                    else:
                        continue  # Skip if unexpected format

                    # Make sure exactly one home and one away (no repeats)
                    team_sides = [side for _, side in td_pairs]
                    if team_sides != ['home', 'away']:
                        continue

                    for td, team_side in td_pairs:
                        nested_table = td.find('table')
                        if not nested_table:
                            continue
                        nested_body = nested_table.find('tbody')
                        if not nested_body:
                            continue

                        for event_row in nested_body.find_all('tr'):
                            for event_td in event_row.find_all('td'):
                                spans = event_td.find_all('span', title=True)
                                titles = [s['title'].strip() for s in spans]

                                if not any(t in titles for t in ['Booked', 'Sent off (second booking)', 'Sent off (straight red)', 'Substituted in']):
                                    continue

                                raw_text = event_td.get_text(strip=True)
                                minute_match = re.search(r'\d{1,4}', raw_text)
                                if not minute_match:
                                    continue
                                raw_minute = minute_match.group()

                                booked_added = False

                                for title in titles:
                                    minute_texts = re.findall(r'\d{1,3}\+?\d{0,2}', raw_text)

                                    if title == 'Booked' and not booked_added:
                                        if minute_texts:
                                            raw_minute_str = minute_texts[0]
                                            if '+' in raw_minute_str:
                                                base, extra = raw_minute_str.split('+')
                                                minute = int(base) + int(extra)
                                            else:
                                                minute = int(raw_minute_str)
                                            event_type = 'Yellow card'
                                            booked_added = True

                                    elif title == 'Sent off (second booking)':
                                        if len(minute_texts) >= 2:
                                            raw_minute_str = minute_texts[1]
                                        elif minute_texts:
                                            raw_minute_str = minute_texts[-1]
                                        else:
                                            continue
                                        if '+' in raw_minute_str:
                                            base, extra = raw_minute_str.split('+')
                                            minute = int(base) + int(extra)
                                        else:
                                            minute = int(raw_minute_str)
                                        event_type = 'Red card (second yellow)'

                                    elif title == 'Sent off (straight red)':
                                        if len(minute_texts) >= 2:
                                            raw_minute_str = minute_texts[1]
                                        elif minute_texts:
                                            raw_minute_str = minute_texts[-1]
                                        else:
                                            continue
                                        if '+' in raw_minute_str:
                                            base, extra = raw_minute_str.split('+')
                                            minute = int(base) + int(extra)
                                        else:
                                            minute = int(raw_minute_str)
                                        event_type = 'Red card (straight)'

                                    elif title == 'Substituted in':
                                        if minute_texts:
                                            raw_minute_str = minute_texts[0]
                                            if '+' in raw_minute_str:
                                                base, extra = raw_minute_str.split('+')
                                                minute = int(base) + int(extra)
                                            else:
                                                minute = int(raw_minute_str)
                                            event_type = 'Substitution'

                                    else:
                                        continue

                                    event_rows.append({
                                        'year': year,
                                        'stage': group.replace('_', ' '),
                                        'home_team': home_team,
                                        'away_team': away_team,
                                        'date': match_date,
                                        'time': match_time,
                                        'event_type': event_type,
                                        'minute': minute,
                                        'team_side': team_side
                                    })

            time.sleep(1)

        except Exception as e:
            print(f"  Error for {group}: {e}")

# === SAVE TO EXCEL ===
df = pd.DataFrame(event_rows)
# df.drop_duplicates(inplace=True)

if df.empty:
    print("\n⚠️ No data collected.")
else:
    expected_cols = ['year', 'stage', 'home_team', 'away_team', 'date', 'time', 'event_type', 'minute', 'team_side']
    missing = [col for col in expected_cols if col not in df.columns]

    if missing:
        print(f"\n❌ Missing columns: {missing}")
        print(df.head())
    else:
        df = df[expected_cols]
        df.to_excel(data_path, index=False)
        print(f"\n✅ Clean Excel file saved to: {data_path}")



=== 1986 ===
  Scraping Group_A...
🔍 Processing match: Bulgaria vs Italy (2 December 2022(2022-12-02))
🔍 Processing match: Argentina vs South Korea (31 May 1986)
🔍 Processing match: Italy vs Argentina (2 June 1986)
🔍 Processing match: South Korea vs Bulgaria (5 June 1986)
🔍 Processing match: South Korea vs Italy (5 June 1986)
🔍 Processing match: Argentina vs Bulgaria (10 June 1986)
  Scraping Group_B...
🔍 Processing match: Belgium vs Mexico (10 June 1986)
🔍 Processing match: Paraguay vs Iraq (3 June 1986)
🔍 Processing match: Mexico vs Paraguay (4 June 1986)
🔍 Processing match: Iraq vs Belgium (7 June 1986)
🔍 Processing match: Paraguay vs Belgium (8 June 1986)
🔍 Processing match: Iraq vs Mexico (11 June 1986)
  Scraping Group_C...
🔍 Processing match: Canada vs France (11 June 1986)
🔍 Processing match: Soviet Union vs Hungary (1 June 1986)
🔍 Processing match: France vs Soviet Union (2 June 1986)
🔍 Processing match: Hungary vs Canada (5 June 1986)
🔍 Processing match: Hungary vs France (6