In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from getpass import getuser

# Get the current user's name
user = getuser()

In [2]:
def scrape_uefa_euro_data(year):
    url = f'https://en.wikipedia.org/wiki/UEFA_Euro_{year}'
    # Send a request to fetch the HTML content of the page
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Searching for divs with the class 'footballbox'
    footballbox_divs = soup.find_all('div', {'class': 'footballbox'})
    
    # Initialize a new list to store the expanded match data with individual goals per row
    expanded_matches_data = []

    # Initialize a variable to store the current stage (from h3 tags)
    current_stage = None
    
    # Get all h3 tags to track the stages
    h3_tags = soup.find_all('h3')

    # Iterating through each footballbox div
    for box in footballbox_divs:
        # Find the closest preceding h3 tag (stage)
        previous_h3 = box.find_previous('h3')
        if previous_h3:
            current_stage = previous_h3.get_text(strip=True)  # Extract stage name from h3 tag

        match_info = {'stage': current_stage}

        # Step 1: Extracting date and time from div class = "fleft"
        fleft = box.find('div', {'class': 'fleft'})
        if fleft:
            match_info['date'] = fleft.find('div', {'class': 'fdate'}).get_text(strip=True) if fleft.find('div', {'class': 'fdate'}) else None
            match_info['time'] = fleft.find('div', {'class': 'ftime'}).get_text(strip=True) if fleft.find('div', {'class': 'ftime'}) else None

        # Step 2: Extracting home team, away team, and score from table class = "fevent"
        fevent_table = box.find('table', {'class': 'fevent'})
        if fevent_table:
            match_info['home_team'] = fevent_table.find('th', {'class': 'fhome'}).get_text(strip=True) if fevent_table.find('th', {'class': 'fhome'}) else None
            match_info['away_team'] = fevent_table.find('th', {'class': 'faway'}).get_text(strip=True) if fevent_table.find('th', {'class': 'faway'}) else None
            match_info['score'] = fevent_table.find('th', {'class': 'fscore'}).get_text(strip=True) if fevent_table.find('th', {'class': 'fscore'}) else None

        # Step 3: Extracting stadium and referee info from div class = "fright"
        fright = box.find('div', {'class': 'fright'})
        if fright:
            location_div = fright.find('div', {'itemprop': 'location'})
            if location_div:
                match_info['stadium_name'] = location_div.find('a').get_text(strip=True) if location_div.find('a') else None
                match_info['stadium_city'] = location_div.find_all('a')[1].get_text(strip=True) if len(location_div.find_all('a')) > 1 else None

            stadium_attendance = fright.find_all('div')[1].get_text(strip=True) if len(fright.find_all('div')) > 1 else None
            match_info['stadium_attendance'] = stadium_attendance

            referee_div = fright.find_all('div')[2] if len(fright.find_all('div')) > 2 else None
            if referee_div:
                match_info['referee_name'] = referee_div.find('a').get_text(strip=True) if referee_div.find('a') else None
                match_info['referee_nationality'] = referee_div.find_all('a')[1].get_text(strip=True) if len(referee_div.find_all('a')) > 1 else None

        # Step 4: Extracting individual goals for home team and away team
        fgoals_rows = fevent_table.find_all('tr', {'class': 'fgoals'})

        for goal_row in fgoals_rows:
            home_goals = goal_row.find_all('td', {'class': 'fhgoal'})
            for home_goal in home_goals:
                plainlist = home_goal.find('div', {'class': 'plainlist'})
                if plainlist:
                    scorers = plainlist.find_all('li')
                    for scorer in scorers:
                        scorer_name = scorer.find('a').get_text(strip=True) if scorer.find('a') else None
                        if scorer_name:
                            goal_minutes = scorer.find_all('span', {'class': 'fb-goal'})
                            for minute in goal_minutes:
                                match_copy = match_info.copy()
                                match_copy['scorer_name'] = scorer_name
                                match_copy['scorer_nationality'] = match_info.get('home_team')
                                match_copy['goal_minute'] = minute.get_text(strip=True)
                                expanded_matches_data.append(match_copy)

            away_goals = goal_row.find_all('td', {'class': 'fagoal'})
            for away_goal in away_goals:
                plainlist = away_goal.find('div', {'class': 'plainlist'})
                if plainlist:
                    scorers = plainlist.find_all('li')
                    for scorer in scorers:
                        scorer_name = scorer.find('a').get_text(strip=True) if scorer.find('a') else None
                        if scorer_name:
                            goal_minutes = scorer.find_all('span', {'class': 'fb-goal'})
                            for minute in goal_minutes:
                                match_copy = match_info.copy()
                                match_copy['scorer_name'] = scorer_name
                                match_copy['scorer_nationality'] = match_info.get('away_team')
                                match_copy['goal_minute'] = minute.get_text(strip=True)
                                expanded_matches_data.append(match_copy)

    return expanded_matches_data


In [3]:
def clean_data(df):
    # Check for extra time in the score and remove it
    df['extra_time'] = df['score'].apply(lambda x: 1 if '(a.e.t.)' in str(x) else 0)
    df['score'] = df['score'].str.replace(r'\(a.e.t.\)', '', regex=True).str.strip()
    df['score'] = df['score'].str.replace(r'[^\d–]', '', regex=True).str.strip()

    # Splitting the score into goals_home and goals_away
    df[['goals_home', 'goals_away']] = df['score'].str.split('–', expand=True)
    df['goals_home'] = pd.to_numeric(df['goals_home'], errors='coerce').fillna(0).astype(int)
    df['goals_away'] = pd.to_numeric(df['goals_away'], errors='coerce').fillna(0).astype(int)

    # Duplicate rows for cases where goal_minute contains a comma
    df = df.assign(goal_minute=df['goal_minute'].str.split(',')).explode('goal_minute')

    # Create own_goal and penalty flags
    df['own_goal'] = df['goal_minute'].apply(lambda x: 1 if '(o.g.)' in str(x) else 0)
    df['penalty'] = df['goal_minute'].apply(lambda x: 1 if '(pen.)' in str(x) else 0)

    # Clean goal_minute by removing '(o.g.)', '(pen.)', and any apostrophes
    df['goal_minute'] = df['goal_minute'].str.replace(r'\(o.g.\)', '', regex=True)
    df['goal_minute'] = df['goal_minute'].str.replace(r'\(pen.\)', '', regex=True)
    df['goal_minute'] = df['goal_minute'].str.replace("'", "").str.strip()

    # Process goal_minute to handle extra time
    df['goal_minute_et'] = df['goal_minute'].apply(lambda x: int(x.split('+')[1]) if '+' in str(x) else 0)
    df['goal_minute'] = df['goal_minute'].apply(lambda x: int(x.split('+')[0]) if '+' in str(x) else int(x))

    # Create goal_et variable: 1 if the minute is extra time (90+), otherwise 0
    df['goal_et'] = df['goal_minute'].apply(lambda x: 1 if x >= 90 else 0)

    # Update goal_minute by adding goal_minute_et to it
    df['goal_minute'] = df['goal_minute'] + df['goal_minute_et']

    # Clean and format other columns, handling commas in 'stadium_attendance'
    df['stadium_attendance'] = df['stadium_attendance'].str.replace(r'[^\d]', '', regex=True).replace(',', '').astype(int)

    # Extract date details
    df['short_date'] = df['date'].str.extract(r'\((.*?)\)')
    df['long_date'] = df['date'].str.split('(').str[0].str.strip()

    # Dropping the original 'date' column
    df = df.drop(columns=['date'])

    return df


In [4]:
# List of years from 1984 to 2024, increasing by 4 each time
years = list(range(1984, 2024 + 1, 4))

# Initialize a list to store data for all editions
all_matches_data = []

# Loop through each year and scrape the data
for year in years:
    print(f"Scraping UEFA Euro {year} data...")
    year_data = scrape_uefa_euro_data(year)
    all_matches_data.extend(year_data)

# Convert the collected data to a DataFrame
all_matches_df = pd.DataFrame(all_matches_data)

# Clean the data
all_matches_df_cleaned = clean_data(all_matches_df)

# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\euro_goals.xlsx'
all_matches_df_cleaned.to_excel(file_path, index=False)
print(f"Data saved to {file_path}")

Scraping UEFA Euro 1984 data...
Scraping UEFA Euro 1988 data...
Scraping UEFA Euro 1992 data...
Scraping UEFA Euro 1996 data...
Scraping UEFA Euro 2000 data...
Scraping UEFA Euro 2004 data...
Scraping UEFA Euro 2008 data...
Scraping UEFA Euro 2012 data...
Scraping UEFA Euro 2016 data...
Scraping UEFA Euro 2020 data...
Scraping UEFA Euro 2024 data...
Data saved to C:\Users\aldi\Documents\GitHub\tiebreak_wc\data\in\euro_goals.xlsx
