In [9]:
# Re-run the code to clean the data and save the output
import pandas as pd
import requests
from bs4 import BeautifulSoup
from getpass import getuser

# Get the current user's name
user = getuser()

In [10]:
# Define the function to scrape data for a single edition
def scrape_uefa_euro_data(year):
    url = f'https://en.wikipedia.org/wiki/UEFA_Euro_{year}'
    # Send a request to fetch the HTML content of the page
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Searching for divs with the class 'footballbox'
    footballbox_divs = soup.find_all('div', {'class': 'footballbox'})

    # Initialize a new list to store the expanded match data with individual goals per row
    expanded_matches_data = []

    # Iterating through each footballbox div
    for box in footballbox_divs:
        match_info = {}

        # Step 1: Extracting date and time from div class = “fleft”
        fleft = box.find('div', {'class': 'fleft'})
        if fleft:
            match_info['date'] = fleft.find('div', {'class': 'fdate'}).get_text(strip=True) if fleft.find('div', {'class': 'fdate'}) else None
            match_info['time'] = fleft.find('div', {'class': 'ftime'}).get_text(strip=True) if fleft.find('div', {'class': 'ftime'}) else None

        # Step 2: Extracting home team, away team, and score from table class = “fevent”
        fevent_table = box.find('table', {'class': 'fevent'})
        if fevent_table:
            # Extracting home team, away team, and score
            match_info['home_team'] = fevent_table.find('th', {'class': 'fhome'}).get_text(strip=True) if fevent_table.find('th', {'class': 'fhome'}) else None
            match_info['away_team'] = fevent_table.find('th', {'class': 'faway'}).get_text(strip=True) if fevent_table.find('th', {'class': 'faway'}) else None
            match_info['score'] = fevent_table.find('th', {'class': 'fscore'}).get_text(strip=True) if fevent_table.find('th', {'class': 'fscore'}) else None

        # Step 3: Extracting stadium and referee info from div class = “fright”
        fright = box.find('div', {'class': 'fright'})
        if fright:
            # First div contains the stadium name and city
            location_div = fright.find('div', {'itemprop': 'location'})
            if location_div:
                match_info['stadium_name'] = location_div.find('a').get_text(strip=True) if location_div.find('a') else None
                match_info['stadium_city'] = location_div.find_all('a')[1].get_text(strip=True) if len(location_div.find_all('a')) > 1 else None

            # Second div contains attendance information
            stadium_attendance = fright.find_all('div')[1].get_text(strip=True) if len(fright.find_all('div')) > 1 else None
            match_info['stadium_attendance'] = stadium_attendance

            # Third div contains referee name and nationality
            referee_div = fright.find_all('div')[2] if len(fright.find_all('div')) > 2 else None
            if referee_div:
                match_info['referee_name'] = referee_div.find('a').get_text(strip=True) if referee_div.find('a') else None
                match_info['referee_nationality'] = referee_div.find_all('a')[1].get_text(strip=True) if len(referee_div.find_all('a')) > 1 else None

        # Step 4: Extracting individual goals for home team and away team
        fgoals_rows = fevent_table.find_all('tr', {'class': 'fgoals'})

        for goal_row in fgoals_rows:
            # Extracting home goals (fhgoal)
            home_goals = goal_row.find_all('td', {'class': 'fhgoal'})
            for home_goal in home_goals:
                # Find the div with class "plainlist" and loop through the li tags
                plainlist = home_goal.find('div', {'class': 'plainlist'})
                if plainlist:
                    scorers = plainlist.find_all('li')
                    for scorer in scorers:
                        scorer_name = scorer.find('a').get_text(strip=True) if scorer.find('a') else None
                        if scorer_name:
                            # Now, extract the goal minutes from span tags (if the player scored multiple goals)
                            goal_minutes = scorer.find_all('span', {'class': 'fb-goal'})
                            for minute in goal_minutes:
                                match_copy = match_info.copy()
                                match_copy['scorer_name'] = scorer_name
                                match_copy['scorer_nationality'] = match_info.get('home_team')
                                match_copy['goal_minute'] = minute.get_text(strip=True)
                                expanded_matches_data.append(match_copy)

            # Extracting away goals (fagoal)
            away_goals = goal_row.find_all('td', {'class': 'fagoal'})
            for away_goal in away_goals:
                # Find the div with class "plainlist" and loop through the li tags
                plainlist = away_goal.find('div', {'class': 'plainlist'})
                if plainlist:
                    scorers = plainlist.find_all('li')
                    for scorer in scorers:
                        scorer_name = scorer.find('a').get_text(strip=True) if scorer.find('a') else None
                        if scorer_name:
                            # Now, extract the goal minutes from span tags (if the player scored multiple goals)
                            goal_minutes = scorer.find_all('span', {'class': 'fb-goal'})
                            for minute in goal_minutes:
                                match_copy = match_info.copy()
                                match_copy['scorer_name'] = scorer_name
                                match_copy['scorer_nationality'] = match_info.get('away_team')
                                match_copy['goal_minute'] = minute.get_text(strip=True)
                                expanded_matches_data.append(match_copy)

    return expanded_matches_data

# Function to clean and modify the data as requested
def clean_data(df):
    # Create a new column 'extra_time' that takes value 1 if '(a.e.t.)' is in the score, 0 otherwise
    df['extra_time'] = df['score'].apply(lambda x: 1 if '(a.e.t.)' in str(x) else 0)
    
    # Remove "(a.e.t.)" and other non-numeric characters from the score before splitting
    df['score'] = df['score'].str.replace(r'\(a.e.t.\)', '', regex=True).str.strip()

    # Clean the score by removing any non-numeric characters except for the en-dash (–)
    df['score'] = df['score'].str.replace(r'[^\d–]', '', regex=True).str.strip()

    # Split the score column into goals_home and goals_away
    df[['goals_home', 'goals_away']] = df['score'].str.split('–', expand=True)

    # Convert the goals to integers if possible, and handle missing data
    df['goals_home'] = pd.to_numeric(df['goals_home'], errors='coerce').fillna(0).astype(int)
    df['goals_away'] = pd.to_numeric(df['goals_away'], errors='coerce').fillna(0).astype(int)

    # Remove "'" from goal_minute and convert to integer
    df['goal_minute'] = df['goal_minute'].str.replace("'", "").str.extract('(\d+)').astype(int)

    # Create a new column 'penalty' that takes value 1 if '(pen.)' is in goal_minute, 0 otherwise
    df['penalty'] = df['goal_minute'].apply(lambda x: 1 if 'pen.' in str(x) else 0)

    # Clean stadium_attendance: remove non-numeric characters, including "Attendance:", and convert to integer
    df['stadium_attendance'] = df['stadium_attendance'].str.replace(r'[^\d]', '', regex=True).astype(int)

    # Split the date into short_date and long_date
    df['short_date'] = df['date'].str.extract(r'\((.*?)\)')
    df['long_date'] = df['date'].str.split('(').str[0].str.strip()

    # Drop the 'date' column after splitting
    df = df.drop(columns=['date'])

    return df


  df['goal_minute'] = df['goal_minute'].str.replace("'", "").str.extract('(\d+)').astype(int)


In [11]:
def scrape_uefa_euro_data(year):
    url = f'https://en.wikipedia.org/wiki/UEFA_Euro_{year}'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all h3 tags (which likely correspond to group or round titles)
    h3_tags = soup.find_all('h3')

    # Initialize a list to store the expanded match data with individual goals per row
    expanded_matches_data = []

    current_group = None  # To store the current group or round information

    # Iterate through h3 tags and footballbox divs
    for h3_tag in h3_tags:
        # Extract the group/round name from the h3 tag
        current_group = h3_tag.get_text(strip=True)

        # Find all the next divs with class 'footballbox' after the h3
        footballbox_divs = h3_tag.find_next_siblings('div', {'class': 'footballbox'})
        
        # Iterate through each footballbox div and scrape the data
        for box in footballbox_divs:
            match_info = {}
            match_info['group'] = current_group  # Add the group/round information to the match info

            # Step 1: Extracting date and time from div class = “fleft”
            fleft = box.find('div', {'class': 'fleft'})
            if fleft:
                match_info['date'] = fleft.find('div', {'class': 'fdate'}).get_text(strip=True) if fleft.find('div', {'class': 'fdate'}) else None
                match_info['time'] = fleft.find('div', {'class': 'ftime'}).get_text(strip=True) if fleft.find('div', {'class': 'ftime'}) else None

            # Step 2: Extracting home team, away team, and score from table class = “fevent”
            fevent_table = box.find('table', {'class': 'fevent'})
            if fevent_table:
                # Extracting home team, away team, and score
                match_info['home_team'] = fevent_table.find('th', {'class': 'fhome'}).get_text(strip=True) if fevent_table.find('th', {'class': 'fhome'}) else None
                match_info['away_team'] = fevent_table.find('th', {'class': 'faway'}).get_text(strip=True) if fevent_table.find('th', {'class': 'faway'}) else None
                match_info['score'] = fevent_table.find('th', {'class': 'fscore'}).get_text(strip=True) if fevent_table.find('th', {'class': 'fscore'}) else None
                print(f"Score found: {match_info['score']}")  # Debug print to ensure score is being extracted
            else:
                print("No score found for this match")

            # Step 3: Extracting stadium and referee info from div class = “fright”
            fright = box.find('div', {'class': 'fright'})
            if fright:
                # First div contains the stadium name and city
                location_div = fright.find('div', {'itemprop': 'location'})
                if location_div:
                    match_info['stadium_name'] = location_div.find('a').get_text(strip=True) if location_div.find('a') else None
                    match_info['stadium_city'] = location_div.find_all('a')[1].get_text(strip=True) if len(location_div.find_all('a')) > 1 else None

                # Second div contains attendance information
                stadium_attendance = fright.find_all('div')[1].get_text(strip=True) if len(fright.find_all('div')) > 1 else None
                match_info['stadium_attendance'] = stadium_attendance

                # Third div contains referee name and nationality
                referee_div = fright.find_all('div')[2] if len(fright.find_all('div')) > 2 else None
                if referee_div:
                    match_info['referee_name'] = referee_div.find('a').get_text(strip=True) if referee_div.find('a') else None
                    match_info['referee_nationality'] = referee_div.find_all('a')[1].get_text(strip=True) if len(referee_div.find_all('a')) > 1 else None

            # Step 4: Extracting individual goals for home team and away team
            fgoals_rows = fevent_table.find_all('tr', {'class': 'fgoals'})

            for goal_row in fgoals_rows:
                # Extracting home goals (fhgoal)
                home_goals = goal_row.find_all('td', {'class': 'fhgoal'})
                for home_goal in home_goals:
                    plainlist = home_goal.find('div', {'class': 'plainlist'})
                    if plainlist:
                        scorers = plainlist.find_all('li')
                        for scorer in scorers:
                            scorer_name = scorer.find('a').get_text(strip=True) if scorer.find('a') else None
                            if scorer_name:
                                goal_minutes = scorer.find_all('span', {'class': 'fb-goal'})
                                for minute in goal_minutes:
                                    match_copy = match_info.copy()
                                    match_copy['scorer_name'] = scorer_name
                                    match_copy['scorer_nationality'] = match_info.get('home_team')
                                    match_copy['goal_minute'] = minute.get_text(strip=True)
                                    expanded_matches_data.append(match_copy)

                # Extracting away goals (fagoal)
                away_goals = goal_row.find_all('td', {'class': 'fagoal'})
                for away_goal in away_goals:
                    plainlist = away_goal.find('div', {'class': 'plainlist'})
                    if plainlist:
                        scorers = plainlist.find_all('li')
                        for scorer in scorers:
                            scorer_name = scorer.find('a').get_text(strip=True) if scorer.find('a') else None
                            if scorer_name:
                                goal_minutes = scorer.find_all('span', {'class': 'fb-goal'})
                                for minute in goal_minutes:
                                    match_copy = match_info.copy()
                                    match_copy['scorer_name'] = scorer_name
                                    match_copy['scorer_nationality'] = match_info.get('away_team')
                                    match_copy['goal_minute'] = minute.get_text(strip=True)
                                    expanded_matches_data.append(match_copy)

    return expanded_matches_data


In [12]:
# List of years from 1984 to 2024, increasing by 4 each time
years = list(range(1984, 2024 + 1, 4))

# Initialize a list to store data for all editions
all_matches_data = []

# Loop through each year and scrape the data
for year in years:
    print(f"Scraping UEFA Euro {year} data...")
    year_data = scrape_uefa_euro_data(year)
    all_matches_data.extend(year_data)

# Convert the collected data to a DataFrame
all_matches_df = pd.DataFrame(all_matches_data)

# Clean the data
all_matches_df_cleaned = clean_data(all_matches_df)

# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\euro_goals.xlsx'
all_matches_df_cleaned.to_excel(file_path, index=False)
print(f"Data saved to {file_path}")

Scraping UEFA Euro 1984 data...
Scraping UEFA Euro 1988 data...
Scraping UEFA Euro 1992 data...
Scraping UEFA Euro 1996 data...
Scraping UEFA Euro 2000 data...
Scraping UEFA Euro 2004 data...
Scraping UEFA Euro 2008 data...
Scraping UEFA Euro 2012 data...
Scraping UEFA Euro 2016 data...
Scraping UEFA Euro 2020 data...
Scraping UEFA Euro 2024 data...


KeyError: 'score'