In [9]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# URL of the Wikipedia page
url = 'https://en.wikipedia.org/wiki/UEFA_Euro_1984'

# Send a request to fetch the HTML content of the page
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Searching for divs with the class 'footballbox'
footballbox_divs = soup.find_all('div', {'class': 'footballbox'})

# Initialize a new list to store the expanded match data with individual goals per row
expanded_matches_data = []

# Iterating through each footballbox div
for box in footballbox_divs:
    match_info = {}

    # Step 1: Extracting date and time from div class = “fleft”
    fleft = box.find('div', {'class': 'fleft'})
    if fleft:
        match_info['date'] = fleft.find('div', {'class': 'fdate'}).get_text(strip=True) if fleft.find('div', {'class': 'fdate'}) else None
        match_info['time'] = fleft.find('div', {'class': 'ftime'}).get_text(strip=True) if fleft.find('div', {'class': 'ftime'}) else None

    # Step 2: Extracting home team, away team, and score from table class = “fevent”
    fevent_table = box.find('table', {'class': 'fevent'})
    if fevent_table:
        # Extracting home team, away team, and score
        match_info['home_team'] = fevent_table.find('th', {'class': 'fhome'}).get_text(strip=True) if fevent_table.find('th', {'class': 'fhome'}) else None
        match_info['away_team'] = fevent_table.find('th', {'class': 'faway'}).get_text(strip=True) if fevent_table.find('th', {'class': 'faway'}) else None
        match_info['score'] = fevent_table.find('th', {'class': 'fscore'}).get_text(strip=True) if fevent_table.find('th', {'class': 'fscore'}) else None

    # Step 3: Extracting stadium and referee info from div class = “fright”
    fright = box.find('div', {'class': 'fright'})
    if fright:
        # First div contains the stadium name and city
        location_div = fright.find('div', {'itemprop': 'location'})
        if location_div:
            match_info['stadium_name'] = location_div.find('a').get_text(strip=True) if location_div.find('a') else None
            match_info['stadium_city'] = location_div.find_all('a')[1].get_text(strip=True) if len(location_div.find_all('a')) > 1 else None

        # Second div contains attendance information
        stadium_attendance = fright.find_all('div')[1].get_text(strip=True) if len(fright.find_all('div')) > 1 else None
        match_info['stadium_attendance'] = stadium_attendance

        # Third div contains referee name and nationality
        referee_div = fright.find_all('div')[2] if len(fright.find_all('div')) > 2 else None
        if referee_div:
            match_info['referee_name'] = referee_div.find('a').get_text(strip=True) if referee_div.find('a') else None
            match_info['referee_nationality'] = referee_div.find_all('a')[1].get_text(strip=True) if len(referee_div.find_all('a')) > 1 else None

    # Step 4: Extracting individual goals for home team and away team
    fgoals_rows = fevent_table.find_all('tr', {'class': 'fgoals'})

    for goal_row in fgoals_rows:
        # Extracting home goals (fhgoal)
        home_goals = goal_row.find_all('td', {'class': 'fhgoal'})
        for home_goal in home_goals:
            # Find the div with class "plainlist" and loop through the li tags
            plainlist = home_goal.find('div', {'class': 'plainlist'})
            if plainlist:
                scorers = plainlist.find_all('li')
                for scorer in scorers:
                    scorer_name = scorer.find('a').get_text(strip=True) if scorer.find('a') else None
                    if scorer_name:
                        # Now, extract the goal minutes from span tags (if the player scored multiple goals)
                        goal_minutes = scorer.find_all('span', {'class': 'fb-goal'})
                        for minute in goal_minutes:
                            match_copy = match_info.copy()
                            match_copy['scorer_name'] = scorer_name
                            match_copy['scorer_nationality'] = match_info.get('home_team')
                            match_copy['goal_minute'] = minute.get_text(strip=True)
                            expanded_matches_data.append(match_copy)

        # Extracting away goals (fagoal)
        away_goals = goal_row.find_all('td', {'class': 'fagoal'})
        for away_goal in away_goals:
            # Find the div with class "plainlist" and loop through the li tags
            plainlist = away_goal.find('div', {'class': 'plainlist'})
            if plainlist:
                scorers = plainlist.find_all('li')
                for scorer in scorers:
                    scorer_name = scorer.find('a').get_text(strip=True) if scorer.find('a') else None
                    if scorer_name:
                        # Now, extract the goal minutes from span tags (if the player scored multiple goals)
                        goal_minutes = scorer.find_all('span', {'class': 'fb-goal'})
                        for minute in goal_minutes:
                            match_copy = match_info.copy()
                            match_copy['scorer_name'] = scorer_name
                            match_copy['scorer_nationality'] = match_info.get('away_team')
                            match_copy['goal_minute'] = minute.get_text(strip=True)
                            expanded_matches_data.append(match_copy)

# Convert the expanded data into a DataFrame
expanded_matches_df = pd.DataFrame(expanded_matches_data)

# Apply the function to split goal_minute values and duplicate rows accordingly
def split_goal_minutes(df):
    rows = []
    for _, row in df.iterrows():
        # Split the 'goal_minute' by commas, then duplicate the row for each goal
        goal_minutes = row['goal_minute'].split(',')
        for minute in goal_minutes:
            new_row = row.copy()
            new_row['goal_minute'] = minute.strip()
            rows.append(new_row)
    return pd.DataFrame(rows)

# Apply the function to split goal minutes into separate rows
expanded_matches_df_split = split_goal_minutes(expanded_matches_df)

# Show the first few rows of the DataFrame
expanded_matches_df_split.head(20)


Unnamed: 0,date,time,home_team,away_team,score,stadium_name,stadium_city,stadium_attendance,referee_name,referee_nationality,scorer_name,scorer_nationality,goal_minute
0,12 June 1984(1984-06-12),20:30,France,Denmark,1–0,Parc des Princes,Paris,"Attendance: 47,570",Volker Roth,West Germany,Platini,France,78'
1,13 June 1984(1984-06-13),20:30,Belgium,Yugoslavia,2–0,Stade Félix-Bollaert,Lens,"Attendance: 41,525",Erik Fredriksson,Sweden,Vandenbergh,Belgium,28'
2,13 June 1984(1984-06-13),20:30,Belgium,Yugoslavia,2–0,Stade Félix-Bollaert,Lens,"Attendance: 41,525",Erik Fredriksson,Sweden,Grün,Belgium,45'
3,16 June 1984(1984-06-16),17:15,France,Belgium,5–0,Stade de la Beaujoire,Nantes,"Attendance: 51,359",Bob Valentine,Scotland,Platini,France,4'
3,16 June 1984(1984-06-16),17:15,France,Belgium,5–0,Stade de la Beaujoire,Nantes,"Attendance: 51,359",Bob Valentine,Scotland,Platini,France,74' (pen.)
3,16 June 1984(1984-06-16),17:15,France,Belgium,5–0,Stade de la Beaujoire,Nantes,"Attendance: 51,359",Bob Valentine,Scotland,Platini,France,89'
4,16 June 1984(1984-06-16),17:15,France,Belgium,5–0,Stade de la Beaujoire,Nantes,"Attendance: 51,359",Bob Valentine,Scotland,Giresse,France,33'
5,16 June 1984(1984-06-16),17:15,France,Belgium,5–0,Stade de la Beaujoire,Nantes,"Attendance: 51,359",Bob Valentine,Scotland,Fernández,France,43'
6,16 June 1984(1984-06-16),20:30,Denmark,Yugoslavia,5–0,Stade de Gerland,Lyon,"Attendance: 34,736",Augusto Lamo Castillo,Spain,Arnesen,Denmark,8'
6,16 June 1984(1984-06-16),20:30,Denmark,Yugoslavia,5–0,Stade de Gerland,Lyon,"Attendance: 34,736",Augusto Lamo Castillo,Spain,Arnesen,Denmark,69' (pen.)
