In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from getpass import getuser


In [2]:
# Get the current user's name
user = getuser()

In [3]:
# Run the notebook to load functions
%run scraping_functions.ipynb


# FIFA World Cup Data

## men

In [4]:

# Define the list of Men's FIFA World Cup years
men_wc_year = [
    1986, 1990, 1994, 1998, 2002, 2006, 2010, 2014, 2018, 2022
]

# Define the URL template for the Men's FIFA World Cup
url_template = 'https://en.wikipedia.org/wiki/{year}_FIFA_World_Cup'

# Call the scraper function with the specified list of years and URL template
all_matches_data = scraper(men_wc_year, url_template)

# Convert the collected data to a DataFrame
all_matches_df = pd.DataFrame(all_matches_data)



Attempting to scrape https://en.wikipedia.org/wiki/1986_FIFA_World_Cup
Found 52 'footballbox' divs for year 1986.

Processing footballbox 1 for 1986.
Stage: Group A
Match: Bulgaria vs Italy (1–1)

Processing footballbox 2 for 1986.
Stage: Group A
Match: Argentina vs South Korea (3–1)

Processing footballbox 3 for 1986.
Stage: Group A
Match: Italy vs Argentina (1–1)

Processing footballbox 4 for 1986.
Stage: Group A
Match: South Korea vs Bulgaria (1–1)

Processing footballbox 5 for 1986.
Stage: Group A
Match: South Korea vs Italy (2–3)

Processing footballbox 6 for 1986.
Stage: Group A
Match: Argentina vs Bulgaria (2–0)

Processing footballbox 7 for 1986.
Stage: Group B
Match: Belgium vs Mexico (1–2)

Processing footballbox 8 for 1986.
Stage: Group B
Match: Paraguay vs Iraq (1–0)

Processing footballbox 9 for 1986.
Stage: Group B
Match: Mexico vs Paraguay (1–1)

Processing footballbox 10 for 1986.
Stage: Group B
Match: Iraq vs Belgium (1–2)

Processing footballbox 11 for 1986.
Stage: G

In [5]:
# Clean the data if a clean_data function is available
all_matches_df_cleaned = clean_data(all_matches_df)

# Wolrd Cup 1990 data retrieved from Kaggle

In [6]:
# Path to the dataset
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\fifa.csv'

# Read the dataset with a different encoding
kaggle = pd.read_csv(data_path, encoding='ISO-8859-1')

In [7]:
# Extract the year from the 'tournament_id' column in the Kaggle dataset
kaggle['year'] = kaggle['tournament_id'].str.extract(r'WC-(\d{4})').astype(int)

# Filter for observations where the year is 1990
kaggle_1990 = kaggle[kaggle['year'] == 1990]

In [8]:
# Rename and transform variables as specified, directly calculating goals_home, goals_away, and score
kaggle_1990_transformed = kaggle_1990.copy()

# Rename group_name to stage
kaggle_1990_transformed = kaggle_1990_transformed.rename(columns={'group_name': 'stage'})

# Extract home_team and away_team from match_name
kaggle_1990_transformed[['home_team', 'away_team']] = kaggle_1990_transformed['match_name'].str.split(' v ', expand=True)

# Initialize goals_home and goals_away
kaggle_1990_transformed['goals_home'] = 0
kaggle_1990_transformed['goals_away'] = 0

# Calculate goals_home and goals_away based on scorer_nationality and own_goal
for match_id, group in kaggle_1990_transformed.groupby(['year', 'stage', 'home_team', 'away_team']):
    # Initialize match-specific goals
    goals_home = 0
    goals_away = 0
    
    for _, row in group.iterrows():
        if row['own_goal'] == 0:
            # Regular goal
            if row['home_team'] == row['player_team_name']:
                goals_home += 1
            elif row['away_team'] == row['player_team_name']:
                goals_away += 1
        else:
            # Own goal
            if row['home_team'] == row['player_team_name']:
                goals_away += 1
            elif row['away_team'] == row['player_team_name']:
                goals_home += 1

    # Assign calculated values back to the DataFrame
    kaggle_1990_transformed.loc[
        (kaggle_1990_transformed['year'] == match_id[0]) &
        (kaggle_1990_transformed['stage'] == match_id[1]) &
        (kaggle_1990_transformed['home_team'] == match_id[2]) &
        (kaggle_1990_transformed['away_team'] == match_id[3]),
        ['goals_home', 'goals_away']
    ] = goals_home, goals_away

# Calculate the score column as "goals_home–goals_away"
kaggle_1990_transformed['score'] = kaggle_1990_transformed['goals_home'].astype(int).astype(str) + '–' + kaggle_1990_transformed['goals_away'].astype(int).astype(str)

# Leave certain columns empty
kaggle_1990_transformed['stadium_name'] = None
kaggle_1990_transformed['stadium_city'] = None
kaggle_1990_transformed['stadium_attendance'] = None
kaggle_1990_transformed['referee_name'] = None
kaggle_1990_transformed['referee_nationality'] = None
kaggle_1990_transformed['time'] = None

# Create scorer_name from family_name and given_name
kaggle_1990_transformed['scorer_name'] = kaggle_1990_transformed['given_name'] + ' ' + kaggle_1990_transformed['family_name']

# Rename player_team_name to scorer_nationality
kaggle_1990_transformed = kaggle_1990_transformed.rename(columns={'player_team_name': 'scorer_nationality'})

# Rename minute_regulation to goal_minute
kaggle_1990_transformed = kaggle_1990_transformed.rename(columns={'minute_regulation': 'goal_minute'})

# Rename minute_stoppage to extra_time
kaggle_1990_transformed = kaggle_1990_transformed.rename(columns={'minute_stoppage': 'extra_time'})

# Keep own_goal and penalty
# (No action needed as they are already present)

# Create goal_et and assign value 0
kaggle_1990_transformed['goal_et'] = 0

# Create goal_minute_et and assign value 0
kaggle_1990_transformed['goal_minute_et'] = 0

# Create and leave short_date empty
kaggle_1990_transformed['short_date'] = None

# Convert match_date to the format of long_date (e.g., 30 June 1990)
kaggle_1990_transformed['long_date'] = pd.to_datetime(kaggle_1990_transformed['match_date'], format='%m/%d/%Y').dt.strftime('%d %B %Y')

# Keep only the relevant columns
columns_to_keep = [
    'stage', 'year', 'home_team', 'away_team', 'score', 'stadium_name',
    'stadium_city', 'stadium_attendance', 'referee_name', 'referee_nationality',
    'time', 'scorer_name', 'scorer_nationality', 'goal_minute', 'goal_minute_et','extra_time',
    'goals_home', 'goals_away', 'own_goal', 'penalty', 'goal_et',
    'short_date', 'long_date'
]
kaggle_1990_final = kaggle_1990_transformed[columns_to_keep]

# Display the first few rows of the transformed dataset
kaggle_1990_final.head()


Unnamed: 0,stage,year,home_team,away_team,score,stadium_name,stadium_city,stadium_attendance,referee_name,referee_nationality,...,goal_minute,goal_minute_et,extra_time,goals_home,goals_away,own_goal,penalty,goal_et,short_date,long_date
1328,Group B,1990,Argentina,Cameroon,0–1,,,,,,...,67,0,0,0,1,0,0,0,,08 June 1990
1329,Group B,1990,Soviet Union,Romania,0–2,,,,,,...,41,0,0,0,2,0,0,0,,09 June 1990
1330,Group B,1990,Soviet Union,Romania,0–2,,,,,,...,55,0,0,0,2,0,1,0,,09 June 1990
1331,Group D,1990,United Arab Emirates,Colombia,0–2,,,,,,...,50,0,0,0,2,0,0,0,,09 June 1990
1332,Group D,1990,United Arab Emirates,Colombia,0–2,,,,,,...,85,0,0,0,2,0,0,0,,09 June 1990


In [9]:
# Create a dictionary to map matches to their respective times
match_times = {
    ('Italy', 'Austria', 'Group A'): '21:00',
    ('United States', 'Czechoslovakia', 'Group A'): '17:00',
    ('Italy', 'United States', 'Group A'): '21:00',
    ('Austria', 'Czechoslovakia', 'Group A'): '21:00',
    ('Austria', 'United States', 'Group A'): '21:00',
    ('Italy', 'Czechoslovakia', 'Group A'): '21:00',
    ('Argentina', 'Cameroon', 'Group B'): '18:00',
    ('Soviet Union', 'Romania', 'Group B'): '17:00',
    ('Cameroon', 'Romania', 'Group B'): '17:00',
    ('Argentina', 'Soviet Union', 'Group B'): '21:00',
    ('Cameroon', 'Soviet Union', 'Group B'): '21:00',
    ('Argentina', 'Romania', 'Group B'): '21:00',
    ('Brazil', 'Sweden', 'Group C'): '21:00',
    ('Costa Rica', 'Scotland', 'Group C'): '17:00',
    ('Brazil', 'Costa Rica', 'Group C'): '21:00',
    ('Sweden', 'Scotland', 'Group C'): '17:00',
    ('Brazil', 'Scotland', 'Group C'): '21:00',
    ('Sweden', 'Costa Rica', 'Group C'): '21:00',
    ('West Germany', 'Yugoslavia', 'Group D'): '21:00',
    ('United Arab Emirates', 'Colombia', 'Group D'): '17:00',
    ('Yugoslavia', 'Colombia', 'Group D'): '17:00',
    ('Yugoslavia', 'United Arab Emirates', 'Group D'): '17:00',
    ('West Germany', 'United Arab Emirates', 'Group D'): '21:00',
    ('West Germany', 'Colombia', 'Group D'): '21:00',
    ('South Korea','Spain', 'Group E'): '17:00',
    ('Belgium', 'Uruguay', 'Group E'): '21:00',
    ('South Korea', 'Uruguay', 'Group E'): '17:00',
    ('Belgium', 'Spain', 'Group E'): '21:00',
    ('Belgium', 'South Korea', 'Group E'): '21:00',
    ('England', 'Republic of Ireland', 'Group F'): '21:00',
    ('Netherlands', 'Egypt', 'Group F'): '21:00',
    ('England', 'Netherlands', 'Group F'): '21:00',
    ('Republic of Ireland', 'Egypt', 'Group F'): '17:00',
    ('Republic of Ireland', 'Netherlands', 'Group F'): '21:00',
    ('England', 'Egypt', 'Group F'): '21:00'
    }

# Update the 'time' column in the dataset based on the match_times dictionary
for index, row in kaggle_1990_final.iterrows():
    key = (row['home_team'], row['away_team'], row['stage'])
    if key in match_times:
        kaggle_1990_final.at[index, 'time'] = match_times[key]

# Display the updated dataset
kaggle_1990_final.head()


Unnamed: 0,stage,year,home_team,away_team,score,stadium_name,stadium_city,stadium_attendance,referee_name,referee_nationality,...,goal_minute,goal_minute_et,extra_time,goals_home,goals_away,own_goal,penalty,goal_et,short_date,long_date
1328,Group B,1990,Argentina,Cameroon,0–1,,,,,,...,67,0,0,0,1,0,0,0,,08 June 1990
1329,Group B,1990,Soviet Union,Romania,0–2,,,,,,...,41,0,0,0,2,0,0,0,,09 June 1990
1330,Group B,1990,Soviet Union,Romania,0–2,,,,,,...,55,0,0,0,2,0,1,0,,09 June 1990
1331,Group D,1990,United Arab Emirates,Colombia,0–2,,,,,,...,50,0,0,0,2,0,0,0,,09 June 1990
1332,Group D,1990,United Arab Emirates,Colombia,0–2,,,,,,...,85,0,0,0,2,0,0,0,,09 June 1990


In [10]:
# Check if all columns in kaggle_1990_final match the columns in all_matches_df_cleaned
if set(kaggle_1990_final.columns) == set(all_matches_df_cleaned.columns):
    print("All columns in kaggle_1990_final match the columns in all_matches_df_cleaned.")
else:
    unmatched_columns = set(kaggle_1990_final.columns).symmetric_difference(all_matches_df_cleaned.columns)
    print(f"The following columns do not match: {unmatched_columns}")

# Proceed with filtering and appending if columns match
matching_columns = kaggle_1990_final.columns.intersection(all_matches_df_cleaned.columns)
kaggle_1990_filtered = kaggle_1990_final[matching_columns]
all_matches_df_cleaned_filtered = all_matches_df_cleaned[matching_columns]

# Append kaggle_1990_filtered to all_matches_df_cleaned_filtered
all_matches_updated = pd.concat([all_matches_df_cleaned_filtered, kaggle_1990_filtered], ignore_index=True)

# Display the first few rows of the updated DataFrame
all_matches_updated.head()


All columns in kaggle_1990_final match the columns in all_matches_df_cleaned.


Unnamed: 0,stage,year,home_team,away_team,score,stadium_name,stadium_city,stadium_attendance,referee_name,referee_nationality,...,goal_minute,goal_minute_et,extra_time,goals_home,goals_away,own_goal,penalty,goal_et,short_date,long_date
0,Group A,1986,Bulgaria,Italy,1–1,Estadio Azteca,Mexico City,96000,Erik Fredriksson,Sweden,...,85,0,0,1,1,0,0,0,,31 May 1986
1,Group A,1986,Bulgaria,Italy,1–1,Estadio Azteca,Mexico City,96000,Erik Fredriksson,Sweden,...,44,0,0,1,1,0,0,0,,31 May 1986
2,Group A,1986,Argentina,South Korea,3–1,Estadio Olímpico Universitario,Mexico City,60000,Victoriano Sánchez Arminio,Spain,...,6,0,0,3,1,0,0,0,,2 June 1986
3,Group A,1986,Argentina,South Korea,3–1,Estadio Olímpico Universitario,Mexico City,60000,Victoriano Sánchez Arminio,Spain,...,46,0,0,3,1,0,0,0,,2 June 1986
4,Group A,1986,Argentina,South Korea,3–1,Estadio Olímpico Universitario,Mexico City,60000,Victoriano Sánchez Arminio,Spain,...,18,0,0,3,1,0,0,0,,2 June 1986


In [11]:

# Define the file path for saving the cleaned data specifically for the Men's World Cup
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\wc_goals_men.xlsx'

# Export the cleaned data to an Excel file
all_matches_updated.to_excel(file_path, index=False)
print(f"Data saved to {file_path}")

Data saved to C:\Users\aldi\Documents\GitHub\tiebreak_wc\data\in\wc_goals_men.xlsx


## women

In [12]:

# Define the list of Men's FIFA World Cup years
women_wc_years = [
    1991, 1995, 1999, 2003, 2007, 2011, 2015, 2019, 2023
]


# Define the URL template for the Men's FIFA World Cup
url_template = 'https://en.wikipedia.org/wiki/{year}_FIFA_Women%27s_World_Cup'

# Call the scraper function with the specified list of years and URL template
all_matches_data = scraper(women_wc_years, url_template)

# Convert the collected data to a DataFrame
all_matches_df = pd.DataFrame(all_matches_data)



Attempting to scrape https://en.wikipedia.org/wiki/1991_FIFA_Women%27s_World_Cup
Found 26 'footballbox' divs for year 1991.

Processing footballbox 1 for 1991.
Stage: Group A
Match: China vs Norway (4–0)

Processing footballbox 2 for 1991.
Stage: Group A
Match: Denmark vs New Zealand (3–0)

Processing footballbox 3 for 1991.
Stage: Group A
Match: Norway vs New Zealand (4–0)

Processing footballbox 4 for 1991.
Stage: Group A
Match: China vs Denmark (2–2)

Processing footballbox 5 for 1991.
Stage: Group A
Match: China vs New Zealand (4–1)

Processing footballbox 6 for 1991.
Stage: Group A
Match: Norway vs Denmark (2–1)

Processing footballbox 7 for 1991.
Stage: Group B
Match: Japan vs Brazil (0–1)

Processing footballbox 8 for 1991.
Stage: Group B
Match: Sweden vs United States (2–3)

Processing footballbox 9 for 1991.
Stage: Group B
Match: Japan vs Sweden (0–8)

Processing footballbox 10 for 1991.
Stage: Group B
Match: Brazil vs United States (0–5)

Processing footballbox 11 for 1991.


In [13]:
# Clean the data if a clean_data function is available
all_matches_df_cleaned = clean_data(all_matches_df)

# Define the file path for saving the cleaned data specifically for the Men's World Cup
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\wc_goals_women.xlsx'

# Export the cleaned data to an Excel file
all_matches_df_cleaned.to_excel(file_path, index=False)
print(f"Data saved to {file_path}")

Data saved to C:\Users\aldi\Documents\GitHub\tiebreak_wc\data\in\wc_goals_women.xlsx


# UEFA Euro Data

## men

In [14]:


# Define the list of Men's UEFA European Championship
eu_men_years = [
    1984, 1988, 1992, 1996, 
    2000, 2004, 2008, 2012, 2016, 2021, 2024
]

# Define the URL template for the Men's FIFA World Cup
url_template = 'https://en.wikipedia.org/wiki/UEFA_Euro_{year}'

# Call the scraper function with the specified list of years and URL template
all_matches_data = scraper(eu_men_years, url_template)

# Convert the collected data to a DataFrame
all_matches_df = pd.DataFrame(all_matches_data)




Attempting to scrape https://en.wikipedia.org/wiki/UEFA_Euro_1984
Found 15 'footballbox' divs for year 1984.

Processing footballbox 1 for 1984.
Stage: Group 1
Match: France vs Denmark (1–0)

Processing footballbox 2 for 1984.
Stage: Group 1
Match: Belgium vs Yugoslavia (2–0)

Processing footballbox 3 for 1984.
Stage: Group 1
Match: France vs Belgium (5–0)

Processing footballbox 4 for 1984.
Stage: Group 1
Match: Denmark vs Yugoslavia (5–0)

Processing footballbox 5 for 1984.
Stage: Group 1
Match: France vs Yugoslavia (3–2)

Processing footballbox 6 for 1984.
Stage: Group 1
Match: Denmark vs Belgium (3–2)

Processing footballbox 7 for 1984.
Stage: Group 2
Match: West Germany vs Portugal (0–0)

Processing footballbox 8 for 1984.
Stage: Group 2
Match: Romania vs Spain (1–1)

Processing footballbox 9 for 1984.
Stage: Group 2
Match: West Germany vs Romania (2–1)

Processing footballbox 10 for 1984.
Stage: Group 2
Match: Portugal vs Spain (1–1)

Processing footballbox 11 for 1984.
Stage: G

In [15]:
# Clean the data if a clean_data function is available
all_matches_df_cleaned = clean_data(all_matches_df)

# Define the file path for saving the cleaned data specifically for the Men's World Cup
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\eu_goals_men.xlsx'

# Export the cleaned data to an Excel file
all_matches_df_cleaned.to_excel(file_path, index=False)
print(f"Data saved to {file_path}")


Data saved to C:\Users\aldi\Documents\GitHub\tiebreak_wc\data\in\eu_goals_men.xlsx


## women

In [16]:


# Define the list of Men's UEFA European Championship
eu_women_years = [
    1984, 1987, 1989, 1991, 1993, 1995, 1997, 2001, 2005, 2009, 
    2013, 2017, 2022
]


# Define the URL template for the Men's FIFA World Cup
url_template = 'https://en.wikipedia.org/wiki/UEFA_Women%27s_Euro_{year}'

# Call the scraper function with the specified list of years and URL template
all_matches_data = scraper(eu_women_years, url_template)

# Convert the collected data to a DataFrame
all_matches_df = pd.DataFrame(all_matches_data)





Attempting to scrape https://en.wikipedia.org/wiki/UEFA_Women%27s_Euro_1984
Found 6 'footballbox' divs for year 1984.

Processing footballbox 1 for 1984.
Stage: First leg
Match: England vs Denmark (2–1)

Processing footballbox 2 for 1984.
Stage: First leg
Match: Italy vs Sweden (2–3)

Processing footballbox 3 for 1984.
Stage: Second leg
Match: Denmark vs England (0–1)

Processing footballbox 4 for 1984.
Stage: Second leg
Match: Sweden vs Italy (2–1)

Processing footballbox 5 for 1984.
Stage: First leg
Match: Sweden vs England (1–0)

Processing footballbox 6 for 1984.
Stage: Second leg
Match: England vs Sweden (1–0)
Year 1984 data appended. Total matches so far: 16

Attempting to scrape https://en.wikipedia.org/wiki/UEFA_Women%27s_Euro_1987
Found 4 'footballbox' divs for year 1987.

Processing footballbox 1 for 1987.
Match: Norway vs Italy (2–0)

Processing footballbox 2 for 1987.
Match: Sweden vs England (3–2 (a.e.t.))

Processing footballbox 3 for 1987.
Match: Italy vs England (2–1)


In [17]:
all_matches_data

Unnamed: 0,stage,year,date,time,home_team,away_team,score,stadium_name,stadium_city,stadium_attendance,referee_name,referee_nationality,scorer_name,scorer_nationality,goal_minute
0,First leg,1984,8 April 1984,14:30,England,Denmark,2–1,Gresty Road,Crewe,"Attendance: 1,000",Republic of Ireland,,Davis,England,31'
1,First leg,1984,8 April 1984,14:30,England,Denmark,2–1,Gresty Road,Crewe,"Attendance: 1,000",Republic of Ireland,,Davis,England,51'
2,First leg,1984,8 April 1984,14:30,England,Denmark,2–1,Gresty Road,Crewe,"Attendance: 1,000",Republic of Ireland,,Hindkjær,Denmark,49' (pen.)
3,First leg,1984,8 April 1984,12:00,Italy,Sweden,2–3,Stadio Flaminio,Rome,"Attendance: 5,000",West Germany,[3],Morace,Italy,18'
4,First leg,1984,8 April 1984,12:00,Italy,Sweden,2–3,Stadio Flaminio,Rome,"Attendance: 5,000",West Germany,[3],Morace,Italy,31'
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
545,Semi-finals,2022,27 July 2022(2022-07-27),20:00,Germany,France,2–1,Stadium MK,Milton Keynes,"Attendance: 27,445[69]",Cheryl Foster,Wales,Popp,Germany,"40',76'"
546,Semi-finals,2022,27 July 2022(2022-07-27),20:00,Germany,France,2–1,Stadium MK,Milton Keynes,"Attendance: 27,445[69]",Cheryl Foster,Wales,Frohms,France,44' (o.g.)
547,Final,2022,31 July 2022(2022-07-31),17:00,England,Germany,2–1(a.e.t.),Wembley Stadium,London,"Attendance: 87,192[70]",Kateryna Monzul,Ukraine,Toone,England,62'
548,Final,2022,31 July 2022(2022-07-31),17:00,England,Germany,2–1(a.e.t.),Wembley Stadium,London,"Attendance: 87,192[70]",Kateryna Monzul,Ukraine,Toone,England,110'


In [18]:
# Clean the data if a clean_data function is available
all_matches_df_cleaned = clean_data(all_matches_df)

# Define the file path for saving the cleaned data specifically for the Men's World Cup
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\eu_goals_women.xlsx'

# Export the cleaned data to an Excel file
all_matches_df_cleaned.to_excel(file_path, index=False)
print(f"Data saved to {file_path}")


Data saved to C:\Users\aldi\Documents\GitHub\tiebreak_wc\data\in\eu_goals_women.xlsx
