# Libraries

In [56]:
import pandas as pd
from getpass import getuser
from collections import defaultdict
from datetime import datetime, timedelta

# Load and inspect dataset

In [57]:
# Get the current user's name
user = getuser()

# Import the functions from the functions notebook
%run functions_uefa.ipynb


In [58]:


# Path to the dataset
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\eu_goals.xlsx'

# Load the Excel file into a DataFrame
df = pd.read_excel(data_path)

# Display the first few rows of the DataFrame to verify the import
display(df.head())

Unnamed: 0,stage,time,home_team,away_team,score,stadium_name,stadium_city,stadium_attendance,referee_name,referee_nationality,...,goal_minute,extra_time,goals_home,goals_away,own_goal,penalty,goal_minute_et,goal_et,short_date,long_date
0,Group 1,20:30,France,Denmark,1–0,Parc des Princes,Paris,47570,Volker Roth,West Germany,...,78,0,1,0,0,0,0,0,1984-06-12,12 June 1984
1,Group 1,20:30,Belgium,Yugoslavia,2–0,Stade Félix-Bollaert,Lens,41525,Erik Fredriksson,Sweden,...,28,0,2,0,0,0,0,0,1984-06-13,13 June 1984
2,Group 1,20:30,Belgium,Yugoslavia,2–0,Stade Félix-Bollaert,Lens,41525,Erik Fredriksson,Sweden,...,45,0,2,0,0,0,0,0,1984-06-13,13 June 1984
3,Group 1,17:15,France,Belgium,5–0,Stade de la Beaujoire,Nantes,51359,Bob Valentine,Scotland,...,4,0,5,0,0,0,0,0,1984-06-16,16 June 1984
4,Group 1,17:15,France,Belgium,5–0,Stade de la Beaujoire,Nantes,51359,Bob Valentine,Scotland,...,74,0,5,0,0,1,0,0,1984-06-16,16 June 1984


# Clean, transfrom, create variables

## time

In [59]:
# Step 1: Clean time variable by removing any letters (e.g., EEST, PST) if they exist
df['time_cleaned'] = df['time'].str.replace(r'[A-Za-z]+', '', regex=True).str.strip()

# Step 2: Split time where there are parentheses into local time and UTC offset
df['local_time'] = df['time_cleaned'].str.extract(r'(\d{2}:\d{2})')  # extract the part before parentheses
df['utc_time_offset'] = df['time_cleaned'].str.extract(r'\((.*?)\)')  # extract the part inside parentheses


## date

In [60]:
# Convert 'long_date' column to datetime format
df['long_date'] = pd.to_datetime(df['long_date'], format='%d %B %Y', errors='coerce')

# Extract the year from 'long_date' and create a new column 'year'
df['year'] = df['long_date'].dt.year


## gender

In [61]:
# Define the list of years for Men's UEFA European Championships
men_years = [1960, 1964, 1968, 1972, 1976, 1980, 1984, 1988, 1992, 1996, 2000, 2004, 2008, 2012, 2016, 2021, 2024]

# Create the 'men' variable: 1 if the year is in men_years, 0 otherwise
df['men'] = df['year'].apply(lambda x: 1 if x in men_years else 0)


# Extract relevant columns

In [62]:
# Filter out the knockout stages
df = df[~df['stage'].isin(['Quarter-finals', 'Round of 16', 'Semi-finals', 'Final'])]

# Extract relevant columns for goal events and match results
goals_df = df[['men','year', 'stage', 'home_team', 'away_team', 'scorer_nationality', 'goal_minute', 'short_date','local_time', 'score']]

display(goals_df.head())


Unnamed: 0,men,year,stage,home_team,away_team,scorer_nationality,goal_minute,short_date,local_time,score
0,1,1984,Group 1,France,Denmark,France,78,1984-06-12,20:30,1–0
1,1,1984,Group 1,Belgium,Yugoslavia,Belgium,28,1984-06-13,20:30,2–0
2,1,1984,Group 1,Belgium,Yugoslavia,Belgium,45,1984-06-13,20:30,2–0
3,1,1984,Group 1,France,Belgium,France,4,1984-06-16,17:15,5–0
4,1,1984,Group 1,France,Belgium,France,74,1984-06-16,17:15,5–0


# Recreate Leauge Table after first two matchdays

In [63]:

# Assuming goals_df is your original dataset
agg_goals_before_last_day, goals_last_day_sorted = process_goals_data(df)


In [64]:
home_games, away_games = aggregate_home_away_points(agg_goals_before_last_day)

In [65]:
# Filter for year == 2021 and stage == 'Group F'
home_f_2021 = home_games[(home_games['year'] == 2021) & (home_games['stage'] == 'Group F')]
home_f_2021

Unnamed: 0,year,stage,home_team,men,goals_scored,goals_conceded,points_home,match_count_home
115,2021,Group F,France,1,2,0,3,1
116,2021,Group F,Hungary,1,2,8,1,2
117,2021,Group F,Portugal,1,4,8,0,1


In [66]:
# Filter for year == 2021 and stage == 'Group F'
away_f_2021 = away_games[(away_games['year'] == 2021) & (away_games['stage'] == 'Group F')]
away_f_2021

Unnamed: 0,year,stage,away_team,men,goals_scored,goals_conceded,points_away,match_count_away
111,2021,Group F,France,1,2,2,1,1
112,2021,Group F,Germany,1,8,6,3,2
113,2021,Group F,Portugal,1,6,0,3,1


## aggregate data after first two matches following 

### UEFA criteria (first h2h, then aggregate)

In [67]:
# Applying the fifa_before_last function to the mock data
all_games_before_last = uefa_before_last(home_games, away_games, agg_goals_before_last_day)

Tie in 1984, Stage: Group 1, Men: 1, between Belgium and Denmark (Points: 2.0)
Tie resolved by head-to-head: Denmark won in 1984, Stage: Group 1
Tie in 1984, Stage: Group 2, Men: 1, between Portugal and Spain (Points: 2.0)
Tie resolved by goals scored: Spain favored in 1984, Stage: Group 2
Tie in 1988, Stage: Group 1, Men: 1, between Italy and West Germany (Points: 3.0)
Tie resolved by goals difference: West Germany favored in 1988, Stage: Group 1
Tie in 1988, Stage: Group 2, Men: 1, between Republic of Ireland and Soviet Union (Points: 3.0)
Tie resolved by goals scored: Soviet Union favored in 1988, Stage: Group 2
Tie in 1992, Stage: Group 2, Men: 1, between Germany and Netherlands (Points: 3.0)
Tie resolved by goals difference: Germany favored in 1992, Stage: Group 2
Tie in 1996, Stage: Group A, Men: 1, between England and Netherlands (Points: 4.0)
Tie resolved by head-to-head: Netherlands won in 1996, Stage: Group A
Tie in 1996, Stage: Group A, Men: 1, between Scotland and Switzerla

  all_games_before_last['standing'] = all_games_before_last.groupby(['year', 'stage', 'men']).apply(


# Recreate league table after last match day

### uefa criteria 

In [68]:
# Initialize an empty list to store the results for each pair
all_results = []

# Get unique pairs of year, stage, and men
unique_pairs = all_games_before_last[['year', 'stage', 'men']].drop_duplicates()

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    stage = row['stage']
    men = row['men']  # Define men based on the current row in unique_pairs
    
    # Apply the function to the current pair
    result = uefa_final_euro(year, stage, men, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_results.append(result)

# Concatenate all the results into a single DataFrame
changes_df_euro = pd.concat(all_results)

# Keep only the specified columns
changes_df_euro = changes_df_euro[['men', 'year', 'stage', 'team', '1st', '2nd', '3rd', '4th', 'changes']]



=== Initial Standings for Year 1984, Group 1, Men: 1 Before Last Match Goals ===

      team  total_points  total_goals_scored  total_goals_conceded  total_goal_difference  before_last_game_points  before_last_game_standing
    France             5                   6                     0                      6                        4                          1
   Belgium             3                   2                     5                     -3                        2                          2
   Denmark             3                   5                     1                      4                        2                          3
Yugoslavia             1                   0                     7                     -7                        0                          4


Analyzing goal: 26 minute, Player team: Belgium, Home: Denmark, Away: Belgium

=== Updated Standings After This Goal ===

      team  total_points  total_goals_scored  total_goals_conceded  total_goal_diff

In [69]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\tb_eu_uefa.xlsx'
changes_df_euro.to_excel(file_path, index=False)


# group composition tracking

In [70]:
# Initialize an empty list to store the results for each pair
all_composition_changes = []

# Get unique pairs of year, stage, and men
unique_pairs = all_games_before_last[['year', 'stage', 'men']].drop_duplicates()

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    stage = row['stage']
    men = row['men']  # Get the value of 'men' for the current pair
    
    # Apply the function to the current pair
    composition_changes_df = track_composition_changes(year, stage, men, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_composition_changes.append(composition_changes_df)

# Concatenate all the results into a single DataFrame
final_composition_changes_df = pd.concat(all_composition_changes, ignore_index=True)

# Display the final DataFrame
display(final_composition_changes_df)




=== Initial Standings for Group 1, 1984 (with 0-0 points added) ===
      team  total_points  total_goals_scored  total_goals_conceded  total_goal_difference  before_last_game_standing
    France             5                   6                     0                      6                          1
   Belgium             3                   2                     5                     -3                          2
   Denmark             3                   5                     1                      4                          3
Yugoslavia             1                   0                     7                     -7                          4



=== Initial Standings for Group 2, 1984 (with 0-0 points added) ===
        team  total_points  total_goals_scored  total_goals_conceded  total_goal_difference  before_last_game_standing
West Germany             4                   2                     1                      1                          1
    Portugal             3           

Unnamed: 0,year,stage,men,change_num,goal_time,home_team,away_team,scorer_team,new_top_teams,1st,2nd,3rd
0,1984,Group 1,1,0,initial,,,,"[Belgium, France, Denmark]",France,Denmark,Belgium
1,1984,Group 1,1,1,32,France,Yugoslavia,Yugoslavia,"[Belgium, France, Yugoslavia]",Belgium,France,
2,1984,Group 1,1,2,39,Denmark,Belgium,Belgium,"[Belgium, France, Yugoslavia]",Belgium,France,
3,1984,Group 1,1,3,41,Denmark,Belgium,Denmark,"[Belgium, France, Yugoslavia]",Belgium,France,
4,1984,Group 1,1,4,59,France,Yugoslavia,France,"[Belgium, France, Denmark]",Belgium,France,
...,...,...,...,...,...,...,...,...,...,...,...,...
141,2024,Group D,1,7,80,Netherlands,Austria,Austria,"[France, Austria]",France,,
142,2024,Group E,1,0,initial,,,,"[Belgium, Romania]",Belgium,Romania,Ukraine
143,2024,Group E,1,1,24,Slovakia,Romania,Slovakia,"[Belgium, Ukraine]",Belgium,,
144,2024,Group E,1,2,37,Slovakia,Romania,Romania,"[Belgium, Romania]",Belgium,,


In [71]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\standings_eu_uefa.xlsx'
final_composition_changes_df.to_excel(file_path, index=False)


# best four third placed

In [72]:
# Call the function to get the DataFrame
final_df = best_four_third_placed(goals_last_day_sorted, all_games_before_last, agg_goals_before_last_day)

# Define the file path and save to Excel
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\third_teams_eu.xlsx'
final_df.to_excel(file_path, index=False)



--- Processing Year: 2016 --- Men

=== Initial Standings for Year 2016, Men Before Processing Any Goals ===

               team   stage  before_last_game_points  before_last_game_goals_scored  before_last_game_goals_conceded  total_goal_difference
             France Group A                        6                              4                                1                      3
        Switzerland Group A                        4                              2                                1                      1
            Romania Group A                        1                              2                                3                     -1
            Albania Group A                        0                              0                                3                     -3
            England Group B                        4                              3                                2                      1
           Slovakia Group B                       

In [73]:
# Count the number of observations where the year is 2016
count_2016 = goals_last_day_sorted[goals_last_day_sorted['year'] == 2016].shape[0]
print(f"Number of observations in goals_last_day_sorted for the year 2016: {count_2016}")

# Count the number of observations where the year is 2021
count_2020 = goals_last_day_sorted[goals_last_day_sorted['year'] == 2021].shape[0]
print(f"Number of observations in goals_last_day_sorted for the year 2020: {count_2020}")

# Count the number of observations where the year is 2024
count_2024 = goals_last_day_sorted[goals_last_day_sorted['year'] == 2024].shape[0]
print(f"Number of observations in goals_last_day_sorted for the year 2024: {count_2024}")


Number of observations in goals_last_day_sorted for the year 2016: 22
Number of observations in goals_last_day_sorted for the year 2020: 39
Number of observations in goals_last_day_sorted for the year 2024: 24
