# Libraries

In [1]:
import pandas as pd
from getpass import getuser
from collections import defaultdict
from datetime import datetime, timedelta

# Load and inspect dataset

In [2]:
# Get the current user's name
user = getuser()

# Import the functions from the functions notebook
%run functions_uefa.ipynb


In [3]:


# Path to the dataset
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\eu_goals_men.xlsx'

# Load the Excel file into a DataFrame
df = pd.read_excel(data_path)

# Display the first few rows of the DataFrame to verify the import
display(df.head())

Unnamed: 0,year,stage,score,time,home_team,away_team,stadium_name,stadium_city,stadium_attendance,referee_name,...,goal_minute,extra_time,goals_home,goals_away,own_goal,penalty,goal_minute_et,goal_et,short_date,long_date
0,1984,Group 1,1–0,20:30,France,Denmark,Parc des Princes,Paris,47570,Volker Roth,...,78,0,1,0,0,0,0,0,1984-06-12,12 June 1984
1,1984,Group 1,2–0,20:30,Belgium,Yugoslavia,Stade Félix-Bollaert,Lens,41525,Erik Fredriksson,...,28,0,2,0,0,0,0,0,1984-06-13,13 June 1984
2,1984,Group 1,2–0,20:30,Belgium,Yugoslavia,Stade Félix-Bollaert,Lens,41525,Erik Fredriksson,...,45,0,2,0,0,0,0,0,1984-06-13,13 June 1984
3,1984,Group 1,5–0,17:15,France,Belgium,Stade de la Beaujoire,Nantes,51359,Bob Valentine,...,4,0,5,0,0,0,0,0,1984-06-16,16 June 1984
4,1984,Group 1,5–0,17:15,France,Belgium,Stade de la Beaujoire,Nantes,51359,Bob Valentine,...,74,0,5,0,0,1,0,0,1984-06-16,16 June 1984


# Clean, transfrom, create variables

## time

In [4]:
# Step 1: Clean time variable by removing any letters (e.g., EEST, PST) if they exist
df['time_cleaned'] = df['time'].str.replace(r'[A-Za-z]+', '', regex=True).str.strip()

# Step 2: Split time where there are parentheses into local time and UTC offset
df['local_time'] = df['time_cleaned'].str.extract(r'(\d{2}:\d{2})')  # extract the part before parentheses
df['utc_time_offset'] = df['time_cleaned'].str.extract(r'\((.*?)\)')  # extract the part inside parentheses


## date

In [5]:
# Convert 'long_date' column to datetime format
df['long_date'] = pd.to_datetime(df['long_date'], format='%d %B %Y', errors='coerce')

# Extract the year from 'long_date' and create a new column 'year'
df['year'] = df['long_date'].dt.year


# Extract relevant columns

In [6]:
# Filter out the knockout stages
df = df[~df['stage'].isin(['Quarter-finals', 'Round of 16', 'Semi-finals', 'Final'])]

# Extract relevant columns for goal events and match results
goals_df = df[['year', 'stage', 'home_team', 'away_team', 'scorer_nationality', 'goal_minute', 'short_date','local_time', 'score']]

# Remove duplicates
goals_df = goals_df.drop_duplicates()

display(goals_df.head())


Unnamed: 0,year,stage,home_team,away_team,scorer_nationality,goal_minute,short_date,local_time,score
0,1984,Group 1,France,Denmark,France,78,1984-06-12,20:30,1–0
1,1984,Group 1,Belgium,Yugoslavia,Belgium,28,1984-06-13,20:30,2–0
2,1984,Group 1,Belgium,Yugoslavia,Belgium,45,1984-06-13,20:30,2–0
3,1984,Group 1,France,Belgium,France,4,1984-06-16,17:15,5–0
4,1984,Group 1,France,Belgium,France,74,1984-06-16,17:15,5–0


# Recreate Leauge Table after first two matchdays

In [7]:

# Assuming goals_df is your original dataset
agg_goals_before_last_day, goals_last_day_sorted = process_goals_data(goals_df)


In [8]:
home_games, away_games = aggregate_home_away_points(agg_goals_before_last_day)

In [9]:
# Filter for year == 2021 and stage == 'Group F'
home_f_2021 = home_games[(home_games['year'] == 2021) & (home_games['stage'] == 'Group F')]
home_f_2021

Unnamed: 0,year,stage,home_team,goals_scored,goals_conceded,points_home,match_count_home
115,2021,Group F,France,1,0,3,1
116,2021,Group F,Hungary,1,4,1,2
117,2021,Group F,Portugal,2,4,0,1


In [10]:
# Filter for year == 2021 and stage == 'Group F'
away_f_2021 = away_games[(away_games['year'] == 2021) & (away_games['stage'] == 'Group F')]
away_f_2021

Unnamed: 0,year,stage,away_team,goals_scored,goals_conceded,points_away,match_count_away
111,2021,Group F,France,1,1,1,1
112,2021,Group F,Germany,4,3,3,2
113,2021,Group F,Portugal,3,0,3,1


## aggregate data after first two matches following 

### UEFA criteria (first h2h, then aggregate)

In [11]:
# Applying the fifa_before_last function to the mock data
all_games_before_last = uefa_before_last(home_games, away_games, agg_goals_before_last_day)

Tie in 1984, Stage: Group 1, between Belgium and Denmark (Points: 2.0)
Tie resolved in favor of Denmark
Tie in 1984, Stage: Group 2, between West Germany and Spain (Points: 2.0)
Tie resolved in favor of West Germany
Tie in 1984, Stage: Group 2, between Portugal and Romania (Points: 1.0)
Tie resolved in favor of Portugal
Tie in 1988, Stage: Group 1, between Italy and West Germany (Points: 3.0)
Tie resolved in favor of West Germany
Tie in 1988, Stage: Group 2, between Republic of Ireland and Soviet Union (Points: 3.0)
Tie in 1996, Stage: Group B, between Bulgaria and France (Points: 4.0)
Tie in 1996, Stage: Group C, between Czech Republic and Italy (Points: 3.0)
Tie resolved in favor of Czech Republic
Tie in 2000, Stage: Group A, between Germany and Romania (Points: 1.0)
Tie in 2000, Stage: Group B, between Turkey and Sweden (Points: 0.0)
Tie in 2000, Stage: Group C, between Norway and Spain (Points: 3.0)
Tie resolved in favor of Spain
Tie in 2000, Stage: Group D, between France and Neth

In [12]:
# Filtering all_games_before_last for the year 2021 and stage 'Group F'
groupf2021 = all_games_before_last[
    (all_games_before_last['year'] == 2021) & 
    (all_games_before_last['stage'] == 'Group F')
]

display(groupf2021)

Unnamed: 0,year,stage,team,goals_scored,goals_conceded,points,goals_difference,total_matches,tiebreaker,tie_won,standing
147,2021,Group F,France,2.0,1.0,4.0,1.0,2.0,no need,0,1
148,2021,Group F,Germany,4.0,3.0,3.0,1.0,2.0,Germany,1,2
149,2021,Group F,Portugal,5.0,4.0,3.0,1.0,2.0,Germany,0,3
150,2021,Group F,Hungary,1.0,4.0,1.0,-3.0,2.0,no need,0,4


# Recreate league table after last match day

### uefa criteria 

In [13]:
# Initialize an empty list to store the results for each pair
all_results = []

# Get unique pairs of year, stage, and men
unique_pairs = all_games_before_last[['year', 'stage']].drop_duplicates()

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    stage = row['stage']
    
    # Apply the function to the current pair
    result = uefa_final_euro(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_results.append(result)

# Concatenate all the results into a single DataFrame
changes_df_euro = pd.concat(all_results)

# Keep only the specified columns
changes_df_euro = changes_df_euro[[ 'year', 'stage', 'team', '1st', '2nd', '3rd', '4th', 'changes']]



=== Initial Standings for Year 1984, Group 1 Before Last Match Goals ===

      team  total_points  total_goals_scored  total_goals_conceded  total_goal_difference  before_last_game_points  before_last_game_standing
    France           5.0                 6.0                   0.0                    6.0                      4.0                          1
   Denmark           3.0                 5.0                   1.0                    4.0                      2.0                          2
   Belgium           3.0                 2.0                   5.0                   -3.0                      2.0                          3
Yugoslavia           1.0                 0.0                   7.0                   -7.0                      0.0                          4


Analyzing goal: 26 minute, Player team: Belgium, Home: Denmark, Away: Belgium

=== Updated Standings After This Goal ===

      team  total_points  total_goals_scored  total_goals_conceded  total_goal_difference  

In [14]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\tb_eu_uefa_men.xlsx'
changes_df_euro.to_excel(file_path, index=False)


# group composition tracking

In [15]:
# Initialize an empty list to store the results for each pair
all_composition_changes = []

# Get unique pairs of year, stage, and men
unique_pairs = all_games_before_last[['year', 'stage']].drop_duplicates()

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    stage = row['stage']
    
    # Apply the function to the current pair
    composition_changes_df = track_composition_changes(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_composition_changes.append(composition_changes_df)

# Concatenate all the results into a single DataFrame
final_composition_changes_df = pd.concat(all_composition_changes, ignore_index=True)

# Display the final DataFrame
display(final_composition_changes_df)




=== Initial Standings for Group 1, 1984 (with 0-0 points added) ===
      team  total_points  total_goals_scored  total_goals_conceded  total_goal_difference  before_last_game_standing
    France           5.0                 6.0                   0.0                    6.0                          1
   Denmark           3.0                 5.0                   1.0                    4.0                          2
   Belgium           3.0                 2.0                   5.0                   -3.0                          3
Yugoslavia           1.0                 0.0                   7.0                   -7.0                          4



=== Standings after goal at minute 26 in Group 1, edition 1984 ===
      team  total_points  total_goals_scored  total_goals_conceded  total_goal_difference  last_game_standing  tied_won
    France           5.0                 6.0                   0.0                    6.0                   1         1
   Belgium           5.0            

Unnamed: 0,year,stage,change_num,goal_time,home_team,away_team,scorer_team,new_top_teams,1st,2nd,3rd,scorer_nationality
0,1984,Group 1,0,initial,,,,"[Belgium, Denmark, France]",France,Denmark,Belgium,
1,1984,Group 1,1,32,France,Yugoslavia,,"[Yugoslavia, Belgium, France]",Belgium,France,Yugoslavia,Yugoslavia
2,1984,Group 1,2,59,France,Yugoslavia,,"[Belgium, Denmark, France]",France,Belgium,Denmark,France
3,1984,Group 2,0,initial,,,,"[West Germany, Spain, Portugal]",West Germany,Spain,Portugal,
4,1988,Group 1,0,initial,,,,"[West Germany, Italy, Spain]",West Germany,Italy,Spain,
...,...,...,...,...,...,...,...,...,...,...,...,...
82,2024,Group D,5,80,Netherlands,Austria,,"[France, Austria]",Austria,France,Netherlands,Austria
83,2024,Group E,0,initial,,,,"[Romania, Belgium]",Belgium,Romania,Ukraine,
84,2024,Group E,1,24,Slovakia,Romania,,"[Ukraine, Belgium]",Belgium,Ukraine,Romania,Slovakia
85,2024,Group E,2,37,Slovakia,Romania,,"[Romania, Belgium]",Belgium,Romania,Ukraine,Romania


In [16]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\standings_eu_uefa_men.xlsx'
final_composition_changes_df.to_excel(file_path, index=False)


# best four third placed

In [None]:
# Call the function to get the DataFrame
final_df = best_four_third_placed_eu(goals_last_day_sorted, all_games_before_last, agg_goals_before_last_day)

# Define the file path and save to Excel
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\third_teams_eu_uefa_men.xlsx'
final_df.to_excel(file_path, index=False)



--- Processing Year: (2016,) ---

=== Initial Standings for Year (2016,) Before Processing Any Goals ===

               team   stage  before_last_game_points  before_last_game_goals_scored  before_last_game_goals_conceded  total_goal_difference
             France Group A                      6.0                            4.0                              1.0                    3.0
        Switzerland Group A                      4.0                            2.0                              1.0                    1.0
            Romania Group A                      1.0                            2.0                              3.0                   -1.0
            Albania Group A                      0.0                            0.0                              3.0                   -3.0
            England Group B                      4.0                            3.0                              2.0                    1.0
              Wales Group B                      3.0 

In [18]:
# Count the number of observations where the year is 2016
count_2016 = goals_last_day_sorted[goals_last_day_sorted['year'] == 2016].shape[0]
print(f"Number of observations in goals_last_day_sorted for the year 2016: {count_2016}")

# Count the number of observations where the year is 2021
count_2020 = goals_last_day_sorted[goals_last_day_sorted['year'] == 2021].shape[0]
print(f"Number of observations in goals_last_day_sorted for the year 2020: {count_2020}")

# Count the number of observations where the year is 2024
count_2024 = goals_last_day_sorted[goals_last_day_sorted['year'] == 2024].shape[0]
print(f"Number of observations in goals_last_day_sorted for the year 2024: {count_2024}")


Number of observations in goals_last_day_sorted for the year 2016: 22
Number of observations in goals_last_day_sorted for the year 2020: 38
Number of observations in goals_last_day_sorted for the year 2024: 24
