# Libraries

In [1]:
import pandas as pd
from getpass import getuser
from collections import defaultdict
from datetime import datetime, timedelta

# Load and inspect dataset

In [2]:
# Get the current user's name
user = getuser()

# Construct the path using the user's name
function_path = f"C:/Users/{user}/Documents/GitHub/tiebreak_wc/code/wiki/functions_uefa.ipynb"

# Run the notebook
%run $function_path


In [3]:


# Path to the dataset
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\eu_goals_men.xlsx'

# Load the Excel file into a DataFrame
df = pd.read_excel(data_path)

# Now, filter the data for tournaments after 1984
df = df[df['year'] > 1984]

# Display the first few rows of the DataFrame to verify the import
display(df.head())

Unnamed: 0,stage,year,time,home_team,away_team,score,stadium_name,stadium_city,stadium_attendance,referee_name,...,goal_minute,extra_time,goals_home,goals_away,own_goal,penalty,goal_minute_et,goal_et,short_date,long_date
134,Group 1,1988,20:15,West Germany,Italy,1–1,Rheinstadion,Düsseldorf,62552,Keith Hackett,...,55,0,1,1,0,0,0,0,1988-06-10,10 June 1988
135,Group 1,1988,20:15,West Germany,Italy,1–1,Rheinstadion,Düsseldorf,62552,Keith Hackett,...,52,0,1,1,0,0,0,0,1988-06-10,10 June 1988
136,Group 1,1988,15:30,Denmark,Spain,2–3,Niedersachsenstadion,Hanover,55707,Bep Thomas,...,24,0,2,3,0,0,0,0,1988-06-11,11 June 1988
137,Group 1,1988,15:30,Denmark,Spain,2–3,Niedersachsenstadion,Hanover,55707,Bep Thomas,...,82,0,2,3,0,0,0,0,1988-06-11,11 June 1988
138,Group 1,1988,15:30,Denmark,Spain,2–3,Niedersachsenstadion,Hanover,55707,Bep Thomas,...,5,0,2,3,0,0,0,0,1988-06-11,11 June 1988


# Clean, transfrom, create variables

## time

In [4]:
# Step 1: Clean time variable by removing any letters (e.g., EEST, PST) if they exist
df['time_cleaned'] = df['time'].str.replace(r'[A-Za-z]+', '', regex=True).str.strip()

# Step 2: Split time where there are parentheses into local time and UTC offset
df['local_time'] = df['time_cleaned'].str.extract(r'(\d{2}:\d{2})')  # extract the part before parentheses
df['utc_time_offset'] = df['time_cleaned'].str.extract(r'\((.*?)\)')  # extract the part inside parentheses


# Extract relevant columns

In [5]:
# Filter out the knockout stages
df = df[~df['stage'].isin(['Quarter-finals', 'Round of 16', 'Semi-finals', 'Final'])]

# Extract relevant columns for goal events and match results
goals_df = df[['year', 'stage', 'home_team', 'away_team', 'scorer_nationality', 'goal_minute', 'short_date','local_time', 'score']]

# Remove duplicates
goals_df = goals_df.drop_duplicates()

display(goals_df.head())


Unnamed: 0,year,stage,home_team,away_team,scorer_nationality,goal_minute,short_date,local_time,score
134,1988,Group 1,West Germany,Italy,West Germany,55,1988-06-10,20:15,1–1
135,1988,Group 1,West Germany,Italy,Italy,52,1988-06-10,20:15,1–1
136,1988,Group 1,Denmark,Spain,Denmark,24,1988-06-11,15:30,2–3
137,1988,Group 1,Denmark,Spain,Denmark,82,1988-06-11,15:30,2–3
138,1988,Group 1,Denmark,Spain,Spain,5,1988-06-11,15:30,2–3


# team counts

In [6]:

# Calculate the number of unique teams per stage and year using home_team and away_team
team_counts = (
    df.groupby(['year', 'stage'])
    .apply(lambda x: pd.concat([x['home_team'], x['away_team']]).unique())
)

# Convert the resulting series to a DataFrame
team_counts = team_counts.reset_index()  # Reset index without 'name' argument
team_counts = team_counts.rename(columns={0: 'team_list'})  # Rename the column appropriately

# Convert team list into counts
team_counts['team_count'] = team_counts['team_list'].apply(len)

# Filter stages where the number of teams is not equal to 4
invalid_groups = team_counts[team_counts['team_count'] != 4]

# Display the invalid groups
display(invalid_groups)



  .apply(lambda x: pd.concat([x['home_team'], x['away_team']]).unique())


Unnamed: 0,year,stage,team_list,team_count


# Recreate Leauge Table after first two matchdays

In [7]:

# Assuming goals_df is your original dataset
agg_goals_before_last_day, goals_last_day_sorted = process_goals_data(goals_df)


In [8]:
agg_goals_before_last_day

Unnamed: 0,year,stage,home_team,away_team,local_time,short_date,goals_home,goals_away,original_score,calculated_score,score_match,won
0,1988,Group 1,Denmark,Spain,15:30,2024-11-26,2,3,2–3,2-3,True,-1
1,1988,Group 1,Italy,Spain,20:15,2024-11-26,1,0,1–0,1-0,True,1
2,1988,Group 1,West Germany,Denmark,17:15,2024-11-26,2,0,2–0,2-0,True,1
3,1988,Group 1,West Germany,Italy,20:15,2024-11-26,1,1,1–1,1-1,True,0
4,1988,Group 2,England,Netherlands,17:15,2024-11-26,1,3,1–3,1-3,True,-1
...,...,...,...,...,...,...,...,...,...,...,...,...
147,2024,Group E,Slovakia,Ukraine,15:00,2024-11-26,1,2,1–2,1-2,True,-1
148,2024,Group F,Georgia,Czech Republic,15:00,2024-11-26,1,1,1–1,1-1,True,0
149,2024,Group F,Portugal,Czech Republic,21:00,2024-11-26,2,1,2–1,2-1,True,1
150,2024,Group F,Turkey,Georgia,18:00,2024-11-26,3,1,3–1,3-1,True,1


In [9]:
home_games, away_games = aggregate_home_away_points(agg_goals_before_last_day)

In [10]:
# Filter for year == 2022 and stage == 'Group E'
home1992 = home_games[(home_games['year'] == 1992) & (home_games['stage'] == 'Group 1')]
home1992

Unnamed: 0,year,stage,home_team,goals_scored,goals_conceded,points_home,match_count_home
6,1992,Group 1,Sweden,2,1,3,2


In [11]:
# Filter for year == 2021 and stage == 'Group F'
home2024d = home_games[(home_games['year'] == 2024) & (home_games['stage'] == 'Group D')]
home2024d

Unnamed: 0,year,stage,home_team,goals_scored,goals_conceded,points_home,match_count_home
120,2024,Group D,Austria,0,1,0,1
121,2024,Group D,Poland,2,5,0,2


In [12]:
# Filter for year == 2022 and stage == 'Group E'
away1992 = away_games[(away_games['year'] == 1992) & (away_games['stage'] == 'Group 1')]
away1992

Unnamed: 0,year,stage,away_team,goals_scored,goals_conceded,points_away,match_count_away
6,1992,Group 1,Denmark,0,1,0,1
7,1992,Group 1,France,1,1,1,1


In [13]:
# Filter for year == 2021 and stage == 'Group F'
away2024d = away_games[(away_games['year'] == 2024) & (away_games['stage'] == 'Group D')]
away2024d

Unnamed: 0,year,stage,away_team,goals_scored,goals_conceded,points_away,match_count_away
117,2024,Group D,Austria,3,1,3,1
118,2024,Group D,France,1,0,3,1
119,2024,Group D,Netherlands,2,1,3,1


## aggregate data after first two matches 

In [14]:
# Applying the fifa_before_last function to the mock data
all_games_before_last = uefa_before_last(home_games, away_games, agg_goals_before_last_day, team_counts)

In [15]:
# Filtering all_games_before_last for the year 2021 and stage 'Group F'
group2024d = all_games_before_last[
    (all_games_before_last['year'] == 2024) & 
    (all_games_before_last['stage'] == 'Group D')
]

display(group2024d)

Unnamed: 0,year,stage,team,goals_scored,goals_conceded,points,goals_difference,total_matches,tiebreaker,tie_won,standing
156,2024,Group D,Netherlands,2,1,4,1,2,Netherlands,1,1
157,2024,Group D,France,1,0,4,1,2,Netherlands,0,2
158,2024,Group D,Austria,3,2,3,1,2,no need,0,3
159,2024,Group D,Poland,2,5,0,-3,2,no need,0,4


In [16]:
# Filtering all_games_before_last for the year 2022 and stage 'Group E'
group1992 = all_games_before_last[
    (all_games_before_last['year'] == 1992) & 
    (all_games_before_last['stage'] == 'Group 1')
]

display(group1992)

Unnamed: 0,year,stage,team,goals_scored,goals_conceded,points,goals_difference,total_matches,tiebreaker,tie_won,standing
8,1992,Group 1,Sweden,2,1,3,1,2,no need,0,1
9,1992,Group 1,France,1,1,2,0,2,France,1,2
10,1992,Group 1,England,0,0,2,0,2,France,0,3
11,1992,Group 1,Denmark,0,1,1,-1,2,no need,0,4


# Recreate league table after last match day

### uefa criteria 

In [17]:
# Initialize an empty list to store the results for each pair
all_results = []

# Get unique pairs of year, stage, and men
unique_pairs = all_games_before_last[['year', 'stage']].drop_duplicates()

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    stage = row['stage']
    
    # Apply the function to the current pair
    result = uefa_final_euro(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_results.append(result)

# Concatenate all the results into a single DataFrame
changes_df_euro = pd.concat(all_results)

# Keep only the specified columns
changes_df_euro = changes_df_euro[[ 'year', 'stage', 'team', '1st', '2nd', '3rd', '4th', 'changes']]



=== Initial Standings for Year 1988, Group 1 Before Last Match Goals ===

        team  total_points  total_goals_scored  total_goals_conceded  total_goal_difference  before_last_game_points  before_last_game_standing
West Germany             4                   3                     1                      2                        3                          1
       Italy             4                   2                     1                      1                        3                          2
       Spain             3                   3                     3                      0                        2                          3
     Denmark             1                   2                     5                     -3                        0                          4


Analyzing goal: 29 minute, Player team: West Germany, Home: West Germany, Away: Spain

=== Updated Standings After This Goal ===

        team  total_points  total_goals_scored  total_goals_conceded  tot

In [18]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\uefa\eu\tb_eu_uefa_men.xlsx'
changes_df_euro.to_excel(file_path, index=False)


# group composition tracking

In [19]:
# Initialize an empty list to store the results for each pair
all_composition_changes = []

# Get unique pairs of year, stage, and men
unique_pairs = all_games_before_last[['year', 'stage']].drop_duplicates()

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    stage = row['stage']
    
    # Apply the function to the current pair
    composition_changes_df = track_composition_changes(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_composition_changes.append(composition_changes_df)

# Concatenate all the results into a single DataFrame
final_composition_changes_df = pd.concat(all_composition_changes, ignore_index=True)

# Rename 'player_team' column to 'scorer_team'
final_composition_changes_df.rename(columns={'player_team': 'scorer_nationality'}, inplace=True)

# Dropping the "scorer_team" column
final_composition_changes_df.drop(columns=['scorer_team'], inplace=True)




=== Initial Standings for Group 1, 1988 (with 0-0 points added) ===
        team  total_points  total_goals_scored  total_goals_conceded  total_goal_difference  before_last_game_standing
West Germany             4                   3                     1                      2                          1
       Italy             4                   2                     1                      1                          2
       Spain             3                   3                     3                      0                          3
     Denmark             1                   2                     5                     -3                          4



=== Standings after goal at minute 29 in Group 1, edition 1988 ===
        team  total_points  total_goals_scored  total_goals_conceded  total_goal_difference  last_game_standing  tied_won
West Germany             6                   4                     1                      3                   1         0
       Italy          

In [20]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\uefa\eu\standings_eu_uefa_men.xlsx'
final_composition_changes_df.to_excel(file_path, index=False)


# best four third placed

In [21]:
# Call the function to get the DataFrame
final_df = best_four_third_placed_eu_men(goals_last_day_sorted, all_games_before_last, agg_goals_before_last_day)

# Define the file path and save to Excel
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\uefa\eu\third_teams_eu_uefa_men.xlsx'
final_df.to_excel(file_path, index=False)



--- Processing Year: 2016 ---

=== Initial Standings for Year 2016 Before Processing Any Goals ===

               team   stage  before_last_game_points  before_last_game_goals_scored  before_last_game_goals_conceded  total_goal_difference
             France Group A                        6                              4                                1                      3
        Switzerland Group A                        4                              2                                1                      1
            Romania Group A                        1                              2                                3                     -1
            Albania Group A                        0                              0                                3                     -3
            England Group B                        4                              3                                2                      1
              Wales Group B                        3       

In [22]:
# Count the number of observations where the year is 2016
count_2016 = goals_last_day_sorted[goals_last_day_sorted['year'] == 2016].shape[0]
print(f"Number of observations in goals_last_day_sorted for the year 2016: {count_2016}")

# Count the number of observations where the year is 2021
count_2020 = goals_last_day_sorted[goals_last_day_sorted['year'] == 2021].shape[0]
print(f"Number of observations in goals_last_day_sorted for the year 2020: {count_2020}")

# Count the number of observations where the year is 2024
count_2024 = goals_last_day_sorted[goals_last_day_sorted['year'] == 2024].shape[0]
print(f"Number of observations in goals_last_day_sorted for the year 2024: {count_2024}")


Number of observations in goals_last_day_sorted for the year 2016: 22
Number of observations in goals_last_day_sorted for the year 2020: 38
Number of observations in goals_last_day_sorted for the year 2024: 24


# suspence

## active suspence

In [23]:
# Call the active_suspense function
try:

    # Fix: Ensure we correctly loop through `goals_last_day_sorted`
    # Apply the active_suspense function
    active_suspense_results = active_suspense(all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)

    # Display the results
    print(active_suspense_results)

    # Optionally, save the results to a CSV file
    file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\uefa\eu\active_suspense_results_eu.xlsx'
    active_suspense_results.to_excel(file_path, index=False)

except Exception as e:
    print(f"An error occurred: {e}")


               team    stage  year  goal_minute  active_suspense_count  \
0             Spain  Group 1  1988           29                      0   
1             Spain  Group 1  1988           51                      0   
2             Spain  Group 1  1988           67                      0   
3             Spain  Group 1  1988           87                      0   
4       Netherlands  Group 2  1988            3                      1   
..              ...      ...   ...          ...                    ...   
212  Czech Republic  Group F  2024            2                      0   
213  Czech Republic  Group F  2024           51                      0   
214  Czech Republic  Group F  2024           57                      0   
215  Czech Republic  Group F  2024           66                      0   
216  Czech Republic  Group F  2024           94                      0   

                                                reason  
0    Condition for moving to 2nd not met (losing vs...

In [25]:
# Group by year, team, and group_name to count the aggregate active suspense

summary = (

    active_suspense_results[active_suspense_results['active_suspense_count'] > 0]

    .groupby(['year', 'team', 'stage'])

    .size()

    .reset_index(name='aggregate_active_suspense')

)


In [27]:
    # Optionally, save the results to a CSV file
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\uefa\eu\active_suspense_eu.xlsx'
summary.to_excel(file_path, index=False)