# Libraries

In [28]:
import pandas as pd
import numpy as np
from getpass import getuser
from collections import defaultdict
from datetime import datetime, timedelta

# Load and inspect dataset

In [29]:
# Get the current user's name
user = getuser()

# Construct the path using the user's name
function_path = f"C:/Users/{user}/Documents/GitHub/tiebreak_wc/code/wiki/functions_uefa.ipynb"

# Run the notebook
%run $function_path


In [30]:


# Path to the dataset
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\eu_goals_men.xlsx'

# Load the Excel file into a DataFrame
df = pd.read_excel(data_path)

# Now, filter the data for tournaments after 1984
df = df[df['year'] >= 1984]

# Display the first few rows of the DataFrame to verify the import
display(df.head())

Unnamed: 0,stage,year,time,home_team,away_team,score,stadium_name,stadium_city,stadium_attendance,referee_name,...,goal_minute,extra_time,goals_home,goals_away,own_goal,penalty,goal_minute_et,goal_et,short_date,long_date
93,Group 1,1984,20:30,France,Denmark,1–0,Parc des Princes,Paris,47570,Volker Roth,...,78,0,1,0,0,0,0,0,1984-06-12,12 June 1984
94,Group 1,1984,20:30,Belgium,Yugoslavia,2–0,Stade Félix-Bollaert,Lens,41525,Erik Fredriksson,...,28,0,2,0,0,0,0,0,1984-06-13,13 June 1984
95,Group 1,1984,20:30,Belgium,Yugoslavia,2–0,Stade Félix-Bollaert,Lens,41525,Erik Fredriksson,...,45,0,2,0,0,0,0,0,1984-06-13,13 June 1984
96,Group 1,1984,17:15,France,Belgium,5–0,Stade de la Beaujoire,Nantes,51359,Bob Valentine,...,4,0,5,0,0,0,0,0,1984-06-16,16 June 1984
97,Group 1,1984,17:15,France,Belgium,5–0,Stade de la Beaujoire,Nantes,51359,Bob Valentine,...,74,0,5,0,0,1,0,0,1984-06-16,16 June 1984


In [31]:
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\elo_eu.xlsx'
elo_eu = pd.read_excel(data_path)

In [32]:
# Dictionary of replacements for team names
replacements = {
    "Republic of Ireland": "Ireland",
    "CIS": "Commonwealth of Independent States",
    "FR Yugoslavia": "Yugoslavia"
}


In [33]:
# Prepare the Elo dataset for integration: rename columns for clarity
elo_eu = elo_eu.rename(columns={"team": "team_name", "elo_rating": "elo"})
elo_eu["team_name"] = elo_eu["team_name"].replace(replacements)

# Clean, transfrom, create variables

## time

In [34]:
# Step 1: Clean time variable by removing any letters (e.g., EEST, PST) if they exist
df['time_cleaned'] = df['time'].str.replace(r'[A-Za-z]+', '', regex=True).str.strip()

# Step 2: Split time where there are parentheses into local time and UTC offset
df['local_time'] = df['time_cleaned'].str.extract(r'(\d{2}:\d{2})')  # extract the part before parentheses
df['utc_time_offset'] = df['time_cleaned'].str.extract(r'\((.*?)\)')  # extract the part inside parentheses


# Extract relevant columns

In [35]:
# Filter out the knockout stages
df = df[~df['stage'].isin(['Quarter-finals', 'Round of 16', 'Semi-finals', 'Final'])]

# Extract relevant columns for goal events and match results
goals_df = df[['year', 'stage', 'home_team', 'away_team', 'scorer_nationality', 'goal_minute', 'short_date','local_time', 'score']]

# Remove duplicates
goals_df = goals_df.drop_duplicates()

#Sort goals_df by short_date, local_time, and goal_minute

goals_df = goals_df.sort_values(by=['short_date', 'local_time', 'goal_minute'], ascending=[True, True, True])


display(goals_df.head())


Unnamed: 0,year,stage,home_team,away_team,scorer_nationality,goal_minute,short_date,local_time,score
93,1984,Group 1,France,Denmark,France,78,1984-06-12,20:30,1–0
94,1984,Group 1,Belgium,Yugoslavia,Belgium,28,1984-06-13,20:30,2–0
95,1984,Group 1,Belgium,Yugoslavia,Belgium,45,1984-06-13,20:30,2–0
117,1984,Group 2,Romania,Spain,Spain,22,1984-06-14,20:30,1–1
116,1984,Group 2,Romania,Spain,Romania,35,1984-06-14,20:30,1–1


# team counts

In [36]:

# Calculate the number of unique teams per stage and year using home_team and away_team
team_counts = (
    df.groupby(['year', 'stage'])
    .apply(lambda x: pd.concat([x['home_team'], x['away_team']]).unique())
)

# Convert the resulting series to a DataFrame
team_counts = team_counts.reset_index()  # Reset index without 'name' argument
team_counts = team_counts.rename(columns={0: 'team_list'})  # Rename the column appropriately

# Convert team list into counts
team_counts['team_count'] = team_counts['team_list'].apply(len)

# Filter stages where the number of teams is not equal to 4
invalid_groups = team_counts[team_counts['team_count'] != 4]

# Display the invalid groups
display(invalid_groups)



Unnamed: 0,year,stage,team_list,team_count


# Recreate Leauge Table after first two matchdays

In [37]:

# Assuming goals_df is your original dataset
agg_goals_before_last_day, goals_last_day_sorted = before_last(goals_df)


In [38]:
home_games, away_games = aggregate_home_away_points(agg_goals_before_last_day)

In [39]:
home2016f = home_games[(home_games['year'] == 2016) & (home_games['stage'] == 'Group F')]
home2016f

Unnamed: 0,year,stage,home_team,goals_scored,goals_conceded,points_home,match_count_home
97,2016,Group F,Austria,0,2,0,1
98,2016,Group F,Iceland,1,1,1,1
99,2016,Group F,Portugal,1,1,1,1


In [40]:
# Filter for year == 2022 and stage == 'Group E'
away2016f = away_games[(away_games['year'] == 2016) & (away_games['stage'] == 'Group F')]
away2016f

Unnamed: 0,year,stage,away_team,goals_scored,goals_conceded,points_away,match_count_away
96,2016,Group F,Hungary,3,1,4,2
97,2016,Group F,Iceland,1,1,1,1


## aggregate data after first two matches 

In [41]:
# Applying the fifa_before_last function to the mock data
all_games_before_last = uefa_before_last(home_games, away_games, agg_goals_before_last_day, team_counts)

In [42]:
group2016f= all_games_before_last[
    (all_games_before_last['year'] == 2016) & 
    (all_games_before_last['stage'] == 'Group F')
]

display(group2016f)

Unnamed: 0,year,stage,team,goals_scored,goals_conceded,points,goals_difference,total_matches,standing,tiebreaker,tie_won
124,2016,Group F,Hungary,3,1,4,2,2,1,no need,0
125,2016,Group F,Iceland,2,2,2,0,2,2,Iceland,1
126,2016,Group F,Portugal,1,1,2,0,2,3,Iceland,0
127,2016,Group F,Austria,0,2,1,-2,2,4,no need,0


# Recreate league table after last match day

### uefa criteria 

In [43]:
# Initialize an empty list to store the results for each pair
all_results = []

# Get unique pairs of year, stage, and men
unique_pairs = all_games_before_last[['year', 'stage']].drop_duplicates()

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    stage = row['stage']
    
    # Apply the function to the current pair
    result = uefa_final_euro(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_results.append(result)

# Concatenate all the results into a single DataFrame
changes_df_euro = pd.concat(all_results)

# Keep only the specified columns
changes_df_euro = changes_df_euro[[ 'year', 'stage', 'team', '1st', '2nd', '3rd', '4th', 'changes']]



=== Initial Standings for Year 1984, Group 1 Before Last Match Goals ===

      team  total_points  total_goals_scored  total_goals_conceded  total_goal_difference  before_last_game_points  before_last_game_standing
    France             5                   6                     0                      6                        4                          1
   Denmark             3                   5                     1                      4                        2                          2
   Belgium             3                   2                     5                     -3                        2                          3
Yugoslavia             1                   0                     7                     -7                        0                          4


Analyzing goal: 26 minute, Player team: Belgium, Home: Denmark, Away: Belgium

=== Teams with Identical Points (Tied Teams) ===

Empty DataFrame
Columns: [team, total_points]
Index: []

=== Updated Standings After

In [44]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\uefa\eu\tb_eu_uefa_men.xlsx'
changes_df_euro.to_excel(file_path, index=False)


# group composition tracking

In [45]:
all_games_before_last

Unnamed: 0,year,stage,team,goals_scored,goals_conceded,points,goals_difference,total_matches,standing,tiebreaker,tie_won
0,1984,Group 1,France,6,0,4,6,2,1,no need,0
1,1984,Group 1,Denmark,5,1,2,4,2,2,Denmark,1
2,1984,Group 1,Belgium,2,5,2,-3,2,3,Denmark,0
3,1984,Group 1,Yugoslavia,0,7,0,-7,2,4,no need,0
4,1984,Group 2,West Germany,2,1,3,1,2,1,no need,0
...,...,...,...,...,...,...,...,...,...,...,...
171,2024,Group E,Slovakia,2,2,3,0,2,4,Ukraine,0
172,2024,Group F,Portugal,5,1,6,4,2,1,no need,0
173,2024,Group F,Turkey,3,4,3,-1,2,2,no need,0
174,2024,Group F,Czech Republic,2,3,1,-1,2,3,Czech Republic,1


In [46]:
# Initialize an empty list to store the results for each pair
all_composition_changes = []

# Get unique pairs of year, stage, and men
unique_pairs = all_games_before_last[['year', 'stage']].drop_duplicates()

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    stage = row['stage']
    
    # Apply the function to the current pair
    composition_changes_df = track_composition_changes(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_composition_changes.append(composition_changes_df)

# Concatenate all the results into a single DataFrame
final_composition_changes_df = pd.concat(all_composition_changes, ignore_index=True)




=== Initial Standings for Group 1, 1984 (Goal Time = 0) ===
      team  total_points  total_goals_scored  total_goals_conceded  total_goal_difference  before_last_game_standing
    France             5                   6                     0                      6                          1
   Denmark             3                   5                     1                      4                          2
   Belgium             3                   2                     5                     -3                          3
Yugoslavia             1                   0                     7                     -7                          4



=== Standings after goal at minute 26 in Group 1, edition 1984 ===
      team  total_points  total_goals_scored  total_goals_conceded  total_goal_difference  before_last_game_standing
    France             5                   6                     0                      6                          1
   Belgium             4                   3      

In [47]:
# Initialize an empty list to store the results for each pair
all_composition_changes = []

# Get unique pairs of year, stage, and men
unique_pairs = all_games_before_last[['year', 'stage']].drop_duplicates()

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    stage = row['stage']
    
    # Apply the function to the current pair
    composition_changes_df = gap_composition(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_composition_changes.append(composition_changes_df)

# Concatenate all the results into a single DataFrame
final_composition_changes_df = pd.concat(all_composition_changes, ignore_index=True)




=== Initial Standings for Group 1, 1984 (Goal Time = 0) ===
      team  total_points  total_goals_scored  total_goals_conceded  total_goal_difference  before_last_game_standing
    France             5                   6                     0                      6                          1
   Denmark             3                   5                     1                      4                          2
   Belgium             3                   2                     5                     -3                          3
Yugoslavia             1                   0                     7                     -7                          4



=== Standings after goal at minute 26 in Group 1, edition 1984 ===
      team  total_points  total_goals_scored  total_goals_conceded  total_goal_difference  before_last_game_standing
    France             5                   6                     0                      6                          1
   Belgium             4                   3      

In [48]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\uefa\eu\standings_eu_uefa_men.xlsx'
final_composition_changes_df.to_excel(file_path, index=False)


# best four third placed

In [49]:
# Call the function to get the DataFrame
final_df = best_four_third_placed_eu_men(goals_last_day_sorted, all_games_before_last)

# Define the file path and save to Excel
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\uefa\eu\third_teams_eu_uefa_men.xlsx'
final_df.to_excel(file_path, index=False)



--- Processing Year: 2016 ---

--- Processing Year: 2021 ---

--- Processing Year: 2024 ---


# suspence

## active suspence

In [50]:
# Call the active_suspense function
try:

    # Fix: Ensure we correctly loop through `goals_last_day_sorted`
    # Apply the active_suspense function
    active_suspense_results = active_suspense(all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)

    # Display the results
    print(active_suspense_results)

    # Optionally, save the results to a CSV file
    file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\uefa\eu\active_suspense_results_eu.xlsx'
    active_suspense_results.to_excel(file_path, index=False)

except Exception as e:
    print(f"An error occurred: {e}")


               team    stage  year  goal_minute  active_suspense_count  \
0           Denmark  Group 1  1984           26                      0   
1           Denmark  Group 1  1984           32                      0   
2           Denmark  Group 1  1984           39                      0   
3           Denmark  Group 1  1984           41                      0   
4           Denmark  Group 1  1984           59                      0   
..              ...      ...   ...          ...                    ...   
224  Czech Republic  Group F  2024            2                      0   
225  Czech Republic  Group F  2024           51                      0   
226  Czech Republic  Group F  2024           57                      0   
227  Czech Republic  Group F  2024           66                      0   
228  Czech Republic  Group F  2024           94                      0   

                                                reason  
0    Condition for moving to 2nd not met (losing vs...

In [51]:
# Group by year, team, and group_name to count the aggregate active suspense

summary = (

    active_suspense_results[active_suspense_results['active_suspense_count'] > 0]

    .groupby(['year', 'team', 'stage'])

    .size()

    .reset_index(name='aggregate_active_suspense')

)


In [52]:
    # Optionally, save the results to a CSV file
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\uefa\eu\active_suspense_eu.xlsx'
summary.to_excel(file_path, index=False)

# probabilities

## match probabilities

In [53]:
# Integrate Elo probabilities with the correct column names
goals_df_with_probs = integrate_elo_probabilities(goals_df, elo_eu)

# Display a sample of the data with probabilities
display(goals_df_with_probs.head())

Unnamed: 0,year,stage,home_team,away_team,scorer_nationality,goal_minute,short_date,local_time,score,elo_home,elo_away,P_home_win,P_draw,P_away_win
0,1984,Group 1,France,Denmark,France,78,1984-06-12,20:30,1–0,1960.0,1809.0,0.626576,0.110716,0.262708
1,1984,Group 1,Belgium,Yugoslavia,Belgium,28,1984-06-13,20:30,2–0,1898.0,1890.0,0.427685,0.16388,0.408436
2,1984,Group 1,Belgium,Yugoslavia,Belgium,45,1984-06-13,20:30,2–0,1898.0,1890.0,0.427685,0.16388,0.408436
3,1984,Group 2,Romania,Spain,Spain,22,1984-06-14,20:30,1–1,1824.0,1868.0,0.370981,0.151104,0.477916
4,1984,Group 2,Romania,Spain,Romania,35,1984-06-14,20:30,1–1,1824.0,1868.0,0.370981,0.151104,0.477916


In [54]:
# Apply the function to update probabilities
goals_df_with_updated_probs = update_probabilities_for_following_matches(goals_df_with_probs)

# Display a sample of the updated data
display(goals_df_with_updated_probs.head())

Unnamed: 0,year,stage,home_team,away_team,scorer_nationality,goal_minute,short_date,local_time,score,elo_home,elo_away,P_home_win,P_draw,P_away_win
0,1984,Group 1,France,Denmark,France,78,1984-06-12,20:30,1–0,1960.0,1809.0,0.643567,0.105657,0.250777
1,1984,Group 1,Belgium,Yugoslavia,Belgium,28,1984-06-13,20:30,2–0,1898.0,1890.0,0.45495,0.156967,0.388084
2,1984,Group 1,Belgium,Yugoslavia,Belgium,45,1984-06-13,20:30,2–0,1898.0,1890.0,0.481117,0.150282,0.368601
3,1984,Group 2,Romania,Spain,Spain,22,1984-06-14,20:30,1–1,1824.0,1868.0,0.373215,0.151873,0.474911
4,1984,Group 2,Romania,Spain,Romania,35,1984-06-14,20:30,1–1,1824.0,1868.0,0.37534,0.152604,0.472056
