# Libraries

In [228]:
import pandas as pd
import numpy as np
from getpass import getuser
from collections import defaultdict
from datetime import datetime, timedelta
import re


# Load and inspect dataset

In [229]:
# Get the current user's name
user = getuser()

# Construct the path using the user's name
function_path = f"C:/Users/{user}/Documents/GitHub/tiebreak_wc/code/wiki/functions_uefa.ipynb"

# Run the notebook
%run $function_path


In [230]:
# Path to the dataset
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\eu_goals_men.xlsx'

# Load the Excel file into a DataFrame
df = pd.read_excel(data_path)

# Now, filter the data for tournaments after 1984
df = df[df['year'] >= 1984]

# Display the first few rows of the DataFrame to verify the import
display(df.head())

Unnamed: 0,stage,year,time,home_team,away_team,score,stadium_name,stadium_city,stadium_attendance,referee_name,...,goal_minute,extra_time,goals_home,goals_away,own_goal,penalty,goal_minute_et,goal_et,short_date,long_date
0,Group 1,1984,20:30,France,Denmark,1–0,Parc des Princes,Paris,47570,Volker Roth,...,78,0,1,0,0,0,0,0,1984-06-12,12 June 1984
1,Group 1,1984,20:30,France,Denmark,1–0,Parc des Princes,Paris,47570,Volker Roth,...,0,0,1,0,0,0,0,0,1984-06-12,12 June 1984
2,Group 1,1984,20:30,Belgium,Yugoslavia,2–0,Stade Félix-Bollaert,Lens,41525,Erik Fredriksson,...,28,0,2,0,0,0,0,0,1984-06-13,13 June 1984
3,Group 1,1984,20:30,Belgium,Yugoslavia,2–0,Stade Félix-Bollaert,Lens,41525,Erik Fredriksson,...,45,0,2,0,0,0,0,0,1984-06-13,13 June 1984
4,Group 1,1984,20:30,Belgium,Yugoslavia,2–0,Stade Félix-Bollaert,Lens,41525,Erik Fredriksson,...,0,0,2,0,0,0,0,0,1984-06-13,13 June 1984


In [231]:
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\elo_eu.xlsx'
elo_eu = pd.read_excel(data_path)

In [232]:
# Dictionary of replacements for team names
replacements = {
    "Republic of Ireland": "Ireland",
    "CIS": "Commonwealth of Independent States",
    "FR Yugoslavia": "Yugoslavia"
}


In [233]:
# Prepare the Elo dataset for integration: rename columns for clarity
elo_eu = elo_eu.rename(columns={"team": "team_name", "elo_rating": "elo"})
elo_eu["team_name"] = elo_eu["team_name"].replace(replacements)

# Clean, transfrom, create variables

## time

In [234]:
# Apply the conversion function
df[['local_time', 'utc_time']] = df.apply(lambda row: pd.Series(convert_time(row)), axis=1)

## date

In [235]:
# Step 1: Remove any commas before the year in 'long_date'
df['long_date'] = df['long_date'].str.replace(r',\s*(\d{4})', r' \1', regex=True)

# Step 2: Convert 'long_date' to 'short_date' where 'short_date' is missing
df['short_date'] = df['short_date'].fillna(pd.to_datetime(df['long_date'], errors='coerce').dt.strftime('%Y-%m-%d'))


# stage

In [236]:
# Calculate the number of unique teams per stage and year using home_team and away_team
team_counts = (
    df.groupby(['year', 'stage'])
    .apply(lambda x: pd.concat([x['home_team'], x['away_team']]).unique())
)

# Convert the resulting series to a DataFrame
team_counts = team_counts.reset_index()  # Reset index without 'name' argument
team_counts = team_counts.rename(columns={0: 'team_list'})  # Rename the column appropriately

# Convert team list into counts
team_counts['team_count'] = team_counts['team_list'].apply(len)

# Filter stages where the number of teams is not equal to 4
invalid_groups = team_counts[team_counts['team_count'] != 4]

# Display the invalid groups
display(invalid_groups)

Unnamed: 0,year,stage,team_list,team_count
0,1984,Final,"[France, Spain]",2
4,1988,Final,"[Soviet Union, Netherlands]",2
8,1992,Final,"[Denmark, Germany]",2
12,1996,Final,"[Czech Republic, Germany]",2
17,1996,Quarter-finals,"[Spain, France, Germany, Czech Republic, Engla...",8
19,2000,Final,"[France, Italy]",2
24,2000,Quarter-finals,"[Portugal, Italy, Netherlands, Spain, Turkey, ...",8
26,2004,Final,"[Portugal, Greece]",2
31,2004,Quarter-finals,"[Portugal, France, Sweden, Czech Republic, Eng...",8
33,2008,Final,"[Germany, Spain]",2


In [237]:
# Filter out the knockout stages
df = df[~df['stage'].isin(['Quarterfinals', 'Quarter-finals', 'Round of 16', 'Semi-finals', 'Semifinals','Final', 'Third place play-off', 'Third place playoff', 'not applicable'])]

# Extract relevant columns for goal events and match results
goals_df = df[['year', 'stage', 'home_team', 'away_team', 'scorer_nationality', 'goal_minute', 'short_date','local_time', 'score']]

# Remove duplicates
goals_df = goals_df.drop_duplicates()

#Sort goals_df by short_date, local_time, and goal_minute

goals_df = goals_df.sort_values(by=['short_date', 'local_time', 'goal_minute'], ascending=[True, True, True])


display(goals_df.head())


Unnamed: 0,year,stage,home_team,away_team,scorer_nationality,goal_minute,short_date,local_time,score
1,1984,Group 1,France,Denmark,,0,1984-06-12,20:30,1–0
0,1984,Group 1,France,Denmark,France,78,1984-06-12,20:30,1–0
4,1984,Group 1,Belgium,Yugoslavia,,0,1984-06-13,20:30,2–0
2,1984,Group 1,Belgium,Yugoslavia,Belgium,28,1984-06-13,20:30,2–0
3,1984,Group 1,Belgium,Yugoslavia,Belgium,45,1984-06-13,20:30,2–0


# team counts

In [238]:
# Calculate the number of unique teams per stage and year using home_team and away_team
team_counts = (
    df.groupby(['year', 'stage'])
    .apply(lambda x: pd.concat([x['home_team'], x['away_team']]).unique())
)

# Convert the resulting series to a DataFrame
team_counts = team_counts.reset_index()  # Reset index without 'name' argument
team_counts = team_counts.rename(columns={0: 'team_list'})  # Rename the column appropriately

# Convert team list into counts
team_counts['team_count'] = team_counts['team_list'].apply(len)

# Filter stages where the number of teams is not equal to 4
invalid_groups = team_counts[team_counts['team_count'] != 4]

# Display the invalid groups
display(invalid_groups)

Unnamed: 0,year,stage,team_list,team_count


# Recreate Leauge Table after first two matchdays

In [239]:

# Assuming goals_df is your original dataset
agg_goals_before_last_day, goals_last_day_sorted = before_last(goals_df)


In [240]:
home_games, away_games = aggregate_home_away_points(agg_goals_before_last_day)

In [241]:
home2016f = home_games[(home_games['year'] == 2016) & (home_games['stage'] == 'Group F')]
home2016f

Unnamed: 0,year,stage,home_team,goals_scored,goals_conceded,points_home,match_count_home
105,2016,Group F,Austria,0,2,0,1
106,2016,Group F,Iceland,1,1,1,1
107,2016,Group F,Portugal,1,1,2,2


In [242]:
# Filter for year == 2022 and stage == 'Group E'
away2016f = away_games[(away_games['year'] == 2016) & (away_games['stage'] == 'Group F')]
away2016f

Unnamed: 0,year,stage,away_team,goals_scored,goals_conceded,points_away,match_count_away
105,2016,Group F,Austria,0,0,1,1
106,2016,Group F,Hungary,3,1,4,2
107,2016,Group F,Iceland,1,1,1,1


## aggregate data after first two matches 

In [243]:
# Applying the fifa_before_last function to the mock data
all_games_before_last = uefa_before_last(home_games, away_games, agg_goals_before_last_day, team_counts)

No missing values in the specified columns.
No observations where total_matches == 1.
No observations where total_matches == 0.

=== Applying Tiebreaker ===
Row1: year                      1984
stage                  Group 1
team                   Belgium
home_team              Belgium
goals_scored_home          2.0
goals_conceded_home        0.0
points_home                2.0
match_count_home           1.0
away_team              Belgium
goals_scored_away          0.0
goals_conceded_away        5.0
points_away                0.0
match_count_away           1.0
goals_scored               2.0
goals_conceded             5.0
points                     2.0
total_matches              2.0
matches_flag               2.0
goals_difference          -3.0
tiebreaker             no need
tie_won                      0
Name: 1, dtype: object
Row2: year                      1984
stage                  Group 1
team                   Denmark
home_team              Denmark
goals_scored_home          5.0
go

In [244]:
group2016f= all_games_before_last[
    (all_games_before_last['year'] == 2016) & 
    (all_games_before_last['stage'] == 'Group F')
]

display(group2016f)

Unnamed: 0,year,stage,team,goals_scored,goals_conceded,points,goals_difference,total_matches,standing,tiebreaker,tie_won,matches_flag
124,2016,Group F,Hungary,3,1,4,2,2,1,no need,0,2.0
125,2016,Group F,Iceland,2,2,2,0,2,2,Iceland,1,2.0
126,2016,Group F,Portugal,1,1,2,0,2,3,Iceland,0,2.0
127,2016,Group F,Austria,0,2,1,-2,2,4,no need,0,2.0


# Recreate league table after last match day

### uefa criteria 

In [245]:
# Initialize an empty list to store the results for each pair
all_results = []

# Get unique pairs of year, stage, and men
unique_pairs = all_games_before_last[['year', 'stage']].drop_duplicates()

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    stage = row['stage']
    
    # Apply the function to the current pair
    result = uefa_final_euro(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_results.append(result)

# Concatenate all the results into a single DataFrame
changes_df_euro = pd.concat(all_results)

# Keep only the specified columns
changes_df_euro = changes_df_euro[[ 'year', 'stage', 'team', '1st', '2nd', '3rd', '4th', 'changes']]



=== Initial Standings for Year 1984, Group 1 Before Last Match Goals ===

      team  total_points  total_goals_scored  total_goals_conceded  total_goals_difference  before_last_game_points  before_last_game_standing
    France             5                   6                     0                       6                        4                          1
   Denmark             3                   5                     1                       4                        2                          2
   Belgium             3                   2                     5                      -3                        2                          3
Yugoslavia             1                   0                     7                      -7                        0                          4


Analyzing goal: 26 minute, Player team: Belgium, Home: Denmark, Away: Belgium

=== Teams with Identical Points (Tied Teams) ===

Empty DataFrame
Columns: [team, total_points]
Index: []

=== Updated Standings 

In [246]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\uefa\eu\tb_eu_uefa_men.xlsx'
changes_df_euro.to_excel(file_path, index=False)


# group composition tracking

In [263]:
# Initialize an empty list to store the results for each pair
all_composition_changes = []

# Get unique pairs of year, stage, and men
unique_pairs = all_games_before_last[['year', 'stage']].drop_duplicates()

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    stage = row['stage']
    
    # Apply the function to the current pair
    composition_changes_df = gap_composition(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_composition_changes.append(composition_changes_df)

# Concatenate all the results into a single DataFrame
final_composition_changes_df = pd.concat(all_composition_changes, ignore_index=True)

# Define the desired column order
column_order = [
    'year', 'stage', 'date', 'time', 'change_num', 'goal_time', 'home_team', 
    'away_team', 'scorer_team', 'new_top_teams', '1st', '2nd', '3rd', 
    'changed', 'points_diff', 'goals_diff', 'tiebreak_result'
]

# Reorder the columns
final_composition_changes_df = final_composition_changes_df[column_order]

# Identify rows where both 'date' and 'time' are NaN
mask = final_composition_changes_df['date'].isna() & final_composition_changes_df['time'].isna()

# Use .shift(-1) to get the values from the following row and fill in the NaN rows
final_composition_changes_df.loc[mask, ['date', 'time']] = final_composition_changes_df.loc[mask, ['date', 'time']].fillna(
    final_composition_changes_df[['date', 'time']].shift(-1)
)

# Ensure 'date' is in datetime format and handle errors
# final_composition_changes_df['date'] = pd.to_datetime(final_composition_changes_df['date'], errors='coerce')

# Ensure 'time' is in proper datetime.time format and handle errors
final_composition_changes_df['time'] = pd.to_datetime(final_composition_changes_df['time'], format='%H:%M', errors='coerce').dt.time

# Drop rows with missing or invalid 'date' or 'time'
final_composition_changes_df = final_composition_changes_df.dropna(subset=['date', 'time'])

# Combine 'date' and 'time' into a single datetime column for proper sorting
final_composition_changes_df['datetime'] = pd.to_datetime(
    final_composition_changes_df['date'].astype(str) + ' ' + final_composition_changes_df['time'].astype(str),
    errors='coerce'
)

# Drop rows with invalid datetime values
final_composition_changes_df = final_composition_changes_df.dropna(subset=['datetime'])

# Sort by year and the combined datetime column
final_composition_changes_df = final_composition_changes_df.sort_values(by=['year', 'datetime']).reset_index(drop=True)

# Drop the combined datetime column if not needed
final_composition_changes_df = final_composition_changes_df.drop(columns=['datetime'])






=== STEP 1: Initial Standings for Group 1, 1984 (Goal Time = 0) ===
      team  total_points  total_goals_scored  total_goals_conceded  total_goals_difference  before_last_game_standing
    France             5                   6                     0                       6                          1
   Denmark             3                   5                     1                       4                          2
   Belgium             3                   2                     5                      -3                          3
Yugoslavia             1                   0                     7                      -7                          4



=== Tied after goal at minute 26 by Belgium in Group 1, edition 1984 ===
Number of tied teams: 0
Empty DataFrame
Columns: [team, total_points, total_goals_scored, total_goals_conceded, total_goals_difference]
Index: []

=== STEP 3: Standings after goal at minute 26 by Belgium in Group 1, edition 1984 ===
      team  total_points  total_

# manually resolve ties with 3 teams 

## Euro 2012, Group B, minute 19

In [248]:
# Locate the observation based on year, stage, and goal_time
mask = (final_composition_changes_df['year'] == 2012) & \
       (final_composition_changes_df['stage'] == 'Group B') & \
       (final_composition_changes_df['goal_time'] == 19)

# Swap the values in the columns '2nd' and '3rd'
final_composition_changes_df.loc[mask, ['2nd', '3rd']] = \
    final_composition_changes_df.loc[mask, ['3rd', '2nd']].values

# Assign 1 to the 'changed' variable for the masked observation
final_composition_changes_df.loc[mask, 'changed'] = 1

# Identify all rows with the same year and stage, but goal_minute higher than the masked one
update_mask = (final_composition_changes_df['year'] == 2012) & \
              (final_composition_changes_df['stage'] == 'Group B') & \
              (final_composition_changes_df['goal_time'] > 19)

# Increase the 'change_num' variable by 1 for these observations
final_composition_changes_df.loc[update_mask, 'change_num'] += 1


## Euro 2012, Group C, minute 35

In [249]:
# Ensure the 'new_top_teams' column is of type object to store lists
# final_composition_changes_df['new_top_teams'] = final_composition_changes_df['new_top_teams'].astype(object)

# Assign the specific values to 1st, 2nd, 3rd
mask = (final_composition_changes_df['year'] == 2012) & \
       (final_composition_changes_df['stage'] == 'Group C') & \
       (final_composition_changes_df['goal_time'] == 35)

final_composition_changes_df.loc[mask, '1st'] = 'Italy'
final_composition_changes_df.loc[mask, '2nd'] = 'Spain'
final_composition_changes_df.loc[mask, '3rd'] = 'Croatia'

# Spain (4) - Croatia (2) = 2
final_composition_changes_df.loc[mask, 'goals_diff'] = 2 



# Assign 1 to the 'changed' variable for the masked observation
final_composition_changes_df.loc[mask, 'changed'] = 1

# Increase the 'change_num' variable by 1 for these observations
final_composition_changes_df.loc[mask, 'change_num'] = 1

# Ensure the 'new_top_teams' column is of type object to store lists
final_composition_changes_df['new_top_teams'] = final_composition_changes_df['new_top_teams'].astype(object)

# Assign the list to 'new_top_teams' for the specific row
initial_top_teams = ['Italy', 'Spain']  # Example list for top teams
final_composition_changes_df.loc[mask, 'new_top_teams'] = final_composition_changes_df.loc[mask, 'new_top_teams'].map(lambda x: initial_top_teams if mask.sum() == 1 else None)


# Identify rows with the same year and stage, but goal_time higher
update_mask = (final_composition_changes_df['year'] == 2012) & \
              (final_composition_changes_df['stage'] == 'Group C') & \
              (final_composition_changes_df['goal_time'] > 35)


# Increase the 'change_num' variable by 1 for these observations
final_composition_changes_df.loc[update_mask, 'change_num'] += 1


## Euro 2021, Group B, minute 74

In [264]:
# Assign the specific values to 1st, 2nd, 3rd
mask = (final_composition_changes_df['year'] == 2021) & \
       (final_composition_changes_df['stage'] == 'Group B') & \
       (final_composition_changes_df['goal_time'] == 74)

final_composition_changes_df.loc[mask, '2nd'] = 'Denmark'
final_composition_changes_df.loc[mask, '3rd'] = 'Russia'

# Denmark (-1) - Russia (-3) = 2
final_composition_changes_df.loc[mask, 'goals_diff'] = 2



# Assign 1 to the 'changed' variable for the masked observation
final_composition_changes_df.loc[mask, 'changed'] = 1

# Ensure the 'new_top_teams' column is of type object to store lists
final_composition_changes_df['new_top_teams'] = final_composition_changes_df['new_top_teams'].astype(object)

# Assign the list to 'new_top_teams' for the specific row
initial_top_teams = {'Belgium', 'Denmark', 'Russia'}  
final_composition_changes_df.loc[mask, 'new_top_teams'] = final_composition_changes_df.loc[mask, 'new_top_teams'].map(lambda x: initial_top_teams if mask.sum() == 1 else None)

# Identify rows with the same year and stage, but goal_time higher
update_mask = (final_composition_changes_df['year'] == 2021) & \
              (final_composition_changes_df['stage'] == 'Group B') & \
              (final_composition_changes_df['goal_time'] >= 74)


# Increase the 'change_num' variable by 1 for these observations
final_composition_changes_df.loc[update_mask, 'change_num'] += 1


## Euro 2021, Group B, minute 79

In [265]:

# Assign the specific values to 1st, 2nd, 3rd
mask = (final_composition_changes_df['year'] == 2021) & \
       (final_composition_changes_df['stage'] == 'Group B') & \
       (final_composition_changes_df['goal_time'] == 79)

# Assign 1 to the 'changed' variable for the masked observation
final_composition_changes_df.loc[mask, 'changed'] = 1

# Identify rows with the same year and stage, but goal_time higher
update_mask = (final_composition_changes_df['year'] == 2021) & \
              (final_composition_changes_df['stage'] == 'Group B') & \
              (final_composition_changes_df['goal_time'] >= 79)


# Increase the 'change_num' variable by 1 for these observations
final_composition_changes_df.loc[update_mask, 'change_num'] += 1

## Euro 2021, Group B, minute 81

In [266]:
# Assign the specific values to 1st, 2nd, 3rd
mask = (final_composition_changes_df['year'] == 2021) & \
       (final_composition_changes_df['stage'] == 'Group B') & \
       (final_composition_changes_df['goal_time'] == 81)

final_composition_changes_df.loc[mask, '2nd'] = 'Denmark'
final_composition_changes_df.loc[mask, '3rd'] = 'Russia'

# Denmark (0) - Russia (-4) = 4
final_composition_changes_df.loc[mask, 'goals_diff'] = 4


# Assign 1 to the 'changed' variable for the masked observation
final_composition_changes_df.loc[mask, 'changed'] = 1


# Ensure the 'new_top_teams' column is of type object to store lists
final_composition_changes_df['new_top_teams'] = final_composition_changes_df['new_top_teams'].astype(object)

# Assign the list to 'new_top_teams' for the specific row
initial_top_teams = {'Belgium', 'Denmark', 'Russia'}  
final_composition_changes_df.loc[mask, 'new_top_teams'] = final_composition_changes_df.loc[mask, 'new_top_teams'].map(lambda x: initial_top_teams if mask.sum() == 1 else None)

# Identify rows with the same year and stage, but goal_time higher
update_mask = (final_composition_changes_df['year'] == 2021) & \
              (final_composition_changes_df['stage'] == 'Group B') & \
              (final_composition_changes_df['goal_time'] >= 81)


# Increase the 'change_num' variable by 1 for these observations
final_composition_changes_df.loc[update_mask, 'change_num'] += 1


## Euro 2021, Group B, minute 82

In [267]:
# Assign the specific values to 1st, 2nd, 3rd
mask = (final_composition_changes_df['year'] == 2021) & \
       (final_composition_changes_df['stage'] == 'Group B') & \
       (final_composition_changes_df['goal_time'] == 82)

final_composition_changes_df.loc[mask, '2nd'] = 'Denmark'
final_composition_changes_df.loc[mask, '3rd'] = 'Russia'

# Denmark (0) - Russia (-5) = 5
final_composition_changes_df.loc[mask, 'goals_diff'] = 5


# Assign 1 to the 'changed' variable for the masked observation
# final_composition_changes_df.loc[mask, 'changed'] = 1


# Ensure the 'new_top_teams' column is of type object to store lists
final_composition_changes_df['new_top_teams'] = final_composition_changes_df['new_top_teams'].astype(object)

# Assign the list to 'new_top_teams' for the specific row
initial_top_teams = {'Belgium', 'Denmark', 'Russia'}  
final_composition_changes_df.loc[mask, 'new_top_teams'] = final_composition_changes_df.loc[mask, 'new_top_teams'].map(lambda x: initial_top_teams if mask.sum() == 1 else None)
# Identify rows with the same year and stage, but goal_time higher
update_mask = (final_composition_changes_df['year'] == 2021) & \
              (final_composition_changes_df['stage'] == 'Group B') & \
              (final_composition_changes_df['goal_time'] >= 81)


# Increase the 'change_num' variable by 1 for these observations
# final_composition_changes_df.loc[update_mask, 'change_num'] += 1


In [268]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\uefa\eu\goals_eu_uefa_men.xlsx'
final_composition_changes_df.to_excel(file_path, index=False)


# minute by minute dataframe

In [255]:
# Initialize an empty list to store the expanded rows
expanded_rows = []

# Iterate over rows grouped by date and time
for (date, time), group in final_composition_changes_df.groupby(["date", "time"]):
    group = group.sort_values(by="goal_time").reset_index(drop=True)
    
    for i in range(len(group) - 1):
        current_row = group.iloc[i]
        next_row = group.iloc[i + 1]
        
        # Create the range of match_minute values between current and next row
        for match_minute in range(current_row["goal_time"], next_row["goal_time"] + 1):
            new_row = current_row.copy()
            new_row["match_minute"] = match_minute
            
            # Leave specific fields empty for interpolated rows
            if match_minute != current_row["goal_time"]:
                new_row["goal_time"] = None
                new_row["home_team"] = None
                new_row["away_team"] = None
                new_row["scorer_team"] = None
            
            expanded_rows.append(new_row)
    
    # Add the last row of the group as it is
    last_row = group.iloc[-1].copy()
    last_row["match_minute"] = last_row["goal_time"]
    expanded_rows.append(last_row)
    
    # Ensure the match_minute reaches 90
    if last_row["goal_time"] < 90:
        for match_minute in range(last_row["goal_time"] + 1, 91):
            new_row = last_row.copy()
            new_row["match_minute"] = match_minute
            new_row["goal_time"] = None
            new_row["home_team"] = None
            new_row["away_team"] = None
            new_row["scorer_team"] = None
            expanded_rows.append(new_row)

# Create a new DataFrame from the expanded rows
expanded_df = pd.DataFrame(expanded_rows)

# Reset the index for clarity
expanded_df = expanded_df.reset_index(drop=True)

# Define the desired column order
column_order = [
    'year', 'stage', 'date', 'time', 'match_minute','change_num', 'goal_time', 'home_team', 
    'away_team', 'scorer_team', 'new_top_teams', '1st', '2nd', '3rd', 
    'changed', 'points_diff', 'goals_diff', 'tiebreak_result'
]


# Reorder the columns
expanded_df = expanded_df[column_order]


In [256]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\uefa\eu\mbm_eu_uefa.xlsx'
expanded_df.to_excel(file_path, index=False)


# best four third placed

In [257]:
# Call the function to get the DataFrame
final_df = best_four_third_placed_eu_men(goals_last_day_sorted, all_games_before_last, agg_goals_before_last_day)

# Define the file path and save to Excel
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\uefa\eu\third_teams_eu_uefa_men.xlsx'
final_df.to_excel(file_path, index=False)



--- Processing Year: 2016 ---

=== Applying Tiebreaker ===
Row1: year                                  2016
stage                              Group B
team                                 Wales
goals_scored                             3
goals_conceded                           3
points                                   4
goals_difference                         0
total_matches                            2
standing                                 2
tiebreaker                           Wales
tie_won                                  1
matches_flag                           2.0
before_last_game_goals_scored            3
before_last_game_goals_conceded          3
before_last_game_points                  4
last_game_goals_scored                   0
last_game_goals_conceded                 0
total_goals_scored                       3
total_goals_conceded                     3
total_goals_difference                   0
last_game_points                         0
total_points                   

# suspence

## active suspence

In [258]:
# Call the active_suspense function
try:

    # Fix: Ensure we correctly loop through `goals_last_day_sorted`
    # Apply the active_suspense function
    active_suspense_results = active_suspense(all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)

    # Display the results
    print(active_suspense_results)

    # Optionally, save the results to a CSV file
    file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\uefa\eu\active_suspense_results_eu.xlsx'
    active_suspense_results.to_excel(file_path, index=False)

except Exception as e:
    print(f"An error occurred: {e}")


               team    stage  year  goal_minute  active_suspense_count  \
0           Denmark  Group 1  1984           26                      0   
1           Denmark  Group 1  1984           32                      0   
2           Denmark  Group 1  1984           39                      0   
3           Denmark  Group 1  1984           41                      0   
4           Denmark  Group 1  1984           59                      0   
..              ...      ...   ...          ...                    ...   
269  Czech Republic  Group F  2024            2                      0   
270  Czech Republic  Group F  2024           51                      0   
271  Czech Republic  Group F  2024           57                      0   
272  Czech Republic  Group F  2024           66                      0   
273  Czech Republic  Group F  2024           94                      0   

                                                reason  
0    Condition for moving to 2nd not met (losing vs...

In [259]:
# Group by year, team, and group_name to count the aggregate active suspense

summary = (

    active_suspense_results[active_suspense_results['active_suspense_count'] > 0]

    .groupby(['year', 'team', 'stage'])

    .size()

    .reset_index(name='aggregate_active_suspense')

)


In [260]:
    # Optionally, save the results to a CSV file
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\uefa\eu\active_suspense_eu.xlsx'
summary.to_excel(file_path, index=False)

# probabilities

## match probabilities

In [261]:
# Integrate Elo probabilities with the correct column names
goals_df_with_probs = integrate_elo_probabilities(goals_df, elo_eu)

# Display a sample of the data with probabilities
display(goals_df_with_probs.head())

Unnamed: 0,year,stage,home_team,away_team,scorer_nationality,goal_minute,short_date,local_time,score,elo_home,elo_away,P_home_win,P_draw,P_away_win
0,1984,Group 1,France,Denmark,,0,1984-06-12,20:30,1–0,1960.0,1809.0,0.626576,0.110716,0.262708
1,1984,Group 1,France,Denmark,France,78,1984-06-12,20:30,1–0,1960.0,1809.0,0.626576,0.110716,0.262708
2,1984,Group 1,Belgium,Yugoslavia,,0,1984-06-13,20:30,2–0,1898.0,1890.0,0.427685,0.16388,0.408436
3,1984,Group 1,Belgium,Yugoslavia,Belgium,28,1984-06-13,20:30,2–0,1898.0,1890.0,0.427685,0.16388,0.408436
4,1984,Group 1,Belgium,Yugoslavia,Belgium,45,1984-06-13,20:30,2–0,1898.0,1890.0,0.427685,0.16388,0.408436


In [262]:
# Apply the function to update probabilities
goals_df_with_updated_probs = update_probabilities_for_following_matches(goals_df_with_probs)

# Display a sample of the updated data
display(goals_df_with_updated_probs.head())

Unnamed: 0,year,stage,home_team,away_team,scorer_nationality,goal_minute,short_date,local_time,score,elo_home,elo_away,P_home_win,P_draw,P_away_win
0,1984,Group 1,France,Denmark,,0,1984-06-12,20:30,1–0,1960.0,1809.0,0.643567,0.105657,0.250777
1,1984,Group 1,France,Denmark,France,78,1984-06-12,20:30,1–0,1960.0,1809.0,0.659568,0.100773,0.239658
2,1984,Group 1,Belgium,Yugoslavia,,0,1984-06-13,20:30,2–0,1898.0,1890.0,0.45495,0.156967,0.388084
3,1984,Group 1,Belgium,Yugoslavia,Belgium,28,1984-06-13,20:30,2–0,1898.0,1890.0,0.481117,0.150282,0.368601
4,1984,Group 1,Belgium,Yugoslavia,Belgium,45,1984-06-13,20:30,2–0,1898.0,1890.0,0.506096,0.143826,0.350078
