# Libraries

In [1]:
import pandas as pd
from getpass import getuser
from collections import defaultdict
from datetime import datetime, timedelta
import re

# Load and inspect dataset

In [2]:
# Get the current user's name
user = getuser()

# Construct the path using the user's name
function_path = f"C:/Users/{user}/Documents/GitHub/tiebreak_wc/code/wiki/functions_fifa.ipynb"

# Run the notebook
%run $function_path

In [3]:

# Path to the dataset
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\eu_goals_men.xlsx'

# Read the dataset with a different encoding
df = pd.read_excel(data_path)

# Now, filter the data for tournaments after 1984
df = df[df['year'] >= 1984]

display(df.head())

Unnamed: 0,stage,year,time,home_team,away_team,score,stadium_name,stadium_city,stadium_attendance,referee_name,...,extra_time,goals_home,goals_away,own_goal,penalty,goal_minute_et,goal_et,half_time,short_date,long_date
0,Group 1,1984,20:30,France,Denmark,1–0,Parc des Princes,Paris,47570,Volker Roth,...,0,1,0,0,0,0,0,2,1984-06-12,12 June 1984
1,Group 1,1984,20:30,France,Denmark,1–0,Parc des Princes,Paris,47570,Volker Roth,...,0,1,0,0,0,0,0,1,1984-06-12,12 June 1984
2,Group 1,1984,20:30,Belgium,Yugoslavia,2–0,Stade Félix-Bollaert,Lens,41525,Erik Fredriksson,...,0,2,0,0,0,0,0,1,1984-06-13,13 June 1984
3,Group 1,1984,20:30,Belgium,Yugoslavia,2–0,Stade Félix-Bollaert,Lens,41525,Erik Fredriksson,...,0,2,0,0,0,0,0,1,1984-06-13,13 June 1984
4,Group 1,1984,20:30,Belgium,Yugoslavia,2–0,Stade Félix-Bollaert,Lens,41525,Erik Fredriksson,...,0,2,0,0,0,0,0,1,1984-06-13,13 June 1984


In [4]:
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\elo_eu.xlsx'
elo_eu = pd.read_excel(data_path)

In [5]:
# Dictionary of replacements for team names
replacements = {
    "Republic of Ireland": "Ireland",
    "CIS": "Commonwealth of Independent States",
    "FR Yugoslavia": "Yugoslavia"
}


In [6]:
# Prepare the Elo dataset for integration: rename columns for clarity
elo_eu = elo_eu.rename(columns={"team": "team_name", "elo_rating": "elo"})
elo_eu["team_name"] = elo_eu["team_name"].replace(replacements)

# Apply replacements to the relevant columns
df[["home_team", "away_team", "scorer_nationality"]] = df[["home_team", "away_team", "scorer_nationality"]].replace(replacements)

# Clean, transfrom, create variables

## duplicates 

In [7]:
# # Remove duplicates
# goals_df = goals_df.drop_duplicates()

In [8]:
# Columns to check for duplicates
columns_to_check = ['year', 'stage', 'home_team', 'scorer_nationality', 'long_date', 'time', 'half_time', 'goal_minute']

# Filter duplicates where both goals_home and goals_away are not 0
filtered_duplicates = df[
    (df.duplicated(subset=columns_to_check, keep=False)) &
    ~((df['goals_home'] == 0) & (df['goals_away'] == 0))
]

print("Filtered duplicates without both goals_home and goals_away being 0:")
display(filtered_duplicates)


Filtered duplicates without both goals_home and goals_away being 0:


Unnamed: 0,stage,year,time,home_team,away_team,score,stadium_name,stadium_city,stadium_attendance,referee_name,...,extra_time,goals_home,goals_away,own_goal,penalty,goal_minute_et,goal_et,half_time,short_date,long_date


## time

In [9]:
# Apply the conversion function
df['local_time'] = df['time'].apply(convert_time_to_utc)


Original time string: 20:30
Extracted local time: 20:30
Normalized 24-hour format: 20:30
No valid timezone found. Returning original time: 20:30

Original time string: 20:30
Extracted local time: 20:30
Normalized 24-hour format: 20:30
No valid timezone found. Returning original time: 20:30

Original time string: 20:30
Extracted local time: 20:30
Normalized 24-hour format: 20:30
No valid timezone found. Returning original time: 20:30

Original time string: 20:30
Extracted local time: 20:30
Normalized 24-hour format: 20:30
No valid timezone found. Returning original time: 20:30

Original time string: 20:30
Extracted local time: 20:30
Normalized 24-hour format: 20:30
No valid timezone found. Returning original time: 20:30

Original time string: 17:15
Extracted local time: 17:15
Normalized 24-hour format: 17:15
No valid timezone found. Returning original time: 17:15

Original time string: 17:15
Extracted local time: 17:15
Normalized 24-hour format: 17:15
No valid timezone found. Returning 

## date

In [10]:
# Step 1: Remove any commas before the year in 'long_date'
df['long_date'] = df['long_date'].str.replace(r',\s*(\d{4})', r' \1', regex=True)

# Step 2: Convert 'long_date' to 'short_date' where 'short_date' is missing
df['short_date'] = df['short_date'].fillna(pd.to_datetime(df['long_date'], errors='coerce').dt.strftime('%Y-%m-%d'))


# stage

In [11]:
# Calculate the number of unique teams per stage and year using home_team and away_team
team_counts = (
    df.groupby(['year', 'stage'])
    .apply(lambda x: pd.concat([x['home_team'], x['away_team']]).unique())
)

# Convert the resulting series to a DataFrame
team_counts = team_counts.reset_index()  # Reset index without 'name' argument
team_counts = team_counts.rename(columns={0: 'team_list'})  # Rename the column appropriately

# Convert team list into counts
team_counts['team_count'] = team_counts['team_list'].apply(len)

# Filter stages where the number of teams is not equal to 4
invalid_groups = team_counts[team_counts['team_count'] != 4]

# Display the invalid groups
display(invalid_groups)

Unnamed: 0,year,stage,team_list,team_count
0,1984,Final,"[France, Spain]",2
4,1988,Final,"[Soviet Union, Netherlands]",2
8,1992,Final,"[Denmark, Germany]",2
12,1996,Final,"[Czech Republic, Germany]",2
17,1996,Quarter-finals,"[Spain, France, Germany, Czech Republic, Engla...",8
19,2000,Final,"[France, Italy]",2
24,2000,Quarter-finals,"[Portugal, Italy, Netherlands, Spain, Turkey, ...",8
26,2004,Final,"[Portugal, Greece]",2
31,2004,Quarter-finals,"[Portugal, France, Sweden, Czech Republic, Eng...",8
33,2008,Final,"[Germany, Spain]",2


In [12]:
# Filter out the knockout stages
df = df[~df['stage'].isin(['Quarterfinals', 'Quarter-finals', 'Round of 16', 'Semi-finals', 'Semifinals','Final', 'Third place play-off', 'Third place playoff', 'not applicable'])]

# Extract relevant columns for goal events and match results
goals_df = df[['year', 'stage', 'home_team', 'away_team', 'scorer_nationality', 'goal_minute', 'half_time','short_date','local_time', 'score']]

#Sort goals_df by short_date, local_time, and goal_minute
goals_df = goals_df.sort_values(by=['short_date', 'local_time', 'goal_minute', 'half_time'], ascending=[True, True, True, True])


display(goals_df.head())


Unnamed: 0,year,stage,home_team,away_team,scorer_nationality,goal_minute,half_time,short_date,local_time,score
1,1984,Group 1,France,Denmark,,0,1,1984-06-12,20:30,1–0
0,1984,Group 1,France,Denmark,France,78,2,1984-06-12,20:30,1–0
4,1984,Group 1,Belgium,Yugoslavia,,0,1,1984-06-13,20:30,2–0
2,1984,Group 1,Belgium,Yugoslavia,Belgium,28,1,1984-06-13,20:30,2–0
3,1984,Group 1,Belgium,Yugoslavia,Belgium,45,1,1984-06-13,20:30,2–0


# team counts

In [13]:
# Calculate the number of unique teams per stage and year using home_team and away_team
team_counts = (
    df.groupby(['year', 'stage'])
    .apply(lambda x: pd.concat([x['home_team'], x['away_team']]).unique())
)

# Convert the resulting series to a DataFrame
team_counts = team_counts.reset_index()  # Reset index without 'name' argument
team_counts = team_counts.rename(columns={0: 'team_list'})  # Rename the column appropriately

# Convert team list into counts
team_counts['team_count'] = team_counts['team_list'].apply(len)

# Filter stages where the number of teams is not equal to 4
invalid_groups = team_counts[team_counts['team_count'] != 4]

# Display the invalid groups
display(invalid_groups)

Unnamed: 0,year,stage,team_list,team_count


# stage counts

In [14]:
# Group by year and collect unique values of stage in a list
stage_summary = df.groupby('year')['stage'].unique().reset_index()

# Rename column for clarity
stage_summary.rename(columns={'stage': 'unique_stages'}, inplace=True)

display(stage_summary)

Unnamed: 0,year,unique_stages
0,1984,"[Group 1, Group 2]"
1,1988,"[Group 1, Group 2]"
2,1992,"[Group 1, Group 2]"
3,1996,"[Group A, Group B, Group C, Group D]"
4,2000,"[Group A, Group B, Group C, Group D]"
5,2004,"[Group A, Group B, Group C, Group D]"
6,2008,"[Group A, Group B, Group C, Group D]"
7,2012,"[Group A, Group B, Group C, Group D]"
8,2016,"[Group A, Group B, Group C, Group D, Group E, ..."
9,2021,"[Group A, Group B, Group C, Group D, Group E, ..."


# Recreate Leauge Table after first two matchdays

In [15]:

# Assuming goals_df is your original dataset
agg_goals_before_last_day, goals_last_day_sorted = before_last(goals_df)


In [16]:
home_games, away_games = aggregate_home_away_points(agg_goals_before_last_day)

In [17]:
# Filter for year == 2022 and stage == 'Group E'
home1992 = home_games[(home_games['year'] == 1992) & (home_games['stage'] == 'Group 1')]
home1992

Unnamed: 0,year,stage,home_team,goals_scored,goals_conceded,points_home,match_count_home
12,1992,Group 1,Denmark,0,0,1,1
13,1992,Group 1,France,0,0,1,1
14,1992,Group 1,Sweden,2,1,3,2


In [18]:
# Filter for year == 2022 and stage == 'Group E'
away1992 = away_games[(away_games['year'] == 1992) & (away_games['stage'] == 'Group 1')]
away1992

Unnamed: 0,year,stage,away_team,goals_scored,goals_conceded,points_away,match_count_away
12,1992,Group 1,Denmark,0,1,0,1
13,1992,Group 1,England,0,0,2,2
14,1992,Group 1,France,1,1,1,1


## aggregate data after first two match days

In [19]:
# Applying the fifa_before_last function to the mock data
all_games_before_last = fifa_before_last(home_games, away_games, agg_goals_before_last_day, team_counts)

No missing values in the specified columns.
No observations where total_matches == 1.
No observations where total_matches == 0.

=== Applying Tiebreaker ===
Row1: year                      1984
stage                  Group 1
team                   Belgium
home_team              Belgium
goals_scored_home          2.0
goals_conceded_home        0.0
points_home                2.0
match_count_home           1.0
away_team              Belgium
goals_scored_away          0.0
goals_conceded_away        5.0
points_away                0.0
match_count_away           1.0
goals_scored               2.0
goals_conceded             5.0
points                     2.0
total_matches              2.0
goals_difference          -3.0
tiebreaker             no need
tie_won                      0
Name: 1, dtype: object
Row2: year                      1984
stage                  Group 1
team                   Denmark
home_team              Denmark
goals_scored_home          5.0
goals_conceded_home        0.0
po

In [20]:
# Filtering all_games_before_last for the year 2022 and stage 'Group E'
group1992 = all_games_before_last[
    (all_games_before_last['year'] == 1992) & 
    (all_games_before_last['stage'] == 'Group 1')
]

display(group1992)

Unnamed: 0,year,stage,team,goals_scored,goals_conceded,points,goals_difference,total_matches,standing,tiebreaker,tie_won
16,1992,Group 1,Sweden,2,1,3,1,2,1,no need,0
17,1992,Group 1,France,1,1,2,0,2,2,France,1
18,1992,Group 1,England,0,0,2,0,2,3,France,0
19,1992,Group 1,Denmark,0,1,1,-1,2,4,no need,0


# Recreate league table after last match day

In [21]:
# Initialize an empty list to store the results for each pair
all_results = []

# Define unique pairs and ensure correct order based on goals_last_day_sorted
unique_pairs = all_games_before_last[['year', 'stage']].drop_duplicates()
# Merge with goals_last_day_sorted to get the correct order
unique_pairs = unique_pairs.merge(
    goals_last_day_sorted[['year', 'stage', 'short_date','local_time', 'half_time']].drop_duplicates(),
    how='left',
    on=['year', 'stage']
).sort_values(by=['year', 'short_date','local_time', 'half_time', 'stage']).reset_index(drop=True)

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    group_name = row['stage']
    
    # Apply the function to the current pair
    result = fifa_final_wc(year, group_name, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_results.append(result)

# Concatenate all the results into a single DataFrame
changes_df_wc = pd.concat(all_results)

# Keep only the specified columns
changes_df_wc = changes_df_wc[['year', 'stage', 'team', '1st', '2nd', '3rd', '4th', 'changes']]

# Display the final DataFrame
display(changes_df_wc)



=== Initial Standings for Year 1984, Group 1 Before Last Match Goals ===

      team  total_points  total_goals_scored  total_goals_conceded  total_goals_difference  before_last_game_points  before_last_game_standing
    France             5                   6                     0                       6                        4                          1
   Denmark             3                   5                     1                       4                        2                          2
   Belgium             3                   2                     5                      -3                        2                          3
Yugoslavia             1                   0                     7                      -7                        0                          4


Analyzing goal: 26 minute, 1 half time, Player team: Belgium, Home: Denmark, Away: Belgium

=== Teams with Identical Points (Tied Teams) ===

Empty DataFrame
Columns: [team, total_points]
Index: []

=== Updat

Unnamed: 0,year,stage,team,1st,2nd,3rd,4th,changes
0,1984,Group 1,France,1,0,0,0,1
1,1984,Group 1,Denmark,0,2,1,0,3
2,1984,Group 1,Belgium,0,1,2,0,3
3,1984,Group 1,Yugoslavia,0,0,0,1,1
0,1984,Group 1,France,1,0,0,0,1
...,...,...,...,...,...,...,...,...
174,2024,Group F,Czech Republic,0,0,1,1,2
172,2024,Group F,Portugal,1,0,0,0,1
173,2024,Group F,Turkey,0,2,1,0,3
175,2024,Group F,Georgia,0,1,2,1,4


In [22]:
# # Exporting final df
# file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\eu\tb_eu_fifa_men.xlsx'
# changes_df_wc.to_excel(file_path, index=False)


# group composition tracking

In [23]:
# Initialize an empty list to store the results for each pair
all_composition_changes = []


# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    stage = row['stage']
    
    # Apply the function to the current pair
    composition_changes_df, third_place_df = gap_composition(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_composition_changes.append(composition_changes_df)

# Concatenate all the results into a single DataFrame
final_composition_changes_df = pd.concat(all_composition_changes, ignore_index=True)

# Define the desired column order
column_order = [
    'year', 'stage', 'date', 'time', 'change_num', 'goal_minute','half_time', 'home_team', 
    'away_team', 'scorer_team', 'new_top_teams', '1st', '2nd', '3rd', 
    'changed', 'points_diff', 'goals_diff', 'tiebreak_result'
]

# Reorder the columns
final_composition_changes_df = final_composition_changes_df[column_order]

# Identify rows where both 'date' and 'time' are NaN
mask = final_composition_changes_df['date'].isna() & final_composition_changes_df['time'].isna()

# Use .shift(-1) to get the values from the following row and fill in the NaN rows
final_composition_changes_df.loc[mask, ['date', 'time']] = final_composition_changes_df.loc[mask, ['date', 'time']].fillna(
    final_composition_changes_df[['date', 'time']].shift(-1)
)

# Ensure 'date' is in datetime format and handle errors
# final_composition_changes_df['date'] = pd.to_datetime(final_composition_changes_df['date'], errors='coerce')

# Ensure 'time' is in proper datetime.time format and handle errors
final_composition_changes_df['time'] = pd.to_datetime(final_composition_changes_df['time'], format='%H:%M', errors='coerce').dt.time

# Drop rows with missing or invalid 'date' or 'time'
final_composition_changes_df = final_composition_changes_df.dropna(subset=['date', 'time'])

# Combine 'date' and 'time' into a single datetime column for proper sorting
final_composition_changes_df['datetime'] = pd.to_datetime(
    final_composition_changes_df['date'].astype(str) + ' ' + final_composition_changes_df['time'].astype(str),
    errors='coerce'
)

# Drop rows with invalid datetime values
final_composition_changes_df = final_composition_changes_df.dropna(subset=['datetime'])

# Sort by year and the combined datetime column
final_composition_changes_df = final_composition_changes_df.sort_values(by=['year', 'datetime']).reset_index(drop=True)

# Drop the combined datetime column if not needed
final_composition_changes_df = final_composition_changes_df.drop(columns=['datetime'])



=== STEP 1: Initial Standings for Group 1, 1984 (Goal Time = 0) ===
      team  total_points  total_goals_scored  total_goals_conceded  total_goals_difference  before_last_game_standing
    France             5                   6                     0                       6                          1
   Denmark             3                   5                     1                       4                          2
   Belgium             3                   2                     5                      -3                          3
Yugoslavia             1                   0                     7                      -7                          4



=== Tied after goal at minute 26 1 half time by Belgium in Group 1, edition 1984 ===
Number of tied teams: 0
Empty DataFrame
Columns: [team, total_points, total_goals_scored, total_goals_conceded, total_goals_difference]
Index: []
[DEBUG] Saving Third Team - Year: 1984, Stage: Group 1, Goal Minute: 26, Team: Denmark, Points: 2, Goal Dif

## manually modify order of tie teams by disciplinary points

In [24]:
mask = (final_composition_changes_df['year'] == 2024) & \
       (final_composition_changes_df['stage'] == "Group C") & \
       (final_composition_changes_df['goal_minute'] == 0)

final_composition_changes_df.loc[mask, ['2nd', '3rd']] = final_composition_changes_df.loc[mask, ['3rd', '2nd']].values


# minute by minute dataframe

In [25]:
# Initialize an empty list to store the expanded rows
expanded_rows = []

# Iterate over rows grouped by date and time
for (date, time), group in final_composition_changes_df.groupby(["date", "time"]):
    group = group.sort_values(by="goal_minute").reset_index(drop=True)
    
    for i in range(len(group) - 1):
        current_row = group.iloc[i]
        next_row = group.iloc[i + 1]
        
        # Create the range of match_minute values between current and next row
        for match_minute in range(current_row["goal_minute"], next_row["goal_minute"] + 1):
            new_row = current_row.copy()
            new_row["match_minute"] = match_minute
            
            # Leave specific fields empty for interpolated rows
            if match_minute != current_row["goal_minute"]:
                new_row["goal_minute"] = None
                new_row["home_team"] = None
                new_row["away_team"] = None
                new_row["scorer_team"] = None
            
            expanded_rows.append(new_row)
    
    # Add the last row of the group as it is
    last_row = group.iloc[-1].copy()
    last_row["match_minute"] = last_row["goal_minute"]
    expanded_rows.append(last_row)
    
    # Ensure the match_minute reaches 90, only if goal_minute is less than 90
    if last_row["goal_minute"] < 90:
        for match_minute in range(last_row["goal_minute"] + 1, 91):
            new_row = last_row.copy()
            new_row["match_minute"] = match_minute
            new_row["goal_minute"] = None
            new_row["home_team"] = None
            new_row["away_team"] = None
            new_row["scorer_team"] = None
            expanded_rows.append(new_row)

# Create a new DataFrame from the expanded rows
expanded_df = pd.DataFrame(expanded_rows)

# Reset the index for clarity
expanded_df = expanded_df.reset_index(drop=True)

# Define the desired column order
column_order = [
    'year', 'stage', 'date', 'time', 'match_minute','change_num', 'goal_minute', 'home_team', 
    'away_team', 'scorer_team', 'new_top_teams', '1st', '2nd', '3rd', 
    'changed', 'points_diff', 'goals_diff', 'tiebreak_result'
]

# Reorder the columns
expanded_df = expanded_df[column_order]


In [26]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\eu\mbm_eu_fifa.xlsx'
expanded_df.to_excel(file_path, index=False)


# best four third placed

In [27]:
# Filter the dataset to include only years >= 2016
third_place_df = third_place_df[third_place_df['year'] >= 2016].copy()

third_place_df = ensure_goal_minute_zero(third_place_df, all_games_before_last)

# Remove duplicates
third_place_df = third_place_df.drop_duplicates()

# Define file path for saving
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\eu\third_place_eu_fifa.xlsx'

# Export to Excel
# third_place_df.to_excel(file_path, index=False)

print(f"Filtered and deduplicated third-place data saved to {file_path}")


[INFO] Checking missing `goal_minute = 0` for each year and stage...
[INFO] Adding 3 missing `goal_minute = 0` entries.
[INFO] `goal_minute = 0` check complete. DataFrame updated.
Filtered and deduplicated third-place data saved to C:\Users\ALESSANDRO\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\eu\third_place_eu_fifa.xlsx


### resolving ties using yellow cards

In [28]:
team_priority = {
    2016: ['Czech Republic', 'Portugal', 'Romania', 'Northern Ireland'],
    2021: ['Hungary', 'Denmark', 'Switzerland', 'Slovakia'],
    2024: ['Slovakia','Netherlands', 'Georgia']
}


In [29]:
best_four_third_df = track_third_place_teams_with_top_four(third_place_df)

# Define output path and save
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\eu\best_four_third_teams_eu_fifa.xlsx'
best_four_third_df.to_excel(file_path, index=False)

print(f"Data successfully saved to {file_path}")


Data successfully saved to C:\Users\ALESSANDRO\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\eu\best_four_third_teams_eu_fifa.xlsx


### variable to track whether third teams qualifies or not

In [30]:
# Initialize third_qualify to 0
final_composition_changes_df['third_qualify'] = 0

# Process only years >= 2016
for index, row in final_composition_changes_df.iterrows():
    if row['year'] >= 2016:
        # Find matching row in best_four_third_df
        match = best_four_third_df[
            (best_four_third_df['year'] == row['year']) &
            (best_four_third_df['goal_minute'] == row['goal_minute'])
        ]

        if not match.empty:
            # Check if the 3rd place team is in the top four third-placed teams
            top_four_teams = match.iloc[0]['top_four_third_teams']
            if isinstance(top_four_teams, list) and row['3rd'] in top_four_teams:
                final_composition_changes_df.at[index, 'third_qualify'] = 1

# Create qualified_teams list: always includes 1st and 2nd, includes 3rd only if third_qualify == 1
final_composition_changes_df['qualified_teams'] = final_composition_changes_df.apply(
    lambda row: [row['1st'], row['2nd']] + ([row['3rd']] if row['third_qualify'] == 1 else []),
    axis=1
)

# Remove the columns 'change_num' and 'changed' from the DataFrame
final_composition_changes_df = final_composition_changes_df.drop(columns=['change_num', 'changed'], errors='ignore')


### variables to track the composition of the teams qualifying

In [31]:

# Ensure goal_minute is numeric for sorting
final_composition_changes_df['goal_minute'] = pd.to_numeric(final_composition_changes_df['goal_minute'], errors='coerce')

# Sort by year, stage, and goal_minute to process in correct order
final_composition_changes_df = final_composition_changes_df.sort_values(by=['year', 'stage', 'goal_minute']).reset_index(drop=True)

# Initialize qual_changed and qual_count columns
final_composition_changes_df['qual_changed'] = 0
final_composition_changes_df['qual_count'] = 0

# Track changes per stage
qual_count_tracker = {}

# Loop through each year and stage to check for changes
for (year, stage), group in final_composition_changes_df.groupby(['year', 'stage']):
    previous_qualified = None
    qual_count = 0

    for index, row in group.iterrows():
        # Extract current qualified teams
        current_qualified = sorted(row['qualified_teams'])  # Sort to ignore order changes

        # Check if qualified_teams has changed from the previous row
        if previous_qualified is not None and current_qualified != previous_qualified:
            final_composition_changes_df.at[index, 'qual_changed'] = 1
            qual_count += 1

        # Update tracking
        previous_qualified = current_qualified
        final_composition_changes_df.at[index, 'qual_count'] = qual_count

In [32]:
final_composition_changes_df = integrate_elo_probabilities(final_composition_changes_df, elo_eu)

In [33]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\eu\goals_eu_fifa_men.xlsx'
final_composition_changes_df.to_excel(file_path, index=False)
