# Libraries

In [2]:
import pandas as pd
from getpass import getuser
from collections import defaultdict
from datetime import datetime, timedelta
import re
import numpy as np

# Load and inspect dataset

In [3]:
# Get the current user's name
user = getuser()

# Construct the path using the user's name
function_path = f"C:/Users/{user}/Documents/GitHub/tiebreak_wc/code/wiki/functions_fifa.ipynb"

# Run the notebook
%run $function_path

In [4]:

# Path to the dataset
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\eu_goals_men.xlsx'

# Read the dataset with a different encoding
df = pd.read_excel(data_path)

# Now, filter the data for tournaments after 1984
df = df[df['year'] >= 1984]

display(df.head())

Unnamed: 0,stage,year,time,home_team,away_team,score,stadium_name,stadium_city,stadium_attendance,referee_name,...,extra_time,goals_home,goals_away,own_goal,penalty,goal_minute_et,goal_et,half_time,short_date,long_date
0,Group 1,1984,20:30,France,Denmark,1–0,Parc des Princes,Paris,47570,Volker Roth,...,0,1,0,0,0,0,0,2,1984-06-12,12 June 1984
1,Group 1,1984,20:30,France,Denmark,1–0,Parc des Princes,Paris,47570,Volker Roth,...,0,1,0,0,0,0,0,1,1984-06-12,12 June 1984
2,Group 1,1984,20:30,Belgium,Yugoslavia,2–0,Stade Félix-Bollaert,Lens,41525,Erik Fredriksson,...,0,2,0,0,0,0,0,1,1984-06-13,13 June 1984
3,Group 1,1984,20:30,Belgium,Yugoslavia,2–0,Stade Félix-Bollaert,Lens,41525,Erik Fredriksson,...,0,2,0,0,0,0,0,1,1984-06-13,13 June 1984
4,Group 1,1984,20:30,Belgium,Yugoslavia,2–0,Stade Félix-Bollaert,Lens,41525,Erik Fredriksson,...,0,2,0,0,0,0,0,1,1984-06-13,13 June 1984


In [5]:
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\elo_eu.xlsx'
elo_eu = pd.read_excel(data_path)

In [6]:
# Dictionary of replacements for team names
replacements = {
    "Republic of Ireland": "Ireland",
    "CIS": "Commonwealth of Independent States",
    "FR Yugoslavia": "Yugoslavia"
}


In [7]:
# Prepare the Elo dataset for integration: rename columns for clarity
elo_eu = elo_eu.rename(columns={"team": "team_name", "elo_rating": "elo"})
elo_eu["team_name"] = elo_eu["team_name"].replace(replacements)

# Apply replacements to the relevant columns
df[["home_team", "away_team", "scorer_nationality"]] = df[["home_team", "away_team", "scorer_nationality"]].replace(replacements)

# Clean, transfrom, create variables

## duplicates 

In [8]:
# # Remove duplicates
# goals_df = goals_df.drop_duplicates()

In [9]:
# Columns to check for duplicates
columns_to_check = ['year', 'stage', 'home_team', 'scorer_nationality', 'long_date', 'time', 'half_time', 'goal_minute']

# Filter duplicates where both goals_home and goals_away are not 0
filtered_duplicates = df[
    (df.duplicated(subset=columns_to_check, keep=False)) &
    ~((df['goals_home'] == 0) & (df['goals_away'] == 0))
]

print("Filtered duplicates without both goals_home and goals_away being 0:")
display(filtered_duplicates)


Filtered duplicates without both goals_home and goals_away being 0:


Unnamed: 0,stage,year,time,home_team,away_team,score,stadium_name,stadium_city,stadium_attendance,referee_name,...,extra_time,goals_home,goals_away,own_goal,penalty,goal_minute_et,goal_et,half_time,short_date,long_date


## time

In [10]:
# Apply the conversion function
df['local_time'] = df['time'].apply(convert_time_to_utc)


Original time string: 20:30
Extracted local time: 20:30
Normalized 24-hour format: 20:30
No valid timezone found. Returning original time: 20:30

Original time string: 20:30
Extracted local time: 20:30
Normalized 24-hour format: 20:30
No valid timezone found. Returning original time: 20:30

Original time string: 20:30
Extracted local time: 20:30
Normalized 24-hour format: 20:30
No valid timezone found. Returning original time: 20:30

Original time string: 20:30
Extracted local time: 20:30
Normalized 24-hour format: 20:30
No valid timezone found. Returning original time: 20:30

Original time string: 20:30
Extracted local time: 20:30
Normalized 24-hour format: 20:30
No valid timezone found. Returning original time: 20:30

Original time string: 17:15
Extracted local time: 17:15
Normalized 24-hour format: 17:15
No valid timezone found. Returning original time: 17:15

Original time string: 17:15
Extracted local time: 17:15
Normalized 24-hour format: 17:15
No valid timezone found. Returning 

### manually setting local time for specific obervations

In [11]:
df.loc[
    (df['year'] == 2021) & 
    (df['stage'] == "Group B") & 
    (df['home_team'] == "Russia") & 
    (df['away_team'] == "Denmark"), 
    'local_time'
] = "18:00"


In [12]:
df.loc[
    (df['year'] == 2021) & 
    (df['stage'] == "Group A") & 
    (df['home_team'] == "Italy") & 
    (df['away_team'] == "Wales"), 
    'local_time'
] = "14:00"


In [13]:
df.loc[
    (df['year'] == 2021) & 
    (df['stage'] == "Group C") & 
    (df['home_team'] == "North Macedonia") & 
    (df['away_team'] == "Netherlands"), 
    'local_time'
] = "15:00"


In [14]:
df.loc[
    (df['year'] == 2021) & 
    (df['stage'] == "Group E") & 
    (df['home_team'] == "Slovakia") & 
    (df['away_team'] == "Spain"), 
    'local_time'
] = "15:00"


## date

In [15]:
# Step 1: Remove any commas before the year in 'long_date'
df['long_date'] = df['long_date'].str.replace(r',\s*(\d{4})', r' \1', regex=True)

# Step 2: Convert 'long_date' to 'short_date' where 'short_date' is missing
df['short_date'] = df['short_date'].fillna(pd.to_datetime(df['long_date'], errors='coerce').dt.strftime('%Y-%m-%d'))


## manually setting local time for specific obervations

# stage

In [16]:
# Calculate the number of unique teams per stage and year using home_team and away_team
team_counts = (
    df.groupby(['year', 'stage'])
    .apply(lambda x: pd.concat([x['home_team'], x['away_team']]).unique())
)

# Convert the resulting series to a DataFrame
team_counts = team_counts.reset_index()  # Reset index without 'name' argument
team_counts = team_counts.rename(columns={0: 'team_list'})  # Rename the column appropriately

# Convert team list into counts
team_counts['team_count'] = team_counts['team_list'].apply(len)

# Filter stages where the number of teams is not equal to 4
invalid_groups = team_counts[team_counts['team_count'] != 4]

# Display the invalid groups
display(invalid_groups)

Unnamed: 0,year,stage,team_list,team_count
0,1984,Final,"[France, Spain]",2
4,1988,Final,"[Soviet Union, Netherlands]",2
8,1992,Final,"[Denmark, Germany]",2
12,1996,Final,"[Czech Republic, Germany]",2
17,1996,Quarter-finals,"[Spain, France, Germany, Czech Republic, Engla...",8
19,2000,Final,"[France, Italy]",2
24,2000,Quarter-finals,"[Portugal, Italy, Netherlands, Spain, Turkey, ...",8
26,2004,Final,"[Portugal, Greece]",2
31,2004,Quarter-finals,"[Portugal, France, Sweden, Czech Republic, Eng...",8
33,2008,Final,"[Germany, Spain]",2


In [17]:
# Filter out the knockout stages
df = df[~df['stage'].isin(['Quarterfinals', 'Quarter-finals', 'Round of 16', 'Semi-finals', 'Semifinals','Final', 'Third place play-off', 'Third place playoff', 'not applicable'])]

# Extract relevant columns for goal events and match results
goals_df = df[['year', 'stage', 'home_team', 'away_team', 'scorer_nationality', 'goal_minute', 'half_time','short_date','local_time', 'score']]

#Sort goals_df by short_date, local_time, and goal_minute
goals_df = goals_df.sort_values(by=['short_date', 'local_time', 'goal_minute', 'half_time'], ascending=[True, True, True, True])


display(goals_df.head())


Unnamed: 0,year,stage,home_team,away_team,scorer_nationality,goal_minute,half_time,short_date,local_time,score
1,1984,Group 1,France,Denmark,,0,1,1984-06-12,20:30,1–0
0,1984,Group 1,France,Denmark,France,78,2,1984-06-12,20:30,1–0
4,1984,Group 1,Belgium,Yugoslavia,,0,1,1984-06-13,20:30,2–0
2,1984,Group 1,Belgium,Yugoslavia,Belgium,28,1,1984-06-13,20:30,2–0
3,1984,Group 1,Belgium,Yugoslavia,Belgium,45,1,1984-06-13,20:30,2–0


# team counts

In [18]:
# Calculate the number of unique teams per stage and year using home_team and away_team
team_counts = (
    df.groupby(['year', 'stage'])
    .apply(lambda x: pd.concat([x['home_team'], x['away_team']]).unique())
)

# Convert the resulting series to a DataFrame
team_counts = team_counts.reset_index()  # Reset index without 'name' argument
team_counts = team_counts.rename(columns={0: 'team_list'})  # Rename the column appropriately

# Convert team list into counts
team_counts['team_count'] = team_counts['team_list'].apply(len)

# Filter stages where the number of teams is not equal to 4
invalid_groups = team_counts[team_counts['team_count'] != 4]

# Display the invalid groups
display(invalid_groups)

Unnamed: 0,year,stage,team_list,team_count


# stage counts

In [19]:
# Group by year and collect unique values of stage in a list
stage_summary = df.groupby('year')['stage'].unique().reset_index()

# Rename column for clarity
stage_summary.rename(columns={'stage': 'unique_stages'}, inplace=True)

# Add a column for the number of unique stages
stage_summary['num_unique_stages'] = stage_summary['unique_stages'].apply(len)

# Display the summary
display(stage_summary)


Unnamed: 0,year,unique_stages,num_unique_stages
0,1984,"[Group 1, Group 2]",2
1,1988,"[Group 1, Group 2]",2
2,1992,"[Group 1, Group 2]",2
3,1996,"[Group A, Group B, Group C, Group D]",4
4,2000,"[Group A, Group B, Group C, Group D]",4
5,2004,"[Group A, Group B, Group C, Group D]",4
6,2008,"[Group A, Group B, Group C, Group D]",4
7,2012,"[Group A, Group B, Group C, Group D]",4
8,2016,"[Group A, Group B, Group C, Group D, Group E, ...",6
9,2021,"[Group A, Group B, Group C, Group D, Group E, ...",6


# Recreate Leauge Table after first two matchdays

In [20]:

# Assuming goals_df is your original dataset
agg_goals_before_last_day, goals_last_day_sorted = before_last(goals_df)


In [21]:
home_games, away_games = aggregate_home_away_points(agg_goals_before_last_day)

In [22]:
# Filter for year == 2022 and stage == 'Group E'
home1992 = home_games[(home_games['year'] == 1992) & (home_games['stage'] == 'Group 1')]
home1992

Unnamed: 0,year,stage,home_team,goals_scored,goals_conceded,points_home,match_count_home
12,1992,Group 1,Denmark,0,0,1,1
13,1992,Group 1,France,0,0,1,1
14,1992,Group 1,Sweden,2,1,3,2


In [23]:
# Filter for year == 2022 and stage == 'Group E'
away1992 = away_games[(away_games['year'] == 1992) & (away_games['stage'] == 'Group 1')]
away1992

Unnamed: 0,year,stage,away_team,goals_scored,goals_conceded,points_away,match_count_away
12,1992,Group 1,Denmark,0,1,0,1
13,1992,Group 1,England,0,0,2,2
14,1992,Group 1,France,1,1,1,1


## aggregate data after first two match days

In [24]:
# Applying the fifa_before_last function to the mock data
all_games_before_last = fifa_before_last(home_games, away_games, agg_goals_before_last_day, team_counts)

No missing values in the specified columns.
No observations where total_matches == 1.
No observations where total_matches == 0.

=== Applying Tiebreaker ===
Row1: year                      1984
stage                  Group 1
team                   Belgium
home_team              Belgium
goals_scored_home          2.0
goals_conceded_home        0.0
points_home                2.0
match_count_home           1.0
away_team              Belgium
goals_scored_away          0.0
goals_conceded_away        5.0
points_away                0.0
match_count_away           1.0
goals_scored               2.0
goals_conceded             5.0
points                     2.0
total_matches              2.0
matches_flag               2.0
goals_difference          -3.0
tiebreaker             no need
tie_won                      0
Name: 1, dtype: object
Row2: year                      1984
stage                  Group 1
team                   Denmark
home_team              Denmark
goals_scored_home          5.0
go

In [25]:
# Filtering all_games_before_last for the year 2022 and stage 'Group E'
group1992 = all_games_before_last[
    (all_games_before_last['year'] == 1992) & 
    (all_games_before_last['stage'] == 'Group 1')
]

display(group1992)

Unnamed: 0,year,stage,team,standing,points,goals_scored,goals_conceded,goals_difference,total_matches,tiebreaker,tie_won
16,1992,Group 1,Sweden,1,3,2,1,1,2,no need,0
17,1992,Group 1,France,2,2,1,1,0,2,France,1
18,1992,Group 1,England,3,2,0,0,0,2,France,0
19,1992,Group 1,Denmark,4,1,0,1,-1,2,no need,0


### manually assing standing where disciplinary points were tiebreaker

In [26]:
# Filter for year == 2024 and stage == 'Group C'

mask_denmark = (all_games_before_last['year'] == 2024) & \
               (all_games_before_last['stage'] == "Group C") & \
               (all_games_before_last['standing'] == 2)

mask_slovenia = (all_games_before_last['year'] == 2024) & \
                (all_games_before_last['stage'] == "Group C") & \
                (all_games_before_last['standing'] == 3)

all_games_before_last.loc[mask_denmark, 'team'] = "Denmark"
all_games_before_last.loc[mask_slovenia, 'team'] = "Slovenia"


In [27]:
# Filter for year == 2024 and stage == 'Group E'
mask_romania = (all_games_before_last['year'] == 2024) & \
               (all_games_before_last['stage'] == "Group E") & \
               (all_games_before_last['standing'] == 1)

mask_belgium = (all_games_before_last['year'] == 2024) & \
               (all_games_before_last['stage'] == "Group E") & \
               (all_games_before_last['standing'] == 2)

mask_slovakia = (all_games_before_last['year'] == 2024) & \
                (all_games_before_last['stage'] == "Group E") & \
                (all_games_before_last['standing'] == 3)

mask_ukraine = (all_games_before_last['year'] == 2024) & \
               (all_games_before_last['stage'] == "Group E") & \
               (all_games_before_last['standing'] == 4)

all_games_before_last.loc[mask_romania, 'team'] = "Romania"
all_games_before_last.loc[mask_belgium, 'team'] = "Belgium"
all_games_before_last.loc[mask_slovakia, 'team'] = "Slovakia"
all_games_before_last.loc[mask_ukraine, 'team'] = "Ukraine"


In [28]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\eu\before_last_eu_fifa.xlsx'
all_games_before_last.to_excel(file_path, index=False)

# Recreate league table after last match day

In [29]:
# Define unique pairs and ensure correct order based on goals_last_day_sorted
unique_pairs = all_games_before_last[['year', 'stage']].drop_duplicates()

# Merge with goals_last_day_sorted to get the correct order
unique_pairs = unique_pairs.merge(
    goals_last_day_sorted[['year', 'stage', 'short_date','local_time']].drop_duplicates(),
    how='left',
    on=['year', 'stage']
).sort_values(by=['year', 'short_date','local_time', 'stage']).reset_index(drop=True)

# Initialize an empty list to store the results for each pair
all_results = []

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    stage = row['stage']
    
    # Apply the function to the current pair
    result = fifa_final_wc(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_results.append(result)

# Concatenate all the results into a single DataFrame
changes_df_euro = pd.concat(all_results)

# Keep only the specified columns
changes_df_euro = changes_df_euro[[ 'year', 'stage', 'team', '1st', '2nd', '3rd', '4th', 'changes']]



=== Initial Standings for Year 1984, Group 1 Before Last Match Goals ===

      team  total_points  total_goals_scored  total_goals_conceded  total_goals_difference  before_last_game_points  before_last_game_standing
    France             5                   6                     0                       6                        4                          1
   Denmark             3                   5                     1                       4                        2                          2
   Belgium             3                   2                     5                      -3                        2                          3
Yugoslavia             1                   0                     7                      -7                        0                          4


Analyzing goal: 26 minute, 1 half time, Player team: Belgium, Home: Denmark, Away: Belgium

=== Teams with Identical Points (Tied Teams) ===

Empty DataFrame
Columns: [team, total_points]
Index: []

=== Updat

In [30]:
# # Exporting final df
# file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\eu\tb_eu_fifa_men.xlsx'
# changes_df_wc.to_excel(file_path, index=False)


# group composition tracking

In [31]:
unique_pairs

Unnamed: 0,year,stage,short_date,local_time
0,1984,Group 1,1984-06-19,20:30
1,1984,Group 2,1984-06-20,20:30
2,1988,Group 1,1988-06-17,20:15
3,1988,Group 2,1988-06-18,15:30
4,1992,Group 1,1992-06-17,20:15
5,1992,Group 2,1992-06-18,20:15
6,1996,Group B,1996-06-18,16:30
7,1996,Group A,1996-06-18,19:30
8,1996,Group D,1996-06-19,16:30
9,1996,Group C,1996-06-19,19:30


In [32]:
# Group by 'year', 'stage', and 'short_date' and count unique values of 'local_time'
unique_pairs_grouped = unique_pairs.groupby(['year', 'stage', 'short_date'])['local_time'].nunique().reset_index()

# Rename the column for clarity
unique_pairs_grouped.rename(columns={'local_time': 'unique_local_times'}, inplace=True)

# Filter the DataFrame to highlight rows where unique_local_times > 1
highlighted_rows = unique_pairs_grouped[unique_pairs_grouped['unique_local_times'] > 1]
highlighted_rows



Unnamed: 0,year,stage,short_date,unique_local_times


In [33]:
# Initialize an empty list to store the results for each pair
all_composition_changes = []


# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    stage = row['stage']
    
    # Apply the function to the current pair
    composition_changes_df, third_place_df = gap_composition(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_composition_changes.append(composition_changes_df)

# Concatenate all the results into a single DataFrame
final_composition_changes_df = pd.concat(all_composition_changes, ignore_index=True)

# # Drop duplicates based on the specified columns
# final_composition_changes_df = final_composition_changes_df.drop_duplicates(
#     subset=['year', 'stage', 'date', 'time', 'goal_minute', 'half_time', 'home_team', 'away_team', 'scorer_team'],
#     keep='first'
# )




=== STEP 1: Initial Standings for Group 1, 1984 (Goal Time = 0) ===


      team  total_points  total_goals_scored  total_goals_conceded  total_goals_difference  before_last_game_standing
    France             5                   6                     0                       6                          1
   Denmark             3                   5                     1                       4                          2
   Belgium             3                   2                     5                      -3                          3
Yugoslavia             1                   0                     7                      -7                          4



=== Tied after goal at minute 26 1 half time by Belgium in Group 1, edition 1984 ===
Number of tied teams: 0
Empty DataFrame
Columns: [team, total_points, total_goals_scored, total_goals_conceded, total_goals_difference]
Index: []
[DEBUG] Saving Third Team - Year: 1984, Stage: Group 1, Goal Minute: 26, Team: Denmark, Points: 2, Goal Difference: 3, Goals Scored: 5, Date: 1984-06-19, Time: 20:30
[DEBUG] T

## manually modify order of tie teams by disciplinary points

In [34]:
mask = (final_composition_changes_df['year'] == 2024) & \
       (final_composition_changes_df['stage'] == "Group C") & \
       (final_composition_changes_df['goal_minute'] == 0)

final_composition_changes_df.loc[mask, ['2nd', '3rd']] = ["Denmark", "Slovenia"]


# best four third placed

In [35]:
# Filter the dataset to include only years >= 2016
third_place_df = third_place_df[third_place_df['year'] >= 2016].copy()

third_place_df = ensure_goal_minute_zero(third_place_df, all_games_before_last)

# Remove duplicates
third_place_df = third_place_df.drop_duplicates()

# Fill missing 'date' and 'time' with the next row's value (backward fill)
third_place_df[['date', 'time']] = third_place_df[['date', 'time']].fillna(method='bfill')



[INFO] Removing existing `goal_minute = 0` entries...
[INFO] Creating `goal_minute = 0` entries from `all_games_before_last`
[INFO] Adding 18 new `goal_minute = 0` entries.
[INFO] `goal_minute = 0` entries updated successfully.


### manually update groups with zero goal in matchday 3

In [36]:
third_place_df.loc[(third_place_df["year"] == 2024) & (third_place_df["stage"] == "Group C"), ["date", "time"]] = ["2024-06-25", "21:00"]


In [37]:
third_place_df = third_place_df.sort_values(by=['year', 'date', 'time', 'half_time', 'goal_minute', 'stage'])


In [38]:
# # Define file path for saving
# file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\eu\third_place_eu_fifa.xlsx'

# # Export to Excel
# third_place_df.to_excel(file_path, index=False)

# print(f"Filtered and deduplicated third-place data saved to {file_path}")

### resolving ties using yellow cards

In [39]:
team_priority = {
    2016: ['Czech Republic', 'Portugal', 'Romania', 'Northern Ireland'],
    2024: ['Netherlands', 'Georgia']
}


In [40]:
initial_third_teams_df = initial_third_teams(third_place_df)

In [41]:
initial_third_teams_df

Unnamed: 0,year,goal_minute,date,time,change_flag,change_count,top_four,last_two,tied_team,Group C,Group B,Group F,Group A,Group D,Group E
0,2016,0,2016-06-19,21:00,0,0,"[[Northern Ireland, 4, 1, 2], [Slovakia, 4, 0,...","[[Czech Republic, 1, -1, 2], [Sweden, 1, -1, 1]]","[Romania, Czech Republic]","[Northern Ireland, 3, 1, 2]","[Slovakia, 3, 0, 3]","[Portugal, 2, 0, 1]","[Romania, 1, -1, 2]","[Czech Republic, 1, -1, 2]","[Sweden, 1, -1, 1]"
1,2021,0,2021-06-20,14:00,0,0,"[[Austria, 4, 0, 3], [Russia, 4, -2, 1], [Germ...","[[Croatia, 1, -1, 1], [Switzerland, 1, -3, 1]]",[],"[Austria, 3, 0, 3]","[Russia, 3, -2, 1]","[Germany, 3, 1, 4]","[Switzerland, 1, -3, 1]","[Croatia, 1, -1, 1]","[Spain, 2, 0, 1]"
2,2024,0,2024-06-23,21:00,0,0,"[[Slovakia, 4, 0, 2], [Austria, 3, 1, 3], [Slo...","[[Czech Republic, 1, -1, 2], [Scotland, 1, -4,...","[Slovakia, Slovenia]","[Slovenia, 2, 0, 2]","[Albania, 1, -1, 3]","[Czech Republic, 1, -1, 2]","[Scotland, 1, -4, 2]","[Austria, 3, 1, 3]","[Slovakia, 3, 0, 2]"


In [42]:
third_teams_df = third_teams(third_place_df, initial_third_teams_df, team_priority)


Processing: Year=2016, Stage=Group A, Goal Minute=0
--- YEAR CHANGED: 2016 --- Resetting group values to first row of structured_df
Before Update: {'Group A': ['Romania', 1, -1, 2], 'Group B': ['Slovakia', 3, 0, 3], 'Group C': ['Northern Ireland', 3, 1, 2], 'Group D': ['Czech Republic', 1, -1, 2], 'Group E': ['Sweden', 1, -1, 1], 'Group F': ['Portugal', 2, 0, 1]}
After Update: {'Group A': ['Romania', 3, -1, 2], 'Group B': ['Slovakia', 3, 0, 3], 'Group C': ['Northern Ireland', 3, 1, 2], 'Group D': ['Czech Republic', 1, -1, 2], 'Group E': ['Sweden', 1, -1, 1], 'Group F': ['Portugal', 2, 0, 1]}

Processing: Year=2016, Stage=Group A, Goal Minute=0
Before Update: {'Group A': ['Romania', 3, -1, 2], 'Group B': ['Slovakia', 3, 0, 3], 'Group C': ['Northern Ireland', 3, 1, 2], 'Group D': ['Czech Republic', 1, -1, 2], 'Group E': ['Sweden', 1, -1, 1], 'Group F': ['Portugal', 2, 0, 1]}
After Update: {'Group A': ['Romania', 2, -1, 2], 'Group B': ['Slovakia', 3, 0, 3], 'Group C': ['Northern Ireland'

In [43]:
third_teams_df = third_track(third_teams_df)

In [44]:
# Define output path and save
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\eu\third_teams_df_eu_fifa.xlsx'
third_teams_df.to_excel(file_path, index=False)

print(f"Data successfully saved to {file_path}")


Data successfully saved to C:\Users\ALESSANDRO\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\eu\third_teams_df_eu_fifa.xlsx


### variable to track whether third teams qualifies or not

In [45]:
def format_time_column(df, column_name):
    """
    Ensures the time column is in HH:MM format without losing data.
    It removes seconds if present and keeps values unchanged if already correct.
    """
    def clean_time(value):
        if isinstance(value, str):
            match = re.match(r'(\d{1,2}):(\d{2}):(\d{2})', value)  # Matches HH:MM:SS
            if match:
                return f"{match.group(1)}:{match.group(2)}"  # Keep only HH:MM
            match = re.match(r'(\d{1,2}):(\d{2})', value)  # Matches HH:MM
            if match:
                return value  # Already correct
        return value  # Keep original if not a string

    df[column_name] = df[column_name].astype(str).apply(clean_time)
    return df

# Apply to final_composition_changes_df
final_composition_changes_df = format_time_column(final_composition_changes_df, 'time')


In [46]:
# Convert 'time' to proper format (HH:MM), handling NaN values
final_composition_changes_df["time"] = pd.to_datetime(final_composition_changes_df["time"], errors='coerce').dt.strftime('%H:%M')

# Step 1: Order the dataset by 'year', 'stage', 'half_time', and 'goal_minute', 
# ensuring that goal_minute == 0 is ranked at the top
final_composition_changes_df = final_composition_changes_df.sort_values(
    by=["year", "stage", "half_time", "goal_minute"], ascending=[True, True, True, True]
).reset_index(drop=True)

# Step 2: Fill 'time' and 'date' forward within each 'stage'
final_composition_changes_df["time"] = final_composition_changes_df.groupby("stage")["time"].fillna(method="ffill")
final_composition_changes_df["date"] = final_composition_changes_df.groupby("stage")["date"].fillna(method="ffill")


  final_composition_changes_df["time"] = pd.to_datetime(final_composition_changes_df["time"], errors='coerce').dt.strftime('%H:%M')


In [47]:
# Initialize third_qualify to 0
final_composition_changes_df['third_qualify'] = 0

# Process only years >= 2016
for index, row in final_composition_changes_df.iterrows():
    if row['year'] >= 2016:
        # Print debugging information for each condition
        year_match = third_teams_df['year'] == row['year']
        goal_minute_match = third_teams_df['goal_minute'] == row['goal_minute']
        date_match = third_teams_df['date'] == row['date']
        time_match = third_teams_df['time'] == row['time']
        stage_match = third_teams_df['stage'] == row['stage']

        # Find matching rows
        match = third_teams_df[year_match & goal_minute_match & date_match & time_match & stage_match]

        # Print which conditions do not hold
        if match.empty:
            print(f"\n⚠️ No match for index {index} | year: {row['year']} | goal_minute: {row['goal_minute']}")

            if not year_match.any():
                print(f" ❌ Year {row['year']} not found in third_teams_df")

            if not goal_minute_match.any():
                print(f" ❌ Goal_minute {row['goal_minute']} not found in third_teams_df | Unique values: {third_teams_df['goal_minute'].unique()}")

            if not date_match.any():
                print(f" ❌ Date {row['date']} not found in third_teams_df | Unique values: {third_teams_df['date'].unique()[:10]}")

            if not time_match.any():
                print(f" ❌ Time {row['time']} not found in third_teams_df | Unique values: {third_teams_df['time'].unique()[:10]}")

            if not stage_match.any():
                print(f" ❌ Stage {row['stage']} not found in third_teams_df | Unique values: {third_teams_df['stage'].unique()[:10]}")

        else:
            print(f"\n✅ Match found for index {index}:\n{match}")

            # Check if the 3rd place team is in the top four third-placed teams
            top_four_teams = match.iloc[0]['top_four']
            print(f"Index {index} | top_four_teams: {top_four_teams} (type: {type(top_four_teams)})")

            # Convert top_four_teams if stored as a string
            import ast
            if isinstance(top_four_teams, str):
                top_four_teams = ast.literal_eval(top_four_teams)

            # Check if the 3rd place team is in the top four
            if row['3rd'] not in top_four_teams:
                print(f"⚠️ Third-place team {row['3rd']} NOT in top_four_teams {top_four_teams}")
            else:
                final_composition_changes_df.at[index, 'third_qualify'] = 1

# Create qualified_teams list: always includes 1st and 2nd, includes 3rd only if third_qualify == 1
final_composition_changes_df['qualified_teams'] = final_composition_changes_df.apply(
    lambda row: [row['1st'], row['2nd']] + ([row['3rd']] if row['third_qualify'] == 1 else []),
    axis=1
)

# Remove the columns 'change_num' and 'changed' from the DataFrame
final_composition_changes_df = final_composition_changes_df.drop(columns=['change_num', 'changed'], errors='ignore')



✅ Match found for index 199:
   year        date   time  goal_minute  half_time    stage  \
0  2016  2016-06-19  21:00            0          1  Group A   
1  2016  2016-06-19  21:00            0          1  Group A   

               Group A              Group B                      Group C  \
0  [Romania, 3, -1, 2]  [Slovakia, 3, 0, 3]  [Northern Ireland, 3, 1, 2]   
1  [Romania, 2, -1, 2]  [Slovakia, 3, 0, 3]  [Northern Ireland, 3, 1, 2]   

                      Group D             Group E              Group F  \
0  [Czech Republic, 1, -1, 2]  [Sweden, 1, -1, 1]  [Portugal, 2, 0, 1]   
1  [Czech Republic, 1, -1, 2]  [Sweden, 1, -1, 1]  [Portugal, 2, 0, 1]   

                                          top_four                  last_two  \
0  [Northern Ireland, Slovakia, Romania, Portugal]  [Czech Republic, Sweden]   
1  [Northern Ireland, Slovakia, Portugal, Romania]  [Czech Republic, Sweden]   

  tied_teams  change_flag  change_count  
0         []            0             0  
1  

### variables to track the composition of the teams qualifying

In [48]:
# Sort the dataset by 'year', 'date', and 'time'
final_composition_changes_df = final_composition_changes_df.sort_values(by=['year', 'date', 'time', 'half_time', 'goal_minute', 'stage'], na_position='first').reset_index(drop=True)


In [49]:

# Ensure goal_minute is numeric for sorting
final_composition_changes_df['goal_minute'] = pd.to_numeric(final_composition_changes_df['goal_minute'], errors='coerce')

# Sort by year, stage, and goal_minute to process in correct order
final_composition_changes_df = final_composition_changes_df.sort_values(by=['year', 'stage', 'goal_minute']).reset_index(drop=True)

# Initialize qual_changed and qual_count columns
final_composition_changes_df['qual_changed'] = 0
final_composition_changes_df['qual_count'] = 0

# Track changes per stage
qual_count_tracker = {}

# Loop through each year and stage to check for changes
for (year, stage), group in final_composition_changes_df.groupby(['year', 'stage']):
    previous_qualified = None
    qual_count = 0

    for index, row in group.iterrows():
        # Extract current qualified teams
        current_qualified = sorted(row['qualified_teams'])  # Sort to ignore order changes

        # Check if qualified_teams has changed from the previous row
        if previous_qualified is not None and current_qualified != previous_qualified:
            final_composition_changes_df.at[index, 'qual_changed'] = 1
            qual_count += 1

        # Update tracking
        previous_qualified = current_qualified
        final_composition_changes_df.at[index, 'qual_count'] = qual_count

In [50]:
final_composition_changes_df = integrate_elo_probabilities(final_composition_changes_df, elo_eu)

In [51]:
# Define pt_diff and gl_diff based on third_qualify condition
final_composition_changes_df['pts_diff'] = final_composition_changes_df.apply(
    lambda row: row['3rd_points'] - row['2nd_points']
    if row['third_qualify'] == 0 else row['4th_points'] - row['3rd_points'], axis=1
)

final_composition_changes_df['gls_diff'] = final_composition_changes_df.apply(
    lambda row: row['3rd_goal_diff'] - row['2nd_goal_diff']
    if row['third_qualify'] == 0 else row['4th_goal_diff'] - row['3rd_goal_diff'], axis=1
)


In [52]:
# Drop unnecessary columns
final_composition_changes_df = final_composition_changes_df.drop(columns=['points_diff', 'goals_diff', 'tiebreak_result'], errors='ignore')


In [53]:
# Define the desired column order
column_order = [
    'year', 'stage', 'date', 'time', 'goal_minute', 'half_time', 'home_team', 
    'away_team', 'scorer_team', 'new_top_teams', 
    '1st', '1st_points', '1st_goal_diff', '1st_last_game_points',
    '2nd', '2nd_points', '2nd_goal_diff', '2nd_last_game_points',
    '3rd', '3rd_points', '3rd_goal_diff', '3rd_last_game_points',
    '4th', '4th_points', '4th_goal_diff', '4th_last_game_points',
    'third_qualify', 'qualified_teams', 'qual_changed', 'qual_count', 
    'pts_diff', 'gls_diff', 'elo_home', 'elo_away'
]

# Reorder the columns
final_composition_changes_df = final_composition_changes_df[column_order]



In [54]:
# Remove rows where goal_minute == 0 and both home_team and away_team have values
filtered_df = final_composition_changes_df[
    ~((final_composition_changes_df["goal_minute"] == 0) & 
      final_composition_changes_df["home_team"].notna() & 
      final_composition_changes_df["away_team"].notna())
]

# Reset index after filtering
filtered_df = filtered_df.reset_index(drop=True)

# Update the original DataFrame with the filtered version
final_composition_changes_df = filtered_df


In [55]:
# Sort the dataset by 'year', 'date', and 'time'
final_composition_changes_df = final_composition_changes_df.sort_values(by=['year', 'date', 'time', 'half_time', 'goal_minute', 'stage'], na_position='first').reset_index(drop=True)


In [56]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\eu\goals_eu_fifa_men.xlsx'
final_composition_changes_df.to_excel(file_path, index=False)


# minute by minute dataframe

In [57]:
# Initialize an empty list to store expanded rows
expanded_rows = []

# Iterate over rows grouped by date and time
for (date, time), group in final_composition_changes_df.groupby(["date", "time"]):
    group = group.sort_values(by="goal_minute").reset_index(drop=True)

    # Start match from minute 0
    min_minute = 0
    max_minute = max(90, group["goal_minute"].max(skipna=True)) if not group["goal_minute"].isna().all() else 90

    # Iterate over goal events and fill in missing match minutes
    for i in range(len(group) - 1):
        current_row = group.iloc[i]
        next_row = group.iloc[i + 1]

        for match_minute in range(int(current_row["goal_minute"]), int(next_row["goal_minute"])):
            new_row = current_row.copy()
            new_row["match_minute"] = match_minute

            # Only set goal_minute when it's an actual goal event
            if match_minute != current_row["goal_minute"]:
                new_row["goal_minute"] = None
                new_row["scorer_team"] = None  # Clear scorer team for non-goal minutes
            
            expanded_rows.append(new_row)

    # Add the last row of the group
    last_row = group.iloc[-1].copy()
    last_goal_minute = int(last_row["goal_minute"])
    
    for match_minute in range(last_goal_minute, max_minute + 1):
        new_row = last_row.copy()
        new_row["match_minute"] = match_minute
        new_row["goal_minute"] = None if match_minute > last_goal_minute else last_row["goal_minute"]
        new_row["scorer_team"] = None if match_minute > last_goal_minute else last_row["scorer_team"]
        expanded_rows.append(new_row)

# Create a new DataFrame from the expanded rows
expanded_df = pd.DataFrame(expanded_rows)

# Define the desired column order
column_order = [
    'year', 'stage', 'date', 'time', 'match_minute', 'goal_minute', 'half_time', 'home_team', 
    'away_team', 'scorer_team', 'new_top_teams', 
    '1st', '1st_points', '1st_goal_diff', '1st_last_game_points',
    '2nd', '2nd_points', '2nd_goal_diff', '2nd_last_game_points',
    '3rd', '3rd_points', '3rd_goal_diff', '3rd_last_game_points',
    '4th', '4th_points', '4th_goal_diff', '4th_last_game_points',
    'third_qualify', 'qualified_teams', 'qual_changed', 'qual_count', 
    'pts_diff', 'gls_diff', 'elo_home', 'elo_away'
]
# Ensure the expanded dataframe follows the defined column order
expanded_df = expanded_df[column_order]


In [58]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\eu\mbm_eu_fifa.xlsx'
expanded_df.to_excel(file_path, index=False)
