# Libraries

In [44]:
import pandas as pd
from getpass import getuser
from collections import defaultdict
from datetime import datetime, timedelta
import re

# Load and inspect dataset

In [45]:
# Get the current user's name
user = getuser()

# Construct the path using the user's name
function_path = f"C:/Users/{user}/Documents/GitHub/tiebreak_wc/code/wiki/functions_fifa.ipynb"

# Run the notebook
%run $function_path

In [46]:
# Path to the dataset
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\wc_goals_men.xlsx'

# Read the dataset with a different encoding
df = pd.read_excel(data_path)

# Now, filter the data for tournaments after 1984
df = df[df['year'] >= 1984]

display(df.head())

Unnamed: 0,stage,year,home_team,away_team,score,half_time,stadium_name,stadium_city,stadium_attendance,referee_name,...,goal_minute_et,extra_time,goals_home,goals_away,own_goal,penalty,goal_et,short_date,long_date,referee_natinality
0,Group A,1986,Bulgaria,Italy,1–1,2,Estadio Azteca,Mexico City,96000.0,Erik Fredriksson,...,0,0,1,1,0,0,0,,31 May 1986,
1,Group A,1986,Bulgaria,Italy,1–1,1,Estadio Azteca,Mexico City,96000.0,Erik Fredriksson,...,0,0,1,1,0,0,0,,31 May 1986,
2,Group A,1986,Argentina,South Korea,3–1,1,Estadio Olímpico Universitario,Mexico City,60000.0,Victoriano Sánchez Arminio,...,0,0,3,1,0,0,0,,2 June 1986,
3,Group A,1986,Argentina,South Korea,3–1,2,Estadio Olímpico Universitario,Mexico City,60000.0,Victoriano Sánchez Arminio,...,0,0,3,1,0,0,0,,2 June 1986,
4,Group A,1986,Argentina,South Korea,3–1,1,Estadio Olímpico Universitario,Mexico City,60000.0,Victoriano Sánchez Arminio,...,0,0,3,1,0,0,0,,2 June 1986,


In [47]:
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\elo_wc.xlsx'
elo_wc = pd.read_excel(data_path)

In [48]:
# Dictionary of replacements for team names
replacements = {
    "Republic of Ireland": "Ireland",
    "CIS": "Commonwealth of Independent States",
    "FR Yugoslavia": "Yugoslavia"
}

In [49]:
# Prepare the Elo dataset for integration: rename columns for clarity
elo_wc = elo_wc.rename(columns={"team": "team_name", "elo_rating": "elo"})
elo_wc["team_name"] = elo_wc["team_name"].replace(replacements)

# Apply replacements to the relevant columns
df[["home_team", "away_team", "scorer_nationality"]] = df[["home_team", "away_team", "scorer_nationality"]].replace(replacements)

# Clean, transfrom, create variables

## duplicates 

In [50]:
# # Remove duplicates
# goals_df = goals_df.drop_duplicates()

In [51]:
# Columns to check for duplicates
columns_to_check = ['year', 'stage', 'home_team', 'scorer_nationality', 'long_date', 'half_time','time', 'goal_minute']

# Filter duplicates where both goals_home and goals_away are not 0
filtered_duplicates = df[
    (df.duplicated(subset=columns_to_check, keep=False)) &
    ~((df['goals_home'] == 0) & (df['goals_away'] == 0))
]

print("Filtered duplicates without both goals_home and goals_away being 0:")
display(filtered_duplicates)


Filtered duplicates without both goals_home and goals_away being 0:


Unnamed: 0,stage,year,home_team,away_team,score,half_time,stadium_name,stadium_city,stadium_attendance,referee_name,...,goal_minute_et,extra_time,goals_home,goals_away,own_goal,penalty,goal_et,short_date,long_date,referee_natinality


## time

In [52]:
# Apply the conversion function
df['local_time'] = df['time'].apply(convert_time_to_utc)


Original time string: 12:00CST
Extracted local time: 12:00
Normalized 24-hour format: 12:00
Detected timezone acronym: CST, UTC Offset: -6
Final UTC time: 18:00

Original time string: 12:00CST
Extracted local time: 12:00
Normalized 24-hour format: 12:00
Detected timezone acronym: CST, UTC Offset: -6
Final UTC time: 18:00

Original time string: 12:00CST
Extracted local time: 12:00
Normalized 24-hour format: 12:00
Detected timezone acronym: CST, UTC Offset: -6
Final UTC time: 18:00

Original time string: 12:00CST
Extracted local time: 12:00
Normalized 24-hour format: 12:00
Detected timezone acronym: CST, UTC Offset: -6
Final UTC time: 18:00

Original time string: 12:00CST
Extracted local time: 12:00
Normalized 24-hour format: 12:00
Detected timezone acronym: CST, UTC Offset: -6
Final UTC time: 18:00

Original time string: 12:00CST
Extracted local time: 12:00
Normalized 24-hour format: 12:00
Detected timezone acronym: CST, UTC Offset: -6
Final UTC time: 18:00

Original time string: 12:00C

## date

In [53]:
# Step 1: Remove any commas before the year in 'long_date'
df['long_date'] = df['long_date'].str.replace(r',\s*(\d{4})', r' \1', regex=True)

# Step 2: Convert 'long_date' to 'short_date' where 'short_date' is missing
df['short_date'] = df['short_date'].fillna(pd.to_datetime(df['long_date'], errors='coerce').dt.strftime('%Y-%m-%d'))


In [54]:
# Step 1: Remove comma only for year 1994
df.loc[df['year'] == 1994, 'long_date'] = df.loc[df['year'] == 1994, 'long_date'].str.replace(',', '', regex=True)

# Step 2: Convert 'long_date' to 'short_date' where 'short_date' is missing
df.loc[df['year'] == 1994, 'short_date'] = df.loc[df['year'] == 1994, 'short_date'].fillna(
    pd.to_datetime(df.loc[df['year'] == 1994, 'long_date'], errors='coerce', format='%B %d %Y').dt.strftime('%Y-%m-%d')
)

# Step 3: Convert all other years normally
df.loc[df['year'] != 1994, 'short_date'] = df.loc[df['year'] != 1994, 'short_date'].fillna(
    pd.to_datetime(df.loc[df['year'] != 1994, 'long_date'], errors='coerce').dt.strftime('%Y-%m-%d')
)

# Step 4: Identify missing conversions again
missing_dates = df[df['short_date'].isna()]

if not missing_dates.empty:
    print("\n⚠️ WARNING: Some dates are still missing after format correction.")
    print(missing_dates[['year', 'long_date', 'short_date']])

# Step 5: Check specific cases for year 1994
missing_1994 = df[(df['year'] == 1994) & df['short_date'].isna()]
if not missing_1994.empty:
    print("\n🔍 DEBUG: Still missing short_date for year 1994:")
    print(missing_1994[['year', 'long_date', 'short_date']])


# stage

In [55]:
# Calculate the number of unique teams per stage and year using home_team and away_team
team_counts = (
    df.groupby(['year', 'stage'])
    .apply(lambda x: pd.concat([x['home_team'], x['away_team']]).unique())
)

# Convert the resulting series to a DataFrame
team_counts = team_counts.reset_index()  # Reset index without 'name' argument
team_counts = team_counts.rename(columns={0: 'team_list'})  # Rename the column appropriately

# Convert team list into counts
team_counts['team_count'] = team_counts['team_list'].apply(len)

# Filter stages where the number of teams is not equal to 4
invalid_groups = team_counts[team_counts['team_count'] != 4]

# Display the invalid groups
display(invalid_groups)

  .apply(lambda x: pd.concat([x['home_team'], x['away_team']]).unique())


Unnamed: 0,year,stage,team_list,team_count
0,1986,Final,"[Argentina, West Germany]",2
7,1986,Quarter-finals,"[Brazil, West Germany, Argentina, Spain, Franc...",8
8,1986,Round of 16,"[Mexico, Soviet Union, Brazil, Argentina, Ital...",16
10,1986,Third place play-off,"[Belgium, France]",2
17,1990,Quarter-finals,"[Argentina, Yugoslavia]",2
18,1990,Round of 16,"[Ireland, Romania]",2
19,1990,not applicable,"[Cameroon, Czechoslovakia, Brazil, West German...",15
20,1994,Final,"[Brazil, Italy]",2
27,1994,Quarterfinals,"[Italy, Netherlands, Bulgaria, Romania, Spain,...",8
28,1994,Round of 16,"[Germany, Spain, Saudi Arabia, Romania, Nether...",16


In [56]:
# Filter out the knockout stages
df = df[~df['stage'].isin(['Quarterfinals', 'Quarter-finals', 'Round of 16', 'Semi-finals', 'Semifinals','Final', 'Third place play-off', 'Third place playoff', 'not applicable'])]

# Extract relevant columns for goal events and match results
goals_df = df[['year', 'stage', 'home_team', 'away_team', 'scorer_nationality', 'goal_minute', 'half_time','short_date','local_time', 'score']]

#Sort goals_df by short_date, local_time, and goal_minute
goals_df = goals_df.sort_values(by=['short_date', 'local_time', 'goal_minute', 'half_time'], ascending=[True, True, True, True])

display(goals_df.head())


Unnamed: 0,year,stage,home_team,away_team,scorer_nationality,goal_minute,half_time,short_date,local_time,score
1,1986,Group A,Bulgaria,Italy,Italy,44,1,1986-05-31,18:00,1–1
0,1986,Group A,Bulgaria,Italy,Bulgaria,85,2,1986-05-31,18:00,1–1
55,1986,Group D,Spain,Brazil,,0,1,1986-06-01,18:00,0–1
56,1986,Group D,Spain,Brazil,Brazil,62,2,1986-06-01,18:00,0–1
34,1986,Group C,Canada,France,,0,1,1986-06-01,22:00,0–1


In [57]:
df_missing = goals_df[(goals_df['year'] == 2006) & (goals_df['stage'] == 'Group C')]
df_missing

Unnamed: 0,year,stage,home_team,away_team,scorer_nationality,goal_minute,half_time,short_date,local_time,score
769,2006,Group C,Argentina,Ivory Coast,Argentina,24,1,2006-06-10,21:00,2–1
770,2006,Group C,Argentina,Ivory Coast,Argentina,38,1,2006-06-10,21:00,2–1
771,2006,Group C,Argentina,Ivory Coast,Ivory Coast,82,2,2006-06-10,21:00,2–1
772,2006,Group C,Serbia and Montenegro,Netherlands,,0,1,2006-06-11,15:00,0–1
773,2006,Group C,Serbia and Montenegro,Netherlands,Netherlands,18,1,2006-06-11,15:00,0–1
780,2006,Group C,Argentina,Serbia and Montenegro,,0,1,2006-06-16,15:00,6–0
774,2006,Group C,Argentina,Serbia and Montenegro,Argentina,6,1,2006-06-16,15:00,6–0
776,2006,Group C,Argentina,Serbia and Montenegro,Argentina,31,1,2006-06-16,15:00,6–0
775,2006,Group C,Argentina,Serbia and Montenegro,Argentina,41,1,2006-06-16,15:00,6–0
777,2006,Group C,Argentina,Serbia and Montenegro,Argentina,78,2,2006-06-16,15:00,6–0


# team counts

In [58]:
# Calculate the number of unique teams per stage and year using home_team and away_team
team_counts = (
    df.groupby(['year', 'stage'])
    .apply(lambda x: pd.concat([x['home_team'], x['away_team']]).unique())
)

# Convert the resulting series to a DataFrame
team_counts = team_counts.reset_index()  # Reset index without 'name' argument
team_counts = team_counts.rename(columns={0: 'team_list'})  # Rename the column appropriately

# Convert team list into counts
team_counts['team_count'] = team_counts['team_list'].apply(len)

# Filter stages where the number of teams is not equal to 4
invalid_groups = team_counts[team_counts['team_count'] != 4]

# Display the invalid groups
display(invalid_groups)

  .apply(lambda x: pd.concat([x['home_team'], x['away_team']]).unique())


Unnamed: 0,year,stage,team_list,team_count


# stage counts

In [59]:
# Group by year and collect unique values of stage in a list
stage_summary = df.groupby('year')['stage'].unique().reset_index()

# Rename column for clarity
stage_summary.rename(columns={'stage': 'unique_stages'}, inplace=True)

# Add a column for the number of unique stages
stage_summary['num_unique_stages'] = stage_summary['unique_stages'].apply(len)

# Display the summary
display(stage_summary)


Unnamed: 0,year,unique_stages,num_unique_stages
0,1986,"[Group A, Group B, Group C, Group D, Group E, ...",6
1,1990,"[Group B, Group D, Group A, Group C, Group F, ...",6
2,1994,"[Group A, Group B, Group C, Group D, Group E, ...",6
3,1998,"[Group A, Group B, Group C, Group D, Group E, ...",8
4,2002,"[Group A, Group B, Group C, Group D, Group E, ...",8
5,2006,"[Group A, Group B, Group C, Group D, Group E, ...",8
6,2010,"[Group A, Group B, Group C, Group D, Group E, ...",8
7,2014,"[Group A, Group B, Group C, Group D, Group E, ...",8
8,2018,"[Group A, Group B, Group C, Group D, Group E, ...",8
9,2022,"[Group A, Group B, Group C, Group D, Group E, ...",8


# Recreate Leauge Table after first two matchdays

In [60]:

# Assuming goals_df is your original dataset
agg_goals_before_last_day, goals_last_day_sorted = before_last(goals_df)


In [61]:
home_games, away_games = aggregate_home_away_points(agg_goals_before_last_day)

In [62]:
# Filter for year == 2022 and stage == 'Group E'
home2022e = home_games[(home_games['year'] == 2022) & (home_games['stage'] == 'Group E')]
home2022e

Unnamed: 0,year,stage,home_team,goals_scored,goals_conceded,points_home,match_count_home
216,2022,Group E,Germany,1,2,0,1
217,2022,Group E,Japan,0,1,0,1
218,2022,Group E,Spain,8,1,4,2


In [63]:
# Filter for year == 2022 and stage == 'Group E'
away2022e = away_games[(away_games['year'] == 2022) & (away_games['stage'] == 'Group E')]
away2022e

Unnamed: 0,year,stage,away_team,goals_scored,goals_conceded,points_away,match_count_away
216,2022,Group E,Costa Rica,1,7,3,2
217,2022,Group E,Germany,1,1,1,1
218,2022,Group E,Japan,2,1,3,1


## aggregate data after first two match days

In [64]:
# Applying the fifa_before_last function to the mock data
all_games_before_last = fifa_before_last(home_games, away_games, agg_goals_before_last_day, team_counts)

No missing values in the specified columns.
No observations where total_matches == 1.
No observations where total_matches == 0.

=== Applying Tiebreaker ===
Row1: year                       1986
stage                   Group A
team                   Bulgaria
home_team              Bulgaria
goals_scored_home           1.0
goals_conceded_home         1.0
points_home                 1.0
match_count_home            1.0
away_team              Bulgaria
goals_scored_away           1.0
goals_conceded_away         1.0
points_away                 1.0
match_count_away            1.0
goals_scored                2.0
goals_conceded              2.0
points                      2.0
total_matches               2.0
goals_difference            0.0
tiebreaker              no need
tie_won                       0
Name: 1, dtype: object
Row2: year                      1986
stage                  Group A
team                     Italy
home_team                Italy
goals_scored_home          1.0
goals_concede

In [65]:
# Filtering all_games_before_last for the year 2022 and stage 'Group E'
group2022e = all_games_before_last[
    (all_games_before_last['year'] == 2022) & 
    (all_games_before_last['stage'] == 'Group E')
]

display(group2022e)

Unnamed: 0,year,stage,team,goals_scored,goals_conceded,points,goals_difference,total_matches,standing,tiebreaker,tie_won
280,2022,Group E,Spain,8,1,4,7,2,1,no need,0
281,2022,Group E,Japan,2,2,3,0,2,2,Japan,1
282,2022,Group E,Costa Rica,1,7,3,-6,2,3,Japan,0
283,2022,Group E,Germany,2,3,1,-1,2,4,no need,0


# Recreate league table after last match day

In [66]:
# Initialize an empty list to store the results for each pair
all_results = []

# Define unique pairs and ensure correct order based on goals_last_day_sorted
unique_pairs = all_games_before_last[['year', 'stage']].drop_duplicates()
# Merge with goals_last_day_sorted to get the correct order
unique_pairs = unique_pairs.merge(
    goals_last_day_sorted[['year', 'stage', 'short_date','local_time', 'half_time']].drop_duplicates(),
    how='left',
    on=['year', 'stage']
).sort_values(by=['year', 'short_date','local_time', 'half_time', 'stage']).reset_index(drop=True)

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    group_name = row['stage']
    
    # Apply the function to the current pair
    result = fifa_final_wc(year, group_name, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_results.append(result)

# Concatenate all the results into a single DataFrame
changes_df_wc = pd.concat(all_results)


# Keep only the specified columns
changes_df_wc = changes_df_wc[['year', 'stage', 'team', '1st', '2nd', '3rd', '4th', 'changes']]



=== Initial Standings for Year 1986, Group C Before Last Match Goals ===

        team  total_points  total_goals_scored  total_goals_conceded  total_goals_difference  before_last_game_points  before_last_game_standing
Soviet Union             4                   7                     1                       6                        3                          1
      France             4                   2                     1                       1                        3                          2
     Hungary             3                   2                     6                      -4                        2                          3
      Canada             1                   0                     3                      -3                        0                          4


Analyzing goal: 0 minute, 1 half time, Player team: nan, Home: Hungary, Away: France

=== Teams with Identical Points (Tied Teams) ===

           team  total_points
8  Soviet Union             4
9 

In [67]:
# # Exporting final df
# file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\wc\tb_wc_fifa_men.xlsx'
# changes_df_wc.to_excel(file_path, index=False)


# group composition tracking

In [68]:
# Initialize an empty list to store the results for each pair
all_composition_changes = []

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    stage = row['stage']
    
    # Apply the function to the current pair
    composition_changes_df, third_place_df = gap_composition(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_composition_changes.append(composition_changes_df)

# Concatenate all the results into a single DataFrame
final_composition_changes_df = pd.concat(all_composition_changes, ignore_index=True)

# Define the desired column order

column_order = [
    'year', 'stage', 'date', 'time', 'change_num', 'goal_minute', 'half_time', 'home_team', 
    'away_team', 'scorer_team', 'new_top_teams', 
    '1st', '1st_points', '1st_goal_diff', '1st_last_game_points',
    '2nd', '2nd_points', '2nd_goal_diff', '2nd_last_game_points',
    '3rd', '3rd_points', '3rd_goal_diff', '3rd_last_game_points',
    '4th', '4th_points', '4th_goal_diff', '4th_last_game_points',
    'changed', 'points_diff', 'goals_diff', 'tiebreak_result'
]



# Reorder the columns
final_composition_changes_df = final_composition_changes_df[column_order]

# Identify rows where both 'date' and 'time' are NaN
mask = final_composition_changes_df['date'].isna() & final_composition_changes_df['time'].isna()

# Use .shift(-1) to get the values from the following row and fill in the NaN rows
final_composition_changes_df.loc[mask, ['date', 'time']] = final_composition_changes_df.loc[mask, ['date', 'time']].fillna(
    final_composition_changes_df[['date', 'time']].shift(-1)
)

# Ensure 'date' is in datetime format and handle errors
# final_composition_changes_df['date'] = pd.to_datetime(final_composition_changes_df['date'], errors='coerce')

# Ensure 'time' is in proper datetime.time format and handle errors
final_composition_changes_df['time'] = pd.to_datetime(final_composition_changes_df['time'], format='%H:%M', errors='coerce').dt.time

# Drop rows with missing or invalid 'date' or 'time'
final_composition_changes_df = final_composition_changes_df.dropna(subset=['date', 'time'])

# Combine 'date' and 'time' into a single datetime column for proper sorting
final_composition_changes_df['datetime'] = pd.to_datetime(
    final_composition_changes_df['date'].astype(str) + ' ' + final_composition_changes_df['time'].astype(str),
    errors='coerce'
)

# Drop rows with invalid datetime values
final_composition_changes_df = final_composition_changes_df.dropna(subset=['datetime'])

# Sort by year and the combined datetime column
final_composition_changes_df = final_composition_changes_df.sort_values(by=['year', 'datetime']).reset_index(drop=True)

# Drop the combined datetime column if not needed
final_composition_changes_df = final_composition_changes_df.drop(columns=['datetime'])



=== STEP 1: Initial Standings for Group C, 1986 (Goal Time = 0) ===
        team  total_points  total_goals_scored  total_goals_conceded  total_goals_difference  before_last_game_standing
Soviet Union             4                   7                     1                       6                          1
      France             4                   2                     1                       1                          2
     Hungary             3                   2                     6                      -4                          3
      Canada             1                   0                     3                      -3                          4



=== Tied after goal at minute 0 1 half time by nan in Group C, edition 1986 ===
Number of tied teams: 2
           team  total_points  total_goals_scored  total_goals_conceded  \
8  Soviet Union             4                   7                     1   
9        France             4                   2                     1   

## manually modify order of tie teams by drawing of lots

In [69]:
mask = (final_composition_changes_df['year'] == 1990) & \
       (final_composition_changes_df['stage'] == "Group F") & \
       (final_composition_changes_df['goal_minute'] == 71)

final_composition_changes_df.loc[mask, ['2nd', '3rd']] = final_composition_changes_df.loc[mask, ['3rd', '2nd']].values


## manually modify order of tie teams by fair play points

In [70]:
mask = (final_composition_changes_df['year'] == 2018) & \
       (final_composition_changes_df['stage'] == "Group H") & \
       (final_composition_changes_df['goal_minute'] == 74)

final_composition_changes_df.loc[mask, ['2nd', '3rd']] = final_composition_changes_df.loc[mask, ['3rd', '2nd']].values


# minute by minute dataframe

In [71]:
# Initialize an empty list to store the expanded rows
expanded_rows = []

# Iterate over rows grouped by date and time
for (date, time), group in final_composition_changes_df.groupby(["date", "time"]):
    group = group.sort_values(by="goal_minute").reset_index(drop=True)
    
    for i in range(len(group) - 1):
        current_row = group.iloc[i]
        next_row = group.iloc[i + 1]
        
        # Create the range of match_minute values between current and next row
        for match_minute in range(current_row["goal_minute"], next_row["goal_minute"] + 1):
            new_row = current_row.copy()
            new_row["match_minute"] = match_minute
            
            # Leave specific fields empty for interpolated rows
            if match_minute != current_row["goal_minute"]:
                new_row["goal_minute"] = None
                new_row["home_team"] = None
                new_row["away_team"] = None
                new_row["scorer_team"] = None
            
            expanded_rows.append(new_row)
    
    # Add the last row of the group as it is
    last_row = group.iloc[-1].copy()
    last_row["match_minute"] = last_row["goal_minute"]
    expanded_rows.append(last_row)
    
    # Ensure the match_minute reaches 90
    if last_row["goal_minute"] < 90:
        for match_minute in range(last_row["goal_minute"] + 1, 91):
            new_row = last_row.copy()
            new_row["match_minute"] = match_minute
            new_row["goal_minute"] = None
            new_row["home_team"] = None
            new_row["away_team"] = None
            new_row["scorer_team"] = None
            expanded_rows.append(new_row)

# Create a new DataFrame from the expanded rows
expanded_df = pd.DataFrame(expanded_rows)

# Reset the index for clarity
expanded_df = expanded_df.reset_index(drop=True)

# Define the desired column order

column_order = [
    'year', 'stage', 'date', 'time', 'change_num', 'goal_minute', 'half_time', 'home_team', 
    'away_team', 'scorer_team', 'new_top_teams', 
    '1st', '1st_points', '1st_goal_diff', '1st_last_game_points',
    '2nd', '2nd_points', '2nd_goal_diff', '2nd_last_game_points',
    '3rd', '3rd_points', '3rd_goal_diff', '3rd_last_game_points',
    '4th', '4th_points', '4th_goal_diff', '4th_last_game_points',
    'changed', 'points_diff', 'goals_diff', 'tiebreak_result'
]


# Reorder the columns
expanded_df = expanded_df[column_order]

# Remove the columns 'change_num' and 'changed' from the DataFrame
expanded_df = expanded_df.drop(columns=['change_num', 'changed'], errors='ignore')


In [72]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\wc\mbm_wc_fifa.xlsx'
expanded_df.to_excel(file_path, index=False)


# best four third placed

In [73]:
# Filter the dataset to include only years <= 1994
third_place_df = third_place_df[third_place_df['year'] <= 1994].copy()

third_place_df = ensure_goal_minute_zero(third_place_df, all_games_before_last)

# Remove duplicates
third_place_df = third_place_df.drop_duplicates()

# Fill missing 'date' and 'time' with the next row's value (backward fill)
third_place_df[['date', 'time']] = third_place_df[['date', 'time']].fillna(method='bfill')


# Define file path for saving
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\wc\third_place_wc_fifa.xlsx'

# Export to Excel
# third_place_df.to_excel(file_path, index=False)

print(f"Filtered and deduplicated third-place data saved to {file_path}")


[INFO] Removing existing `goal_minute = 0` entries...
[INFO] Creating `goal_minute = 0` entries from `all_games_before_last`
[INFO] Adding 18 new `goal_minute = 0` entries.
[INFO] `goal_minute = 0` entries updated successfully.
Filtered and deduplicated third-place data saved to C:\Users\aldi\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\wc\third_place_wc_fifa.xlsx


  third_place_df[['date', 'time']] = third_place_df[['date', 'time']].fillna(method='bfill')


### resolving ties using yellow cards

In [74]:
team_priority = {
  
}


In [75]:
initial_third_teams_df = initial_third_teams(third_place_df)

In [76]:
initial_third_teams_df

Unnamed: 0,year,goal_minute,date,time,change_flag,change_count,top_four,last_two,tied_team,Group B,Group A,Group F,Group C,Group D,Group E
0,1986,0,1986-06-09,18:00,0,0,"[[Belgium, 3, 0, 3], [Italy, 3, 0, 2], [Morocc...","[[Northern Ireland, 2, -1, 2], [Uruguay, 2, -5...",[],"[Belgium, 3, 0, 3]","[Italy, 3, 0, 2]","[Morocco, 3, 0, 0]","[Hungary, 3, -4, 2]","[Northern Ireland, 2, -1, 2]","[Uruguay, 2, -5, 2]"
1,1990,0,1990-06-18,21:00,0,0,"[[Argentina, 3, 1, 2], [Costa Rica, 3, 0, 1], ...","[[Uruguay, 2, -2, 1], [Austria, 1, -2, 0]]","[Ireland, Costa Rica]","[Argentina, 3, 1, 2]","[Austria, 1, -2, 0]","[Ireland, 3, 0, 1]","[Costa Rica, 3, 0, 1]","[Yugoslavia, 3, -2, 2]","[Uruguay, 2, -2, 1]"
2,1994,0,1994-06-26,20:00,0,0,"[[Bulgaria, 4, 1, 4], [Netherlands, 4, 0, 2], ...","[[South Korea, 3, 0, 2], [Cameroon, 2, -3, 2]]",[],"[Cameroon, 2, -3, 2]","[Romania, 4, -1, 4]","[Netherlands, 4, 0, 2]","[South Korea, 3, 0, 2]","[Bulgaria, 4, 1, 4]","[Italy, 4, 0, 1]"


In [77]:
third_teams_df = third_teams(third_place_df, initial_third_teams_df, team_priority)


Processing: Year=1986, Stage=Group C, Goal Minute=0
--- YEAR CHANGED: 1986 --- Resetting group values to first row of structured_df
Before Update: {'Group A': ['Italy', 3, 0, 2], 'Group B': ['Belgium', 3, 0, 3], 'Group C': ['Hungary', 3, -4, 2], 'Group D': ['Northern Ireland', 2, -1, 2], 'Group E': ['Uruguay', 2, -5, 2], 'Group F': ['Morocco', 3, 0, 0]}
After Update: {'Group A': ['Italy', 3, 0, 2], 'Group B': ['Belgium', 3, 0, 3], 'Group C': ['Hungary', 3, -4, 2], 'Group D': ['Northern Ireland', 2, -1, 2], 'Group E': ['Uruguay', 2, -5, 2], 'Group F': ['Morocco', 3, 0, 0]}

Processing: Year=1986, Stage=Group C, Goal Minute=29
Before Update: {'Group A': ['Italy', 3, 0, 2], 'Group B': ['Belgium', 3, 0, 3], 'Group C': ['Hungary', 3, -4, 2], 'Group D': ['Northern Ireland', 2, -1, 2], 'Group E': ['Uruguay', 2, -5, 2], 'Group F': ['Morocco', 3, 0, 0]}
After Update: {'Group A': ['Italy', 3, 0, 2], 'Group B': ['Belgium', 3, 0, 3], 'Group C': ['Hungary', 2, -5, 2], 'Group D': ['Northern Ireland

In [78]:
third_teams_df = third_track(third_teams_df)

In [79]:
# Define output path and save
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\wc\third_teams_wc_fifa.xlsx'
third_teams_df.to_excel(file_path, index=False)

print(f"Data successfully saved to {file_path}")


Data successfully saved to C:\Users\aldi\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\wc\third_teams_wc_fifa.xlsx


### variable to track whether third teams qualifies or not

In [80]:
def format_time_column(df, column_name):
    """
    Ensures the time column is in HH:MM format without losing data.
    It removes seconds if present and keeps values unchanged if already correct.
    """
    def clean_time(value):
        if isinstance(value, str):
            match = re.match(r'(\d{1,2}):(\d{2}):(\d{2})', value)  # Matches HH:MM:SS
            if match:
                return f"{match.group(1)}:{match.group(2)}"  # Keep only HH:MM
            match = re.match(r'(\d{1,2}):(\d{2})', value)  # Matches HH:MM
            if match:
                return value  # Already correct
        return value  # Keep original if not a string

    df[column_name] = df[column_name].astype(str).apply(clean_time)
    return df

# Apply to final_composition_changes_df
final_composition_changes_df = format_time_column(final_composition_changes_df, 'time')


In [81]:
# Initialize third_qualify to 0
final_composition_changes_df['third_qualify'] = 0

# Process only years <= 1994
for index, row in final_composition_changes_df.iterrows():
    if row['year'] <= 1994:
        # Find matching row in third_teams_df with additional conditions
        match = third_teams_df[
            (third_teams_df['year'] == row['year']) &
            (third_teams_df['goal_minute'] == row['goal_minute']) &
            (third_teams_df['date'] == row['date']) &
            (third_teams_df['time'] == row['time']) &
            (third_teams_df['stage'] == row['stage'])
        ]

        if not match.empty:
            # Check if the 3rd place team is in the top four third-placed teams
            top_four_teams = match.iloc[0]['top_four']
            if isinstance(top_four_teams, list) and row['3rd'] in top_four_teams:
                final_composition_changes_df.at[index, 'third_qualify'] = 1

# Create qualified_teams list: always includes 1st and 2nd, includes 3rd only if third_qualify == 1
final_composition_changes_df['qualified_teams'] = final_composition_changes_df.apply(
    lambda row: [row['1st'], row['2nd']] + ([row['3rd']] if row['third_qualify'] == 1 else []),
    axis=1
)

# Remove the columns 'change_num' and 'changed' from the DataFrame
final_composition_changes_df = final_composition_changes_df.drop(columns=['change_num', 'changed'], errors='ignore')


### variables to track the composition of the teams qualifying

In [82]:

# Ensure goal_minute is numeric for sorting
final_composition_changes_df['goal_minute'] = pd.to_numeric(final_composition_changes_df['goal_minute'], errors='coerce')

# Sort by year, stage, and goal_minute to process in correct order
final_composition_changes_df = final_composition_changes_df.sort_values(by=['year', 'stage', 'goal_minute']).reset_index(drop=True)

# Initialize qual_changed and qual_count columns
final_composition_changes_df['qual_changed'] = 0
final_composition_changes_df['qual_count'] = 0

# Track changes per stage
qual_count_tracker = {}

# Loop through each year and stage to check for changes
for (year, stage), group in final_composition_changes_df.groupby(['year', 'stage']):
    previous_qualified = None
    qual_count = 0

    for index, row in group.iterrows():
        # Extract current qualified teams
        current_qualified = sorted(row['qualified_teams'])  # Sort to ignore order changes

        # Check if qualified_teams has changed from the previous row
        if previous_qualified is not None and current_qualified != previous_qualified:
            final_composition_changes_df.at[index, 'qual_changed'] = 1
            qual_count += 1

        # Update tracking
        previous_qualified = current_qualified
        final_composition_changes_df.at[index, 'qual_count'] = qual_count

In [83]:
final_composition_changes_df = integrate_elo_probabilities(final_composition_changes_df, elo_wc)

In [84]:
# Define pt_diff and gl_diff based on third_qualify condition
final_composition_changes_df['pts_diff'] = final_composition_changes_df.apply(
    lambda row: row['3rd_points'] - row['2nd_points']
    if row['third_qualify'] == 0 else row['4th_points'] - row['3rd_points'], axis=1
)

final_composition_changes_df['gls_diff'] = final_composition_changes_df.apply(
    lambda row: row['3rd_goal_diff'] - row['2nd_goal_diff']
    if row['third_qualify'] == 0 else row['4th_goal_diff'] - row['3rd_goal_diff'], axis=1
)


In [85]:
# Drop unnecessary columns
final_composition_changes_df = final_composition_changes_df.drop(columns=['points_diff', 'goals_diff', 'tiebreak_result'], errors='ignore')


In [86]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\wc\goals_wc_fifa_men.xlsx'
final_composition_changes_df.to_excel(file_path, index=False)
