# Libraries

In [1]:
import pandas as pd
from getpass import getuser
from collections import defaultdict
from datetime import datetime, timedelta
import re
import numpy as np
import math 
from scipy.stats import skellam

# Load and inspect dataset

In [2]:
# Get the current user's name
user = getuser()

# Construct the path using the user's name
function_path = f"C:/Users/{user}/Documents/GitHub/tiebreak_wc/code/wiki/functions_fifa.ipynb"

# Run the notebook
%run $function_path

In [3]:
# Path to the dataset
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\wc_goals_men.xlsx'

# Read the dataset with a different encoding
df = pd.read_excel(data_path)

# Now, filter the data for tournaments after 1984
df = df[df['year'] >= 1984]

display(df.head())

Unnamed: 0,stage,year,home_team,away_team,score,half_time,stadium_name,stadium_city,stadium_attendance,referee_name,...,goal_minute_et,extra_time,goals_home,goals_away,own_goal,penalty,goal_et,short_date,long_date,referee_natinality
0,Group A,1986,Bulgaria,Italy,1–1,2,Estadio Azteca,Mexico City,96000.0,Erik Fredriksson,...,0,0,1,1,0,0,0,,31 May 1986,
1,Group A,1986,Bulgaria,Italy,1–1,1,Estadio Azteca,Mexico City,96000.0,Erik Fredriksson,...,0,0,1,1,0,0,0,,31 May 1986,
2,Group A,1986,Argentina,South Korea,3–1,1,Estadio Olímpico Universitario,Mexico City,60000.0,Victoriano Sánchez Arminio,...,0,0,3,1,0,0,0,,2 June 1986,
3,Group A,1986,Argentina,South Korea,3–1,2,Estadio Olímpico Universitario,Mexico City,60000.0,Victoriano Sánchez Arminio,...,0,0,3,1,0,0,0,,2 June 1986,
4,Group A,1986,Argentina,South Korea,3–1,1,Estadio Olímpico Universitario,Mexico City,60000.0,Victoriano Sánchez Arminio,...,0,0,3,1,0,0,0,,2 June 1986,


In [4]:
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\elo_wc.xlsx'
elo_wc = pd.read_excel(data_path)

In [5]:
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\cards_subs_wc.xlsx'
cardsub_wc = pd.read_excel(data_path)

In [6]:
# Dictionary of replacements for team names
replacements = {
    "Republic of Ireland": "Ireland",
    "CIS": "Commonwealth of Independent States",
    "FR Yugoslavia": "Yugoslavia"
}

In [7]:
# Prepare the Elo dataset for integration: rename columns for clarity
elo_wc = elo_wc.rename(columns={"team": "team_name", "elo_rating": "elo"})
elo_wc["team_name"] = elo_wc["team_name"].replace(replacements)

# Apply replacements to the relevant columns
df[["home_team", "away_team", "scorer_nationality"]] = df[["home_team", "away_team", "scorer_nationality"]].replace(replacements)

# Clean, transfrom, create variables

In [8]:
# Columns to check for duplicates
columns_to_check = ['year', 'stage', 'home_team', 'scorer_nationality', 'long_date', 'half_time','time', 'goal_minute']

# Filter duplicates where both goals_home and goals_away are not 0
filtered_duplicates = df[
    (df.duplicated(subset=columns_to_check, keep=False)) &
    ~((df['goals_home'] == 0) & (df['goals_away'] == 0))
]

print("Filtered duplicates without both goals_home and goals_away being 0:")
display(filtered_duplicates)


Filtered duplicates without both goals_home and goals_away being 0:


Unnamed: 0,stage,year,home_team,away_team,score,half_time,stadium_name,stadium_city,stadium_attendance,referee_name,...,goal_minute_et,extra_time,goals_home,goals_away,own_goal,penalty,goal_et,short_date,long_date,referee_natinality


## time

In [9]:
# Apply the conversion function
df['local_time'] = df['time'].apply(convert_time_to_utc)


Original time string: 12:00CST
Extracted local time: 12:00
Normalized 24-hour format: 12:00
Detected timezone acronym: CST, UTC Offset: -6
Final UTC time: 18:00

Original time string: 12:00CST
Extracted local time: 12:00
Normalized 24-hour format: 12:00
Detected timezone acronym: CST, UTC Offset: -6
Final UTC time: 18:00

Original time string: 12:00CST
Extracted local time: 12:00
Normalized 24-hour format: 12:00
Detected timezone acronym: CST, UTC Offset: -6
Final UTC time: 18:00

Original time string: 12:00CST
Extracted local time: 12:00
Normalized 24-hour format: 12:00
Detected timezone acronym: CST, UTC Offset: -6
Final UTC time: 18:00

Original time string: 12:00CST
Extracted local time: 12:00
Normalized 24-hour format: 12:00
Detected timezone acronym: CST, UTC Offset: -6
Final UTC time: 18:00

Original time string: 12:00CST
Extracted local time: 12:00
Normalized 24-hour format: 12:00
Detected timezone acronym: CST, UTC Offset: -6
Final UTC time: 18:00

Original time string: 12:00C

In [10]:
df_filtered = df[(df['year'] == 2014) & (df['stage'] == "Group C")]

# Specify the columns you want to display
columns_to_display = ['year', 'stage', 'short_date','time', 'local_time', 'home_team', 'away_team']  # Replace with actual column names

# Print the selected columns
display(df_filtered[columns_to_display])


Unnamed: 0,year,stage,short_date,time,local_time,home_team,away_team
1161,2014,Group C,,13:00BRT(UTC−3),16:00,Colombia,Greece
1162,2014,Group C,,13:00BRT(UTC−3),16:00,Colombia,Greece
1163,2014,Group C,,13:00BRT(UTC−3),16:00,Colombia,Greece
1164,2014,Group C,,13:00BRT(UTC−3),16:00,Colombia,Greece
1165,2014,Group C,,22:00BRT(UTC−3),01:00,Ivory Coast,Japan
1166,2014,Group C,,22:00BRT(UTC−3),01:00,Ivory Coast,Japan
1167,2014,Group C,,22:00BRT(UTC−3),01:00,Ivory Coast,Japan
1168,2014,Group C,,13:00BRT(UTC−3),16:00,Colombia,Ivory Coast
1169,2014,Group C,,13:00BRT(UTC−3),16:00,Colombia,Ivory Coast
1170,2014,Group C,,13:00BRT(UTC−3),16:00,Colombia,Ivory Coast


## date

In [11]:
# Step 1: Remove any commas before the year in 'long_date'
df['long_date'] = df['long_date'].str.replace(r',\s*(\d{4})', r' \1', regex=True)

# Step 2: Convert 'long_date' to 'short_date' where 'short_date' is missing
df['short_date'] = df['short_date'].fillna(pd.to_datetime(df['long_date'], errors='coerce').dt.strftime('%Y-%m-%d'))


In [12]:
# Step 1: Remove comma only for year 1994
df.loc[df['year'] == 1994, 'long_date'] = df.loc[df['year'] == 1994, 'long_date'].str.replace(',', '', regex=True)

# Step 2: Convert 'long_date' to 'short_date' where 'short_date' is missing
df.loc[df['year'] == 1994, 'short_date'] = df.loc[df['year'] == 1994, 'short_date'].fillna(
    pd.to_datetime(df.loc[df['year'] == 1994, 'long_date'], errors='coerce', format='%B %d %Y').dt.strftime('%Y-%m-%d')
)

# Step 3: Convert all other years normally
df.loc[df['year'] != 1994, 'short_date'] = df.loc[df['year'] != 1994, 'short_date'].fillna(
    pd.to_datetime(df.loc[df['year'] != 1994, 'long_date'], errors='coerce').dt.strftime('%Y-%m-%d')
)

# Step 4: Identify missing conversions again
missing_dates = df[df['short_date'].isna()]

if not missing_dates.empty:
    print("\n⚠️ WARNING: Some dates are still missing after format correction.")
    print(missing_dates[['year', 'long_date', 'short_date']])

# Step 5: Check specific cases for year 1994
missing_1994 = df[(df['year'] == 1994) & df['short_date'].isna()]
if not missing_1994.empty:
    print("\n🔍 DEBUG: Still missing short_date for year 1994:")
    print(missing_1994[['year', 'long_date', 'short_date']])


# stage

In [13]:
# Calculate the number of unique teams per stage and year using home_team and away_team
team_counts = (
    df.groupby(['year', 'stage'])
    .apply(lambda x: pd.concat([x['home_team'], x['away_team']]).unique())
)

# Convert the resulting series to a DataFrame
team_counts = team_counts.reset_index()  # Reset index without 'name' argument
team_counts = team_counts.rename(columns={0: 'team_list'})  # Rename the column appropriately

# Convert team list into counts
team_counts['team_count'] = team_counts['team_list'].apply(len)

# Filter stages where the number of teams is not equal to 4
invalid_groups = team_counts[team_counts['team_count'] != 4]

# Display the invalid groups
display(invalid_groups)

Unnamed: 0,year,stage,team_list,team_count
0,1986,Final,"[Argentina, West Germany]",2
7,1986,Quarter-finals,"[Brazil, West Germany, Argentina, Spain, Franc...",8
8,1986,Round of 16,"[Mexico, Soviet Union, Brazil, Argentina, Ital...",16
10,1986,Third place play-off,"[Belgium, France]",2
17,1990,Quarter-finals,"[Argentina, Yugoslavia]",2
18,1990,Round of 16,"[Ireland, Romania]",2
19,1990,not applicable,"[Cameroon, Czechoslovakia, Brazil, West German...",15
20,1994,Final,"[Brazil, Italy]",2
27,1994,Quarterfinals,"[Italy, Netherlands, Bulgaria, Romania, Spain,...",8
28,1994,Round of 16,"[Germany, Spain, Saudi Arabia, Romania, Nether...",16


In [14]:
# Filter out the knockout stages
df = df[~df['stage'].isin(['Quarterfinals', 'Quarter-finals', 'Round of 16', 'Semi-finals', 'Semifinals','Final', 'Third place play-off', 'Third place playoff', 'not applicable'])]

# Extract relevant columns for goal events and match results
goals_df = df[['year', 'stage', 'home_team', 'away_team', 'scorer_nationality', 'goal_minute', 'half_time','short_date','local_time', 'score']]

#Sort goals_df by short_date, local_time, and goal_minute
goals_df = goals_df.sort_values(by=['short_date', 'local_time', 'goal_minute', 'half_time'], ascending=[True, True, True, True])

display(goals_df.head())


Unnamed: 0,year,stage,home_team,away_team,scorer_nationality,goal_minute,half_time,short_date,local_time,score
1,1986,Group A,Bulgaria,Italy,Italy,44,1,1986-05-31,18:00,1–1
0,1986,Group A,Bulgaria,Italy,Bulgaria,85,2,1986-05-31,18:00,1–1
55,1986,Group D,Spain,Brazil,,0,1,1986-06-01,18:00,0–1
56,1986,Group D,Spain,Brazil,Brazil,62,2,1986-06-01,18:00,0–1
34,1986,Group C,Canada,France,,0,1,1986-06-01,22:00,0–1


In [15]:
df_missing = goals_df[(goals_df['year'] == 2006) & (goals_df['stage'] == 'Group C')]
df_missing

Unnamed: 0,year,stage,home_team,away_team,scorer_nationality,goal_minute,half_time,short_date,local_time,score
769,2006,Group C,Argentina,Ivory Coast,Argentina,24,1,2006-06-10,21:00,2–1
770,2006,Group C,Argentina,Ivory Coast,Argentina,38,1,2006-06-10,21:00,2–1
771,2006,Group C,Argentina,Ivory Coast,Ivory Coast,82,2,2006-06-10,21:00,2–1
772,2006,Group C,Serbia and Montenegro,Netherlands,,0,1,2006-06-11,15:00,0–1
773,2006,Group C,Serbia and Montenegro,Netherlands,Netherlands,18,1,2006-06-11,15:00,0–1
780,2006,Group C,Argentina,Serbia and Montenegro,,0,1,2006-06-16,15:00,6–0
774,2006,Group C,Argentina,Serbia and Montenegro,Argentina,6,1,2006-06-16,15:00,6–0
776,2006,Group C,Argentina,Serbia and Montenegro,Argentina,31,1,2006-06-16,15:00,6–0
775,2006,Group C,Argentina,Serbia and Montenegro,Argentina,41,1,2006-06-16,15:00,6–0
777,2006,Group C,Argentina,Serbia and Montenegro,Argentina,78,2,2006-06-16,15:00,6–0


# team counts

In [16]:
# Calculate the number of unique teams per stage and year using home_team and away_team
team_counts = (
    df.groupby(['year', 'stage'])
    .apply(lambda x: pd.concat([x['home_team'], x['away_team']]).unique())
)

# Convert the resulting series to a DataFrame
team_counts = team_counts.reset_index()  # Reset index without 'name' argument
team_counts = team_counts.rename(columns={0: 'team_list'})  # Rename the column appropriately

# Convert team list into counts
team_counts['team_count'] = team_counts['team_list'].apply(len)

# Filter stages where the number of teams is not equal to 4
invalid_groups = team_counts[team_counts['team_count'] != 4]

# Display the invalid groups
display(invalid_groups)

Unnamed: 0,year,stage,team_list,team_count


# stage counts

In [17]:
# Group by year and collect unique values of stage in a list
stage_summary = df.groupby('year')['stage'].unique().reset_index()

# Rename column for clarity
stage_summary.rename(columns={'stage': 'unique_stages'}, inplace=True)

# Add a column for the number of unique stages
stage_summary['num_unique_stages'] = stage_summary['unique_stages'].apply(len)

# Display the summary
display(stage_summary)


Unnamed: 0,year,unique_stages,num_unique_stages
0,1986,"[Group A, Group B, Group C, Group D, Group E, ...",6
1,1990,"[Group B, Group D, Group A, Group C, Group F, ...",6
2,1994,"[Group A, Group B, Group C, Group D, Group E, ...",6
3,1998,"[Group A, Group B, Group C, Group D, Group E, ...",8
4,2002,"[Group A, Group B, Group C, Group D, Group E, ...",8
5,2006,"[Group A, Group B, Group C, Group D, Group E, ...",8
6,2010,"[Group A, Group B, Group C, Group D, Group E, ...",8
7,2014,"[Group A, Group B, Group C, Group D, Group E, ...",8
8,2018,"[Group A, Group B, Group C, Group D, Group E, ...",8
9,2022,"[Group A, Group B, Group C, Group D, Group E, ...",8


# Recreate Leauge Table after first two matchdays

In [18]:

# Assuming goals_df is your original dataset
agg_goals_before_last_day, goals_last_day_sorted = before_last(goals_df)


In [19]:
home_games, away_games = aggregate_home_away_points(agg_goals_before_last_day)

In [20]:
# Filter for year == 2022 and stage == 'Group E'
home2022e = home_games[(home_games['year'] == 2022) & (home_games['stage'] == 'Group E')]
home2022e

Unnamed: 0,year,stage,home_team,goals_scored,goals_conceded,points_home,match_count_home
216,2022,Group E,Germany,1,2,0,1
217,2022,Group E,Japan,0,1,0,1
218,2022,Group E,Spain,8,1,4,2


In [21]:
# Filter for year == 2022 and stage == 'Group E'
away2022e = away_games[(away_games['year'] == 2022) & (away_games['stage'] == 'Group E')]
away2022e

Unnamed: 0,year,stage,away_team,goals_scored,goals_conceded,points_away,match_count_away
216,2022,Group E,Costa Rica,1,7,3,2
217,2022,Group E,Germany,1,1,1,1
218,2022,Group E,Japan,2,1,3,1


## aggregate data after first two match days

In [22]:
# Applying the fifa_before_last function to the mock data
all_games_before_last = fifa_before_last(home_games, away_games, agg_goals_before_last_day, team_counts)

No missing values in the specified columns.
No observations where total_matches == 1.
No observations where total_matches == 0.

=== Applying Tiebreaker ===
Row1: year                       1986
stage                   Group A
team                   Bulgaria
home_team              Bulgaria
goals_scored_home           1.0
goals_conceded_home         1.0
points_home                 1.0
match_count_home            1.0
away_team              Bulgaria
goals_scored_away           1.0
goals_conceded_away         1.0
points_away                 1.0
match_count_away            1.0
goals_scored                2.0
goals_conceded              2.0
points                      2.0
total_matches               2.0
matches_flag                2.0
goals_difference            0.0
tiebreaker              no need
tie_won                       0
Name: 1, dtype: object
Row2: year                      1986
stage                  Group A
team                     Italy
home_team                Italy
goals_scored

In [23]:
# Filtering all_games_before_last for the year 2022 and stage 'Group E'
group2022e = all_games_before_last[
    (all_games_before_last['year'] == 2022) & 
    (all_games_before_last['stage'] == 'Group E')
]

display(group2022e)

Unnamed: 0,year,stage,team,standing,points,goals_scored,goals_conceded,goals_difference,total_matches,tiebreaker,tie_won
280,2022,Group E,Spain,1,4,8,1,7,2,no need,0
281,2022,Group E,Japan,2,3,2,2,0,2,Japan,1
282,2022,Group E,Costa Rica,3,3,1,7,-6,2,Japan,0
283,2022,Group E,Germany,4,1,2,3,-1,2,no need,0


In [24]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\wc\before_last_wc_fifa.xlsx'
all_games_before_last.to_excel(file_path, index=False)


# Recreate league table after last match day

In [25]:
# Define unique pairs and ensure correct order based on goals_last_day_sorted
unique_pairs = all_games_before_last[['year', 'stage']].drop_duplicates()

# Merge with goals_last_day_sorted to get the correct order
unique_pairs = unique_pairs.merge(
    goals_last_day_sorted[['year', 'stage', 'short_date','local_time']].drop_duplicates(),
    how='left',
    on=['year', 'stage']
).sort_values(by=['year', 'short_date','local_time', 'stage']).reset_index(drop=True)

# Initialize an empty list to store the results for each pair
all_results = []

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    stage = row['stage']
    
    # Apply the function to the current pair
    result = fifa_final_wc(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_results.append(result)

# Concatenate all the results into a single DataFrame
changes_df_wc = pd.concat(all_results)

# Keep only the specified columns
changes_df_wc = changes_df_wc[[ 'year', 'stage', 'team', '1st', '2nd', '3rd', '4th', 'changes']]



=== Initial Standings for Year 1986, Group C Before Last Match Goals ===

        team  total_points  total_goals_scored  total_goals_conceded  total_goals_difference  before_last_game_points  before_last_game_standing
Soviet Union             4                   7                     1                       6                        3                          1
      France             4                   2                     1                       1                        3                          2
     Hungary             3                   2                     6                      -4                        2                          3
      Canada             1                   0                     3                      -3                        0                          4


Analyzing goal: 0 minute, 1 half time, Player team: nan, Home: Hungary, Away: France

=== Teams with Identical Points (Tied Teams) ===

           team  total_points
8  Soviet Union             4
9 

In [26]:
# # Exporting final df
# file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\wc\tb_wc_fifa_men.xlsx'
# changes_df_wc.to_excel(file_path, index=False)


# group composition tracking

In [27]:
unique_pairs.groupby('year').count()

Unnamed: 0_level_0,stage,short_date,local_time
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1986,6,6,6
1990,6,6,6
1994,6,6,6
1998,8,8,8
2002,8,8,8
2006,8,8,8
2010,8,8,8
2014,8,8,8
2018,8,8,8
2022,8,8,8


In [28]:
# Group by 'year', 'stage', and 'short_date' and count unique values of 'local_time'
unique_pairs_grouped = unique_pairs.groupby(['year', 'stage', 'short_date'])['local_time'].nunique().reset_index()

# Rename the column for clarity
unique_pairs_grouped.rename(columns={'local_time': 'unique_local_times'}, inplace=True)

# Filter the DataFrame to highlight rows where unique_local_times > 1
highlighted_rows = unique_pairs_grouped[unique_pairs_grouped['unique_local_times'] > 1]
highlighted_rows



Unnamed: 0,year,stage,short_date,unique_local_times


In [29]:
# Initialize an empty list to store the results for each pair
all_composition_changes = []

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    stage = row['stage']
    
    # Apply the function to the current pair
    composition_changes_df, third_place_df = gap_composition(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day, competition = "WC")
    
    # Append the result to the list
    all_composition_changes.append(composition_changes_df)

# Concatenate all the results into a single DataFrame
final_composition_changes_df = pd.concat(all_composition_changes, ignore_index=True)


[INFO] Using competition filter: WC

=== STEP 1: Initial Standings for Group C, 1986 (Goal Time = 0) ===
        team  total_points  total_goals_scored  total_goals_conceded  total_goals_difference  before_last_game_standing
Soviet Union             4                   7                     1                       6                          1
      France             4                   2                     1                       1                          2
     Hungary             3                   2                     6                      -4                          3
      Canada             1                   0                     3                      -3                          4



=== Tied after goal at minute 0 1 half time by nan in Group C, edition 1986 ===
Number of tied teams: 0
Empty DataFrame
Columns: [team, total_points, total_goals_scored, total_goals_conceded, total_goals_difference]
Index: []
[DEBUG] Saving Third Team - Year: 1986, Stage: Group C, Goal Minut


[DEBUG] Third Place Tracking Updated: {'year': 1986, 'stage': 'Group C', 'goal_minute': 58, 'half_time': 2, 'third_team': 'Hungary', 'total_points': 2, 'total_goals_difference': -5, 'total_goals_scored': 2, 'date': '1986-06-09', 'time': '18:00'}

=== STEP 3: Standings after goal at minute 58 2 half time by Soviet Union in Group C, edition 1986 ===
        team  total_points  total_goals_scored  total_goals_conceded  total_goals_difference  before_last_game_standing
Soviet Union             5                   8                     1                       7                          1
      France             5                   3                     1                       2                          2
     Hungary             2                   2                     7                      -5                          3
      Canada             0                   0                     4                      -4                          4



=== Tied after goal at minute 62 2 half time b

## manually modify order of tie teams by drawing of lots

In [30]:
mask = (final_composition_changes_df['year'] == 1990) & \
       (final_composition_changes_df['stage'] == "Group F") & \
       (final_composition_changes_df['goal_minute'] == 71)

final_composition_changes_df.loc[mask, ['2nd', '3rd']] = final_composition_changes_df.loc[mask, ['3rd', '2nd']].values


## manually modify order of tie teams by fair play points

In [31]:
mask = (final_composition_changes_df['year'] == 2018) & \
       (final_composition_changes_df['stage'] == "Group H") & \
       (final_composition_changes_df['goal_minute'] == 74)

final_composition_changes_df.loc[mask, ['2nd', '3rd']] = final_composition_changes_df.loc[mask, ['3rd', '2nd']].values


# best four third placed

In [32]:
# Filter the dataset to include only years <= 1994
third_place_df = third_place_df[third_place_df['year'] <= 1994].copy()

# Remove all observations with goal_minute == 0
third_place_df = third_place_df[third_place_df['goal_minute'] != 0]

# Remove duplicates
third_place_df = third_place_df.drop_duplicates()


third_place_df = ensure_goal_minute_zero(third_place_df, all_games_before_last, add_initial_draw_point=True, competition="WC")


# Fill missing 'date' and 'time' with the next row's value (backward fill)
third_place_df[['date', 'time']] = third_place_df[['date', 'time']].fillna(method='bfill')



[INFO] Removing existing `goal_minute = 0` entries...
[INFO] Using competition filter: WC
[INFO] Years included: [1986, 1990, 1994]
[INFO] Creating `goal_minute = 0` entries from `all_games_before_last`
[INFO] Adding 18 new `goal_minute = 0` entries.
[INFO] `goal_minute = 0` entries updated successfully.


In [33]:
# # Exporting final df
# file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\wc\third_place_df.xlsx'
# third_place_df.to_excel(file_path, index=False)


### resolving ties using yellow cards

In [34]:
team_priority = {
  
}


In [35]:
initial_third_teams_df = initial_third_teams(third_place_df, add_initial_draw_point=False)

In [36]:
initial_third_teams_df

Unnamed: 0,year,goal_minute,date,time,change_flag,change_count,top_four,last_two,tied_team,Group B,Group A,Group F,Group C,Group D,Group E
0,1986,0,1986-06-09,18:00,0,0,"[[Belgium, 3, 0, 3], [Italy, 3, 0, 2], [Morocc...","[[Northern Ireland, 2, -1, 2], [Uruguay, 2, -5...",[],"[Belgium, 3, 0, 3]","[Italy, 3, 0, 2]","[Morocco, 3, 0, 0]","[Hungary, 3, -4, 2]","[Northern Ireland, 2, -1, 2]","[Uruguay, 2, -5, 2]"
1,1990,0,1990-06-18,21:00,0,0,"[[Argentina, 3, 1, 2], [Costa Rica, 3, 0, 1], ...","[[Uruguay, 2, -2, 1], [Austria, 1, -2, 0]]","[Ireland, Costa Rica]","[Argentina, 3, 1, 2]","[Austria, 1, -2, 0]","[Ireland, 3, 0, 1]","[Costa Rica, 3, 0, 1]","[Yugoslavia, 3, -2, 2]","[Uruguay, 2, -2, 1]"
2,1994,0,1994-06-26,20:00,0,0,"[[Bulgaria, 4, 1, 4], [Netherlands, 4, 0, 2], ...","[[South Korea, 3, 0, 2], [Cameroon, 2, -3, 2]]",[],"[Cameroon, 2, -3, 2]","[Romania, 4, -1, 4]","[Netherlands, 4, 0, 2]","[South Korea, 3, 0, 2]","[Bulgaria, 4, 1, 4]","[Italy, 4, 0, 1]"


In [37]:
third_teams_df = third_teams(third_place_df, initial_third_teams_df, team_priority)

In [38]:
third_teams_df = third_track(third_teams_df)

In [39]:
# Define output path and save
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\wc\third_teams_wc_fifa.xlsx'
third_teams_df.to_excel(file_path, index=False)

print(f"Data successfully saved to {file_path}")


Data successfully saved to C:\Users\aldi\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\wc\third_teams_wc_fifa.xlsx


### variable to track whether third teams qualifies or not

In [40]:
# Apply to final_composition_changes_df
final_composition_changes_df = format_time_column(final_composition_changes_df, 'time')


In [41]:
# Convert 'time' to proper format (HH:MM), handling NaN values
final_composition_changes_df["time"] = pd.to_datetime(final_composition_changes_df["time"], errors='coerce').dt.strftime('%H:%M')

# Step 1: Order the dataset by 'year', 'stage', 'half_time', and 'goal_minute', 
# ensuring that goal_minute == 0 is ranked at the top
final_composition_changes_df = final_composition_changes_df.sort_values(
    by=["year", "stage", "half_time", "goal_minute"], ascending=[True, True, True, True]
).reset_index(drop=True)

# Step 2: Fill 'time' and 'date' forward within each 'stage'
final_composition_changes_df["time"] = final_composition_changes_df.groupby("stage")["time"].fillna(method="ffill")
final_composition_changes_df["date"] = final_composition_changes_df.groupby("stage")["date"].fillna(method="ffill")


In [42]:
# Initialize third_qualify to 0
final_composition_changes_df['third_qualify'] = 0

# Process only years <= 1994
for index, row in final_composition_changes_df.iterrows():
    if row['year'] <= 1994:
        # Print debugging information for each condition
        year_match = third_teams_df['year'] == row['year']
        goal_minute_match = third_teams_df['goal_minute'] == row['goal_minute']
        date_match = third_teams_df['date'] == row['date']
        time_match = third_teams_df['time'] == row['time']
        stage_match = third_teams_df['stage'] == row['stage']

        # Find matching rows
        match = third_teams_df[year_match & goal_minute_match & date_match & time_match & stage_match]

        # Print which conditions do not hold
        if match.empty:
            print(f"\n⚠️ No match for index {index} | year: {row['year']} | goal_minute: {row['goal_minute']}")

            if not year_match.any():
                print(f" ❌ Year {row['year']} not found in third_teams_df")

            if not goal_minute_match.any():
                print(f" ❌ Goal_minute {row['goal_minute']} not found in third_teams_df | Unique values: {third_teams_df['goal_minute'].unique()}")

            if not date_match.any():
                print(f" ❌ Date {row['date']} not found in third_teams_df | Unique values: {third_teams_df['date'].unique()[:10]}")

            if not time_match.any():
                print(f" ❌ Time {row['time']} not found in third_teams_df | Unique values: {third_teams_df['time'].unique()[:10]}")

            if not stage_match.any():
                print(f" ❌ Stage {row['stage']} not found in third_teams_df | Unique values: {third_teams_df['stage'].unique()[:10]}")

        else:
            print(f"\n✅ Match found for index {index}:\n{match}")

            # Check if the 3rd place team is in the top four third-placed teams
            top_four_teams = match.iloc[0]['top_four']
            print(f"Index {index} | top_four_teams: {top_four_teams} (type: {type(top_four_teams)})")

            # Convert top_four_teams if stored as a string
            import ast
            if isinstance(top_four_teams, str):
                top_four_teams = ast.literal_eval(top_four_teams)

            # Check if the 3rd place team is in the top four
            if row['3rd'] not in top_four_teams:
                print(f"⚠️ Third-place team {row['3rd']} NOT in top_four_teams {top_four_teams}")
            else:
                final_composition_changes_df.at[index, 'third_qualify'] = 1

# Create qualified_teams list: always includes 1st and 2nd, includes 3rd only if third_qualify == 1
final_composition_changes_df['qualified_teams'] = final_composition_changes_df.apply(
    lambda row: [row['1st'], row['2nd']] + ([row['3rd']] if row['third_qualify'] == 1 else []),
    axis=1
)

# Remove the columns 'change_num' and 'changed' from the DataFrame
final_composition_changes_df = final_composition_changes_df.drop(columns=['change_num', 'changed'], errors='ignore')



✅ Match found for index 0:
   year        date   time  goal_minute  half_time    stage           Group A  \
6  1986  1986-06-10  18:00            0          1  Group A  [Italy, 3, 0, 2]   

              Group B              Group C                       Group D  \
6  [Belgium, 3, 0, 3]  [Hungary, 2, -7, 2]  [Northern Ireland, 2, -1, 2]   

               Group E             Group F  \
6  [Uruguay, 2, -5, 2]  [Morocco, 3, 0, 0]   

                                      top_four            last_two tied_teams  \
6  [Belgium, Italy, Morocco, Northern Ireland]  [Uruguay, Hungary]         []   

   change_flag  change_count  
6            0             1  
Index 0 | top_four_teams: ['Belgium', 'Italy', 'Morocco', 'Northern Ireland'] (type: <class 'list'>)

✅ Match found for index 1:
   year        date   time  goal_minute  half_time    stage  \
7  1986  1986-06-10  18:00            4          1  Group A   

                Group A             Group B              Group C  \
7  [Bulgaria, 


Index 11 | top_four_teams: ['Belgium', 'Morocco', 'Northern Ireland', 'Bulgaria'] (type: <class 'list'>)

✅ Match found for index 12:
    year        date   time  goal_minute  half_time    stage  \
17  1986  1986-06-11  18:00           54          2  Group B   

                 Group A             Group B              Group C  \
17  [Bulgaria, 2, -2, 2]  [Belgium, 3, 0, 4]  [Hungary, 2, -7, 2]   

                         Group D              Group E             Group F  \
17  [Northern Ireland, 2, -1, 2]  [Uruguay, 2, -5, 2]  [Morocco, 3, 0, 0]   

                                          top_four            last_two  \
17  [Belgium, Morocco, Northern Ireland, Bulgaria]  [Uruguay, Hungary]   

   tied_teams  change_flag  change_count  
17         []            0             4  
Index 12 | top_four_teams: ['Belgium', 'Morocco', 'Northern Ireland', 'Bulgaria'] (type: <class 'list'>)

✅ Match found for index 13:
    year        date   time  goal_minute  half_time    stage  \
18  1986 

### variables to track the composition of the teams qualifying

In [43]:
# Sort the dataset by 'year', 'date', and 'time'
final_composition_changes_df = final_composition_changes_df.sort_values(by=['year', 'date', 'time', 'half_time', 'goal_minute', 'stage'], na_position='first').reset_index(drop=True)


In [44]:

# Ensure goal_minute is numeric for sorting
final_composition_changes_df['goal_minute'] = pd.to_numeric(final_composition_changes_df['goal_minute'], errors='coerce')

# Sort by year, stage, and goal_minute to process in correct order
final_composition_changes_df = final_composition_changes_df.sort_values(by=['year', 'stage', 'goal_minute']).reset_index(drop=True)

# Initialize qual_changed and qual_count columns
final_composition_changes_df['qual_changed'] = 0
final_composition_changes_df['qual_count'] = 0

# Track changes per stage
qual_count_tracker = {}

# Loop through each year and stage to check for changes
for (year, stage), group in final_composition_changes_df.groupby(['year', 'stage']):
    previous_qualified = None
    qual_count = 0

    for index, row in group.iterrows():
        # Extract current qualified teams
        current_qualified = sorted(row['qualified_teams'])  # Sort to ignore order changes

        # Check if qualified_teams has changed from the previous row
        if previous_qualified is not None and current_qualified != previous_qualified:
            final_composition_changes_df.at[index, 'qual_changed'] = 1
            qual_count += 1

        # Update tracking
        previous_qualified = current_qualified
        final_composition_changes_df.at[index, 'qual_count'] = qual_count

In [45]:
final_composition_changes_df = integrate_elo_probabilities(final_composition_changes_df, elo_wc)

In [46]:
# Define pt_diff and gl_diff based on third_qualify condition
final_composition_changes_df['pts_diff'] = final_composition_changes_df.apply(
    lambda row: row['3rd_points'] - row['2nd_points']
    if row['third_qualify'] == 0 else row['4th_points'] - row['3rd_points'], axis=1
)

final_composition_changes_df['gls_diff'] = final_composition_changes_df.apply(
    lambda row: row['3rd_goals_diff'] - row['2nd_goals_diff']
    if row['third_qualify'] == 0 else row['4th_goals_diff'] - row['3rd_goals_diff'], axis=1
)


In [47]:
# Drop unnecessary columns
final_composition_changes_df = final_composition_changes_df.drop(columns=['points_diff', 'goals_diff', 'tiebreak_result'], errors='ignore')


In [48]:
# Define the desired column order
column_order = [
    'year', 'stage', 'date', 'time', 'goal_minute', 'half_time', 'home_team', 
    'away_team', 'scorer_team', 'new_top_teams', 
    '1st', '1st_points', '1st_goals_diff', '1st_goals_scored', '1st_last_game_points', '1st_last_game_goals_diff',
    '2nd', '2nd_points', '2nd_goals_diff', '2nd_goals_scored','2nd_last_game_points', '2nd_last_game_goals_diff',
    '3rd', '3rd_points', '3rd_goals_diff', '3rd_goals_scored','3rd_last_game_points', '3rd_last_game_goals_diff',
    '4th', '4th_points', '4th_goals_diff', '4th_goals_scored','4th_last_game_points', '4th_last_game_goals_diff',
    'third_qualify', 'qualified_teams', 'qual_changed', 'qual_count', 
    'pts_diff', 'gls_diff', 'elo_home', 'elo_away'
]

# Reorder the columns
final_composition_changes_df = final_composition_changes_df[column_order]



In [49]:
# Remove rows where goal_minute == 0 and both home_team and away_team have values
filtered_df = final_composition_changes_df[
    ~((final_composition_changes_df["goal_minute"] == 0) & 
      final_composition_changes_df["home_team"].notna() & 
      final_composition_changes_df["away_team"].notna())
]

# Reset index after filtering
filtered_df = filtered_df.reset_index(drop=True)

# Update the original DataFrame with the filtered version
final_composition_changes_df = filtered_df


# suspense

## h2h variable

In [50]:
final_composition_changes_df = (
    final_composition_changes_df
    .groupby(["year", "stage"], group_keys=False)
    .apply(assign_h2h_per_row)
)


In [51]:
final_composition_changes_df = assign_lagging_won(final_composition_changes_df, goals_df)


In [52]:
# # Apply the function to compute suspense for each row
final_composition_changes_df['suspense'] = final_composition_changes_df.apply(calculate_suspense, axis=1)

In [53]:
final_composition_changes_df.loc[
    (final_composition_changes_df["year"] == 2002) &
    (final_composition_changes_df["stage"] == "Group G") &
    (final_composition_changes_df["goal_minute"] == 48)
]

Unnamed: 0,year,stage,date,time,goal_minute,half_time,home_team,away_team,scorer_team,new_top_teams,...,qual_count,pts_diff,gls_diff,elo_home,elo_away,h2h,lagging_team,leading_team,lagging_won,suspense
2,2002,Group G,2002-06-13,11:30,48,2.0,Ecuador,Croatia,Ecuador,"[Mexico, Italy]",...,2,0,-1,1727.0,1882.0,0,Croatia,Italy,1.0,1


In [54]:
final_composition_changes_df = update_suspense(final_composition_changes_df, debug=True)


Year 1990 | Stage Group F | Minute 58
Lagging: Egypt | Leading: England | h2h=1
BASE: pts_diff=-2 | gd_diff=-2 | gs_diff=-1

Scenario S:
  pts_diff=0.0 | gd_diff=0 | gs_diff=0 | full_tie_S=True

Scenario C:
  pts_diff=0.0 | gd_diff=0 | gs_diff=0 | full_tie_C=True

=> h2h_suspense = True

Year 2002 | Stage Group A | Minute 0
Lagging: Uruguay | Leading: Senegal | h2h=1
BASE: pts_diff=-3 | gd_diff=-2 | gs_diff=-1

Scenario S:
  pts_diff=0.0 | gd_diff=0 | gs_diff=0 | full_tie_S=True

Scenario C:
  pts_diff=0.0 | gd_diff=0 | gs_diff=0 | full_tie_C=True

=> h2h_suspense = True

Year 2002 | Stage Group A | Minute 88
Lagging: Uruguay | Leading: Senegal | h2h=1
BASE: pts_diff=-3 | gd_diff=-2 | gs_diff=-1

Scenario S:
  pts_diff=0.0 | gd_diff=0 | gs_diff=0 | full_tie_S=True

Scenario C:
  pts_diff=0.0 | gd_diff=0 | gs_diff=0 | full_tie_C=True

=> h2h_suspense = True

Year 1994 | Stage Group F | Minute 0
Lagging: Morocco | Leading: Netherlands | h2h=1
BASE: pts_diff=-3 | gd_diff=-2 | gs_diff=-1


In [55]:
final_composition_changes_df = evaluate_lagging_losses_and_update_suspense(final_composition_changes_df, goals_df)

🔍 Checking 6 h2h_suspense observations...


🔎 Year: 1990, Stage: Group F, Minute: 58
   Pair under tie-break check: Egypt (lagging) vs England (leading) | h2h=1
   ⏭️ Skipping: h2h==1 indicates current match. Not using final score to update suspense.

🔎 Year: 1994, Stage: Group F, Minute: 0
   Pair under tie-break check: Morocco (lagging) vs Netherlands (leading) | h2h=1
   ⏭️ Skipping: h2h==1 indicates current match. Not using final score to update suspense.

🔎 Year: 1994, Stage: Group F, Minute: 5
   Pair under tie-break check: Morocco (lagging) vs Netherlands (leading) | h2h=1
   ⏭️ Skipping: h2h==1 indicates current match. Not using final score to update suspense.

🔎 Year: 1994, Stage: Group F, Minute: 47
   Pair under tie-break check: Morocco (lagging) vs Netherlands (leading) | h2h=1
   ⏭️ Skipping: h2h==1 indicates current match. Not using final score to update suspense.

🔎 Year: 2002, Stage: Group A, Minute: 0
   Pair under tie-break check: Uruguay (lagging) vs Senegal (leading

In [56]:
display(final_composition_changes_df[
    (final_composition_changes_df['h2h_suspense'] == 1) &
    (final_composition_changes_df['suspense'] == 0)
][['year', 'stage', 'goal_minute', 'lagging_team', 'leading_team', 'suspense']])

Unnamed: 0,year,stage,goal_minute,lagging_team,leading_team,suspense


In [57]:
final_composition_changes_df.dtypes


year                          int64
stage                        object
date                         object
time                         object
goal_minute                   int64
half_time                   float64
home_team                    object
away_team                    object
scorer_team                  object
new_top_teams                object
1st                          object
1st_points                    int64
1st_goals_diff                int32
1st_goals_scored              int32
1st_last_game_points          int64
1st_last_game_goals_diff      int64
2nd                          object
2nd_points                    int64
2nd_goals_diff                int32
2nd_goals_scored              int32
2nd_last_game_points          int64
2nd_last_game_goals_diff      int64
3rd                          object
3rd_points                    int64
3rd_goals_diff                int32
3rd_goals_scored              int32
3rd_last_game_points          int64
3rd_last_game_goals_diff    

In [58]:
final_composition_changes_df = update_suspense_draw_lots(final_composition_changes_df)


In [59]:
 # Drop helper columns
final_composition_changes_df.drop(columns=['lagging_team', 'leading_team', 'h2h_suspense', 'lagging_won'], inplace=True, errors='ignore')

In [60]:
# Sort the dataset by 'year', 'date', and 'time'
final_composition_changes_df = final_composition_changes_df.sort_values(by=['year', 'date', 'time', 'half_time', 'goal_minute', 'stage'], na_position='first').reset_index(drop=True)


In [61]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\wc\goals_wc_fifa.xlsx'
final_composition_changes_df.to_excel(file_path, index=False)


# minute by minute dataframe

In [62]:
# Initialize an empty list to store expanded rows
expanded_rows = []

# Iterate over rows grouped by date and time
for (date, time), group in final_composition_changes_df.groupby(["date", "time"]):
    group = group.sort_values(by="goal_minute").reset_index(drop=True)

    # Start match from minute 0
    min_minute = 0
    max_minute = max(90, group["goal_minute"].max(skipna=True)) if not group["goal_minute"].isna().all() else 90

    # Iterate over goal events and fill in missing match minutes
    for i in range(len(group) - 1):
        current_row = group.iloc[i]
        next_row = group.iloc[i + 1]

        for match_minute in range(int(current_row["goal_minute"]), int(next_row["goal_minute"])):
            new_row = current_row.copy()
            new_row["match_minute"] = match_minute

            # Only set goal_minute when it's an actual goal event
            if match_minute != current_row["goal_minute"]:
                new_row["goal_minute"] = None
                new_row["scorer_team"] = None  # Clear scorer team for non-goal minutes
            
            expanded_rows.append(new_row)

    # Add the last row of the group
    last_row = group.iloc[-1].copy()
    last_goal_minute = int(last_row["goal_minute"])
    
    for match_minute in range(last_goal_minute, max_minute + 1):
        new_row = last_row.copy()
        new_row["match_minute"] = match_minute
        new_row["goal_minute"] = None if match_minute > last_goal_minute else last_row["goal_minute"]
        new_row["scorer_team"] = None if match_minute > last_goal_minute else last_row["scorer_team"]
        expanded_rows.append(new_row)

# Create a new DataFrame from the expanded rows
expanded_df = pd.DataFrame(expanded_rows)

# Define the desired column order
column_order = [
    'year', 'stage', 'date', 'time', 'match_minute', 'goal_minute', 'half_time', 'home_team', 
    'away_team', 'scorer_team', 'new_top_teams', 
    '1st', '1st_points', '1st_goals_diff', '1st_goals_scored','1st_last_game_points', '1st_last_game_goals_diff',
    '2nd', '2nd_points', '2nd_goals_diff','2nd_goals_scored', '2nd_last_game_points', '2nd_last_game_goals_diff',
    '3rd', '3rd_points', '3rd_goals_diff', '3rd_goals_scored','3rd_last_game_points', '3rd_last_game_goals_diff',
    '4th', '4th_points', '4th_goals_diff', '4th_goals_scored','4th_last_game_points', '4th_last_game_goals_diff',
    'third_qualify', 'qualified_teams', 'qual_changed', 'qual_count', 
    'pts_diff', 'gls_diff', 'elo_home', 'elo_away', 'suspense', 'h2h'
]
# Ensure the expanded dataframe follows the defined column order
expanded_df = expanded_df[column_order]


In [63]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\wc\mbm_wc_fifa.xlsx'
expanded_df.to_excel(file_path, index=False)


# add info of bookings and subs

In [64]:
# Create a working copy of the input data
goals_df = final_composition_changes_df.copy()
events_df = cardsub_wc.copy()


# Ensure numeric types
events_df['minute'] = pd.to_numeric(events_df['minute'], errors='coerce')
goals_df['goal_minute'] = pd.to_numeric(goals_df['goal_minute'], errors='coerce')

# Add result columns
goals_df['home_bookings'] = 0
goals_df['away_bookings'] = 0
goals_df['home_yellow_cards'] = 0
goals_df['away_yellow_cards'] = 0
goals_df['home_red_cards_straight'] = 0
goals_df['away_red_cards_straight'] = 0
goals_df['home_red_cards_second_yellow'] = 0
goals_df['away_red_cards_second_yellow'] = 0
goals_df['home_substitutions'] = 0
goals_df['away_substitutions'] = 0
goals_df['match_found'] = False
goals_df['reason_no_match'] = ""

# Iterate and debug
for idx, row in goals_df.iterrows():
    year, stage, ht, at, goal_min = row['year'], row['stage'], row['home_team'], row['away_team'], row['goal_minute']
    match_mask = (events_df['year'] == year) & (events_df['stage'] == stage)
    if not events_df[match_mask].shape[0]:
        goals_df.at[idx, 'reason_no_match'] = "No matching year/stage"
        continue

    same_order = (events_df['home_team'] == ht) & (events_df['away_team'] == at)
    reversed_order = (events_df['home_team'] == at) & (events_df['away_team'] == ht)
    match_mask &= (same_order | reversed_order)

    if not events_df[match_mask].shape[0]:
        goals_df.at[idx, 'reason_no_match'] = "Teams not found in correct or reversed order"
        continue

    match_mask &= (events_df['minute'] <= goal_min)

    match_events = events_df[match_mask]
    if match_events.empty:
        goals_df.at[idx, 'reason_no_match'] = "No events before goal minute"
        continue

    # Match found
    goals_df.at[idx, 'match_found'] = True
    goals_df.at[idx, 'reason_no_match'] = ""

    goals_df.at[idx, 'home_bookings'] = (
        (match_events['event_type'].isin(['Yellow card', 'Red card (straight)', 'Red card (second yellow)'])) &
        (match_events['team_side'] == 'home')
    ).sum()

    goals_df.at[idx, 'away_bookings'] = (
        (match_events['event_type'].isin(['Yellow card', 'Red card (straight)', 'Red card (second yellow)'])) &
        (match_events['team_side'] == 'away')
    ).sum()

    goals_df.at[idx, 'home_substitutions'] = (
        (match_events['event_type'] == 'Substitution') &
        (match_events['team_side'] == 'home')
    ).sum()

    goals_df.at[idx, 'away_substitutions'] = (
        (match_events['event_type'] == 'Substitution') &
        (match_events['team_side'] == 'away')
    ).sum()


    # Yellow cards
    goals_df.at[idx, 'home_yellow_cards'] = (
        (match_events['event_type'] == 'Yellow card') &
        (match_events['team_side'] == 'home')
    ).sum()

    goals_df.at[idx, 'away_yellow_cards'] = (
        (match_events['event_type'] == 'Yellow card') &
        (match_events['team_side'] == 'away')
    ).sum()

    # Red card (straight)
    goals_df.at[idx, 'home_red_cards_straight'] = (
        (match_events['event_type'] == 'Red card (straight)') &
        (match_events['team_side'] == 'home')
    ).sum()

    goals_df.at[idx, 'away_red_cards_straight'] = (
        (match_events['event_type'] == 'Red card (straight)') &
        (match_events['team_side'] == 'away')
    ).sum()

    # Red card (second yellow)
    goals_df.at[idx, 'home_red_cards_second_yellow'] = (
        (match_events['event_type'] == 'Red card (second yellow)') &
        (match_events['team_side'] == 'home')
    ).sum()

    goals_df.at[idx, 'away_red_cards_second_yellow'] = (
        (match_events['event_type'] == 'Red card (second yellow)') &
        (match_events['team_side'] == 'away')
    ).sum()


# surprise, shock, and suspense according to literature

In [65]:
goals_df['year'] = goals_df['year'].astype(int)
elo_wc['year'] = elo_wc['year'].astype(int)


In [66]:
# Get score for each team
goals_df['home_score'] = goals_df.apply(lambda row: get_team_score(row['home_team'], row), axis=1)
goals_df['away_score'] = goals_df.apply(lambda row: get_team_score(row['away_team'], row), axis=1)
goals_df['homeahead'] = (goals_df['home_score'] >= 2).astype(int)
goals_df['awayahead'] = (goals_df['away_score'] >= 2).astype(int)
goals_df['scoreeq'] = ((goals_df['home_score'] == 1) & (goals_df['away_score'] == 1)).astype(int)


In [67]:
# Your original (Elo-based) trajectory
prob_elo = compute_probabilities_on_goals_new(goals_df, method="elo")

# Csató Poisson–Skellam trajectory
prob_csato = compute_probabilities_on_goals_new(
    goals_df,
    method="csato",
    use_remaining_time_scaling=True,  # optional
    full_duration=100,
    home_adv=100.0
)


In [68]:
prob_elo = prob_elo[[
    'year', 'stage', 'home_team', 'away_team', 'minute',
    'score_home', 'score_away',
    'P_home', 'P_draw', 'P_away',
    'dP_home', 'dP_draw', 'dP_away',
    'shock', 'surprise', 'suspense',
    'event'
]]


In [69]:
prob_csato = prob_csato[[
    'year', 'stage', 'home_team', 'away_team', 'minute',
    'score_home', 'score_away',
    'P_home', 'P_draw', 'P_away',
    'dP_home', 'dP_draw', 'dP_away',
    'shock', 'surprise', 'suspense',
    'event'
]]


In [70]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\wc\tab_prob_fifa_wc.xlsx'
prob_elo.to_excel(file_path, index=False)


In [71]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\wc\tab_prob_fifa_wc_csato.xlsx'
prob_csato.to_excel(file_path, index=False)
