# Libraries

In [121]:
import pandas as pd
from getpass import getuser
from collections import defaultdict
from datetime import datetime, timedelta
import re

# Load and inspect dataset

In [122]:
# Get the current user's name
user = getuser()

# Construct the path using the user's name
function_path = f"C:/Users/{user}/Documents/GitHub/tiebreak_wc/code/wiki/functions_fifa.ipynb"

# Run the notebook
%run $function_path

In [123]:
# Path to the dataset
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\wc_goals_men.xlsx'

# Read the dataset with a different encoding
df = pd.read_excel(data_path)

# Now, filter the data for tournaments after 1984
df = df[df['year'] >= 1984]

display(df.head())

Unnamed: 0,stage,year,time,home_team,away_team,score,stadium_name,stadium_city,stadium_attendance,referee_name,...,goal_minute,extra_time,goals_home,goals_away,own_goal,penalty,goal_minute_et,goal_et,short_date,long_date
0,Group A,1986,12:00CST,Bulgaria,Italy,1–1,Estadio Azteca,Mexico City,96000,Erik Fredriksson,...,85,0,1,1,0,0,0,0,,31 May 1986
1,Group A,1986,12:00CST,Bulgaria,Italy,1–1,Estadio Azteca,Mexico City,96000,Erik Fredriksson,...,44,0,1,1,0,0,0,0,,31 May 1986
2,Group A,1986,12:00CST,Argentina,South Korea,3–1,Estadio Olímpico Universitario,Mexico City,60000,Victoriano Sánchez Arminio,...,6,0,3,1,0,0,0,0,,2 June 1986
3,Group A,1986,12:00CST,Argentina,South Korea,3–1,Estadio Olímpico Universitario,Mexico City,60000,Victoriano Sánchez Arminio,...,46,0,3,1,0,0,0,0,,2 June 1986
4,Group A,1986,12:00CST,Argentina,South Korea,3–1,Estadio Olímpico Universitario,Mexico City,60000,Victoriano Sánchez Arminio,...,18,0,3,1,0,0,0,0,,2 June 1986


In [124]:
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\elo_wc.xlsx'
elo_wc = pd.read_excel(data_path)

In [125]:
# Dictionary of replacements for team names
replacements = {
    "Republic of Ireland": "Ireland",
    "CIS": "Commonwealth of Independent States",
    "FR Yugoslavia": "Yugoslavia"
}

In [126]:
# Prepare the Elo dataset for integration: rename columns for clarity
elo_wc = elo_wc.rename(columns={"team": "team_name", "elo_rating": "elo"})
elo_wc["team_name"] = elo_wc["team_name"].replace(replacements)

# Clean, transfrom, create variables

## time

In [127]:
# Apply the conversion function
df[['local_time', 'utc_time']] = df.apply(lambda row: pd.Series(convert_time(row)), axis=1)

## date

In [128]:
# Step 1: Remove any commas before the year in 'long_date'
df['long_date'] = df['long_date'].str.replace(r',\s*(\d{4})', r' \1', regex=True)

# Step 2: Convert 'long_date' to 'short_date' where 'short_date' is missing
df['short_date'] = df['short_date'].fillna(pd.to_datetime(df['long_date'], errors='coerce').dt.strftime('%Y-%m-%d'))


# Extract relevant columns

In [129]:
# Filter out the knockout stages
df = df[~df['stage'].isin(['Quarterfinals', 'Quarter-finals', 'Round of 16', 'Semi-finals', 'Semifinals','Final', 'Third place play-off', 'Third place playoff'])]

# Extract relevant columns for goal events and match results
goals_df = df[['year', 'stage', 'home_team', 'away_team', 'scorer_nationality', 'goal_minute', 'short_date','local_time', 'score']]

# Remove duplicates
goals_df = goals_df.drop_duplicates()

display(goals_df.head())


Unnamed: 0,year,stage,home_team,away_team,scorer_nationality,goal_minute,short_date,local_time,score
0,1986,Group A,Bulgaria,Italy,Bulgaria,85,1986-05-31,18:00,1–1
1,1986,Group A,Bulgaria,Italy,Italy,44,1986-05-31,18:00,1–1
2,1986,Group A,Argentina,South Korea,Argentina,6,1986-06-02,18:00,3–1
3,1986,Group A,Argentina,South Korea,Argentina,46,1986-06-02,18:00,3–1
4,1986,Group A,Argentina,South Korea,Argentina,18,1986-06-02,18:00,3–1


# team counts

In [130]:

# Calculate the number of unique teams per stage and year using home_team and away_team
team_counts = (
    df.groupby(['year', 'stage'])
    .apply(lambda x: pd.concat([x['home_team'], x['away_team']]).unique())
)

# Convert the resulting series to a DataFrame
team_counts = team_counts.reset_index()  # Reset index without 'name' argument
team_counts = team_counts.rename(columns={0: 'team_list'})  # Rename the column appropriately

# Convert team list into counts
team_counts['team_count'] = team_counts['team_list'].apply(len)

# Filter stages where the number of teams is not equal to 4
invalid_groups = team_counts[team_counts['team_count'] != 4]

# Display the invalid groups
display(invalid_groups)



Unnamed: 0,year,stage,team_list,team_count


# Recreate Leauge Table after first two matchdays

In [131]:

# Assuming goals_df is your original dataset
agg_goals_before_last_day, goals_last_day_sorted = before_last(goals_df)


In [132]:
home_games, away_games = aggregate_home_away_points(agg_goals_before_last_day)

In [133]:
# Filter for year == 2022 and stage == 'Group E'
home2022e = home_games[(home_games['year'] == 2022) & (home_games['stage'] == 'Group E')]
home2022e

Unnamed: 0,year,stage,home_team,goals_scored,goals_conceded,points_home,match_count_home
198,2022,Group E,Germany,1,2,0,1
199,2022,Group E,Japan,0,1,0,1
200,2022,Group E,Spain,8,1,4,2


In [134]:
# Filter for year == 2022 and stage == 'Group E'
away2022e = away_games[(away_games['year'] == 2022) & (away_games['stage'] == 'Group E')]
away2022e

Unnamed: 0,year,stage,away_team,goals_scored,goals_conceded,points_away,match_count_away
198,2022,Group E,Costa Rica,1,7,3,2
199,2022,Group E,Germany,1,1,1,1
200,2022,Group E,Japan,2,1,3,1


## aggregate data after first two match days

In [135]:
# Applying the fifa_before_last function to the mock data
all_games_before_last = fifa_before_last(home_games, away_games, agg_goals_before_last_day, team_counts)


=== Applying Tiebreaker ===
Row1: year                       1986
stage                   Group A
team                   Bulgaria
home_team              Bulgaria
goals_scored_home           1.0
goals_conceded_home         1.0
points_home                 1.0
match_count_home            1.0
away_team              Bulgaria
goals_scored_away           1.0
goals_conceded_away         1.0
points_away                 1.0
match_count_away            1.0
goals_scored                2.0
goals_conceded              2.0
points                      2.0
total_matches               2.0
goals_difference            0.0
tiebreaker              no need
tie_won                       0
Name: 1, dtype: object
Row2: year                      1986
stage                  Group A
team                     Italy
home_team                Italy
goals_scored_home          1.0
goals_conceded_home        1.0
points_home                1.0
match_count_home           1.0
away_team                Italy
goals_scored_away

In [136]:
# Filtering all_games_before_last for the year 2022 and stage 'Group E'
group2022e = all_games_before_last[
    (all_games_before_last['year'] == 2022) & 
    (all_games_before_last['stage'] == 'Group E')
]

display(group2022e)

Unnamed: 0,year,stage,team,goals_scored,goals_conceded,points,goals_difference,total_matches,standing,tiebreaker,tie_won
256,2022,Group E,Spain,8,1,4,7,2,1,no need,0
257,2022,Group E,Japan,2,2,3,0,2,2,Costa Rica,0
258,2022,Group E,Costa Rica,1,7,3,-6,2,3,Costa Rica,1
259,2022,Group E,Germany,2,3,1,-1,2,4,no need,0


# Recreate league table after last match day

In [137]:
# Initialize an empty list to store the results for each pair
all_results = []

# Get unique pairs of year and group_name
unique_pairs = all_games_before_last[['year', 'stage']].drop_duplicates()

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    group_name = row['stage']
    
    # Apply the function to the current pair
    result = fifa_final_wc(year, group_name, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_results.append(result)

# Concatenate all the results into a single DataFrame
changes_df_wc = pd.concat(all_results)


# Keep only the specified columns
changes_df_wc = changes_df_wc[['year', 'stage', 'team', '1st', '2nd', '3rd', '4th', 'changes']]



=== Initial Standings for Year 1986, Group A Before Last Match Goals ===

       team  total_points  total_goals_scored  total_goals_conceded  total_goals_difference  before_last_game_points  before_last_game_standing
  Argentina             4                   4                     2                       2                        3                          1
   Bulgaria             3                   2                     2                       0                        2                          2
      Italy             3                   2                     2                       0                        2                          3
South Korea             2                   2                     4                      -2                        1                          4


Analyzing goal: 0 minute, Player team: nan, Home: Argentina, Away: Bulgaria

=== Teams with Identical Points (Tied Teams) ===

       team  total_points
1  Bulgaria             3
2     Italy             

In [138]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\wc\tb_wc_fifa_men.xlsx'
changes_df_wc.to_excel(file_path, index=False)


# group composition tracking

In [139]:
# Initialize an empty list to store the results for each pair
all_composition_changes = []

# Get unique pairs of year, stage, and men
unique_pairs = all_games_before_last[['year', 'stage']].drop_duplicates()

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    stage = row['stage']
    
    # Apply the function to the current pair
    composition_changes_df = gap_composition(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_composition_changes.append(composition_changes_df)

# Concatenate all the results into a single DataFrame
final_composition_changes_df = pd.concat(all_composition_changes, ignore_index=True)

# Define the desired column order
column_order = [
    'year', 'stage', 'date', 'time', 'change_num', 'goal_time', 'home_team', 
    'away_team', 'scorer_team', 'new_top_teams', '1st', '2nd', '3rd', 
    'changed', 'points_diff', 'goals_diff', 'tiebreak_result'
]

# Reorder the columns
final_composition_changes_df = final_composition_changes_df[column_order]

# Identify rows where both 'date' and 'time' are NaN
mask = final_composition_changes_df['date'].isna() & final_composition_changes_df['time'].isna()

# Use .shift(-1) to get the values from the following row and fill in the NaN rows
final_composition_changes_df.loc[mask, ['date', 'time']] = final_composition_changes_df.loc[mask, ['date', 'time']].fillna(
    final_composition_changes_df[['date', 'time']].shift(-1)
)

# Ensure 'date' is in datetime format and handle errors
# final_composition_changes_df['date'] = pd.to_datetime(final_composition_changes_df['date'], errors='coerce')

# Ensure 'time' is in proper datetime.time format and handle errors
final_composition_changes_df['time'] = pd.to_datetime(final_composition_changes_df['time'], format='%H:%M', errors='coerce').dt.time

# Drop rows with missing or invalid 'date' or 'time'
final_composition_changes_df = final_composition_changes_df.dropna(subset=['date', 'time'])

# Combine 'date' and 'time' into a single datetime column for proper sorting
final_composition_changes_df['datetime'] = pd.to_datetime(
    final_composition_changes_df['date'].astype(str) + ' ' + final_composition_changes_df['time'].astype(str),
    errors='coerce'
)

# Drop rows with invalid datetime values
final_composition_changes_df = final_composition_changes_df.dropna(subset=['datetime'])

# Sort by year and the combined datetime column
final_composition_changes_df = final_composition_changes_df.sort_values(by=['year', 'datetime']).reset_index(drop=True)

# Drop the combined datetime column if not needed
final_composition_changes_df = final_composition_changes_df.drop(columns=['datetime'])






=== STEP 1: Initial Standings for Group A, 1986 (Goal Time = 0) ===
       team  total_points  total_goals_scored  total_goals_conceded  total_goals_difference  before_last_game_standing
  Argentina             4                   4                     2                       2                          1
   Bulgaria             3                   2                     2                       0                          2
      Italy             3                   2                     2                       0                          3
South Korea             2                   2                     4                      -2                          4



=== Tied after goal at minute 0 by nan in Group A, edition 1986 ===
       team  total_points  total_goals_scored  total_goals_conceded  \
1  Bulgaria             3                   2                     2   
2     Italy             3                   2                     2   

   total_goals_difference  
1                      

In [140]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\wc\goals_wc_fifa_men.xlsx'
final_composition_changes_df.to_excel(file_path, index=False)


# minute by minute dataframe

In [141]:
# Initialize an empty list to store the expanded rows
expanded_rows = []

# Iterate over rows grouped by date and time
for (date, time), group in final_composition_changes_df.groupby(["date", "time"]):
    group = group.sort_values(by="goal_time").reset_index(drop=True)
    
    for i in range(len(group) - 1):
        current_row = group.iloc[i]
        next_row = group.iloc[i + 1]
        
        # Create the range of match_minute values between current and next row
        for match_minute in range(current_row["goal_time"], next_row["goal_time"] + 1):
            new_row = current_row.copy()
            new_row["match_minute"] = match_minute
            
            # Leave specific fields empty for interpolated rows
            if match_minute != current_row["goal_time"]:
                new_row["goal_time"] = None
                new_row["home_team"] = None
                new_row["away_team"] = None
                new_row["scorer_team"] = None
            
            expanded_rows.append(new_row)
    
    # Add the last row of the group as it is
    last_row = group.iloc[-1].copy()
    last_row["match_minute"] = last_row["goal_time"]
    expanded_rows.append(last_row)
    
    # Ensure the match_minute reaches 90
    if last_row["goal_time"] < 90:
        for match_minute in range(last_row["goal_time"] + 1, 91):
            new_row = last_row.copy()
            new_row["match_minute"] = match_minute
            new_row["goal_time"] = None
            new_row["home_team"] = None
            new_row["away_team"] = None
            new_row["scorer_team"] = None
            expanded_rows.append(new_row)

# Create a new DataFrame from the expanded rows
expanded_df = pd.DataFrame(expanded_rows)

# Reset the index for clarity
expanded_df = expanded_df.reset_index(drop=True)


In [142]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\wc\mbm_wc_fifa.xlsx'
expanded_df.to_excel(file_path, index=False)


# best four third placed

In [143]:
# Call the function to get the DataFrame
final_df = best_four_third_placed_wc_men(goals_last_day_sorted, all_games_before_last, agg_goals_before_last_day)

# Define the file path and save to Excel
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\wc\third_teams_wc_fifa_men.xlsx'
final_df.to_excel(file_path, index=False)


--- Processing Year: 2018 ---

=== Applying Tiebreaker ===
Row1: year                                  2018
stage                              Group A
team                                 Egypt
goals_scored                             1
goals_conceded                           4
points                                   1
goals_difference                        -3
total_matches                            2
standing                                 3
tiebreaker                           Egypt
tie_won                                  1
before_last_game_goals_scored            1
before_last_game_goals_conceded          4
before_last_game_points                  1
last_game_goals_scored                   0
last_game_goals_conceded                 0
total_goals_scored                       1
total_goals_conceded                     4
total_goals_difference                  -3
last_game_points                         0
total_points                             1
tied_won                       

In [144]:
# Count the number of observations where the year is 1986
count_1986 = goals_last_day_sorted[goals_last_day_sorted['year'] == 1986].shape[0]
print(f"Number of observations in goals_last_day_sorted for the year 1986: {count_1986}")

# Count the number of observations where the year is 1990
count_1990 = goals_last_day_sorted[goals_last_day_sorted['year'] == 1990].shape[0]
print(f"Number of observations in goals_last_day_sorted for the year 1990: {count_1990}")

# Count the number of observations where the year is 1994
count_1994 = goals_last_day_sorted[goals_last_day_sorted['year'] == 1994].shape[0]
print(f"Number of observations in goals_last_day_sorted for the year 1994: {count_1994}")


Number of observations in goals_last_day_sorted for the year 1986: 41
Number of observations in goals_last_day_sorted for the year 1990: 0
Number of observations in goals_last_day_sorted for the year 1994: 37
