# Libraries

In [55]:
import pandas as pd
from getpass import getuser
from collections import defaultdict
from datetime import datetime, timedelta
import re

# Load and inspect dataset

In [56]:
# Get the current user's name
user = getuser()

# Construct the path using the user's name
function_path = f"C:/Users/{user}/Documents/GitHub/tiebreak_wc/code/wiki/functions_fifa.ipynb"

# Run the notebook
%run $function_path

In [57]:
# Path to the dataset
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\wc_goals_men.xlsx'

# Read the dataset with a different encoding
df = pd.read_excel(data_path)

# Now, filter the data for tournaments after 1984
df = df[df['year'] > 1984]

display(df.head())

Unnamed: 0,stage,year,time,home_team,away_team,score,stadium_name,stadium_city,stadium_attendance,referee_name,...,goal_minute,extra_time,goals_home,goals_away,own_goal,penalty,goal_minute_et,goal_et,short_date,long_date
0,Group A,1986,12:00CST,Bulgaria,Italy,1–1,Estadio Azteca,Mexico City,96000,Erik Fredriksson,...,85,0,1,1,0,0,0,0,,31 May 1986
1,Group A,1986,12:00CST,Bulgaria,Italy,1–1,Estadio Azteca,Mexico City,96000,Erik Fredriksson,...,44,0,1,1,0,0,0,0,,31 May 1986
2,Group A,1986,12:00CST,Argentina,South Korea,3–1,Estadio Olímpico Universitario,Mexico City,60000,Victoriano Sánchez Arminio,...,6,0,3,1,0,0,0,0,,2 June 1986
3,Group A,1986,12:00CST,Argentina,South Korea,3–1,Estadio Olímpico Universitario,Mexico City,60000,Victoriano Sánchez Arminio,...,46,0,3,1,0,0,0,0,,2 June 1986
4,Group A,1986,12:00CST,Argentina,South Korea,3–1,Estadio Olímpico Universitario,Mexico City,60000,Victoriano Sánchez Arminio,...,18,0,3,1,0,0,0,0,,2 June 1986


# Clean, transfrom, create variables

## time

In [58]:
# Apply the conversion function
df[['local_time', 'utc_time']] = df.apply(lambda row: pd.Series(convert_time(row)), axis=1)

## date

In [59]:
# Step 1: Remove any commas before the year in 'long_date'
df['long_date'] = df['long_date'].str.replace(r',\s*(\d{4})', r' \1', regex=True)

# Step 2: Convert 'long_date' to 'short_date' where 'short_date' is missing
df['short_date'] = df['short_date'].fillna(pd.to_datetime(df['long_date'], errors='coerce').dt.strftime('%Y-%m-%d'))


# Extract relevant columns

In [60]:
# Filter out the knockout stages
df = df[~df['stage'].isin(['Quarter-finals', 'Round of 16', 'Semi-finals', 'Final'])]

# Extract relevant columns for goal events and match results
goals_df = df[['year', 'stage', 'home_team', 'away_team', 'scorer_nationality', 'goal_minute', 'short_date','local_time', 'score']]

# Remove duplicates
goals_df = goals_df.drop_duplicates()

display(goals_df.head())


Unnamed: 0,year,stage,home_team,away_team,scorer_nationality,goal_minute,short_date,local_time,score
0,1986,Group A,Bulgaria,Italy,Bulgaria,85,1986-05-31,18:00,1–1
1,1986,Group A,Bulgaria,Italy,Italy,44,1986-05-31,18:00,1–1
2,1986,Group A,Argentina,South Korea,Argentina,6,1986-06-02,18:00,3–1
3,1986,Group A,Argentina,South Korea,Argentina,46,1986-06-02,18:00,3–1
4,1986,Group A,Argentina,South Korea,Argentina,18,1986-06-02,18:00,3–1


# manually insert missing 0-0

In [None]:
# Define new observations as a list of dictionaries
new_observations = [
    {
        'year': 1986,
        'stage': 'Group E',
        'home_team': 'Scotland',
        'away_team': 'Uruguay',
        'scorer_nationality': 'none',
        'goal_minute': 0,
        'short_date': '1986-06-13',
        'local_time': '12:00',
        'score': '0-0'
    },
   
    
]

# Convert the new observations into a DataFrame
new_observations_df = pd.DataFrame(new_observations)

# Append the new observations to the existing DataFrame
goals_df = pd.concat([goals_df, new_observations_df], ignore_index=True)



# Recreate Leauge Table after first two matchdays

In [61]:

# Assuming goals_df is your original dataset
agg_goals_before_last_day, goals_last_day_sorted = before_last(goals_df)


In [62]:
home_games, away_games = aggregate_home_away_points(agg_goals_before_last_day)

In [63]:
# Filter for year == 2022 and stage == 'Group E'
home2022e = home_games[(home_games['year'] == 2022) & (home_games['stage'] == 'Group E')]
home2022e

Unnamed: 0,year,stage,home_team,goals_scored,goals_conceded,points_home,match_count_home
171,2022,Group E,Germany,1,2,0,1
172,2022,Group E,Japan,0,1,0,1
173,2022,Group E,Spain,8,1,4,2


In [64]:
# Filter for year == 2022 and stage == 'Group E'
away2022e = away_games[(away_games['year'] == 2022) & (away_games['stage'] == 'Group E')]
away2022e

Unnamed: 0,year,stage,away_team,goals_scored,goals_conceded,points_away,match_count_away
167,2022,Group E,Costa Rica,1,7,3,2
168,2022,Group E,Germany,1,1,1,1
169,2022,Group E,Japan,2,1,3,1


## aggregate data after first two match days

In [65]:
# Applying the fifa_before_last function to the mock data
all_games_before_last = fifa_before_last(home_games, away_games, agg_goals_before_last_day, team_counts)

NameError: name 'team_counts' is not defined

In [266]:
# Filtering all_games_before_last for the year 2022 and stage 'Group E'
group2022e = all_games_before_last[
    (all_games_before_last['year'] == 2022) & 
    (all_games_before_last['stage'] == 'Group E')
]

display(group2022e)

Unnamed: 0,year,stage,team,goals_scored,goals_conceded,points,goals_difference,total_matches,tiebreaker,standing
234,2022,Group E,Spain,8,1,4,7,2,no need,1
235,2022,Group E,Japan,2,2,3,0,2,no need,2
236,2022,Group E,Costa Rica,1,7,3,-6,2,no need,3
237,2022,Group E,Germany,2,3,1,-1,2,no need,4


# Recreate league table after last match day

In [267]:
# Initialize an empty list to store the results for each pair
all_results = []

# Get unique pairs of year and group_name
unique_pairs = all_games_before_last[['year', 'stage']].drop_duplicates()

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    group_name = row['stage']
    
    # Apply the function to the current pair
    result = fifa_final_wc(year, group_name, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_results.append(result)

# Concatenate all the results into a single DataFrame
changes_df_wc = pd.concat(all_results)


# Keep only the specified columns
changes_df_wc = changes_df_wc[['year', 'stage', 'team', '1st', '2nd', '3rd', '4th', 'changes']]

# Display the final DataFrame
display(changes_df_wc)



=== Initial Standings for Year 1986, Group Group A Before Last Match Goals ===

       team  total_points  total_goals_scored  total_goals_conceded  total_goal_difference  before_last_game_points  before_last_game_standing
  Argentina             3                   4                     2                      2                        3                          1
   Bulgaria             2                   2                     2                      0                        2                          2
      Italy             2                   2                     2                      0                        2                          3
South Korea             1                   2                     4                     -2                        1                          4


Analyzing Group A, year 1986, goal: 4 minute, Player team: Argentina, Home: Argentina, Away: Bulgaria

=== Updated Standings After This Goal ===

       team  total_points  total_goals_scored  total_goa

Unnamed: 0,year,stage,team,1st,2nd,3rd,4th,changes
0,1986,Group A,Argentina,1,0,0,0,1
2,1986,Group A,Italy,0,1,1,0,2
1,1986,Group A,Bulgaria,0,1,1,0,2
3,1986,Group A,South Korea,0,0,0,2,2
4,1986,Group B,Mexico,2,1,0,0,3
...,...,...,...,...,...,...,...,...
245,2022,Group G,Serbia,0,1,0,2,3
246,2022,Group H,Portugal,1,0,0,0,1
248,2022,Group H,South Korea,0,1,1,1,3
249,2022,Group H,Uruguay,0,1,2,1,4


In [268]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\wc\tb_wc_fifa_men.xlsx'
changes_df_wc.to_excel(file_path, index=False)


# group composition tracking

In [269]:
# Initialize an empty list to store the results for each pair
all_composition_changes = []

# Get unique pairs of year, stage, and men
unique_pairs = all_games_before_last[['year', 'stage']].drop_duplicates()

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    stage = row['stage']
    
    # Apply the function to the current pair
    composition_changes_df = track_composition_changes(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_composition_changes.append(composition_changes_df)

# Concatenate all the results into a single DataFrame
final_composition_changes_df = pd.concat(all_composition_changes, ignore_index=True)

# Display the final DataFrame
display(final_composition_changes_df)



Unnamed: 0,year,stage,change_num,goal_time,home_team,away_team,scorer_nationality,new_top_teams,third_place_teams_list,top4_third_place,1st,2nd,3rd
0,1986,Group A,0,initial,,,,"[Argentina, Bulgaria, Italy]",[Italy],1,Argentina,Bulgaria,Italy
1,1986,Group B,0,initial,,,,"[Belgium, Mexico, Paraguay]",[Belgium],1,Mexico,Paraguay,Belgium
2,1986,Group C,0,initial,,,,"[Hungary, Soviet Union, France]",[Hungary],1,Soviet Union,France,Hungary
3,1986,Group D,0,initial,,,,"[Spain, Northern Ireland, Brazil]",[Northern Ireland],1,Brazil,Spain,Northern Ireland
4,1986,Group D,1,15,Northern Ireland,Brazil,Brazil,"[Spain, Algeria, Brazil]",[Algeria],1,Brazil,Spain,Algeria
...,...,...,...,...,...,...,...,...,...,...,...,...,...
112,2022,Group G,1,35,Serbia,Switzerland,Serbia,"[Serbia, Brazil]",[Switzerland],1,Brazil,Serbia,Switzerland
113,2022,Group G,2,44,Serbia,Switzerland,Switzerland,"[Brazil, Switzerland]",[Cameroon],1,Brazil,Switzerland,Cameroon
114,2022,Group H,0,initial,,,,"[Ghana, Portugal]",[South Korea],1,Portugal,Ghana,South Korea
115,2022,Group H,1,26,Ghana,Uruguay,Uruguay,"[Portugal, Uruguay]",[Ghana],1,Portugal,Uruguay,Ghana


In [270]:
# Initialize an empty list to store the results for each pair
all_composition_changes = []

# Get unique pairs of year and group_name
unique_pairs = all_games_before_last[['year', 'stage']].drop_duplicates()

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    group_name = row['stage']
    
    # Apply the function to the current pair
    composition_changes_df = track_composition_changes(year, group_name, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_composition_changes.append(composition_changes_df)

# Concatenate all the results into a single DataFrame
final_composition_changes_df = pd.concat(all_composition_changes, ignore_index=True)

# Rename 'player_team' column to 'scorer_team'
final_composition_changes_df.rename(columns={'player_team': 'scorer_team'}, inplace=True)

# Display the final DataFrame
display(final_composition_changes_df)


Unnamed: 0,year,stage,change_num,goal_time,home_team,away_team,scorer_nationality,new_top_teams,third_place_teams_list,top4_third_place,1st,2nd,3rd
0,1986,Group A,0,initial,,,,"[Argentina, Bulgaria, Italy]",[Italy],1,Argentina,Bulgaria,Italy
1,1986,Group B,0,initial,,,,"[Belgium, Mexico, Paraguay]",[Belgium],1,Mexico,Paraguay,Belgium
2,1986,Group C,0,initial,,,,"[Hungary, Soviet Union, France]",[Hungary],1,Soviet Union,France,Hungary
3,1986,Group D,0,initial,,,,"[Spain, Northern Ireland, Brazil]",[Northern Ireland],1,Brazil,Spain,Northern Ireland
4,1986,Group D,1,15,Northern Ireland,Brazil,Brazil,"[Spain, Algeria, Brazil]",[Algeria],1,Brazil,Spain,Algeria
...,...,...,...,...,...,...,...,...,...,...,...,...,...
112,2022,Group G,1,35,Serbia,Switzerland,Serbia,"[Serbia, Brazil]",[Switzerland],1,Brazil,Serbia,Switzerland
113,2022,Group G,2,44,Serbia,Switzerland,Switzerland,"[Brazil, Switzerland]",[Cameroon],1,Brazil,Switzerland,Cameroon
114,2022,Group H,0,initial,,,,"[Ghana, Portugal]",[South Korea],1,Portugal,Ghana,South Korea
115,2022,Group H,1,26,Ghana,Uruguay,Uruguay,"[Portugal, Uruguay]",[Ghana],1,Portugal,Uruguay,Ghana


In [271]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\wc\standings_wc_fifa_men.xlsx'
final_composition_changes_df.to_excel(file_path, index=False)


# best four third placed

In [272]:
# Call the function to get the DataFrame
final_df = best_four_third_placed_wc_men(goals_last_day_sorted, all_games_before_last, agg_goals_before_last_day)

# Define the file path and save to Excel
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\wc\third_teams_wc_fifa_men.xlsx'
final_df.to_excel(file_path, index=False)


--- Processing Year: 1986 ---

=== Initial Standings for Year 1986 Before Processing Any Goals ===

            team   stage  before_last_game_points  before_last_game_goals_scored  before_last_game_goals_conceded  total_goal_difference
       Argentina Group A                        3                              4                                2                      2
        Bulgaria Group A                        2                              2                                2                      0
           Italy Group A                        2                              2                                2                      0
     South Korea Group A                        1                              2                                4                     -2
          Mexico Group B                        3                              3                                2                      1
        Paraguay Group B                        3                            

In [273]:
# Count the number of observations where the year is 1986
count_1986 = goals_last_day_sorted[goals_last_day_sorted['year'] == 1986].shape[0]
print(f"Number of observations in goals_last_day_sorted for the year 1986: {count_1986}")

# Count the number of observations where the year is 1990
count_1990 = goals_last_day_sorted[goals_last_day_sorted['year'] == 1990].shape[0]
print(f"Number of observations in goals_last_day_sorted for the year 1990: {count_1990}")

# Count the number of observations where the year is 1994
count_1994 = goals_last_day_sorted[goals_last_day_sorted['year'] == 1994].shape[0]
print(f"Number of observations in goals_last_day_sorted for the year 1994: {count_1994}")


Number of observations in goals_last_day_sorted for the year 1986: 38
Number of observations in goals_last_day_sorted for the year 1990: 11
Number of observations in goals_last_day_sorted for the year 1994: 116
