# Libraries

In [41]:
import pandas as pd
from getpass import getuser
from collections import defaultdict
from datetime import datetime, timedelta

# Load and inspect dataset

In [42]:
# Get the current user's name
user = getuser()

# Construct the path using the user's name
function_path = f"C:/Users/{user}/Documents/GitHub/tiebreak_wc/code/wiki/functions_fifa.ipynb"

# Run the notebook
%run $function_path

In [None]:

# Path to the dataset
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\eu_goals_men.xlsx'

# Read the dataset with a different encoding
df = pd.read_excel(data_path)

# Now, filter the data for tournaments after 1984
df = df[df['year'] >= 1984]

display(df.head())

Unnamed: 0,stage,year,time,home_team,away_team,score,stadium_name,stadium_city,stadium_attendance,referee_name,...,goal_minute,extra_time,goals_home,goals_away,own_goal,penalty,goal_minute_et,goal_et,short_date,long_date
93,Group 1,1984,20:30,France,Denmark,1–0,Parc des Princes,Paris,47570,Volker Roth,...,78,0,1,0,0,0,0,0,1984-06-12,12 June 1984
94,Group 1,1984,20:30,Belgium,Yugoslavia,2–0,Stade Félix-Bollaert,Lens,41525,Erik Fredriksson,...,28,0,2,0,0,0,0,0,1984-06-13,13 June 1984
95,Group 1,1984,20:30,Belgium,Yugoslavia,2–0,Stade Félix-Bollaert,Lens,41525,Erik Fredriksson,...,45,0,2,0,0,0,0,0,1984-06-13,13 June 1984
96,Group 1,1984,17:15,France,Belgium,5–0,Stade de la Beaujoire,Nantes,51359,Bob Valentine,...,4,0,5,0,0,0,0,0,1984-06-16,16 June 1984
97,Group 1,1984,17:15,France,Belgium,5–0,Stade de la Beaujoire,Nantes,51359,Bob Valentine,...,74,0,5,0,0,1,0,0,1984-06-16,16 June 1984


# Clean, transfrom, create variables

## time

In [44]:
# Step 1: Clean time variable by removing any letters (e.g., EEST, PST) if they exist
df['time_cleaned'] = df['time'].str.replace(r'[A-Za-z]+', '', regex=True).str.strip()

# Step 2: Split time where there are parentheses into local time and UTC offset
df['local_time'] = df['time_cleaned'].str.extract(r'(\d{2}:\d{2})')  # extract the part before parentheses
df['utc_time_offset'] = df['time_cleaned'].str.extract(r'\((.*?)\)')  # extract the part inside parentheses


## date

In [45]:
# Step 1: Remove any commas before the year in 'long_date'
df['long_date'] = df['long_date'].str.replace(r',\s*(\d{4})', r' \1', regex=True)

# Step 2: Convert 'long_date' to 'short_date' where 'short_date' is missing
df['short_date'] = df['short_date'].fillna(pd.to_datetime(df['long_date'], errors='coerce').dt.strftime('%Y-%m-%d'))


# Extract relevant columns

In [46]:
# Filter out the knockout stages
df = df[~df['stage'].isin(['Quarter-finals', 'Round of 16', 'Semi-finals', 'Final'])]

# Extract relevant columns for goal events and match results
goals_df = df[['year', 'stage', 'home_team', 'away_team', 'scorer_nationality', 'goal_minute', 'short_date','local_time', 'score']]

# Remove duplicates
goals_df = goals_df.drop_duplicates()

display(goals_df.head())


Unnamed: 0,year,stage,home_team,away_team,scorer_nationality,goal_minute,short_date,local_time,score
93,1984,Group 1,France,Denmark,France,78,1984-06-12,20:30,1–0
94,1984,Group 1,Belgium,Yugoslavia,Belgium,28,1984-06-13,20:30,2–0
95,1984,Group 1,Belgium,Yugoslavia,Belgium,45,1984-06-13,20:30,2–0
96,1984,Group 1,France,Belgium,France,4,1984-06-16,17:15,5–0
97,1984,Group 1,France,Belgium,France,74,1984-06-16,17:15,5–0


# team counts

In [47]:

# Calculate the number of unique teams per stage and year using home_team and away_team
team_counts = (
    df.groupby(['year', 'stage'])
    .apply(lambda x: pd.concat([x['home_team'], x['away_team']]).unique())
)

# Convert the resulting series to a DataFrame
team_counts = team_counts.reset_index()  # Reset index without 'name' argument
team_counts = team_counts.rename(columns={0: 'team_list'})  # Rename the column appropriately

# Convert team list into counts
team_counts['team_count'] = team_counts['team_list'].apply(len)

# Filter stages where the number of teams is not equal to 4
invalid_groups = team_counts[team_counts['team_count'] != 4]

# Display the invalid groups
display(invalid_groups)



Unnamed: 0,year,stage,team_list,team_count


# Recreate Leauge Table after first two matchdays

In [48]:

# Assuming goals_df is your original dataset
agg_goals_before_last_day, goals_last_day_sorted = process_goals_data(goals_df)


In [49]:
home_games, away_games = aggregate_home_away_points(agg_goals_before_last_day)

In [50]:
# Filter for year == 2022 and stage == 'Group E'
home1992 = home_games[(home_games['year'] == 1992) & (home_games['stage'] == 'Group 1')]
home1992

Unnamed: 0,year,stage,home_team,goals_scored,goals_conceded,points_home,match_count_home
12,1992,Group 1,Sweden,2,1,3,2


In [51]:
# Filter for year == 2022 and stage == 'Group E'
away1992 = away_games[(away_games['year'] == 1992) & (away_games['stage'] == 'Group 1')]
away1992

Unnamed: 0,year,stage,away_team,goals_scored,goals_conceded,points_away,match_count_away
11,1992,Group 1,Denmark,0,1,0,1
12,1992,Group 1,France,1,1,1,1


## aggregate data after first two match days

In [52]:
# Applying the fifa_before_last function to the mock data
all_games_before_last = fifa_before_last(home_games, away_games, agg_goals_before_last_day, team_counts)

In [53]:
# Group by 'year' and 'stage' and calculate unique teams for each group
unique_teams_per_stage = (
    all_games_before_last.groupby(['year', 'stage'])['team']
    .apply(lambda x: x.unique())
    .reset_index()
    .rename(columns={'team': 'unique_teams'})
)

# Calculate the number of unique teams in each group
unique_teams_per_stage['team_count'] = unique_teams_per_stage['unique_teams'].apply(len)

# Filter rows where the number of unique teams is not 4
invalid_groups = unique_teams_per_stage[unique_teams_per_stage['team_count'] != 4]

# Display the invalid groups
display(invalid_groups)

Unnamed: 0,year,stage,unique_teams,team_count


In [54]:
# Filtering all_games_before_last for the year 2022 and stage 'Group E'
group1992 = all_games_before_last[
    (all_games_before_last['year'] == 1992) & 
    (all_games_before_last['stage'] == 'Group 1')
]

display(group1992)

Unnamed: 0,year,stage,team,goals_scored,goals_conceded,points,goals_difference,total_matches,tiebreaker,standing
16,1992,Group 1,Sweden,2,1,3,1,2,no need,1
17,1992,Group 1,France,1,1,2,0,2,no need,2
18,1992,Group 1,England,0,0,2,0,2,no need,3
19,1992,Group 1,Denmark,0,1,1,-1,2,no need,4


# Recreate league table after last match day

In [55]:
# Initialize an empty list to store the results for each pair
all_results = []

# Get unique pairs of year and group_name
unique_pairs = all_games_before_last[['year', 'stage']].drop_duplicates()

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    group_name = row['stage']
    
    # Apply the function to the current pair
    result = fifa_final_wc(year, group_name, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_results.append(result)

# Concatenate all the results into a single DataFrame
changes_df_wc = pd.concat(all_results)

# Keep only the specified columns
changes_df_wc = changes_df_wc[['year', 'stage', 'team', '1st', '2nd', '3rd', '4th', 'changes']]

# Display the final DataFrame
display(changes_df_wc)



=== Initial Standings for Year 1984, Group Group 1 Before Last Match Goals ===

      team  total_points  total_goals_scored  total_goals_conceded  total_goal_difference  before_last_game_points  before_last_game_standing
    France             4                   6                     0                      6                        4                          1
   Denmark             2                   5                     1                      4                        2                          2
   Belgium             2                   2                     5                     -3                        2                          3
Yugoslavia             0                   0                     7                     -7                        0                          4


Analyzing Group 1, year 1984, goal: 26 minute, Player team: Belgium, Home: Denmark, Away: Belgium

=== Updated Standings After This Goal ===

      team  total_points  total_goals_scored  total_goals_concede

Unnamed: 0,year,stage,team,1st,2nd,3rd,4th,changes
0,1984,Group 1,France,1,0,0,0,1
1,1984,Group 1,Denmark,0,2,1,0,3
2,1984,Group 1,Belgium,0,1,2,0,3
3,1984,Group 1,Yugoslavia,0,0,0,2,2
5,1984,Group 2,Spain,1,1,1,0,3
...,...,...,...,...,...,...,...,...
171,2024,Group E,Ukraine,0,0,1,2,3
172,2024,Group F,Portugal,1,0,0,0,1
173,2024,Group F,Turkey,0,2,1,0,3
175,2024,Group F,Georgia,0,1,2,1,4


In [56]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\eu\tb_eu_fifa_men.xlsx'
changes_df_wc.to_excel(file_path, index=False)


# group composition tracking

In [57]:
# Initialize an empty list to store the results for each pair
all_composition_changes = []

# Get unique pairs of year, stage, and men
unique_pairs = all_games_before_last[['year', 'stage']].drop_duplicates()

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    stage = row['stage']
    
    # Apply the function to the current pair
    composition_changes_df = track_composition_changes(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_composition_changes.append(composition_changes_df)

# Concatenate all the results into a single DataFrame
final_composition_changes_df = pd.concat(all_composition_changes, ignore_index=True)

# Rename 'player_team' column to 'scorer_team'
final_composition_changes_df.rename(columns={'player_team': 'scorer_nationality'}, inplace=True)



In [58]:
# Initialize an empty list to store the results for each pair
all_composition_changes = []

# Get unique pairs of year and group_name
unique_pairs = all_games_before_last[['year', 'stage']].drop_duplicates()

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    group_name = row['stage']
    
    # Apply the function to the current pair
    composition_changes_df = track_composition_changes(year, group_name, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_composition_changes.append(composition_changes_df)

# Concatenate all the results into a single DataFrame
final_composition_changes_df = pd.concat(all_composition_changes, ignore_index=True)

# Rename 'player_team' column to 'scorer_team'
final_composition_changes_df.rename(columns={'player_team': 'scorer_nationality'}, inplace=True)

# Display the final DataFrame
display(final_composition_changes_df)


Unnamed: 0,year,stage,change_num,goal_time,home_team,away_team,scorer_nationality,new_top_teams,third_place_teams_list,top4_third_place,1st,2nd,3rd
0,1984,Group 1,0,initial,,,,"[France, Denmark, Belgium]",[Belgium],1,France,Denmark,Belgium
1,1984,Group 2,0,initial,,,,"[Spain, West Germany, Portugal]",[Portugal],1,West Germany,Spain,Portugal
2,1988,Group 1,0,initial,,,,"[Spain, Italy, West Germany]",[Spain],1,West Germany,Italy,Spain
3,1988,Group 2,0,initial,,,,"[Netherlands, Republic of Ireland, Soviet Union]",[Netherlands],1,Republic of Ireland,Soviet Union,Netherlands
4,1992,Group 1,0,initial,,,,"[France, England, Sweden]",[England],1,Sweden,France,England
...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,2024,Group E,1,24,Slovakia,Romania,Slovakia,"[Slovakia, Belgium]",[Ukraine],1,Slovakia,Belgium,Ukraine
97,2024,Group E,2,37,Slovakia,Romania,Romania,"[Romania, Belgium]",[Slovakia],1,Romania,Belgium,Slovakia
98,2024,Group F,0,initial,,,,"[Turkey, Portugal]",[Czech Republic],1,Portugal,Turkey,Czech Republic
99,2024,Group F,1,66,Czech Republic,Turkey,Czech Republic,"[Georgia, Portugal]",[Turkey],1,Portugal,Georgia,Turkey


In [59]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\eu\standings_eu_fifa_men.xlsx'
final_composition_changes_df.to_excel(file_path, index=False)


# best four third placed

In [60]:
# Call the function to get the DataFrame
final_df = best_four_third_placed_eu_men(goals_last_day_sorted, all_games_before_last)

# Define the file path and save to Excel
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\eu\third_teams_eu_fifa_men.xlsx'
final_df.to_excel(file_path, index=False)


--- Processing Year: 2016 ---

--- Processing Year: 2021 ---

--- Processing Year: 2024 ---


In [61]:
# Count the number of observations where the year is 2016
count_2016 = goals_last_day_sorted[goals_last_day_sorted['year'] == 2016].shape[0]
print(f"Number of observations in goals_last_day_sorted for the year 2016: {count_2016}")

# Count the number of observations where the year is 2021
count_2021 = goals_last_day_sorted[goals_last_day_sorted['year'] == 2021].shape[0]
print(f"Number of observations in goals_last_day_sorted for the year 2021: {count_2021}")

# Count the number of observations where the year is 2024
count_2024 = goals_last_day_sorted[goals_last_day_sorted['year'] == 2024].shape[0]
print(f"Number of observations in goals_last_day_sorted for the year 2024: {count_2024}")


Number of observations in goals_last_day_sorted for the year 2016: 22
Number of observations in goals_last_day_sorted for the year 2021: 38
Number of observations in goals_last_day_sorted for the year 2024: 24
