# Libraries

In [118]:
import pandas as pd
from getpass import getuser
from collections import defaultdict
from datetime import datetime, timedelta

# Load and inspect dataset

In [119]:
# Get the current user's name
user = getuser()

# Construct the path using the user's name
function_path = f"C:/Users/{user}/Documents/GitHub/tiebreak_wc/code/wiki/functions_fifa.ipynb"

# Run the notebook
%run $function_path

In [120]:
# Get the current user's name
user = getuser()

# Path to the dataset
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\eu_goals_women.xlsx'

# Read the dataset with a different encoding
df = pd.read_excel(data_path)

# Now, filter the data for tournaments after 1984
df = df[df['year'] > 1984]

display(df.head())

Unnamed: 0,stage,year,time,home_team,away_team,score,stadium_name,stadium_city,stadium_attendance,referee_name,...,goal_minute,extra_time,goals_home,goals_away,own_goal,penalty,goal_minute_et,goal_et,short_date,long_date
14,,1987,,Norway,Italy,2–0,Ullevaal Stadion,Oslo,5154,Iceland,...,40,0,2,0,0,0,0,0,,11 June 1987
15,,1987,,Norway,Italy,2–0,Ullevaal Stadion,Oslo,5154,Iceland,...,73,0,2,0,0,0,0,0,,11 June 1987
16,,1987,,Sweden,England,3–2,Melløs Stadion,Moss,300,Michał Listkiewicz,...,32,1,3,2,0,0,0,0,,11 June 1987
17,,1987,,Sweden,England,3–2,Melløs Stadion,Moss,300,Michał Listkiewicz,...,50,1,3,2,0,0,0,0,,11 June 1987
18,,1987,,Sweden,England,3–2,Melløs Stadion,Moss,300,Michał Listkiewicz,...,100,1,3,2,0,0,0,1,,11 June 1987


# Clean, transfrom, create variables

## time

In [121]:
# Step 1: Clean time variable by removing any letters (e.g., EEST, PST) if they exist
df['time_cleaned'] = df['time'].str.replace(r'[A-Za-z]+', '', regex=True).str.strip()

# Step 2: Split time where there are parentheses into local time and UTC offset
df['local_time'] = df['time_cleaned'].str.extract(r'(\d{2}:\d{2})')  # extract the part before parentheses
df['utc_time_offset'] = df['time_cleaned'].str.extract(r'\((.*?)\)')  # extract the part inside parentheses


## date

In [122]:
# Step 1: Remove any commas before the year in 'long_date'
df['long_date'] = df['long_date'].str.replace(r',\s*(\d{4})', r' \1', regex=True)

# Step 2: Convert 'long_date' to 'short_date' where 'short_date' is missing
df['short_date'] = df['short_date'].fillna(pd.to_datetime(df['long_date'], errors='coerce').dt.strftime('%Y-%m-%d'))


# Extract relevant columns

In [123]:
# Filter out the knockout stages
df = df[~df['stage'].isin(['Quarter-finals', 'Round of 16', 'Semi-finals', 'Final'])]

# Extract relevant columns for goal events and match results
goals_df = df[['year', 'stage', 'home_team', 'away_team', 'scorer_nationality', 'goal_minute', 'short_date','local_time', 'score']]

# Remove duplicates
goals_df = goals_df.drop_duplicates()

display(goals_df.head())


Unnamed: 0,year,stage,home_team,away_team,scorer_nationality,goal_minute,short_date,local_time,score
14,1987,,Norway,Italy,Norway,40,1987-06-11,,2–0
15,1987,,Norway,Italy,Norway,73,1987-06-11,,2–0
16,1987,,Sweden,England,Sweden,32,1987-06-11,,3–2
17,1987,,Sweden,England,Sweden,50,1987-06-11,,3–2
18,1987,,Sweden,England,Sweden,100,1987-06-11,,3–2


# Recreate Leauge Table after first two matchdays

In [124]:

# Assuming goals_df is your original dataset
agg_goals_before_last_day, goals_last_day_sorted = process_goals_data(goals_df)


In [125]:
home_games, away_games = aggregate_home_away_points(agg_goals_before_last_day)

In [126]:
# Filter for year == 2022 and stage == 'Group E'
home2017d = home_games[(home_games['year'] == 2017) & (home_games['stage'] == 'Group D')]
home2017d

Unnamed: 0,year,stage,home_team,goals_scored,goals_conceded,points_home,match_count_home
48,2017,Group D,England,8,0,6,2
49,2017,Group D,Scotland,1,2,0,1
50,2017,Group D,Spain,2,0,3,1


In [127]:
# Filter for year == 2022 and stage == 'Group E'
away2017d = away_games[(away_games['year'] == 2017) & (away_games['stage'] == 'Group D')]
away2017d

Unnamed: 0,year,stage,away_team,goals_scored,goals_conceded,points_away,match_count_away
47,2017,Group D,Portugal,2,3,3,2
48,2017,Group D,Scotland,0,6,0,1
49,2017,Group D,Spain,0,2,0,1


## aggregate data after first two match days

In [128]:
# Applying the fifa_before_last function to the mock data
all_games_before_last = fifa_before_last(home_games, away_games, agg_goals_before_last_day)

  all_games_before_last['standing'] = all_games_before_last.groupby(['year', 'stage']).apply(


In [129]:
# Filtering all_games_before_last for the year 2022 and stage 'Group E'
group2017d = all_games_before_last[
    (all_games_before_last['year'] == 2017) & 
    (all_games_before_last['stage'] == 'Group D')
]

display(group2017d)

Unnamed: 0,year,stage,team,goals_scored,goals_conceded,points,goals_difference,total_matches,tiebreaker,standing
66,2017,Group D,England,8,0,6,8,2,no need,1
67,2017,Group D,Spain,2,2,3,0,2,no need,2
68,2017,Group D,Portugal,2,3,3,-1,2,no need,3
69,2017,Group D,Scotland,1,8,0,-7,2,no need,4


# Recreate league table after last match day

In [130]:
# Initialize an empty list to store the results for each pair
all_results = []

# Get unique pairs of year and group_name
unique_pairs = all_games_before_last[['year', 'stage']].drop_duplicates()

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    group_name = row['stage']
    
    # Apply the function to the current pair
    result = fifa_final_wc(year, group_name, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_results.append(result)

# Concatenate all the results into a single DataFrame
changes_df_wc = pd.concat(all_results)


# Keep only the specified columns
changes_df_wc = changes_df_wc[['year', 'stage', 'team', '1st', '2nd', '3rd', '4th', 'changes']]

# Display the final DataFrame
display(changes_df_wc)



=== Initial Standings for Year 1993, Group Semifinals Before Last Match Goals ===

   team  total_points  total_goals_scored  total_goals_conceded  total_goal_difference  before_last_game_points  before_last_game_standing
 Norway             4                   1                     0                      1                        4                          1
Denmark             1                   0                     1                     -1                        1                          2


Analyzing Semifinals, year 1993, goal: 56 minute, Player team: Germany, Home: Italy, Away: Germany

=== Updated Standings After This Goal ===

   team  total_points  total_goals_scored  total_goals_conceded  total_goal_difference  last_game_points  last_game_standing  changes  1st  2nd  3rd  4th  tied  tied_won
 Norway             5                   1                     0                      1                 1                   1        1    1    0    0    0 False         0
Denmark       

Unnamed: 0,year,stage,team,1st,2nd,3rd,4th,changes
0,1993,Semifinals,Norway,1,0,0,0,1
1,1993,Semifinals,Denmark,0,1,0,0,1
2,1995,Semifinals,Germany,1,0,0,0,1
4,1995,Semifinals,Sweden,0,1,1,0,2
3,1995,Semifinals,Norway,0,1,1,0,2
...,...,...,...,...,...,...,...,...
80,2022,Group C,Portugal,0,0,1,1,2
82,2022,Group D,France,1,0,0,0,1
84,2022,Group D,Belgium,0,1,1,0,2
83,2022,Group D,Iceland,0,1,1,0,2


In [131]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\women\fifa\eu\tb_eu_fifa_women.xlsx'
changes_df_wc.to_excel(file_path, index=False)


# group composition tracking

In [132]:
# Initialize an empty list to store the results for each pair
all_composition_changes = []

# Get unique pairs of year, stage, and men
unique_pairs = all_games_before_last[['year', 'stage']].drop_duplicates()

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    stage = row['stage']
    
    # Apply the function to the current pair
    composition_changes_df = track_composition_changes(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_composition_changes.append(composition_changes_df)

# Concatenate all the results into a single DataFrame
final_composition_changes_df = pd.concat(all_composition_changes, ignore_index=True)

# Display the final DataFrame
display(final_composition_changes_df)



Unnamed: 0,year,stage,change_num,goal_time,home_team,away_team,scorer_nationality,new_top_teams,third_place_teams_list,top4_third_place,1st,2nd,3rd
0,1993,Semifinals,0,initial,,,,"[Denmark, Norway]",[],0,Norway,Denmark,
1,1995,Semifinals,0,initial,,,,"[Germany, Norway]",[Sweden],1,Germany,Norway,Sweden
2,1995,Semifinals,1,61,Sweden,Norway,Sweden,"[Germany, Sweden]",[Norway],1,Germany,Sweden,Norway
3,1997,Group A,0,initial,,,,"[France, Sweden]",[Spain],1,Sweden,France,Spain
4,1997,Group A,1,67,Russia,Spain,Spain,"[Sweden, Spain]",[France],1,Sweden,Spain,France
5,1997,Group B,0,initial,,,,"[Norway, Italy]",[Germany],1,Norway,Italy,Germany
6,1997,Group B,1,82,Denmark,Germany,Germany,"[Germany, Italy]",[Norway],1,Italy,Germany,Norway
7,2001,Group A,0,initial,,,,"[Germany, Sweden]",[England],1,Germany,Sweden,England
8,2001,Group B,0,initial,,,,"[Norway, Italy]",[Denmark],1,Norway,Italy,Denmark
9,2001,Group B,1,37,France,Italy,France,"[Denmark, Norway]",[Italy],1,Norway,Denmark,Italy


In [133]:
# Initialize an empty list to store the results for each pair
all_composition_changes = []

# Get unique pairs of year and group_name
unique_pairs = all_games_before_last[['year', 'stage']].drop_duplicates()

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    group_name = row['stage']
    
    # Apply the function to the current pair
    composition_changes_df = track_composition_changes(year, group_name, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_composition_changes.append(composition_changes_df)

# Concatenate all the results into a single DataFrame
final_composition_changes_df = pd.concat(all_composition_changes, ignore_index=True)

# Rename 'player_team' column to 'scorer_team'
final_composition_changes_df.rename(columns={'player_team': 'scorer_team'}, inplace=True)

# Display the final DataFrame
display(final_composition_changes_df)


Unnamed: 0,year,stage,change_num,goal_time,home_team,away_team,scorer_nationality,new_top_teams,third_place_teams_list,top4_third_place,1st,2nd,3rd
0,1993,Semifinals,0,initial,,,,"[Denmark, Norway]",[],0,Norway,Denmark,
1,1995,Semifinals,0,initial,,,,"[Germany, Norway]",[Sweden],1,Germany,Norway,Sweden
2,1995,Semifinals,1,61,Sweden,Norway,Sweden,"[Germany, Sweden]",[Norway],1,Germany,Sweden,Norway
3,1997,Group A,0,initial,,,,"[France, Sweden]",[Spain],1,Sweden,France,Spain
4,1997,Group A,1,67,Russia,Spain,Spain,"[Sweden, Spain]",[France],1,Sweden,Spain,France
5,1997,Group B,0,initial,,,,"[Norway, Italy]",[Germany],1,Norway,Italy,Germany
6,1997,Group B,1,82,Denmark,Germany,Germany,"[Germany, Italy]",[Norway],1,Italy,Germany,Norway
7,2001,Group A,0,initial,,,,"[Germany, Sweden]",[England],1,Germany,Sweden,England
8,2001,Group B,0,initial,,,,"[Norway, Italy]",[Denmark],1,Norway,Italy,Denmark
9,2001,Group B,1,37,France,Italy,France,"[Denmark, Norway]",[Italy],1,Norway,Denmark,Italy


In [134]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\women\fifa\eu\standings_eu_fifa_women.xlsx'
final_composition_changes_df.to_excel(file_path, index=False)


# best four third placed

In [135]:
# Call the function to get the DataFrame
final_df = best_two_third_placed_eu_women(goals_last_day_sorted, all_games_before_last, agg_goals_before_last_day)

# Define the file path and save to Excel
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\women\fifa\eu\third_teams_eu_fifa_women.xlsx'
final_df.to_excel(file_path, index=False)


--- Processing Year: 2009 ---

=== Initial Standings for Year 2009 Before Processing Any Goals ===

       team   stage  before_last_game_points  before_last_game_goals_scored  before_last_game_goals_conceded  total_goal_difference
    Finland Group A                        6                              3                                1                      2
Netherlands Group A                        3                              3                                2                      1
    Denmark Group A                        3                              2                                2                      0
    Ukraine Group A                        0                              1                                4                     -3
    Germany Group B                        6                              9                                1                      8
     France Group B                        3                              4                                

In [136]:
# Count the number of observations where the year is 2013
count_2009 = goals_last_day_sorted[goals_last_day_sorted['year'] == 2009].shape[0]
print(f"Number of observations in goals_last_day_sorted for the year 2009: {count_2009}")

# Count the number of observations where the year is 201
count_2013 = goals_last_day_sorted[goals_last_day_sorted['year'] == 2013].shape[0]
print(f"Number of observations in goals_last_day_sorted for the year 2013: {count_2013}")

Number of observations in goals_last_day_sorted for the year 2009: 11
Number of observations in goals_last_day_sorted for the year 2013: 13
