# Libraries

In [19]:
import pandas as pd
from getpass import getuser
from collections import defaultdict
from datetime import datetime, timedelta

# Load and inspect dataset

In [20]:
# Get the current user's name
user = getuser()

# Construct the path using the user's name
function_path = f"C:/Users/{user}/Documents/GitHub/tiebreak_wc/code/wiki/functions_uefa.ipynb"

# Run the notebook
%run $function_path


In [21]:


# Path to the dataset
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\wc_goals_women.xlsx'

# Load the Excel file into a DataFrame
df = pd.read_excel(data_path)

# Display the first few rows of the DataFrame to verify the import
display(df.head())

Unnamed: 0,stage,year,time,home_team,away_team,score,stadium_name,stadium_city,stadium_attendance,referee_name,...,goal_minute,extra_time,goals_home,goals_away,own_goal,penalty,goal_minute_et,goal_et,short_date,long_date
0,Group A,1991,20:45,China,Norway,4–0,Tianhe Stadium,Guangzhou,65000,Salvador Imperatore,...,22,0,4,0,0,0,0,0,1991-11-16,16 November 1991
1,Group A,1991,20:45,China,Norway,4–0,Tianhe Stadium,Guangzhou,65000,Salvador Imperatore,...,45,0,4,0,0,0,0,0,1991-11-16,16 November 1991
2,Group A,1991,20:45,China,Norway,4–0,Tianhe Stadium,Guangzhou,65000,Salvador Imperatore,...,50,0,4,0,0,0,0,0,1991-11-16,16 November 1991
3,Group A,1991,20:45,China,Norway,4–0,Tianhe Stadium,Guangzhou,65000,Salvador Imperatore,...,75,0,4,0,0,0,0,0,1991-11-16,16 November 1991
4,Group A,1991,19:45,Denmark,New Zealand,3–0,Tianhe Stadium,Guangzhou,14000,Omer Yengo,...,15,0,3,0,0,0,0,0,1991-11-17,17 November 1991


# Clean, transfrom, create variables

## time

In [22]:
# Step 1: Clean time variable by removing any letters (e.g., EEST, PST) if they exist
df['time_cleaned'] = df['time'].str.replace(r'[A-Za-z]+', '', regex=True).str.strip()

# Step 2: Split time where there are parentheses into local time and UTC offset
df['local_time'] = df['time_cleaned'].str.extract(r'(\d{2}:\d{2})')  # extract the part before parentheses
df['utc_time_offset'] = df['time_cleaned'].str.extract(r'\((.*?)\)')  # extract the part inside parentheses


## date

In [23]:
# Step 1: Remove any commas before the year in 'long_date'
df['long_date'] = df['long_date'].str.replace(r',\s*(\d{4})', r' \1', regex=True)

# Step 2: Convert 'long_date' to 'short_date' where 'short_date' is missing
df['short_date'] = df['short_date'].fillna(pd.to_datetime(df['long_date'], errors='coerce').dt.strftime('%Y-%m-%d'))


# Extract relevant columns

In [24]:
# Filter out the knockout stages
df = df[~df['stage'].isin(['Quarter-finals', 'Round of 16', 'Semi-finals', 'Final'])]

# Extract relevant columns for goal events and match results
goals_df = df[['year', 'stage', 'home_team', 'away_team', 'scorer_nationality', 'goal_minute', 'short_date','local_time', 'score']]

# Remove duplicates
goals_df = goals_df.drop_duplicates()

display(goals_df.head())


Unnamed: 0,year,stage,home_team,away_team,scorer_nationality,goal_minute,short_date,local_time,score
0,1991,Group A,China,Norway,China,22,1991-11-16,20:45,4–0
1,1991,Group A,China,Norway,China,45,1991-11-16,20:45,4–0
2,1991,Group A,China,Norway,China,50,1991-11-16,20:45,4–0
3,1991,Group A,China,Norway,China,75,1991-11-16,20:45,4–0
4,1991,Group A,Denmark,New Zealand,Denmark,15,1991-11-17,19:45,3–0


# Recreate Leauge Table after first two matchdays

In [25]:

# Assuming goals_df is your original dataset
agg_goals_before_last_day, goals_last_day_sorted = process_goals_data(goals_df)


In [26]:
home_games, away_games = aggregate_home_away_points(agg_goals_before_last_day)

In [27]:
# Filter for year == 2007 and stage == 'Group C'
home2007c = home_games[(home_games['year'] == 2007) & (home_games['stage'] == 'Group C')]
home2007c

Unnamed: 0,year,stage,home_team,goals_scored,goals_conceded,points_home,match_count_home
47,2007,Group C,Australia,1,1,1,1
48,2007,Group C,Canada,4,0,3,1
49,2007,Group C,Ghana,1,4,0,1
50,2007,Group C,Norway,2,1,3,1


In [28]:
# Filter for year == 2022 and stage == 'Group E'
away2007c = away_games[(away_games['year'] == 2007) & (away_games['stage'] == 'Group C')]
away2007c

Unnamed: 0,year,stage,away_team,goals_scored,goals_conceded,points_away,match_count_away
47,2007,Group C,Australia,4,1,3,1
48,2007,Group C,Canada,1,2,0,1
49,2007,Group C,Ghana,0,4,0,1
50,2007,Group C,Norway,1,1,1,1


## aggregate data after first two matches 

In [29]:
# Applying the fifa_before_last function to the mock data
all_games_before_last = uefa_before_last(home_games, away_games, agg_goals_before_last_day)

In [30]:
# Filtering all_games_before_last for the year 2022 and stage 'Group E'
group2007c = all_games_before_last[
    (all_games_before_last['year'] == 2007) & 
    (all_games_before_last['stage'] == 'Group C')
]

display(group2007c)

Unnamed: 0,year,stage,team,goals_scored,goals_conceded,points,goals_difference,total_matches,tiebreaker,tie_won,standing
68,2007,Group C,Australia,5,2,4,3,2,Australia,1,1
69,2007,Group C,Norway,3,2,4,1,2,Australia,0,2
70,2007,Group C,Canada,5,2,3,3,2,no need,0,3
71,2007,Group C,Ghana,1,8,0,-7,2,no need,0,4


# Recreate league table after last match day

### uefa criteria 

In [31]:
# Initialize an empty list to store the results for each pair
all_results = []

# Get unique pairs of year, stage, and men
unique_pairs = all_games_before_last[['year', 'stage']].drop_duplicates()

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    stage = row['stage']
    
    # Apply the function to the current pair
    result = uefa_final_euro(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_results.append(result)

# Concatenate all the results into a single DataFrame
changes_df_euro = pd.concat(all_results)

# Keep only the specified columns
changes_df_euro = changes_df_euro[[ 'year', 'stage', 'team', '1st', '2nd', '3rd', '4th', 'changes']]



=== Initial Standings for Year 1991, Group A Before Last Match Goals ===

       team  total_points  total_goals_scored  total_goals_conceded  total_goal_difference  before_last_game_points  before_last_game_standing
      China             4                   6                     2                      4                        3                          1
    Denmark             4                   5                     2                      3                        3                          2
     Norway             3                   4                     4                      0                        2                          3
New Zealand             1                   0                     7                     -7                        0                          4


Analyzing goal: 14 minute, Player team: Norway, Home: Norway, Away: Denmark

=== Updated Standings After This Goal ===

       team  total_points  total_goals_scored  total_goals_conceded  total_goal_differen

In [32]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\tb_wc_uefa_women.xlsx'
changes_df_euro.to_excel(file_path, index=False)


# group composition tracking

In [33]:
# Initialize an empty list to store the results for each pair
all_composition_changes = []

# Get unique pairs of year, stage, and men
unique_pairs = all_games_before_last[['year', 'stage']].drop_duplicates()

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    stage = row['stage']
    
    # Apply the function to the current pair
    composition_changes_df = track_composition_changes(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_composition_changes.append(composition_changes_df)

# Concatenate all the results into a single DataFrame
final_composition_changes_df = pd.concat(all_composition_changes, ignore_index=True)

# Display the final DataFrame
display(final_composition_changes_df)




=== Initial Standings for Group A, 1991 (with 0-0 points added) ===
       team  total_points  total_goals_scored  total_goals_conceded  total_goal_difference  before_last_game_standing
      China             4                   6                     2                      4                          1
    Denmark             4                   5                     2                      3                          2
     Norway             3                   4                     4                      0                          3
New Zealand             1                   0                     7                     -7                          4



=== Standings after goal at minute 14 in Group A, edition 1991 ===
       team  total_points  total_goals_scored  total_goals_conceded  total_goal_difference  last_game_standing  tied_won
     Norway             5                   5                     4                      1                   1         0
      China             4    

Unnamed: 0,year,stage,change_num,goal_time,home_team,away_team,scorer_team,new_top_teams,1st,2nd,3rd,scorer_nationality
0,1991,Group A,0,initial,,,,"[Denmark, Norway, China]",China,Denmark,Norway,
1,1991,Group B,0,initial,,,,"[Brazil, United States, Sweden]",United States,Sweden,Brazil,
2,1991,Group C,0,initial,,,,"[Germany, Nigeria, Italy]",Germany,Italy,Nigeria,
3,1991,Group C,1,38,Chinese Taipei,Nigeria,,"[Germany, Italy, Chinese Taipei]",Germany,Italy,Chinese Taipei,Chinese Taipei
4,1995,Group A,0,initial,,,,"[Germany, Sweden]",Germany,Sweden,Brazil,
...,...,...,...,...,...,...,...,...,...,...,...,...
69,2023,Group G,3,92,South Africa,Italy,,"[South Africa, Sweden]",Sweden,South Africa,Italy,South Africa
70,2023,Group H,0,initial,,,,"[Germany, Colombia]",Colombia,Germany,Morocco,
71,2023,Group H,1,6,South Korea,Germany,,"[Colombia, Morocco]",Colombia,Morocco,Germany,South Korea
72,2023,Group H,2,42,South Korea,Germany,,"[Germany, Colombia]",Colombia,Germany,Morocco,Germany


In [34]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\standings_wc_uefa_women.xlsx'
final_composition_changes_df.to_excel(file_path, index=False)


# best two third placed in 1991

In [35]:
# Call the function to get the DataFrame
final_df = best_two_third_placed_wc_women(goals_last_day_sorted, all_games_before_last, agg_goals_before_last_day)

# Define the file path and save to Excel
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\third_teams_wc_uefa_women.xlsx'
final_df.to_excel(file_path, index=False)



--- Processing Year: 1991 ---

=== Initial Standings for Year 1991 Before Processing Any Goals ===

          team   stage  before_last_game_points  before_last_game_goals_scored  before_last_game_goals_conceded  total_goal_difference
         China Group A                        3                              6                                2                      4
       Denmark Group A                        3                              5                                2                      3
        Norway Group A                        2                              4                                4                      0
   New Zealand Group A                        0                              0                                7                     -7
 United States Group B                        4                              8                                2                      6
        Sweden Group B                        2                             10           

In [36]:
# Count the number of observations where the year is 1990
count_1991 = goals_last_day_sorted[goals_last_day_sorted['year'] == 1991].shape[0]
print(f"Number of observations in goals_last_day_sorted for the year 1991: {count_1991}")




Number of observations in goals_last_day_sorted for the year 1991: 21
