# Libraries

In [116]:
import pandas as pd
from getpass import getuser
from collections import defaultdict
from datetime import datetime, timedelta

# Load and inspect dataset

In [117]:
# Get the current user's name
user = getuser()

# Import the functions from the functions notebook
%run functions_uefa.ipynb


In [118]:


# Path to the dataset
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\eu_goals_women.xlsx'

# Load the Excel file into a DataFrame
df = pd.read_excel(data_path)

# Display the first few rows of the DataFrame to verify the import
display(df.head())

Unnamed: 0,stage,year,time,home_team,away_team,score,stadium_name,stadium_city,stadium_attendance,referee_name,...,goal_minute,extra_time,goals_home,goals_away,own_goal,penalty,goal_minute_et,goal_et,short_date,long_date
0,First leg,1984,14:30,England,Denmark,2–1,Gresty Road,Crewe,1000,Republic of Ireland,...,31,0,2,1,0,0,0,0,,8 April 1984
1,First leg,1984,14:30,England,Denmark,2–1,Gresty Road,Crewe,1000,Republic of Ireland,...,51,0,2,1,0,0,0,0,,8 April 1984
2,First leg,1984,14:30,England,Denmark,2–1,Gresty Road,Crewe,1000,Republic of Ireland,...,49,0,2,1,0,1,0,0,,8 April 1984
3,First leg,1984,12:00,Italy,Sweden,2–3,Stadio Flaminio,Rome,5000,West Germany,...,18,0,2,3,0,0,0,0,,8 April 1984
4,First leg,1984,12:00,Italy,Sweden,2–3,Stadio Flaminio,Rome,5000,West Germany,...,31,0,2,3,0,0,0,0,,8 April 1984


# Clean, transfrom, create variables

## time

In [119]:
# Step 1: Clean time variable by removing any letters (e.g., EEST, PST) if they exist
df['time_cleaned'] = df['time'].str.replace(r'[A-Za-z]+', '', regex=True).str.strip()

# Step 2: Split time where there are parentheses into local time and UTC offset
df['local_time'] = df['time_cleaned'].str.extract(r'(\d{2}:\d{2})')  # extract the part before parentheses
df['utc_time_offset'] = df['time_cleaned'].str.extract(r'\((.*?)\)')  # extract the part inside parentheses


## Date

In [None]:
# Remove comma before the year in 'long_date' column
df['long_date'] = df['long_date'].str.replace(r',\s*(\d{4})', r' \1', regex=True)

# Convert 'long_date' to datetime format and extract the date in "YYYY-MM-DD" format for missing 'short_date' values
df['short_date'] = df['short_date'].fillna(pd.to_datetime(df['long_date'], format="%d %B %Y").dt.strftime('%Y-%m-%d'))


# Extract relevant columns

In [121]:
# Filter out the knockout stages
df = df[~df['stage'].isin(['Quarter-finals', 'Round of 16', 'Semi-finals', 'Final', 'Knockout stage'])]

# Extract relevant columns for goal events and match results
goals_df = df[['year', 'stage', 'home_team', 'away_team', 'scorer_nationality', 'goal_minute', 'short_date','local_time', 'score']]

# Remove duplicates
goals_df = goals_df.drop_duplicates()

display(goals_df.head())


Unnamed: 0,year,stage,home_team,away_team,scorer_nationality,goal_minute,short_date,local_time,score
0,1984,First leg,England,Denmark,England,31,1984-04-08,14:30,2–1
1,1984,First leg,England,Denmark,England,51,1984-04-08,14:30,2–1
2,1984,First leg,England,Denmark,Denmark,49,1984-04-08,14:30,2–1
3,1984,First leg,Italy,Sweden,Italy,18,1984-04-08,12:00,2–3
4,1984,First leg,Italy,Sweden,Italy,31,1984-04-08,12:00,2–3


# Recreate Leauge Table after first two matchdays

In [122]:

# Assuming goals_df is your original dataset
agg_goals_before_last_day, goals_last_day_sorted = process_goals_data(goals_df)


In [123]:
agg_goals_before_last_day

Unnamed: 0,year,stage,home_team,away_team,local_time,short_date,goals_home,goals_away,original_score,calculated_score,score_match,won
0,1984,First leg,England,Denmark,14:30,2024-11-06,2,1,2–1,2-1,True,1
1,1984,First leg,Italy,Sweden,12:00,2024-11-06,2,3,2–3,2-3,True,-1
2,1984,Second leg,Denmark,England,14:00,2024-11-06,0,1,0–1,0-1,True,-1
3,1984,Second leg,Sweden,Italy,,NaT,2,1,2–1,2-1,True,1
4,1993,Semifinals,Norway,Denmark,,NaT,1,0,1–0,1-0,True,1
...,...,...,...,...,...,...,...,...,...,...,...,...
90,2022,Group C,Sweden,Switzerland,17:00,2024-11-06,2,1,2–1,2-1,True,1
91,2022,Group D,Belgium,Iceland,17:00,2024-11-06,1,1,1–1,1-1,True,0
92,2022,Group D,France,Belgium,20:00,2024-11-06,2,1,2–1,2-1,True,1
93,2022,Group D,France,Italy,20:00,2024-11-06,5,1,5–1,5-1,True,1


In [124]:
home_games, away_games = aggregate_home_away_points(agg_goals_before_last_day)

In [125]:
# Filter for year == 2017 and stage == 'Group D'
home2017d = home_games[(home_games['year'] == 2017) & (home_games['stage'] == 'Group D')]
home2017d

Unnamed: 0,year,stage,home_team,goals_scored,goals_conceded,points_home,match_count_home
59,2017,Group D,England,8,0,6,2
60,2017,Group D,Scotland,1,2,0,1
61,2017,Group D,Spain,2,0,3,1


In [126]:
# Filter for year == 2021 and stage == 'Group F'
away2017d = away_games[(away_games['year'] == 2017) & (away_games['stage'] == 'Group D')]
away2017d

Unnamed: 0,year,stage,away_team,goals_scored,goals_conceded,points_away,match_count_away
58,2017,Group D,Portugal,2,3,3,2
59,2017,Group D,Scotland,0,6,0,1
60,2017,Group D,Spain,0,2,0,1


## aggregate data after first two matches following 

### UEFA criteria (first h2h, then aggregate)

In [127]:
# Applying the fifa_before_last function to the mock data
all_games_before_last = uefa_before_last(home_games, away_games, agg_goals_before_last_day)

In [128]:
# Filtering all_games_before_last for the year 2021 and stage 'Group F'
group2017d = all_games_before_last[
    (all_games_before_last['year'] == 2017) & 
    (all_games_before_last['stage'] == 'Group D')
]

display(group2017d)

Unnamed: 0,year,stage,team,goals_scored,goals_conceded,points,goals_difference,total_matches,tiebreaker,tie_won,standing
74,2017,Group D,England,8.0,0.0,6.0,8.0,2.0,no need,0,1
75,2017,Group D,Portugal,2.0,3.0,3.0,-1.0,2.0,Portugal,1,2
76,2017,Group D,Spain,2.0,2.0,3.0,0.0,2.0,Portugal,0,3
77,2017,Group D,Scotland,1.0,8.0,0.0,-7.0,2.0,no need,0,4


# Recreate league table after last match day

### uefa criteria 

In [129]:
# Initialize an empty list to store the results for each pair
all_results = []

# Get unique pairs of year, stage, and men
unique_pairs = all_games_before_last[['year', 'stage']].drop_duplicates()

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    stage = row['stage']
    
    # Apply the function to the current pair
    result = uefa_final_euro(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_results.append(result)

# Concatenate all the results into a single DataFrame
changes_df_euro = pd.concat(all_results)

# Keep only the specified columns
changes_df_euro = changes_df_euro[[ 'year', 'stage', 'team', '1st', '2nd', '3rd', '4th', 'changes']]



=== Initial Standings for Year 1984, First leg Before Last Match Goals ===

   team  total_points  total_goals_scored  total_goals_conceded  total_goal_difference  before_last_game_points  before_last_game_standing
 Sweden           4.0                 3.0                   2.0                    1.0                      3.0                          1
England           4.0                 2.0                   1.0                    1.0                      3.0                          2
  Italy           2.0                 2.0                   3.0                   -1.0                      1.0                          3
Denmark           2.0                 1.0                   2.0                   -1.0                      1.0                          4


Analyzing goal: 57 minute, Player team: Sweden, Home: Sweden, Away: England

=== Updated Standings After This Goal ===

   team  total_points  total_goals_scored  total_goals_conceded  total_goal_difference  last_game_points  

In [130]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\tb_eu_uefa_women.xlsx'
changes_df_euro.to_excel(file_path, index=False)


# group composition tracking

In [131]:
# Initialize an empty list to store the results for each pair
all_composition_changes = []

# Get unique pairs of year, stage, and men
unique_pairs = all_games_before_last[['year', 'stage']].drop_duplicates()

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    stage = row['stage']
    
    # Apply the function to the current pair
    composition_changes_df = track_composition_changes(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_composition_changes.append(composition_changes_df)

# Concatenate all the results into a single DataFrame
final_composition_changes_df = pd.concat(all_composition_changes, ignore_index=True)

# Display the final DataFrame
display(final_composition_changes_df)




=== Initial Standings for First leg, 1984 (with 0-0 points added) ===
   team  total_points  total_goals_scored  total_goals_conceded  total_goal_difference  before_last_game_standing
 Sweden           4.0                 3.0                   2.0                    1.0                          1
England           4.0                 2.0                   1.0                    1.0                          2
  Italy           2.0                 2.0                   3.0                   -1.0                          3
Denmark           2.0                 1.0                   2.0                   -1.0                          4



=== Standings after goal at minute 57 in First leg, edition 1984 ===
   team  total_points  total_goals_scored  total_goals_conceded  total_goal_difference  last_game_standing  tied_won
 Sweden           6.0                 4.0                   2.0                    2.0                   1         0
England           3.0                 2.0            

Unnamed: 0,year,stage,change_num,goal_time,home_team,away_team,scorer_team,new_top_teams,1st,2nd,3rd,scorer_nationality
0,1984,First leg,0,initial,,,,"[Sweden, Italy, England]",Sweden,England,Italy,
1,1984,Second leg,0,initial,,,,"[Sweden, Italy, England]",Sweden,England,Italy,
2,1993,Semifinals,0,initial,,,,"[Norway, Denmark]",Norway,Denmark,,
3,1995,Semifinals,0,initial,,,,"[Germany, Norway]",Germany,Norway,Sweden,
4,1997,Group A,0,initial,,,,"[Sweden, France]",Sweden,France,Spain,
5,1997,Group A,1,67,Russia,Spain,,"[Sweden, Spain]",Sweden,Spain,France,Spain
6,1997,Group B,0,initial,,,,"[Italy, Norway]",Norway,Italy,Germany,
7,1997,Group B,1,82,Denmark,Germany,,"[Italy, Germany]",Italy,Germany,Norway,Germany
8,2001,Group stage,0,initial,,,,"[Sweden, Germany]",Germany,Sweden,Norway,
9,2005,First round,0,initial,,,,"[France, Germany]",Germany,France,Sweden,


In [132]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\standings_eu_uefa_women.xlsx'
final_composition_changes_df.to_excel(file_path, index=False)


# best four third placed

In [133]:
# Call the function to get the DataFrame
final_df = best_four_third_placed_eu(goals_last_day_sorted, all_games_before_last, agg_goals_before_last_day)

# Define the file path and save to Excel
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\third_teams_eu_uefa_women.xlsx'
final_df.to_excel(file_path, index=False)



--- Processing Year: (2017,) ---

=== Initial Standings for Year (2017,) Before Processing Any Goals ===

       team   stage  before_last_game_points  before_last_game_goals_scored  before_last_game_goals_conceded  total_goal_difference
Netherlands Group A                      6.0                            2.0                              0.0                    2.0
    Belgium Group A                      3.0                            2.0                              1.0                    1.0
    Denmark Group A                      3.0                            1.0                              1.0                    0.0
     Norway Group A                      0.0                            0.0                              3.0                   -3.0
     Sweden Group B                      3.0                            2.0                              0.0                    2.0
    Germany Group B                      3.0                            2.0                          

In [134]:
# Count the number of observations where the year is 2017
count_2017 = goals_last_day_sorted[goals_last_day_sorted['year'] == 2017].shape[0]
print(f"Number of observations in goals_last_day_sorted for the year 2017: {count_2017}")

# Count the number of observations where the year is 2022
count_2022 = goals_last_day_sorted[goals_last_day_sorted['year'] == 2022].shape[0]
print(f"Number of observations in goals_last_day_sorted for the year 2022: {count_2022}")



Number of observations in goals_last_day_sorted for the year 2017: 20
Number of observations in goals_last_day_sorted for the year 2022: 23
