# Libraries

In [1]:
import pandas as pd
from getpass import getuser
from collections import defaultdict
from datetime import datetime, timedelta

# Load and inspect dataset

In [2]:
# Get the current user's name
user = getuser()

# Construct the path using the user's name
function_path = f"C:/Users/{user}/Documents/GitHub/tiebreak_wc/code/wiki/functions_uefa.ipynb"

# Run the notebook
%run $function_path

In [3]:


# Path to the dataset
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\wc_goals_men.xlsx'

# Load the Excel file into a DataFrame
df = pd.read_excel(data_path)

# Display the first few rows of the DataFrame to verify the import
display(df.head())

Unnamed: 0,stage,year,time,home_team,away_team,score,stadium_name,stadium_city,stadium_attendance,referee_name,...,goal_minute,extra_time,goals_home,goals_away,own_goal,penalty,goal_minute_et,goal_et,short_date,long_date
0,Group A,1986,12:00CST,Bulgaria,Italy,1–1,Estadio Azteca,Mexico City,96000,Erik Fredriksson,...,85,0,1,1,0,0,0,0,,31 May 1986
1,Group A,1986,12:00CST,Bulgaria,Italy,1–1,Estadio Azteca,Mexico City,96000,Erik Fredriksson,...,44,0,1,1,0,0,0,0,,31 May 1986
2,Group A,1986,12:00CST,Argentina,South Korea,3–1,Estadio Olímpico Universitario,Mexico City,60000,Victoriano Sánchez Arminio,...,6,0,3,1,0,0,0,0,,2 June 1986
3,Group A,1986,12:00CST,Argentina,South Korea,3–1,Estadio Olímpico Universitario,Mexico City,60000,Victoriano Sánchez Arminio,...,46,0,3,1,0,0,0,0,,2 June 1986
4,Group A,1986,12:00CST,Argentina,South Korea,3–1,Estadio Olímpico Universitario,Mexico City,60000,Victoriano Sánchez Arminio,...,18,0,3,1,0,0,0,0,,2 June 1986


# Clean, transfrom, create variables

## time

In [4]:
# Step 1: Clean time variable by removing any letters (e.g., EEST, PST) if they exist
df['time_cleaned'] = df['time'].str.replace(r'[A-Za-z]+', '', regex=True).str.strip()

# Step 2: Split time where there are parentheses into local time and UTC offset
df['local_time'] = df['time_cleaned'].str.extract(r'(\d{2}:\d{2})')  # extract the part before parentheses
df['utc_time_offset'] = df['time_cleaned'].str.extract(r'\((.*?)\)')  # extract the part inside parentheses


## date

In [5]:
# Step 1: Remove any commas before the year in 'long_date'
df['long_date'] = df['long_date'].str.replace(r',\s*(\d{4})', r' \1', regex=True)

# Step 2: Convert 'long_date' to 'short_date' where 'short_date' is missing
df['short_date'] = df['short_date'].fillna(pd.to_datetime(df['long_date'], errors='coerce').dt.strftime('%Y-%m-%d'))


# Extract relevant columns

In [6]:
# Filter out the knockout stages
df = df[~df['stage'].isin(['Quarter-finals', 'Round of 16', 'Semi-finals', 'Final'])]

# Extract relevant columns for goal events and match results
goals_df = df[['year', 'stage', 'home_team', 'away_team', 'scorer_nationality', 'goal_minute', 'short_date','local_time', 'score']]

# Remove duplicates
goals_df = goals_df.drop_duplicates()

display(goals_df.head())


Unnamed: 0,year,stage,home_team,away_team,scorer_nationality,goal_minute,short_date,local_time,score
0,1986,Group A,Bulgaria,Italy,Bulgaria,85,1986-05-31,12:00,1–1
1,1986,Group A,Bulgaria,Italy,Italy,44,1986-05-31,12:00,1–1
2,1986,Group A,Argentina,South Korea,Argentina,6,1986-06-02,12:00,3–1
3,1986,Group A,Argentina,South Korea,Argentina,46,1986-06-02,12:00,3–1
4,1986,Group A,Argentina,South Korea,Argentina,18,1986-06-02,12:00,3–1


# team counts

In [7]:

# Calculate the number of unique teams per stage and year using home_team and away_team
team_counts = (
    df.groupby(['year', 'stage'])
    .apply(lambda x: pd.concat([x['home_team'], x['away_team']]).unique())
)

# Convert the resulting series to a DataFrame
team_counts = team_counts.reset_index()  # Reset index without 'name' argument
team_counts = team_counts.rename(columns={0: 'team_list'})  # Rename the column appropriately

# Convert team list into counts
team_counts['team_count'] = team_counts['team_list'].apply(len)

# Filter stages where the number of teams is not equal to 4
invalid_groups = team_counts[team_counts['team_count'] != 4]

# Display the invalid groups
display(invalid_groups)



  .apply(lambda x: pd.concat([x['home_team'], x['away_team']]).unique())


Unnamed: 0,year,stage,team_list,team_count
6,1986,Third place play-off,"[Belgium, France]",2
7,1990,Quarterfinals,"[Republic of Ireland, Czechoslovakia, Cameroon...",6
9,1990,Third place play-off,"[Italy, England]",2
16,1994,Quarterfinals,"[Italy, Netherlands, Bulgaria, Romania, Spain,...",8
18,1994,Third place playoff,"[Sweden, Bulgaria]",2
27,1998,Third place play-off,"[Netherlands, Croatia]",2
36,2002,Third place play-off,"[South Korea, Turkey]",2
45,2006,Third place play-off,"[Germany, Portugal]",2
54,2010,Third place play-off,"[Uruguay, Germany]",2
63,2014,Third place play-off,"[Brazil, Netherlands]",2


# Recreate Leauge Table after first two matchdays

In [8]:

# Assuming goals_df is your original dataset
agg_goals_before_last_day, goals_last_day_sorted = process_goals_data(goals_df)


In [9]:
home_games, away_games = aggregate_home_away_points(agg_goals_before_last_day)

In [10]:
# Filter for year == 2022 and stage == 'Group E'
home2022e = home_games[(home_games['year'] == 2022) & (home_games['stage'] == 'Group E')]
home2022e

Unnamed: 0,year,stage,home_team,goals_scored,goals_conceded,points_home,match_count_home
171,2022,Group E,Germany,1,2,0,1
172,2022,Group E,Japan,0,1,0,1
173,2022,Group E,Spain,8,1,4,2


In [11]:
# Filter for year == 2022 and stage == 'Group E'
away2022e = away_games[(away_games['year'] == 2022) & (away_games['stage'] == 'Group E')]
away2022e

Unnamed: 0,year,stage,away_team,goals_scored,goals_conceded,points_away,match_count_away
167,2022,Group E,Costa Rica,1,7,3,2
168,2022,Group E,Germany,1,1,1,1
169,2022,Group E,Japan,2,1,3,1


## aggregate data after first two matches 

In [12]:
# Applying the fifa_before_last function to the mock data
all_games_before_last = uefa_before_last(home_games, away_games, agg_goals_before_last_day, team_counts)

In [13]:
# Filtering all_games_before_last for the year 2022 and stage 'Group E'
group2022e = all_games_before_last[
    (all_games_before_last['year'] == 2022) & 
    (all_games_before_last['stage'] == 'Group E')
]

display(group2022e)

Unnamed: 0,year,stage,team,goals_scored,goals_conceded,points,goals_difference,total_matches,tiebreaker,tie_won,standing
296,2022,Group E,Spain,8,1,4,7,2,no need,0,1
297,2022,Group E,Costa Rica,1,7,3,-6,2,Costa Rica,1,2
298,2022,Group E,Japan,2,2,3,0,2,Costa Rica,0,3
299,2022,Group E,Germany,2,3,1,-1,2,no need,0,4


# Recreate league table after last match day

### uefa criteria 

In [14]:
# Initialize an empty list to store the results for each pair
all_results = []

# Get unique pairs of year, stage, and men
unique_pairs = all_games_before_last[['year', 'stage']].drop_duplicates()

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    stage = row['stage']
    
    # Apply the function to the current pair
    result = uefa_final_euro(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_results.append(result)

# Concatenate all the results into a single DataFrame
changes_df_euro = pd.concat(all_results)

# Keep only the specified columns
changes_df_euro = changes_df_euro[[ 'year', 'stage', 'team', '1st', '2nd', '3rd', '4th', 'changes']]



=== Initial Standings for Year 1986, Group A Before Last Match Goals ===

       team  total_points  total_goals_scored  total_goals_conceded  total_goal_difference  before_last_game_points  before_last_game_standing
  Argentina             4                   4                     2                      2                        3                          1
   Bulgaria             3                   2                     2                      0                        2                          2
      Italy             3                   2                     2                      0                        2                          3
South Korea             2                   2                     4                     -2                        1                          4


Analyzing goal: 4 minute, Player team: Argentina, Home: Argentina, Away: Bulgaria

=== Updated Standings After This Goal ===

       team  total_points  total_goals_scored  total_goals_conceded  total_goal_di

In [15]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\uefa\wc\tb_wc_uefa_men.xlsx'
changes_df_euro.to_excel(file_path, index=False)


# group composition tracking

In [16]:
# Initialize an empty list to store the results for each pair
all_composition_changes = []

# Get unique pairs of year, stage, and men
unique_pairs = all_games_before_last[['year', 'stage']].drop_duplicates()

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    stage = row['stage']
    
    # Apply the function to the current pair
    composition_changes_df = track_composition_changes(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_composition_changes.append(composition_changes_df)

# Concatenate all the results into a single DataFrame
final_composition_changes_df = pd.concat(all_composition_changes, ignore_index=True)

# Display the final DataFrame
display(final_composition_changes_df)




=== Initial Standings for Group A, 1986 (Goal Time = 0) ===
       team  total_points  total_goals_scored  total_goals_conceded  total_goal_difference  before_last_game_standing
  Argentina             4                   4                     2                      2                          1
   Bulgaria             3                   2                     2                      0                          2
      Italy             3                   2                     2                      0                          3
South Korea             2                   2                     4                     -2                          4



=== Standings after goal at minute 4 in Group A, edition 1986 ===
       team  total_points  total_goals_scored  total_goals_conceded  total_goal_difference  last_game_standing
  Argentina             5                   5                     2                      3                   1
      Italy             3                   2             

Unnamed: 0,year,stage,change_num,goal_time,home_team,away_team,scorer_team,new_top_teams,1st,2nd,3rd,changed
0,1986,Group A,0,0,,,,"[Bulgaria, Argentina, Italy]",Argentina,Bulgaria,Italy,0
1,1986,Group A,0,4,Argentina,Bulgaria,Argentina,"[Bulgaria, Argentina, Italy]",Argentina,Italy,Bulgaria,0
2,1986,Group A,0,17,South Korea,Italy,Italy,"[Bulgaria, Argentina, Italy]",Argentina,Italy,Bulgaria,0
3,1986,Group A,0,62,South Korea,Italy,South Korea,"[Bulgaria, Argentina, Italy]",Argentina,Italy,Bulgaria,0
4,1986,Group A,0,73,South Korea,Italy,Italy,"[Bulgaria, Argentina, Italy]",Argentina,Italy,Bulgaria,0
...,...,...,...,...,...,...,...,...,...,...,...,...
556,2022,Group H,2,91,South Korea,Portugal,South Korea,"[Portugal, South Korea]",Portugal,South Korea,Uruguay,1
557,2022,Third place play-off,0,0,,,,"[Croatia, Morocco]",Croatia,Morocco,,0
558,2022,Third place play-off,0,7,Croatia,Morocco,Croatia,"[Croatia, Morocco]",Croatia,Morocco,,0
559,2022,Third place play-off,0,9,Croatia,Morocco,Morocco,"[Croatia, Morocco]",Croatia,Morocco,,0


In [17]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\uefa\wc\standings_wc_uefa_men.xlsx'
final_composition_changes_df.to_excel(file_path, index=False)


# best four third placed

In [18]:
# Call the function to get the DataFrame
final_df = best_four_third_placed_wc_men(goals_last_day_sorted, all_games_before_last, agg_goals_before_last_day)

# Define the file path and save to Excel
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\uefa\wc\third_teams_wc_uefa_men.xlsx'
final_df.to_excel(file_path, index=False)



--- Processing Year: 1986 ---

=== Initial Standings for Year 1986 Before Processing Any Goals ===

            team                stage  before_last_game_points  before_last_game_goals_scored  before_last_game_goals_conceded  total_goal_difference
       Argentina              Group A                        3                              4                                2                      2
        Bulgaria              Group A                        2                              2                                2                      0
           Italy              Group A                        2                              2                                2                      0
     South Korea              Group A                        1                              2                                4                     -2
          Mexico              Group B                        3                              3                                2                      1

In [19]:
# Count the number of observations where the year is 1986
count_1986 = goals_last_day_sorted[goals_last_day_sorted['year'] == 1986].shape[0]
print(f"Number of observations in goals_last_day_sorted for the year 1986: {count_1986}")

# Count the number of observations where the year is 1990
count_1990 = goals_last_day_sorted[goals_last_day_sorted['year'] == 1990].shape[0]
print(f"Number of observations in goals_last_day_sorted for the year 1990: {count_1990}")

# Count the number of observations where the year is 1994
count_1994 = goals_last_day_sorted[goals_last_day_sorted['year'] == 1994].shape[0]
print(f"Number of observations in goals_last_day_sorted for the year 1994: {count_1994}")


Number of observations in goals_last_day_sorted for the year 1986: 38
Number of observations in goals_last_day_sorted for the year 1990: 11
Number of observations in goals_last_day_sorted for the year 1994: 116
