# Libraries

In [72]:
import pandas as pd
from getpass import getuser
from collections import defaultdict
from datetime import datetime, timedelta

# Load and inspect dataset

In [73]:
# Get the current user's name
user = getuser()

# Path to the dataset
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\euro_goals.xlsx'

# Load the Excel file into a DataFrame
df = pd.read_excel(data_path)

# Display the first few rows of the DataFrame to verify the import
display(df.head())

Unnamed: 0,stage,time,home_team,away_team,score,stadium_name,stadium_city,stadium_attendance,referee_name,referee_nationality,...,goal_minute,extra_time,goals_home,goals_away,own_goal,penalty,goal_minute_et,goal_et,short_date,long_date
0,Group 1,20:30,France,Denmark,1–0,Parc des Princes,Paris,47570,Volker Roth,West Germany,...,78,0,1,0,0,0,0,0,1984-06-12,12 June 1984
1,Group 1,20:30,Belgium,Yugoslavia,2–0,Stade Félix-Bollaert,Lens,41525,Erik Fredriksson,Sweden,...,28,0,2,0,0,0,0,0,1984-06-13,13 June 1984
2,Group 1,20:30,Belgium,Yugoslavia,2–0,Stade Félix-Bollaert,Lens,41525,Erik Fredriksson,Sweden,...,45,0,2,0,0,0,0,0,1984-06-13,13 June 1984
3,Group 1,17:15,France,Belgium,5–0,Stade de la Beaujoire,Nantes,51359,Bob Valentine,Scotland,...,4,0,5,0,0,0,0,0,1984-06-16,16 June 1984
4,Group 1,17:15,France,Belgium,5–0,Stade de la Beaujoire,Nantes,51359,Bob Valentine,Scotland,...,74,0,5,0,0,1,0,0,1984-06-16,16 June 1984


# Clean and transfrom variables

## time

In [74]:
# Step 1: Clean time variable by removing any letters (e.g., EEST, PST) if they exist
df['time_cleaned'] = df['time'].str.replace(r'[A-Za-z]+', '', regex=True).str.strip()

# Step 2: Split time where there are parentheses into local time and UTC offset
df['local_time'] = df['time_cleaned'].str.extract(r'(\d{2}:\d{2})')  # extract the part before parentheses
df['utc_time_offset'] = df['time_cleaned'].str.extract(r'\((.*?)\)')  # extract the part inside parentheses


## date

In [75]:
# Convert 'short_date' column to datetime if it's not already
df['short_date'] = pd.to_datetime(df['short_date'])

# Extract the year from 'short_date' and create a new column 'year'
df['year'] = df['short_date'].dt.year

# Extract relevant columns

In [76]:
# Filter out the knockout stages
df = df[~df['stage'].isin(['Quarter-finals', 'Round of 16', 'Semi-finals', 'Final'])]

# Extract relevant columns for goal events and match results
goals_df = df[['year', 'stage', 'home_team', 'away_team', 'scorer_nationality', 'goal_minute', 'short_date','local_time', 'score']]

display(goals_df.head())


Unnamed: 0,year,stage,home_team,away_team,scorer_nationality,goal_minute,short_date,local_time,score
0,1984,Group 1,France,Denmark,France,78,1984-06-12,20:30,1–0
1,1984,Group 1,Belgium,Yugoslavia,Belgium,28,1984-06-13,20:30,2–0
2,1984,Group 1,Belgium,Yugoslavia,Belgium,45,1984-06-13,20:30,2–0
3,1984,Group 1,France,Belgium,France,4,1984-06-16,17:15,5–0
4,1984,Group 1,France,Belgium,France,74,1984-06-16,17:15,5–0


# Recreate Leauge Table after first two matchdays

### Filter out the matches from the last match day in the goals_df DataFrame.

In [77]:
# Step 1: Find the last match date for each tournament and group
last_dates = goals_df.groupby(['year', 'stage'])['short_date'].max().reset_index()

# Step 2: Create the dataset with all games excluding the last match day (goals_before_last_day)
# Performing a left merge and keeping only rows not present in the last_dates
goals_before_last_day = goals_df.merge(last_dates, on=['year', 'stage', 'short_date'], how='left', indicator=True)
goals_before_last_day = goals_before_last_day[goals_before_last_day['_merge'] == 'left_only'].drop(columns=['_merge'])

# Step 3: Create the dataset with only the last match day games (goals_last_day)
goals_last_day = goals_df.merge(last_dates, on=['year', 'stage', 'short_date'], how='inner')

# Step 4: Sort the DataFrame by 'goal_minute'
goals_last_day_sorted = goals_last_day.sort_values(by=['short_date','goal_minute'], ascending=True)

# Ensure that goals_last_day_sorted has no duplicates
goals_last_day_sorted = goals_last_day_sorted.drop_duplicates()

In [78]:
goals_last_day_sorted

Unnamed: 0,year,stage,home_team,away_team,scorer_nationality,goal_minute,short_date,local_time,score
8,1984,Group 1,Denmark,Belgium,Belgium,26,1984-06-19,20:30,3–2
3,1984,Group 1,France,Yugoslavia,Yugoslavia,32,1984-06-19,20:30,3–2
9,1984,Group 1,Denmark,Belgium,Belgium,39,1984-06-19,20:30,3–2
5,1984,Group 1,Denmark,Belgium,Denmark,41,1984-06-19,20:30,3–2
0,1984,Group 1,France,Yugoslavia,France,59,1984-06-19,20:30,3–2
...,...,...,...,...,...,...,...,...,...
224,2024,Group E,Slovakia,Romania,Romania,37,2024-06-26,18:00,1–1
228,2024,Group F,Czech Republic,Turkey,Turkey,51,2024-06-26,21:00,1–2
226,2024,Group F,Georgia,Portugal,Georgia,57,2024-06-26,21:00,2–0
227,2024,Group F,Czech Republic,Turkey,Czech Republic,66,2024-06-26,21:00,1–2


## create df  of aggregate data for each match

In [79]:
# Initialize an empty list to store the results
results = []

# Iterate over each match in the dataset
for match_id, group in goals_before_last_day.groupby(['year', 'stage', 'home_team', 'away_team']):
    # Initialize goals_home and goals_away for each match
    goals_home = 0
    goals_away = 0
    
    # Extract the local_time and the score from the first row in the group
    local_time = group.iloc[0]['local_time']
    score = group.iloc[0]['score']  # Assuming score is stored with a longer dash like '2–0'
    
    # Extract short_date from local_time (assuming local_time is a datetime object or string convertible to date)
    short_date = pd.to_datetime(local_time).date()  # Convert local_time to a short date (YYYY-MM-DD)
    
    # Loop through each row in the group to count goals for home and away teams
    for _, row in group.iterrows():
        # Check if the scorer's nationality matches the home team
        if row['home_team'] == row['scorer_nationality']:
            goals_home += 1
        # If the scorer's nationality matches the away team, increment goals_away
        elif row['away_team'] == row['scorer_nationality']:
            goals_away += 1

    # Calculate the expected score from goals_home and goals_away (using a standard hyphen)
    calculated_score = f"{goals_home}-{goals_away}"
    
    # Normalize both scores by replacing any long dash (en-dash or em-dash) with a standard hyphen
    normalized_score = score.replace("–", "-").replace("—", "-")
    
    # Check if the normalized score matches the calculated score
    score_match = normalized_score == calculated_score
    
    # Append the results to the list, including local_time, short_date, and score match flag
    results.append({
        'year': match_id[0],
        'stage': match_id[1],
        'home_team': match_id[2],
        'away_team': match_id[3],
        'local_time': local_time,
        'short_date': short_date,  # New column for the short date
        'goals_home': goals_home,
        'goals_away': goals_away,
        'original_score': score,  # Keep the original score with long dash
        'calculated_score': calculated_score,
        'score_match': score_match  # True if score matches, False otherwise
    })

# Convert the list into a DataFrame
agg_goals_before_last_day = pd.DataFrame(results)

# Add a new column 'won' based on the comparison of goals_home and goals_away
agg_goals_before_last_day['won'] = agg_goals_before_last_day.apply(
    lambda row: 1 if row['goals_home'] > row['goals_away'] else (-1 if row['goals_home'] < row['goals_away'] else 0), 
    axis=1
)

In [80]:
agg_goals_before_last_day

Unnamed: 0,year,stage,home_team,away_team,local_time,short_date,goals_home,goals_away,original_score,calculated_score,score_match,won
0,1984,Group 1,Belgium,Yugoslavia,20:30,2024-10-31,2,0,2–0,2-0,True,1
1,1984,Group 1,Denmark,Yugoslavia,20:30,2024-10-31,5,0,5–0,5-0,True,1
2,1984,Group 1,France,Belgium,17:15,2024-10-31,5,0,5–0,5-0,True,1
3,1984,Group 1,France,Denmark,20:30,2024-10-31,1,0,1–0,1-0,True,1
4,1984,Group 2,Portugal,Spain,20:30,2024-10-31,1,1,1–1,1-1,True,0
...,...,...,...,...,...,...,...,...,...,...,...,...
153,2024,Group E,Slovakia,Ukraine,15:00,2024-10-31,1,2,1–2,1-2,True,-1
154,2024,Group F,Georgia,Czech Republic,15:00,2024-10-31,1,1,1–1,1-1,True,0
155,2024,Group F,Portugal,Czech Republic,21:00,2024-10-31,2,1,2–1,2-1,True,1
156,2024,Group F,Turkey,Georgia,18:00,2024-10-31,3,1,3–1,3-1,True,1


## aggregate data for home and away games

In [81]:
def calculate_points(results, years, win_result):
    points = []
    for result, year in zip(results, years):
        if result == win_result:  # Win condition (1 for home, -1 for away)
            points.append(2 if year <= 1992 else 3)
        elif result == 0:  # Draw condition
            points.append(1)
        else:  # Loss condition
            points.append(0)
    return sum(points)

In [82]:
# Step 1: Home games aggregation with match count and conditional points based on year
home_games = agg_goals_before_last_day.groupby(['year', 'stage', 'home_team']).agg(
    goals_scored=('goals_home', 'sum'),
    goals_conceded=('goals_away', 'sum'),
    points_home=('won', lambda x: calculate_points(x, agg_goals_before_last_day.loc[x.index, 'year'], 1)),
    match_count_home=('home_team', 'count')
).reset_index()


In [83]:
display(home_games.head())

Unnamed: 0,year,stage,home_team,goals_scored,goals_conceded,points_home,match_count_home
0,1984,Group 1,Belgium,2,0,2,1
1,1984,Group 1,Denmark,5,0,2,1
2,1984,Group 1,France,6,0,4,2
3,1984,Group 2,Portugal,1,1,1,1
4,1984,Group 2,Romania,1,1,1,1


In [84]:
# Filter for year == 2021 and stage == 'Group F'
group_f_2021 = home_games[(home_games['year'] == 2021) & (home_games['stage'] == 'Group F')]
group_f_2021

Unnamed: 0,year,stage,home_team,goals_scored,goals_conceded,points_home,match_count_home
115,2021,Group F,France,1,0,3,1
116,2021,Group F,Hungary,1,4,1,2
117,2021,Group F,Portugal,2,4,0,1


In [85]:
# Step 2: Away games aggregation with match count and conditional points based on year
away_games = agg_goals_before_last_day.groupby(['year', 'stage', 'away_team']).agg(
    goals_scored=('goals_away', 'sum'),
    goals_conceded=('goals_home', 'sum'),
    points_away=('won', lambda x: calculate_points(x, agg_goals_before_last_day.loc[x.index, 'year'], -1)),
    match_count_away=('away_team', 'count')
).reset_index()

In [86]:
display(away_games.head())

Unnamed: 0,year,stage,away_team,goals_scored,goals_conceded,points_away,match_count_away
0,1984,Group 1,Belgium,0,5,0,1
1,1984,Group 1,Denmark,0,1,0,1
2,1984,Group 1,Yugoslavia,0,7,0,2
3,1984,Group 2,Romania,1,2,0,1
4,1984,Group 2,Spain,2,2,2,2


## aggregate data after first two matches following 

### UEFA criteria (first h2h, then aggregate)

In [87]:
def uefa_before_last(home_games, away_games, agg_goals_before_last_day):
    # Step 1: Merge home_games and away_games on year, stage, and home_team with away_team
    all_games_before_last = pd.merge(
        home_games,
        away_games,
        left_on=['year', 'stage', 'home_team'],
        right_on=['year', 'stage', 'away_team'],
        how='outer',
        suffixes=('_home', '_away')
    )

    # Step 2: For teams that only appear in home_games, assign home_team to team and copy relevant columns
    all_games_before_last['team'] = all_games_before_last['home_team'].fillna(all_games_before_last['away_team'])

    # Step 3: For goals_scored, goals_conceded, and points, handle missing values
    all_games_before_last['goals_scored'] = all_games_before_last['goals_scored_home'].fillna(0) + all_games_before_last['goals_scored_away'].fillna(0)
    all_games_before_last['goals_conceded'] = all_games_before_last['goals_conceded_home'].fillna(0) + all_games_before_last['goals_conceded_away'].fillna(0)
    all_games_before_last['points'] = all_games_before_last['points_home'].fillna(0) + all_games_before_last['points_away'].fillna(0)

    # Step 4: Add goals_difference column
    all_games_before_last['goals_difference'] = all_games_before_last['goals_scored'] - all_games_before_last['goals_conceded']

    # Step 5: Sum match_count_home and match_count_away to get total match count for each team
    all_games_before_last['total_matches'] = all_games_before_last['match_count_home'].fillna(0) + all_games_before_last['match_count_away'].fillna(0)

    # Step 6: Adjust points if total_matches is 1 (indicating a likely 0-0 draw)
    all_games_before_last.loc[all_games_before_last['total_matches'] == 1, 'points'] += 1

    # Step 7: Drop unnecessary columns used in the merge process
    all_games_before_last = all_games_before_last[['year', 'stage', 'team', 'goals_scored', 'goals_conceded', 'points', 'goals_difference', 'total_matches']]

    # Step 8: Initial sort by year, stage, and points
    all_games_before_last = all_games_before_last.sort_values(
        by=['year', 'stage', 'points'],
        ascending=[True, True, False]
    ).reset_index(drop=True)

    # Step 9: Initialize a new column for tie-break results, which will store either the tie-break result or 'no need'
    all_games_before_last['tiebreaker'] = 'no need'

    # Step 10: Define the function to check the tie-breaker
    def check_tiebreaker(row1, row2, agg_data):
        """
        Check the tie-breaker based on the head-to-head match result from agg_goals_before_last_day.
        Return the team that won (if any) or 'tie'.
        """
        match = agg_data[((agg_data['home_team'] == row1['team']) & (agg_data['away_team'] == row2['team'])) |
                         ((agg_data['home_team'] == row2['team']) & (agg_data['away_team'] == row1['team']))]

        if not match.empty:
            if match.iloc[0]['won'] == 1:
                return row1['team']  # Home team won
            elif match.iloc[0]['won'] == -1:
                return row2['team']  # Away team won
            else:
                return 'tie'  # It's a draw
        else:
            return 'no result'  # No match found

    # Step 11: Apply the tie-breaker and if still tied, check goals_difference and goals_scored
    for i in range(len(all_games_before_last) - 1):
        row1 = all_games_before_last.iloc[i]
        row2 = all_games_before_last.iloc[i + 1]

        # Get the year and stage for the current comparison
        year = row1['year']
        stage = row1['stage']

        # Check if the two rows have identical values for points
        if row1['points'] == row2['points']:
            print(f"Tie in {year}, Stage: {stage}, between {row1['team']} and {row2['team']} (Points: {row1['points']})")

            # Apply the tie-breaker by checking the head-to-head match result
            tiebreak_result = check_tiebreaker(row1, row2, agg_goals_before_last_day)

            if tiebreak_result != 'tie' and tiebreak_result != 'no result':
                print(f"Tie resolved by head-to-head: {tiebreak_result} won in {year}, Stage: {stage}")
                all_games_before_last.at[i, 'tiebreaker'] = tiebreak_result
                all_games_before_last.at[i + 1, 'tiebreaker'] = tiebreak_result
            else:
                # If still tied after the tiebreaker, check goals_difference and goals_scored
                if row1['goals_difference'] > row2['goals_difference']:
                    print(f"Tie resolved by goals difference: {row1['team']} favored in {year}, Stage: {stage}")
                    all_games_before_last.at[i, 'tiebreaker'] = row1['team']
                    all_games_before_last.at[i + 1, 'tiebreaker'] = row1['team']
                elif row1['goals_difference'] < row2['goals_difference']:
                    print(f"Tie resolved by goals difference: {row2['team']} favored in {year}, Stage: {stage}")
                    all_games_before_last.at[i, 'tiebreaker'] = row2['team']
                    all_games_before_last.at[i + 1, 'tiebreaker'] = row2['team']
                elif row1['goals_scored'] > row2['goals_scored']:
                    print(f"Tie resolved by goals scored: {row1['team']} favored in {year}, Stage: {stage}")
                    all_games_before_last.at[i, 'tiebreaker'] = row1['team']
                    all_games_before_last.at[i + 1, 'tiebreaker'] = row1['team']
                else:
                    print(f"Tie resolved by goals scored: {row2['team']} favored in {year}, Stage: {stage}")
                    all_games_before_last.at[i, 'tiebreaker'] = row2['team']
                    all_games_before_last.at[i + 1, 'tiebreaker'] = row2['team']

    # Step 12: Add group standing by ranking teams within each stage based on points, goals_difference, and goals_scored
    all_games_before_last['standing'] = all_games_before_last.groupby(['year', 'stage']).apply(
        lambda x: x.sort_values(by=['points', 'goals_difference', 'goals_scored'], ascending=[False, False, False])
    ).reset_index(drop=True).groupby(['year', 'stage']).cumcount() + 1

    # Convert goals_scored, goals_conceded, points, goals_difference, and total_matches to integers
    all_games_before_last[['goals_scored', 'goals_conceded', 'points', 'goals_difference', 'total_matches']] = all_games_before_last[
        ['goals_scored', 'goals_conceded', 'points', 'goals_difference', 'total_matches']].astype(int)

    return all_games_before_last


In [88]:
# Applying the fifa_before_last function to the mock data
all_games_before_last = uefa_before_last(home_games, away_games, agg_goals_before_last_day)

Tie in 1984, Stage: Group 1, between Belgium and Denmark (Points: 2.0)
Tie resolved by head-to-head: Denmark won in 1984, Stage: Group 1
Tie in 1984, Stage: Group 2, between Portugal and Spain (Points: 2.0)
Tie resolved by goals scored: Spain favored in 1984, Stage: Group 2
Tie in 1988, Stage: Group 1, between Italy and West Germany (Points: 3.0)
Tie resolved by goals difference: West Germany favored in 1988, Stage: Group 1
Tie in 1988, Stage: Group 2, between Republic of Ireland and Soviet Union (Points: 3.0)
Tie resolved by goals scored: Soviet Union favored in 1988, Stage: Group 2
Tie in 1992, Stage: Group 2, between Germany and Netherlands (Points: 3.0)
Tie resolved by goals difference: Germany favored in 1992, Stage: Group 2
Tie in 1996, Stage: Group A, between England and Netherlands (Points: 4.0)
Tie resolved by head-to-head: Netherlands won in 1996, Stage: Group A
Tie in 1996, Stage: Group A, between Scotland and Switzerland (Points: 1.0)
Tie resolved by goals scored: Switzerla

  all_games_before_last['standing'] = all_games_before_last.groupby(['year', 'stage']).apply(


# Recreate league table after last match day

### uefa criteria 

In [89]:
def uefa_final_euro(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day):
    # Step 1: Filter the data for the specific year and stage
    group_goals_tracking = all_games_before_last[
        (all_games_before_last['year'] == year) & 
        (all_games_before_last['stage'] == stage)
    ].copy()

    group_goals_last_day = goals_last_day_sorted[
        (goals_last_day_sorted['year'] == year) & 
        (goals_last_day_sorted['stage'] == stage)
    ]

    # Step 2: Initialize columns for tracking team performance
    group_goals_tracking['before_last_game_goals_scored'] = group_goals_tracking['goals_scored']
    group_goals_tracking['before_last_game_goals_conceded'] = group_goals_tracking['goals_conceded']
    group_goals_tracking['before_last_game_standing'] = group_goals_tracking['standing']
    group_goals_tracking['before_last_game_points'] = group_goals_tracking['points']

    group_goals_tracking['last_game_goals_scored'] = 0
    group_goals_tracking['last_game_goals_conceded'] = 0
    group_goals_tracking['total_goals_scored'] = group_goals_tracking['before_last_game_goals_scored']
    group_goals_tracking['total_goals_conceded'] = group_goals_tracking['before_last_game_goals_conceded']
    group_goals_tracking['total_goal_difference'] = group_goals_tracking['total_goals_scored'] - group_goals_tracking['total_goals_conceded']
    group_goals_tracking['last_game_points'] = 0
    group_goals_tracking['total_points'] = group_goals_tracking['before_last_game_points']
    group_goals_tracking['last_game_standing'] = 0
    group_goals_tracking['1st'] = 0
    group_goals_tracking['2nd'] = 0
    group_goals_tracking['3rd'] = 0
    group_goals_tracking['4th'] = 0
    group_goals_tracking['changes'] = 0  # Initialize this but will be redefined later as the sum of 1st, 2nd, 3rd, 4th
    group_goals_tracking['tied'] = False  # Initialize a flag to track tied teams
    group_goals_tracking['tied_won'] = 0  # Initialize to track if the team won a tie-breaker match

    # Step 3: Sort group_goals_last_day by goal_minute in ascending order
    group_goals_last_day = group_goals_last_day.sort_values(by='goal_minute', ascending=True)

    # Print the year, stage, and standings before starting the loop for last match goals
    print(f"\n=== Initial Standings for Year {year}, {stage} Before Last Match Goals ===\n")
    display_columns = ['team', 'total_points', 'total_goals_scored', 'total_goals_conceded', 
                       'total_goal_difference', 'before_last_game_points', 'before_last_game_standing']
    print(group_goals_tracking[display_columns].to_string(index=False))
    print("\n====================================================\n")

    # Step 4: Iterate through the sorted and filtered last match goals and update the goals_tracking table
    previous_standings = group_goals_tracking['last_game_standing'].copy()
    first_iteration = True  # Variable to track the first iteration

    for _, goal in group_goals_last_day.iterrows():
        home_team = goal['home_team']
        away_team = goal['away_team']
        player_team = goal['scorer_nationality']

        # Print goal information for each goal
        print(f"Analyzing goal: {goal['goal_minute']} minute, Player team: {player_team}, Home: {home_team}, Away: {away_team}")

        # Update the goals based on who scored the goal
        if player_team == home_team:
            # Home team scored, update home scored and away conceded
            group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'last_game_goals_scored'] += 1
            group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'last_game_goals_conceded'] += 1
        elif player_team == away_team:
            # Away team scored, update away scored and home conceded
            group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'last_game_goals_scored'] += 1
            group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'last_game_goals_conceded'] += 1

        # Step 5: Update total_goals_scored, total_goals_conceded, and total_goal_difference
        group_goals_tracking['total_goals_scored'] = group_goals_tracking['before_last_game_goals_scored'] + group_goals_tracking['last_game_goals_scored']
        group_goals_tracking['total_goals_conceded'] = group_goals_tracking['before_last_game_goals_conceded'] + group_goals_tracking['last_game_goals_conceded']
        group_goals_tracking['total_goal_difference'] = group_goals_tracking['total_goals_scored'] - group_goals_tracking['total_goals_conceded']

        # Step 6: Assign points for the last game dynamically after each goal
        for i, row in group_goals_tracking.iterrows():
            if row['last_game_goals_scored'] > row['last_game_goals_conceded']:
                # Check if the year is <= 1992, award 2 points for a win, else award 3 points
                if row['year'] <= 1992:
                    group_goals_tracking.loc[i, 'last_game_points'] = 2  # Win before or during 1992
                else:
                    group_goals_tracking.loc[i, 'last_game_points'] = 3  # Win after 1992
            elif row['last_game_goals_scored'] == row['last_game_goals_conceded']:
                group_goals_tracking.loc[i, 'last_game_points'] = 1  # Draw
            else:
                group_goals_tracking.loc[i, 'last_game_points'] = 0  # Loss

        # Step 7: Update total points
        group_goals_tracking['total_points'] = group_goals_tracking['before_last_game_points'] + group_goals_tracking['last_game_points']

        # Step 8: Mark teams that are tied
        group_goals_tracking['tied'] = group_goals_tracking.duplicated(subset=['total_points'], keep=False)

        # Reset `tied_won` to 0 for all teams
        group_goals_tracking['tied_won'] = 0

        # Step 8b: Calculate tied_won only for tied teams
        tied_teams = group_goals_tracking[group_goals_tracking['tied']]

        if not tied_teams.empty:
            # Iterate over tied teams to resolve standings using head-to-head results
            for index, row in tied_teams.iterrows():
                team1 = row['team']
                team1_index = index

                # Look for other teams tied with this team
                for other_index, other_row in tied_teams[tied_teams.index != index].iterrows():
                    team2 = other_row['team']
                    team2_index = other_index

                    # Check if these two teams played against each other in agg_goals_before_last_day
                    match = agg_goals_before_last_day[
                        ((agg_goals_before_last_day['home_team'] == team1) & (agg_goals_before_last_day['away_team'] == team2)) |
                        ((agg_goals_before_last_day['home_team'] == team2) & (agg_goals_before_last_day['away_team'] == team1))
                    ]

                    if not match.empty:
                        match_result = match.iloc[0]['won']  # Assume the 'won' column holds 1 for home win, -1 for away win, 0 for draw

                        # Resolve the tie using the match result
                        if match_result == 1:
                            # Home team won
                            if match.iloc[0]['home_team'] == team1:
                                group_goals_tracking.loc[group_goals_tracking['team'] == team1, 'tied_won'] = 1
                            else:
                                group_goals_tracking.loc[group_goals_tracking['team'] == team2, 'tied_won'] = 1
                        elif match_result == -1:
                            # Away team won
                            if match.iloc[0]['away_team'] == team1:
                                group_goals_tracking.loc[group_goals_tracking['team'] == team1, 'tied_won'] = 1
                            else:
                                group_goals_tracking.loc[group_goals_tracking['team'] == team2, 'tied_won'] = 1
                        elif match_result == 0:
                            # Draw, both teams get the same standing
                            group_goals_tracking.loc[group_goals_tracking['team'] == team1, 'tied_won'] = 0
                            group_goals_tracking.loc[group_goals_tracking['team'] == team2, 'tied_won'] = 0

        # Step 8c: Sort teams by total points, tied_won, goal difference, and goals scored
        group_goals_tracking = group_goals_tracking.sort_values(by=['total_points', 'tied_won', 'total_goal_difference', 'total_goals_scored'],
                                                                ascending=[False, False, False, False])

        # Step 9: Assign standings based on the sorting and tie resolution
        group_goals_tracking['last_game_standing'] = group_goals_tracking.reset_index(drop=True).index + 1

        # Step 10: Track changes and update standing positions after each goal is processed
        for i, row in group_goals_tracking.iterrows():
            team = row['team']

            if first_iteration:
                # Skip the update if the standing did not change
                if row['before_last_game_standing'] == row['last_game_standing']:
                    continue  # Skip updating the counters
                else:
                    # Update the position counters since standing has changed
                    if row['last_game_standing'] == 1:
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, '1st'] += 1
                    elif row['last_game_standing'] == 2:
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, '2nd'] += 1
                    elif row['last_game_standing'] == 3:
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, '3rd'] += 1
                    elif row['last_game_standing'] == 4:
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, '4th'] += 1

                # Disable first iteration flag after the first goal
                first_iteration = False
            else:
                # Track the standing position counters only if the position differs from the previous state
                if row['last_game_standing'] != previous_standings[i]:  
                    if row['last_game_standing'] == 1:
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, '1st'] += 1
                    elif row['last_game_standing'] == 2:
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, '2nd'] += 1
                    elif row['last_game_standing'] == 3:
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, '3rd'] += 1
                    elif row['last_game_standing'] == 4:
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, '4th'] += 1

        # Update previous standings after each goal
        previous_standings = group_goals_tracking['last_game_standing'].copy()

        # Step 11: Calculate changes as the sum of 1st, 2nd, 3rd, and 4th
        group_goals_tracking['changes'] = group_goals_tracking[['1st', '2nd', '3rd', '4th']].sum(axis=1)

        # Step 12: Print the updated group_goals_tracking after processing each goal
        print("\n=== Updated Standings After This Goal ===\n")
        display_columns = ['team', 'total_points', 'total_goals_scored', 'total_goals_conceded', 
                           'total_goal_difference', 'last_game_points', 'last_game_standing', 
                           'changes', '1st', '2nd', '3rd', '4th', 'tied', 'tied_won']
        print(group_goals_tracking[display_columns].to_string(index=False))
        print("\n========================================\n")

    # Step 13: Return the final DataFrame
    return group_goals_tracking


In [90]:
# Initialize an empty list to store the results for each pair
all_results = []

# Get unique pairs of year and stage
unique_pairs = all_games_before_last[['year', 'stage']].drop_duplicates()

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    stage = row['stage']
    
    # Apply the function to the current pair
    result = uefa_final_euro(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)

    
    # Append the result to the list
    all_results.append(result)

# Concatenate all the results into a single DataFrame
changes_df_euro = pd.concat(all_results)

# Keep only the specified columns
changes_df_euro = changes_df_euro[['year', 'stage', 'team', '1st', '2nd', '3rd', '4th', 'changes']]



=== Initial Standings for Year 1984, Group 1 Before Last Match Goals ===

      team  total_points  total_goals_scored  total_goals_conceded  total_goal_difference  before_last_game_points  before_last_game_standing
    France             4                   6                     0                      6                        4                          1
   Belgium             2                   2                     5                     -3                        2                          2
   Denmark             2                   5                     1                      4                        2                          3
Yugoslavia             0                   0                     7                     -7                        0                          4


Analyzing goal: 26 minute, Player team: Belgium, Home: Denmark, Away: Belgium

=== Updated Standings After This Goal ===

      team  total_points  total_goals_scored  total_goals_conceded  total_goal_difference  

In [91]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\tb_euro_uefa.xlsx'
changes_df_euro.to_excel(file_path, index=False)


# group composition tracking

In [92]:
def track_composition_changes(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day):
    # Step 1: Filter the data for the specific year and stage
    group_goals_tracking = all_games_before_last[
        (all_games_before_last['year'] == year) & 
        (all_games_before_last['stage'] == stage)
    ].copy()

    group_goals_last_day = goals_last_day_sorted[
        (goals_last_day_sorted['year'] == year) & 
        (goals_last_day_sorted['stage'] == stage)
    ]

    # Initialize columns for team performance and standings
    group_goals_tracking['before_last_game_goals_scored'] = group_goals_tracking['goals_scored']
    group_goals_tracking['before_last_game_goals_conceded'] = group_goals_tracking['goals_conceded']
    group_goals_tracking['before_last_game_standing'] = group_goals_tracking['standing']
    group_goals_tracking['before_last_game_points'] = group_goals_tracking['points']
    group_goals_tracking['last_game_goals_scored'] = 0
    group_goals_tracking['last_game_goals_conceded'] = 0
    group_goals_tracking['total_goals_scored'] = group_goals_tracking['before_last_game_goals_scored']
    group_goals_tracking['total_goals_conceded'] = group_goals_tracking['before_last_game_goals_conceded']
    group_goals_tracking['total_goal_difference'] = group_goals_tracking['total_goals_scored'] - group_goals_tracking['total_goals_conceded']
    group_goals_tracking['last_game_points'] = 0
    group_goals_tracking['total_points'] = group_goals_tracking['before_last_game_points']
    group_goals_tracking['last_game_standing'] = 0
    group_goals_tracking['tied_won'] = 0  # Initialize tied_won for tiebreak resolution

    # Define top standings limit based on the year
    top_standings_limit = 3 if year <= 1994 else 2

    # Determine the initial standings for 1st, 2nd, 3rd, and third-placed teams
    sorted_initial = group_goals_tracking.sort_values(by=['before_last_game_points', 'total_goal_difference', 'before_last_game_goals_scored'], ascending=[False, False, False])
    first_place_team = sorted_initial.iloc[0]['team'] if len(sorted_initial) > 0 else None
    second_place_team = sorted_initial.iloc[1]['team'] if len(sorted_initial) > 1 else None
    third_place_team = sorted_initial.iloc[2]['team'] if len(sorted_initial) > 2 else None

    # Identify third-place teams by group
    third_place_teams = []
    for group, group_data in group_goals_tracking.groupby('stage'):
        sorted_group = group_data.sort_values(by=['before_last_game_points', 'total_goal_difference', 'before_last_game_goals_scored'], ascending=[False, False, False])
        if len(sorted_group) >= 3:
            third_place_teams.append(sorted_group.iloc[2]['team'])

    # Determine if the third-placed team is among the top 4 third-placed teams initially
    third_place_df = group_goals_tracking[group_goals_tracking['team'].isin(third_place_teams)]
    top4_third_place_teams = third_place_df.sort_values(
        by=['before_last_game_points', 'total_goal_difference', 'before_last_game_goals_scored'], 
        ascending=[False, False, False]
    ).head(4)['team'].tolist()
    is_top4_third_place_initial = 1 if third_place_team in top4_third_place_teams else 0

    # Step 2: Initialize composition tracking with initial composition (change_num = 0)
    initial_top_teams = set(sorted_initial.nsmallest(top_standings_limit, 'before_last_game_standing')['team'])
    composition_changes = [{
        'year': year,
        'stage': stage,
        'change_num': 0,
        'goal_time': 'initial',
        'home_team': None,
        'away_team': None,
        'scorer_team': None,
        'new_top_teams': list(initial_top_teams),
        'third_place_teams_list': third_place_teams,
        'top4_third_place': is_top4_third_place_initial,
        '1st': first_place_team,
        '2nd': second_place_team,
        '3rd': third_place_team
    }]
    change_counter = 0  # Counter for the number of composition changes

    # Step 3: Sort goals by regulation time
    group_goals_last_day = group_goals_last_day.sort_values(by=['goal_minute'])

    # Step 4: Iterate through each goal and track changes in composition
    for _, goal in group_goals_last_day.iterrows():
        home_team = goal['home_team']
        away_team = goal['away_team']
        scorer_team = goal['scorer_nationality']

        # Update scores based on who scored the goal
        if scorer_team == home_team:
            group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'last_game_goals_scored'] += 1
            group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'last_game_goals_conceded'] += 1
        elif scorer_team == away_team:
            group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'last_game_goals_scored'] += 1
            group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'last_game_goals_conceded'] += 1

        # Update total goals, goal difference, and total points
        group_goals_tracking['total_goals_scored'] = group_goals_tracking['before_last_game_goals_scored'] + group_goals_tracking['last_game_goals_scored']
        group_goals_tracking['total_goals_conceded'] = group_goals_tracking['before_last_game_goals_conceded'] + group_goals_tracking['last_game_goals_conceded']
        group_goals_tracking['total_goal_difference'] = group_goals_tracking['total_goals_scored'] - group_goals_tracking['total_goals_conceded']

        # Step 5: Calculate tied_won for tied teams based on head-to-head results
        group_goals_tracking['tied'] = group_goals_tracking.duplicated(subset=['total_points'], keep=False)
        group_goals_tracking['tied_won'] = 0  # Reset tied_won for all teams

        tied_teams = group_goals_tracking[group_goals_tracking['tied']]
        if not tied_teams.empty:
            for index, row in tied_teams.iterrows():
                team1 = row['team']
                for _, other_row in tied_teams[tied_teams.index != index].iterrows():
                    team2 = other_row['team']
                    match = agg_goals_before_last_day[
                        ((agg_goals_before_last_day['home_team'] == team1) & (agg_goals_before_last_day['away_team'] == team2)) |
                        ((agg_goals_before_last_day['home_team'] == team2) & (agg_goals_before_last_day['away_team'] == team1))
                    ]

                    if not match.empty:
                        match_result = match.iloc[0]['won']
                        if match_result == 1:
                            if match.iloc[0]['home_team'] == team1:
                                group_goals_tracking.loc[group_goals_tracking['team'] == team1, 'tied_won'] = 1
                            else:
                                group_goals_tracking.loc[group_goals_tracking['team'] == team2, 'tied_won'] = 1
                        elif match_result == -1:
                            if match.iloc[0]['away_team'] == team1:
                                group_goals_tracking.loc[group_goals_tracking['team'] == team1, 'tied_won'] = 1
                            else:
                                group_goals_tracking.loc[group_goals_tracking['team'] == team2, 'tied_won'] = 1

        # Calculate points and update standings as needed
        group_goals_tracking['total_points'] = group_goals_tracking['before_last_game_points'] + group_goals_tracking['last_game_points']
        group_goals_tracking = group_goals_tracking.sort_values(by=['total_points', 'tied_won', 'total_goal_difference', 'total_goals_scored'], ascending=[False, False, False, False])
        group_goals_tracking['last_game_standing'] = group_goals_tracking.reset_index(drop=True).index + 1

        # Track 1st, 2nd, and 3rd teams
        top_teams = group_goals_tracking[group_goals_tracking['last_game_standing'] <= 3]
        first_place_team = top_teams.iloc[0]['team'] if len(top_teams) > 0 else None
        second_place_team = top_teams.iloc[1]['team'] if len(top_teams) > 1 else None
        third_place_team = top_teams.iloc[2]['team'] if len(top_teams) > 2 else None

        # Determine changes
        current_top_teams = set(group_goals_tracking.nsmallest(top_standings_limit, 'last_game_standing')['team'])
        if current_top_teams != initial_top_teams:
            change_counter += 1
            initial_top_teams = current_top_teams
            composition_changes.append({
                'year': year,
                'stage': stage,
                'change_num': change_counter,
                'goal_time': goal['goal_minute'],
                'home_team': home_team,
                'away_team': away_team,
                'scorer_nationality': scorer_team,
                'new_top_teams': list(current_top_teams),
                'top4_third_place': 1 if third_place_team in top4_third_place_teams else 0,
                '1st': first_place_team,
                '2nd': second_place_team,
                '3rd': third_place_team
            })

    return pd.DataFrame(composition_changes)


In [93]:
# Initialize an empty list to store the results for each pair
all_composition_changes = []

# Get unique pairs of year and stage
unique_pairs = all_games_before_last[['year', 'stage']].drop_duplicates()

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    stage = row['stage']
    
    # Apply the function to the current pair
    composition_changes_df = track_composition_changes(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_composition_changes.append(composition_changes_df)

# Concatenate all the results into a single DataFrame
final_composition_changes_df = pd.concat(all_composition_changes, ignore_index=True)

# Display the final DataFrame
display(final_composition_changes_df)


Unnamed: 0,year,stage,change_num,goal_time,home_team,away_team,scorer_team,new_top_teams,third_place_teams_list,top4_third_place,1st,2nd,3rd,scorer_nationality
0,1984,Group 1,0,initial,,,,"[France, Belgium, Denmark]",[Belgium],1,France,Denmark,Belgium,
1,1984,Group 2,0,initial,,,,"[Spain, West Germany, Portugal]",[Portugal],1,West Germany,Spain,Portugal,
2,1988,Group 1,0,initial,,,,"[Italy, West Germany, Spain]",[Spain],1,West Germany,Italy,Spain,
3,1988,Group 2,0,initial,,,,"[Netherlands, Soviet Union, Republic of Ireland]",[Netherlands],1,Republic of Ireland,Soviet Union,Netherlands,
4,1992,Group 1,0,initial,,,,"[Sweden, France, Denmark]",[Denmark],1,Sweden,France,Denmark,
5,1992,Group 2,0,initial,,,,"[Netherlands, CIS, Germany]",[CIS],1,Germany,Netherlands,CIS,
6,1996,Group A,0,initial,,,,"[Netherlands, England]",[Switzerland],1,England,Netherlands,Switzerland,
7,1996,Group B,0,initial,,,,"[France, Bulgaria]",[Spain],1,Bulgaria,France,Spain,
8,1996,Group C,0,initial,,,,"[Czech Republic, Germany]",[Czech Republic],1,Germany,Italy,Czech Republic,
9,1996,Group D,0,initial,,,,"[Croatia, Portugal]",[Denmark],1,Croatia,Portugal,Denmark,


In [94]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\standings_euro_eufa.xlsx'
final_composition_changes_df.to_excel(file_path, index=False)


# best four third placed

In [95]:
def best_four_third_placed(goals_last_day_sorted, all_games_before_last, agg_goals_before_last_day):
    # Apply filter to process only years <= 2024
    all_games_before_last = all_games_before_last[all_games_before_last['year'] >= 2016].copy()

    # Initialize a list to store data for each year that will be appended into the final DataFrame
    all_years_data = []

    # Group data by year and process year-by-year
    for year, year_data in all_games_before_last.groupby('year'):
        print(f"\n--- Processing Year: {year} ---")
        year_data = year_data.copy()

        # Initialize dictionaries to track third-placed teams and top 4 counts for the current year
        third_teams_count = {}
        top4_count = {}

        # Step 1: Initialize columns for tracking team performance before and after goals
        year_data.loc[:, 'before_last_game_goals_scored'] = year_data['goals_scored']
        year_data.loc[:, 'before_last_game_goals_conceded'] = year_data['goals_conceded']
        year_data.loc[:, 'before_last_game_points'] = year_data['points']

        year_data.loc[:, 'last_game_goals_scored'] = 0
        year_data.loc[:, 'last_game_goals_conceded'] = 0
        year_data.loc[:, 'last_game_points'] = 0
        year_data.loc[:, 'total_goals_scored'] = year_data['before_last_game_goals_scored'] + year_data['last_game_goals_scored']
        year_data.loc[:, 'total_goals_conceded'] = year_data['before_last_game_goals_conceded'] + year_data['last_game_goals_conceded']
        year_data.loc[:, 'total_goal_difference'] = year_data['total_goals_scored'] - year_data['total_goals_conceded']

        
        # Print the standings before any goal is processed
        print(f"\n=== Initial Standings for Year {year} Before Processing Any Goals ===\n")
        display_columns = ['team', 'stage', 'before_last_game_points', 'before_last_game_goals_scored', 'before_last_game_goals_conceded', 'total_goal_difference']
        print(year_data[display_columns].sort_values(by=['stage', 'before_last_game_points', 'total_goal_difference', 'before_last_game_goals_scored'], ascending=[True, False, False, False]).to_string(index=False))
        print("\n===============================================================\n")

        # Process each goal from the sorted last-day goals for the current year
        goals_last_day_year = goals_last_day_sorted[goals_last_day_sorted['year'] == year]

        for i, goal in goals_last_day_year.iterrows():
            print(f"\n--- Analyzing Goal {i + 1} in {goal['stage']}: On date {goal['short_date']}, Minute {goal['goal_minute']} (Player Team: {goal['scorer_nationality']}, Home: {goal['home_team']}, Away: {goal['away_team']}) ---")

            home_team = goal['home_team']
            away_team = goal['away_team']
            player_team = goal['scorer_nationality']

            # Identify the opponent team
            opponent_team = home_team if player_team == away_team else away_team

            year_data.loc[year_data['team'] == player_team, 'last_game_goals_scored'] += 1
            year_data.loc[year_data['team'] == opponent_team, 'last_game_goals_conceded'] += 1

            # Update total goals scored, goals conceded, and goal difference
            year_data.loc[:, 'total_goals_scored'] = year_data['before_last_game_goals_scored'] + year_data['last_game_goals_scored']
            year_data.loc[:, 'total_goals_conceded'] = year_data['before_last_game_goals_conceded'] + year_data['last_game_goals_conceded']
            year_data.loc[:, 'total_goal_difference'] = year_data['total_goals_scored'] - year_data['total_goals_conceded']

            # Assign points for the last game dynamically
            year_data.loc[:, 'last_game_points'] = (year_data['last_game_goals_scored'] > year_data['last_game_goals_conceded']).astype(int) * 3 + \
                                                   (year_data['last_game_goals_scored'] == year_data['last_game_goals_conceded']).astype(int)

            # Update total points
            year_data.loc[:, 'total_points'] = year_data['before_last_game_points'] + year_data['last_game_points']

            # Reset third-placed teams after every goal
            third_teams_per_year = []

            # Recalculate standings after the goal
            for group, group_data in year_data.groupby('stage'):
                sorted_standings = group_data.sort_values(by=['total_points', 'total_goal_difference', 'total_goals_scored'], ascending=[False, False, False])

                # Identify the third-placed teams after each goal, **by group**
                if len(sorted_standings) >= 3:
                    third_placed_team = sorted_standings.iloc[2]
                    third_teams_per_year.append(third_placed_team['team'])

            # Update the count for third-placed teams after each goal
            for team in third_teams_per_year:
                if team not in third_teams_count:
                    third_teams_count[team] = 0
                third_teams_count[team] += 1

            # Create DataFrame of third-placed teams with additional stats
            third_teams_df = year_data[year_data['team'].isin(third_teams_per_year)][
                ['team', 'total_points', 'last_game_points', 'total_goal_difference', 'total_goals_scored']
            ]

            # Identify the best four third-placed teams after each goal
            top4_teams = third_teams_df['team'].value_counts().nlargest(4).index.tolist()

            # Print third-placed teams' additional statistics after each goal
            print(f"Top 4 third-placed teams after this goal: {top4_teams}")
            print(f"Third-placed teams DataFrame after this goal:\n{third_teams_df}\n")

            # Update the count for top 4 third-placed teams after each goal
            for team in top4_teams:
                if team not in top4_count:
                    top4_count[team] = 0
                top4_count[team] += 1

        # After processing all goals for the year, store the results in the final list
        for team, third_count in third_teams_count.items():
            top4_count_for_team = top4_count.get(team, 0)
            all_years_data.append({
                'team': team,
                'year': year,
                'third_place_count': third_count,
                'top4_third_place_count': top4_count_for_team
            })

    # Create a DataFrame from the list of all years' data
    final_df = pd.DataFrame(all_years_data)

    file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\third_teams_euro.xlsx'
    final_df.to_excel(file_path, index=False)
    
    return final_df


In [96]:
best_four_third_placed(goals_last_day_sorted, all_games_before_last, agg_goals_before_last_day)


--- Processing Year: 2016 ---

=== Initial Standings for Year 2016 Before Processing Any Goals ===

               team   stage  before_last_game_points  before_last_game_goals_scored  before_last_game_goals_conceded  total_goal_difference
             France Group A                        6                              4                                1                      3
        Switzerland Group A                        4                              2                                1                      1
            Romania Group A                        1                              2                                3                     -1
            Albania Group A                        0                              0                                3                     -3
            England Group B                        4                              3                                2                      1
           Slovakia Group B                        3       

Unnamed: 0,team,year,third_place_count,top4_third_place_count
0,Albania,2016,22,22
1,Wales,2016,1,1
2,Northern Ireland,2016,22,22
3,Czech Republic,2016,5,5
4,Sweden,2016,19,0
5,Portugal,2016,19,0
6,Slovakia,2016,21,21
7,Turkey,2016,17,17
8,Iceland,2016,3,0
9,Republic of Ireland,2016,3,0


In [97]:
# Count the number of observations where the year is 2016
count_2016 = goals_last_day_sorted[goals_last_day_sorted['year'] == 2016].shape[0]
print(f"Number of observations in goals_last_day_sorted for the year 2016: {count_2016}")

# Count the number of observations where the year is 2021
count_2020 = goals_last_day_sorted[goals_last_day_sorted['year'] == 2021].shape[0]
print(f"Number of observations in goals_last_day_sorted for the year 2020: {count_2020}")

# Count the number of observations where the year is 2024
count_2024 = goals_last_day_sorted[goals_last_day_sorted['year'] == 2024].shape[0]
print(f"Number of observations in goals_last_day_sorted for the year 2024: {count_2024}")


Number of observations in goals_last_day_sorted for the year 2016: 22
Number of observations in goals_last_day_sorted for the year 2020: 38
Number of observations in goals_last_day_sorted for the year 2024: 24
