# Libraries

In [62]:
import pandas as pd
from getpass import getuser
from collections import defaultdict

# Load and inspect dataset

In [63]:
# Get the current user's name
user = getuser()

# Path to the dataset
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\fifa.csv'

# Read the dataset with a different encoding
df = pd.read_csv(data_path, encoding='ISO-8859-1')

# Extract relevant columns

In [64]:
# First, let's extract the year from the 'tournament_id' column and create a new column 'year'
df['year'] = df['tournament_id'].str.extract(r'WC-(\d{4})').astype(int)

# Now, filter the data for tournaments after 1986
filtered_df = df[df['year'] > 1984]

# Extract relevant columns for goal events and match results
goals_df = filtered_df[['year', 'group_name','match_name', 'match_id', 'player_team_name','match_date', 'minute_regulation', 'minute_stoppage', 
               'team_id', 'own_goal']]

# Remove all observations where group_name is "not applicable"
goals_df = goals_df[goals_df['group_name'] != 'not applicable']

# Convert 'match_date' to datetime format
goals_df['match_date'] = pd.to_datetime(goals_df['match_date'], format='%m/%d/%Y')

# Sort the dataset by 'match_date' in ascending order (oldest first) and 'minute_regulation'
goals_df = goals_df.sort_values(by=['match_date', 'minute_regulation'], ascending=[True, True])

# Display the first few rows to confirm the sorting
goals_df.head()


Unnamed: 0,year,group_name,match_name,match_id,player_team_name,match_date,minute_regulation,minute_stoppage,team_id,own_goal
1196,1986,Group A,Bulgaria v Italy,M-1986-01,Italy,1986-05-31,44,0,T-39,0
1197,1986,Group A,Bulgaria v Italy,M-1986-01,Bulgaria,1986-05-31,85,0,T-10,0
1198,1986,Group D,Spain v Brazil,M-1986-02,Brazil,1986-06-01,62,0,T-09,0
1199,1986,Group C,Canada v France,M-1986-03,France,1986-06-01,79,0,T-28,0
1204,1986,Group C,Soviet Union v Hungary,M-1986-05,Soviet Union,1986-06-02,2,0,T-70,0


# Recreate Leauge Table after first two matchdays

### Filter out the matches from the last match day in the goals_df DataFrame.

In [65]:
# Step 1: Find the last match date for each tournament and group
last_dates = goals_df.groupby(['year', 'group_name'])['match_date'].max().reset_index()

# Step 2: Create the dataset with all games excluding the last match day (goals_before_last_day)
goals_before_last_day = goals_df.merge(last_dates, on=['year', 'group_name', 'match_date'], how='outer', indicator=True)
goals_before_last_day = goals_before_last_day[goals_before_last_day['_merge'] == 'left_only'].drop(columns=['_merge'])

# Step 3: Create the dataset with only the last match day games (goals_last_day)
goals_last_day = goals_df.merge(last_dates, on=['year', 'group_name', 'match_date'])

# Step 4: Split match_name into home and away teams and remove match_name column for both datasets

# For goals_before_last_day
goals_before_last_day[['home', 'away']] = goals_before_last_day['match_name'].str.split(' v ', expand=True)
goals_before_last_day = goals_before_last_day.drop(columns=['match_name'])

# For goals_last_day
goals_last_day[['home', 'away']] = goals_last_day['match_name'].str.split(' v ', expand=True)
goals_last_day = goals_last_day.drop(columns=['match_name'])

# Sort the DataFrame by 'minute_regulation' and 'minute_stoppage'
goals_last_day_sorted = goals_last_day.sort_values(by=['match_date','minute_regulation', 'minute_stoppage'], ascending=[True, True, True])

## create df  of aggregate data for each match

In [66]:
# Initialize an empty list to store the results
results = []

# Iterate over each match in the dataset
for match_id, group in goals_before_last_day.groupby(['year', 'group_name', 'match_id', 'home', 'away']):
    # Initialize goals_home and goals_away for each match
    goals_home = 0
    goals_away = 0
    
    # Loop through each row in the group
    for _, row in group.iterrows():
        # Check if the team is the home team and no own goal
        if row['home'] == row['player_team_name'] and row['own_goal'] == 0:
            goals_home += 1
        # If the team is the away team and no own goal, increment goals_away
        elif row['away'] == row['player_team_name'] and row['own_goal'] == 0:
            goals_away += 1

    # Append the results to the list, keeping home and away columns
    results.append({
        'year': match_id[0],
        'group_name': match_id[1],
        'match_id': match_id[2],
        'home': match_id[3],
        'away': match_id[4],
        'goals_home': goals_home,
        'goals_away': goals_away
    })

# Convert the list into a DataFrame
agg_goals_before_last_day = pd.DataFrame(results)

# Add a new column 'won' based on the comparison of goals_home and goals_away
agg_goals_before_last_day['won'] = agg_goals_before_last_day.apply(
    lambda row: 1 if row['goals_home'] > row['goals_away'] else (-1 if row['goals_home'] < row['goals_away'] else 0), 
    axis=1
)



In [67]:
spain2022 = agg_goals_before_last_day[(agg_goals_before_last_day['home'] == 'Spain') & (agg_goals_before_last_day['year'] == 2022)]
spain2022

Unnamed: 0,year,group_name,match_id,home,away,goals_home,goals_away,won
258,2022,Group E,M-2022-11,Spain,Costa Rica,7,0,1
260,2022,Group E,M-2022-28,Spain,Germany,1,1,0


## aggregate data for home and away games

In [68]:
# Remove duplicates to ensure each match is counted only once per home-away combination
unique_matches = agg_goals_before_last_day.drop_duplicates(subset=['year', 'group_name', 'home', 'away'])

# Define points calculation directly within the groupby to ensure correct year alignment
home_games = unique_matches.groupby(['year', 'group_name', 'home']).apply(
    lambda df: pd.Series({
        'goals_scored': df['goals_home'].sum(),
        'goals_conceded': df['goals_away'].sum(),
        'points_home': sum(3 if (result == 1 and year > 1994) else 
                           (2 if (result == 1 and year <= 1994) else 
                            (1 if result == 0 else 0)) 
                           for result, year in zip(df['won'], df['year'])),
        'match_count_home': df['match_id'].count()
    })
).reset_index()

# Group away games and calculate stats similarly
away_games = unique_matches.groupby(['year', 'group_name', 'away']).apply(
    lambda df: pd.Series({
        'goals_scored': df['goals_away'].sum(),
        'goals_conceded': df['goals_home'].sum(),
        'points_away': sum(3 if (result == -1 and year > 1994) else 
                           (2 if (result == -1 and year <= 1994) else 
                            (1 if result == 0 else 0)) 
                           for result, year in zip(df['won'], df['year'])),
        'match_count_away': df['match_id'].count()
    })
).reset_index()


In [69]:
# Filter for year == 2022 and group_name == 'Group E'
home2022e = home_games[(home_games['year'] == 2022) & (home_games['group_name'] == 'Group E')]
home2022e

Unnamed: 0,year,group_name,home,goals_scored,goals_conceded,points_home,match_count_home
202,2022,Group E,Germany,1,2,0,1
203,2022,Group E,Japan,0,1,0,1
204,2022,Group E,Spain,8,1,4,2


In [70]:
# Filter for year == 2022 and group_name == 'Group E'
away2022e = away_games[(away_games['year'] == 2022) & (away_games['group_name'] == 'Group E')]
away2022e

Unnamed: 0,year,group_name,away,goals_scored,goals_conceded,points_away,match_count_away
200,2022,Group E,Costa Rica,1,7,3,2
201,2022,Group E,Germany,1,1,1,1
202,2022,Group E,Japan,2,1,3,1


## aggregate data after first two matches

In [71]:
def uefa_before_last(home_games, away_games, agg_goals_before_last_day):
    # Step 1: Merge home_games and away_games on year, group_name, and home with away
    all_games_before_last = pd.merge(
        home_games,
        away_games,
        left_on=['year', 'group_name', 'home'],
        right_on=['year', 'group_name', 'away'],
        how='outer',
        suffixes=('_home', '_away')
    )

    # Step 2: For teams that only appear in home_games, assign home to team and copy relevant columns
    all_games_before_last['team'] = all_games_before_last['home'].fillna(all_games_before_last['away'])

    # Step 3: For goals_scored, goals_conceded, and points, handle missing values
    all_games_before_last['goals_scored'] = all_games_before_last['goals_scored_home'].fillna(0) + all_games_before_last['goals_scored_away'].fillna(0)
    all_games_before_last['goals_conceded'] = all_games_before_last['goals_conceded_home'].fillna(0) + all_games_before_last['goals_conceded_away'].fillna(0)
    all_games_before_last['points'] = all_games_before_last['points_home'].fillna(0) + all_games_before_last['points_away'].fillna(0)

    # Step 4: Add goals_difference column
    all_games_before_last['goals_difference'] = all_games_before_last['goals_scored'] - all_games_before_last['goals_conceded']

    # Step 5: Sum match_count_home and match_count_away to get total match count for each team
    all_games_before_last['total_matches'] = all_games_before_last['match_count_home'].fillna(0) + all_games_before_last['match_count_away'].fillna(0)

    # Step 6: Adjust points and matches if total_matches is 1 or 0 (indicating possible 0-0 draws)
    for i in range(len(all_games_before_last) - 1):
        row1 = all_games_before_last.iloc[i]
        row2 = all_games_before_last.iloc[i + 1]

        if row1['year'] == row2['year'] and row1['group_name'] == row2['group_name'] and row1['team'] != row2['team']:
            if row1['total_matches'] + row2['total_matches'] == 1:
                all_games_before_last.at[i, 'total_matches'] += 1
                all_games_before_last.at[i, 'points'] += 1
                all_games_before_last.at[i + 1, 'total_matches'] += 1
                all_games_before_last.at[i + 1, 'points'] += 1
            elif row1['total_matches'] + row2['total_matches'] == 0:
                all_games_before_last.at[i, 'total_matches'] += 2
                all_games_before_last.at[i, 'points'] += 2
                all_games_before_last.at[i + 1, 'total_matches'] += 2
                all_games_before_last.at[i + 1, 'points'] += 2

    # Step 7: Drop unnecessary columns used in the merge process
    all_games_before_last = all_games_before_last[['year', 'group_name', 'team', 'goals_scored', 'goals_conceded', 'points', 'goals_difference', 'total_matches']]

    # Step 8: Initial sort by points, goals_difference, and goals_scored within each tournament and group
    all_games_before_last = all_games_before_last.sort_values(
        by=['year', 'group_name', 'points', 'goals_difference', 'goals_scored'],
        ascending=[True, True, False, False, False]
    ).reset_index(drop=True)

    # Step 9: Initialize a new column for tie-break results and `tied_won` to track winners
    all_games_before_last['tiebreaker'] = 'no need'
    all_games_before_last['tied_won'] = None  # This will be set to 1 for winner, 0 for loser in tie-breaks

    # Step 10: Define the function to check the tie-breaker
    def check_tiebreaker(row1, row2, agg_data):
        """
        Check the tie-breaker based on the head-to-head match result from agg_goals_before_last_day.
        Return the team that won (if any) or 'tie'.
        """
        match = agg_data[((agg_data['home'] == row1['team']) & (agg_data['away'] == row2['team'])) |
                         ((agg_data['home'] == row2['team']) & (agg_data['away'] == row1['team']))]
        
        if not match.empty:
            if match.iloc[0]['won'] == 1:
                return row1['team']  # Home team won
            elif match.iloc[0]['won'] == -1:
                return row2['team']  # Away team won
            else:
                return 'tie'  # It's a draw
        else:
            return 'no result'  # No match found

    # Step 11: Loop through the sorted dataframe and apply the tie-breaker if needed
    for i in range(len(all_games_before_last) - 1):
        row1 = all_games_before_last.iloc[i]
        row2 = all_games_before_last.iloc[i + 1]

        if row1['points'] == row2['points']:
            tiebreak_result = check_tiebreaker(row1, row2, agg_goals_before_last_day)

            if tiebreak_result != 'tie' and tiebreak_result != 'no result':
                # Assign the tiebreak winner and update tied_won
                if tiebreak_result == row1['team']:
                    all_games_before_last.at[i, 'tiebreaker'] = tiebreak_result
                    all_games_before_last.at[i, 'tied_won'] = 1
                    all_games_before_last.at[i + 1, 'tied_won'] = 0
                else:
                    all_games_before_last.at[i + 1, 'tiebreaker'] = tiebreak_result
                    all_games_before_last.at[i, 'tied_won'] = 0
                    all_games_before_last.at[i + 1, 'tied_won'] = 1
            else:
                if row1['goals_difference'] > row2['goals_difference']:
                    all_games_before_last.at[i, 'tiebreaker'] = row1['team']
                    all_games_before_last.at[i, 'tied_won'] = 1
                    all_games_before_last.at[i + 1, 'tied_won'] = 0
                elif row1['goals_difference'] < row2['goals_difference']:
                    all_games_before_last.at[i + 1, 'tiebreaker'] = row2['team']
                    all_games_before_last.at[i, 'tied_won'] = 0
                    all_games_before_last.at[i + 1, 'tied_won'] = 1
                elif row1['goals_scored'] > row2['goals_scored']:
                    all_games_before_last.at[i, 'tiebreaker'] = row1['team']
                    all_games_before_last.at[i, 'tied_won'] = 1
                    all_games_before_last.at[i + 1, 'tied_won'] = 0
                else:
                    all_games_before_last.at[i + 1, 'tiebreaker'] = row2['team']
                    all_games_before_last.at[i, 'tied_won'] = 0
                    all_games_before_last.at[i + 1, 'tied_won'] = 1

    # Step 12: Sort to ensure winning team is ranked above losing team
    all_games_before_last['tiebreaker_rank'] = all_games_before_last['tied_won'].fillna(0).astype(int)
    all_games_before_last = all_games_before_last.sort_values(
        by=['year', 'group_name', 'points', 'tiebreaker_rank', 'goals_difference', 'goals_scored'],
        ascending=[True, True, False, False, False, False]
    ).reset_index(drop=True)
    
    # Step 13: Add group standing by ranking teams within each group based on the updated sort order
    all_games_before_last['standing'] = all_games_before_last.groupby(['year', 'group_name']).cumcount() + 1

    # Convert relevant columns to integers
    all_games_before_last[['goals_scored', 'goals_conceded', 'points', 'goals_difference', 'total_matches']] = all_games_before_last[
        ['goals_scored', 'goals_conceded', 'points', 'goals_difference', 'total_matches']].astype(int)

    # Drop the helper column 'tiebreaker_rank'
    all_games_before_last = all_games_before_last.drop(columns=['tiebreaker_rank'])

    return all_games_before_last


In [72]:
all_games_before_last = uefa_before_last(home_games, away_games, agg_goals_before_last_day)

In [73]:
# Filtering all_games_before_last for the year 2022 and group_name 'Group E'
group2022e = all_games_before_last[
    (all_games_before_last['year'] == 2022) & 
    (all_games_before_last['group_name'] == 'Group E')
]

display(group2022e)

Unnamed: 0,year,group_name,team,goals_scored,goals_conceded,points,goals_difference,total_matches,tiebreaker,tied_won,standing
278,2022,Group E,Spain,8,1,4,7,2,no need,,1
279,2022,Group E,Costa Rica,1,7,3,-6,2,Costa Rica,1.0,2
280,2022,Group E,Japan,2,2,3,0,2,no need,0.0,3
281,2022,Group E,Germany,2,3,1,-1,2,no need,,4


# Recreate league table after last match day

In [74]:
def uefa_final_wc(year, group_name, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day):
    # Step 1: Filter the data for the specific year and group_name
    group_goals_tracking = all_games_before_last[
        (all_games_before_last['year'] == year) & 
        (all_games_before_last['group_name'] == group_name)
    ].copy()

    group_goals_last_day = goals_last_day_sorted[
        (goals_last_day_sorted['year'] == year) & 
        (goals_last_day_sorted['group_name'] == group_name)
    ]

    # Step 2: Initialize columns for tracking team performance
    group_goals_tracking['before_last_game_goals_scored'] = group_goals_tracking['goals_scored']
    group_goals_tracking['before_last_game_goals_conceded'] = group_goals_tracking['goals_conceded']
    group_goals_tracking['before_last_game_standing'] = group_goals_tracking['standing']
    group_goals_tracking['before_last_game_points'] = group_goals_tracking['points']

    group_goals_tracking['last_game_goals_scored'] = 0
    group_goals_tracking['last_game_goals_conceded'] = 0
    group_goals_tracking['total_goals_scored'] = group_goals_tracking['before_last_game_goals_scored']
    group_goals_tracking['total_goals_conceded'] = group_goals_tracking['before_last_game_goals_conceded']
    group_goals_tracking['total_goal_difference'] = group_goals_tracking['total_goals_scored'] - group_goals_tracking['total_goals_conceded']
    group_goals_tracking['last_game_points'] = 0
    group_goals_tracking['total_points'] = group_goals_tracking['before_last_game_points']
    group_goals_tracking['last_game_standing'] = 0
 
    # Initialize position counters based on initial standings
    group_goals_tracking['1st'] = group_goals_tracking['before_last_game_standing'].apply(lambda x: 1 if x == 1 else 0)
    group_goals_tracking['2nd'] = group_goals_tracking['before_last_game_standing'].apply(lambda x: 1 if x == 2 else 0)
    group_goals_tracking['3rd'] = group_goals_tracking['before_last_game_standing'].apply(lambda x: 1 if x == 3 else 0)
    group_goals_tracking['4th'] = group_goals_tracking['before_last_game_standing'].apply(lambda x: 1 if x == 4 else 0)

    group_goals_tracking['changes'] = 0  # Initialize this but will be redefined later as the sum of 1st, 2nd, 3rd, 4th
    group_goals_tracking['tied'] = False  # Initialize a flag to track tied teams
    group_goals_tracking['tied_won'] = 0  # Initialize to track if the team won a tie-breaker match

    # Step 3: Sort group_goals_last_day by minute_regulation and minute_stoppage in ascending order
    group_goals_last_day = group_goals_last_day.sort_values(by=['minute_regulation', 'minute_stoppage'], ascending=[True, True])

    # Print the year, group_name, and standings before starting the loop for last match goals
    print(f"\n=== Initial Standings for Year {year}, Group {group_name} Before Last Match Goals ===\n")
    display_columns = ['team', 'total_points', 'total_goals_scored', 'total_goals_conceded', 
                       'total_goal_difference', 'before_last_game_points', 'before_last_game_standing']
    print(group_goals_tracking[display_columns].to_string(index=False))
    print("\n====================================================\n")

    # Step 4: Iterate through the sorted and filtered last match goals and update the goals_tracking table
    previous_standings = group_goals_tracking['last_game_standing'].copy()
    first_iteration = True  # Variable to track the first iteration

    for _, goal in group_goals_last_day.iterrows():
        home_team = goal['home']
        away_team = goal['away']
        player_team = goal['player_team_name']
        own_goal = goal['own_goal']

        # Print goal information for each goal
        print(f"Analyzing goal: {goal['minute_regulation']} minute, {goal['minute_stoppage']} extra time, Player team: {player_team}, Home: {home_team}, Away: {away_team}")

        # Update the goals based on who scored the goal
        if own_goal == 0:  # Normal goal
            if player_team == home_team:
                # Home team scored, update home scored and away conceded
                group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'last_game_goals_scored'] += 1
                group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'last_game_goals_conceded'] += 1
            elif player_team == away_team:
                # Away team scored, update away scored and home conceded
                group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'last_game_goals_scored'] += 1
                group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'last_game_goals_conceded'] += 1
        elif own_goal == 1:  # Own goal
            if player_team == home_team:
                # Own goal by home team, away team scores
                group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'last_game_goals_scored'] += 1
                group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'last_game_goals_conceded'] += 1
            elif player_team == away_team:
                # Own goal by away team, home team scores
                group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'last_game_goals_scored'] += 1
                group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'last_game_goals_conceded'] += 1

        # Step 5: Update total_goals_scored, total_goals_conceded, and total_goal_difference
        group_goals_tracking['total_goals_scored'] = group_goals_tracking['before_last_game_goals_scored'] + group_goals_tracking['last_game_goals_scored']
        group_goals_tracking['total_goals_conceded'] = group_goals_tracking['before_last_game_goals_conceded'] + group_goals_tracking['last_game_goals_conceded']
        group_goals_tracking['total_goal_difference'] = group_goals_tracking['total_goals_scored'] - group_goals_tracking['total_goals_conceded']

        # Step 6: Assign points for the last game dynamically after each goal
        for i, row in group_goals_tracking.iterrows():
            if row['last_game_goals_scored'] > row['last_game_goals_conceded']:
                # Check the year and assign 3 or 2 points based on the year
                if year > 1994:
                    group_goals_tracking.loc[i, 'last_game_points'] = 3  # Win after 1994
                else:
                    group_goals_tracking.loc[i, 'last_game_points'] = 2  # Win before or during 1994
            elif row['last_game_goals_scored'] == row['last_game_goals_conceded']:
                group_goals_tracking.loc[i, 'last_game_points'] = 1  # Draw
            else:
                group_goals_tracking.loc[i, 'last_game_points'] = 0  # Loss



        # Step 7: Update total points
        group_goals_tracking['total_points'] = group_goals_tracking['before_last_game_points'] + group_goals_tracking['last_game_points']

        # Step 8: Mark teams that are tied
        group_goals_tracking['tied'] = group_goals_tracking.duplicated(subset=['total_points'], keep=False)

        # Reset `tied_won` to 0 for all teams
        group_goals_tracking['tied_won'] = 0

        # Step 8b: Calculate tied_won only for tied teams
        tied_teams = group_goals_tracking[group_goals_tracking['tied']]

        if not tied_teams.empty:
            # Iterate over tied teams to resolve standings using head-to-head results
            for index, row in tied_teams.iterrows():
                team1 = row['team']
                team1_index = index

                # Look for other teams tied with this team
                for other_index, other_row in tied_teams[tied_teams.index != index].iterrows():
                    team2 = other_row['team']
                    team2_index = other_index

                    # Check if these two teams played against each other in agg_goals_before_last_day
                    match = agg_goals_before_last_day[
                        ((agg_goals_before_last_day['home'] == team1) & (agg_goals_before_last_day['away'] == team2)) |
                        ((agg_goals_before_last_day['home'] == team2) & (agg_goals_before_last_day['away'] == team1))
                    ]

                    if not match.empty:
                        match_result = match.iloc[0]['won']  # Assume the 'won' column holds 1 for home win, -1 for away win, 0 for draw

                        # Resolve the tie using the match result
                        if match_result == 1:
                            # Home team won
                            if match.iloc[0]['home'] == team1:
                                group_goals_tracking.loc[group_goals_tracking['team'] == team1, 'tied_won'] = 1
                            else:
                                group_goals_tracking.loc[group_goals_tracking['team'] == team2, 'tied_won'] = 1
                        elif match_result == -1:
                            # Away team won
                            if match.iloc[0]['away'] == team1:
                                group_goals_tracking.loc[group_goals_tracking['team'] == team1, 'tied_won'] = 1
                            else:
                                group_goals_tracking.loc[group_goals_tracking['team'] == team2, 'tied_won'] = 1
                        elif match_result == 0:
                            # Draw, both teams get the same standing
                            group_goals_tracking.loc[group_goals_tracking['team'] == team1, 'tied_won'] = 0
                            group_goals_tracking.loc[group_goals_tracking['team'] == team2, 'tied_won'] = 0

        # Step 8c: Sort teams by total points, tied_won, goal difference, and goals scored
        group_goals_tracking = group_goals_tracking.sort_values(by=['total_points', 'tied_won', 'total_goal_difference', 'total_goals_scored'],
                                                                ascending=[False, False, False, False])

        # Step 9: Assign standings based on the sorting and tie resolution
        group_goals_tracking['last_game_standing'] = group_goals_tracking.reset_index(drop=True).index + 1

        # Step 10: Track changes and update standing positions after each goal is processed
        for i, row in group_goals_tracking.iterrows():
            team = row['team']

            if first_iteration:
                # Skip the update if the standing did not change
                if row['before_last_game_standing'] == row['last_game_standing']:
                    continue  # Skip updating the counters
                else:
                    # Update the position counters since standing has changed
                    if row['last_game_standing'] == 1:
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, '1st'] += 1
                    elif row['last_game_standing'] == 2:
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, '2nd'] += 1
                    elif row['last_game_standing'] == 3:
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, '3rd'] += 1
                    elif row['last_game_standing'] == 4:
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, '4th'] += 1

                # Disable first iteration flag after the first goal
                first_iteration = False
            else:
                # Track the standing position counters only if the position differs from the previous state
                if row['last_game_standing'] != previous_standings[i]:  
                    if row['last_game_standing'] == 1:
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, '1st'] += 1
                    elif row['last_game_standing'] == 2:
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, '2nd'] += 1
                    elif row['last_game_standing'] == 3:
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, '3rd'] += 1
                    elif row['last_game_standing'] == 4:
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, '4th'] += 1

        # Update previous standings after each goal
        previous_standings = group_goals_tracking['last_game_standing'].copy()

        # Step 11: Calculate changes as the sum of 1st, 2nd, 3rd, and 4th
        group_goals_tracking['changes'] = group_goals_tracking[['1st', '2nd', '3rd', '4th']].sum(axis=1)

        # Step 12: Print the updated group_goals_tracking after processing each goal
        print("\n=== Updated Standings After This Goal ===\n")
        display_columns = ['team', 'total_points', 'total_goals_scored', 'total_goals_conceded', 
                           'total_goal_difference', 'last_game_points', 'last_game_standing', 
                           'changes', '1st', '2nd', '3rd', '4th', 'tied', 'tied_won']
        print(group_goals_tracking[display_columns].to_string(index=False))
        print("\n========================================\n")

    # Step 13: Return the final DataFrame
    return group_goals_tracking


In [75]:
# Initialize an empty list to store the results for each pair
all_results = []

# Get unique pairs of year and group_name
unique_pairs = all_games_before_last[['year', 'group_name']].drop_duplicates()

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    group_name = row['group_name']
    
    # Apply the function to the current pair
    result = uefa_final_wc(year, group_name, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_results.append(result)

# Concatenate all the results into a single DataFrame
changes_df_wc = pd.concat(all_results)

# Keep only the specified columns
changes_df_wc = changes_df_wc[['year', 'group_name', 'team', '1st', '2nd', '3rd', '4th', 'changes']]

# Display the final DataFrame
display(changes_df_wc)



=== Initial Standings for Year 1986, Group Group A Before Last Match Goals ===

       team  total_points  total_goals_scored  total_goals_conceded  total_goal_difference  before_last_game_points  before_last_game_standing
  Argentina             3                   4                     2                      2                        3                          1
      Italy             2                   2                     2                      0                        2                          2
   Bulgaria             2                   2                     2                      0                        2                          3
South Korea             1                   2                     4                     -2                        1                          4


Analyzing goal: 4 minute, 0 extra time, Player team: Argentina, Home: Argentina, Away: Bulgaria

=== Updated Standings After This Goal ===

       team  total_points  total_goals_scored  total_goals_con

Unnamed: 0,year,group_name,team,1st,2nd,3rd,4th,changes
0,1986,Group A,Argentina,1,0,0,0,1
1,1986,Group A,Italy,0,1,0,0,1
2,1986,Group A,Bulgaria,0,0,1,0,1
3,1986,Group A,South Korea,0,0,0,1,1
4,1986,Group B,Mexico,1,0,0,0,1
...,...,...,...,...,...,...,...,...
289,2022,Group G,Serbia,0,1,0,2,3
290,2022,Group H,Portugal,1,0,0,0,1
291,2022,Group H,Ghana,0,2,1,0,3
292,2022,Group H,South Korea,0,0,2,1,3


In [76]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\kaggle\uefa\tb_wc_uefa_men.xlsx'
changes_df_wc.to_excel(file_path, index=False)

# group composition tracking

In [77]:
import pandas as pd

def track_composition_changes(year, group_name, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day):
    # Step 1: Filter the data for the specific year and group_name
    group_goals_tracking = all_games_before_last[
        (all_games_before_last['year'] == year) & 
        (all_games_before_last['group_name'] == group_name)
    ].copy()

    group_goals_last_day = goals_last_day_sorted[
        (goals_last_day_sorted['year'] == year) & 
        (goals_last_day_sorted['group_name'] == group_name)
    ]

    # Initialize columns for team performance and standings
    group_goals_tracking['before_last_game_goals_scored'] = group_goals_tracking['goals_scored']
    group_goals_tracking['before_last_game_goals_conceded'] = group_goals_tracking['goals_conceded']
    group_goals_tracking['before_last_game_standing'] = group_goals_tracking['standing']
    group_goals_tracking['before_last_game_points'] = group_goals_tracking['points']
    group_goals_tracking['last_game_goals_scored'] = 0
    group_goals_tracking['last_game_goals_conceded'] = 0
    group_goals_tracking['total_goals_scored'] = group_goals_tracking['before_last_game_goals_scored']
    group_goals_tracking['total_goals_conceded'] = group_goals_tracking['before_last_game_goals_conceded']
    group_goals_tracking['total_goal_difference'] = group_goals_tracking['total_goals_scored'] - group_goals_tracking['total_goals_conceded']
    group_goals_tracking['last_game_points'] = 0
    group_goals_tracking['total_points'] = group_goals_tracking['before_last_game_points']
    group_goals_tracking['tied_won'] = 0  # Initialize tied_won for tiebreak resolution

    # Add one point to each team for a 0-0 starting score
    group_goals_tracking['total_points'] += 1

    # Print initial standings after adding the 0-0 points
    print(f"\n=== Initial Standings for {group_name}, {year} (with 0-0 points added) ===")
    print(group_goals_tracking[['team', 'total_points', 'total_goals_scored', 
                                'total_goals_conceded', 'total_goal_difference', 'before_last_game_standing']].to_string(index=False))
    print("\n====================================================\n")

    # Define top standings limit based on the year
    top_standings_limit = 3 if year <= 1994 else 2

    # Step 2: Initialize composition tracking with initial composition (change_num = 0)
    sorted_initial = group_goals_tracking.sort_values(by=['total_points', 'total_goal_difference', 'total_goals_scored'], ascending=[False, False, False])
    initial_top_teams = set(sorted_initial.nsmallest(top_standings_limit, 'before_last_game_standing')['team'])
    composition_changes = [{
        'year': year,
        'group_name': group_name,
        'change_num': 0,
        'goal_time': 'initial',
        'home_team': None,
        'away_team': None,
        'scorer_team': None,
        'new_top_teams': list(initial_top_teams),
        '1st': sorted_initial.iloc[0]['team'] if len(sorted_initial) > 0 else None,
        '2nd': sorted_initial.iloc[1]['team'] if len(sorted_initial) > 1 else None,
        '3rd': sorted_initial.iloc[2]['team'] if len(sorted_initial) > 2 else None
    }]
    change_counter = 0  # Counter for the number of composition changes

    # Step 3: Sort goals by regulation time
    group_goals_last_day = group_goals_last_day.sort_values(by=['minute_regulation', 'minute_stoppage'], ascending=[True, True])

    # Step 4: Iterate through each goal and track changes in composition
    for _, goal in group_goals_last_day.iterrows():
        home_team = goal['home']
        away_team = goal['away']
        scorer_team = goal['player_team_name']
        own_goal = goal.get('own_goal', 0)  # Check if the goal is an own goal; default to 0 if column not present

        # Update scores based on who scored the goal or if it was an own goal
        if own_goal == 1:
            # Own goal case
            if scorer_team == home_team:
                # Home team concedes an own goal, so away team scores
                group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'last_game_goals_scored'] += 1
                group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'total_goals_scored'] += 1
                group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'last_game_goals_conceded'] += 1
                group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'total_goals_conceded'] += 1
            elif scorer_team == away_team:
                # Away team concedes an own goal, so home team scores
                group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'last_game_goals_scored'] += 1
                group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'total_goals_scored'] += 1
                group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'last_game_goals_conceded'] += 1
                group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'total_goals_conceded'] += 1
        else:
            # Normal goal case
            if scorer_team == home_team:
                # Home team scores
                group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'last_game_goals_scored'] += 1
                group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'total_goals_scored'] += 1
                group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'last_game_goals_conceded'] += 1
                group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'total_goals_conceded'] += 1
            elif scorer_team == away_team:
                # Away team scores
                group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'last_game_goals_scored'] += 1
                group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'total_goals_scored'] += 1
                group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'last_game_goals_conceded'] += 1
                group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'total_goals_conceded'] += 1

        # Update goal difference
        group_goals_tracking['total_goal_difference'] = group_goals_tracking['total_goals_scored'] - group_goals_tracking['total_goals_conceded']

        # Step 5: Update last_game_points based on the current game state
        group_goals_tracking['last_game_points'] = group_goals_tracking.apply(
            lambda row: 3 if row['last_game_goals_scored'] > row['last_game_goals_conceded'] else (1 if row['last_game_goals_scored'] == row['last_game_goals_conceded'] else 0), 
            axis=1
        )

        # Calculate total points by adding last game points to before_last_game_points
        group_goals_tracking['total_points'] = group_goals_tracking['before_last_game_points'] + group_goals_tracking['last_game_points']

        # Step 6: Resolve ties based on head-to-head results
        group_goals_tracking['tied'] = group_goals_tracking.duplicated(subset=['total_points'], keep=False)
        group_goals_tracking['tied_won'] = 0  # Reset tied_won for all teams

        # Only resolve ties for tied teams with the same total points
        tied_teams = group_goals_tracking[group_goals_tracking['tied']]
        if not tied_teams.empty:
            for _, row in tied_teams.iterrows():
                team1 = row['team']
                for _, other_row in tied_teams[tied_teams['team'] != team1].iterrows():
                    team2 = other_row['team']
                    match = agg_goals_before_last_day[
                        ((agg_goals_before_last_day['home'] == team1) & (agg_goals_before_last_day['away'] == team2)) |
                        ((agg_goals_before_last_day['home'] == team2) & (agg_goals_before_last_day['away'] == team1))
                    ]

                    if not match.empty:
                        match_result = match.iloc[0]['won']
                        if match_result == 1 and match.iloc[0]['home'] == team1:
                            group_goals_tracking.loc[group_goals_tracking['team'] == team1, 'tied_won'] = 1
                        elif match_result == -1 and match.iloc[0]['away'] == team1:
                            group_goals_tracking.loc[group_goals_tracking['team'] == team1, 'tied_won'] = 1
                        elif match_result == 1 and match.iloc[0]['home'] == team2:
                            group_goals_tracking.loc[group_goals_tracking['team'] == team2, 'tied_won'] = 1
                        elif match_result == -1 and match.iloc[0]['away'] == team2:
                            group_goals_tracking.loc[group_goals_tracking['team'] == team2, 'tied_won'] = 1

        # Step 7: Sort teams by updated points and tie-breaking criteria
        group_goals_tracking = group_goals_tracking.sort_values(
            by=['total_points', 'tied_won', 'total_goal_difference', 'total_goals_scored'], 
            ascending=[False, False, False, False]
        )
        group_goals_tracking['last_game_standing'] = group_goals_tracking.reset_index(drop=True).index + 1

        # Print standings after each goal
        print(f"\n=== Standings after goal at minute {goal['minute_regulation']} in {group_name}, edition {year} ===")
        print(group_goals_tracking[['team', 'total_points', 'total_goals_scored', 'total_goals_conceded', 'total_goal_difference', 'last_game_standing', 'tied_won']].to_string(index=False))
        print("\n========================================\n")

        # Track top teams and composition changes if needed
        top_teams = group_goals_tracking[group_goals_tracking['last_game_standing'] <= 3]
        first_place_team = top_teams.iloc[0]['team'] if len(top_teams) > 0 else None
        second_place_team = top_teams.iloc[1]['team'] if len(top_teams) > 1 else None
        third_place_team = top_teams.iloc[2]['team'] if len(top_teams) > 2 else None

        # Check and record any composition changes
        current_top_teams = set(group_goals_tracking.nsmallest(top_standings_limit, 'last_game_standing')['team'])
        if current_top_teams != initial_top_teams:
            change_counter += 1
            initial_top_teams = current_top_teams
            composition_changes.append({
                'year': year,
                'group_name': group_name,
                'change_num': change_counter,
                'goal_time': goal['minute_regulation'],
                'extra_time': goal['minute_stoppage'],
                'home_team': home_team,
                'away_team': away_team,
                'player_team_name': scorer_team,
                'new_top_teams': list(current_top_teams),
                '1st': first_place_team,
                '2nd': second_place_team,
                '3rd': third_place_team
            })

    return pd.DataFrame(composition_changes)


In [78]:
# Initialize an empty list to store the results for each pair
all_composition_changes = []

# Get unique pairs of year and group_name
unique_pairs = all_games_before_last[['year', 'group_name']].drop_duplicates()

# Loop through each unique pair
for _, row in unique_pairs.iterrows():
    year = row['year']
    group_name = row['group_name']
    
    # Apply the function to the current pair
    composition_changes_df = track_composition_changes(year, group_name, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day)
    
    # Append the result to the list
    all_composition_changes.append(composition_changes_df)

# Concatenate all the results into a single DataFrame
final_composition_changes_df = pd.concat(all_composition_changes, ignore_index=True)

# Rename 'player_team' column to 'scorer_team'
final_composition_changes_df.rename(columns={'scorer_team_name': 'scorer_nationality'}, inplace=True)

# Dropping the "scorer_team" column
final_composition_changes_df.drop(columns=['scorer_team'], inplace=True)


=== Initial Standings for Group A, 1986 (with 0-0 points added) ===
       team  total_points  total_goals_scored  total_goals_conceded  total_goal_difference  before_last_game_standing
  Argentina             4                   4                     2                      2                          1
      Italy             3                   2                     2                      0                          2
   Bulgaria             3                   2                     2                      0                          3
South Korea             2                   2                     4                     -2                          4



=== Standings after goal at minute 4 in Group A, edition 1986 ===
       team  total_points  total_goals_scored  total_goals_conceded  total_goal_difference  last_game_standing  tied_won
  Argentina             6                   5                     2                      3                   1         0
      Italy             3     

In [79]:
# Filtering all_games_before_last for the year 2022 and group_name 'Group E'
group2022e = final_composition_changes_df[
    (final_composition_changes_df['year'] == 2022) & 
    (final_composition_changes_df['group_name'] == 'Group E')
]

display(group2022e)

Unnamed: 0,year,group_name,change_num,goal_time,home_team,away_team,new_top_teams,1st,2nd,3rd,extra_time,player_team_name
124,2022,Group E,0,initial,,,"[Costa Rica, Spain]",Spain,Japan,Costa Rica,,
125,2022,Group E,1,10,Costa Rica,Germany,"[Japan, Spain]",Spain,Japan,Germany,0.0,Germany
126,2022,Group E,2,11,Japan,Spain,"[Germany, Spain]",Spain,Germany,Costa Rica,0.0,Spain
127,2022,Group E,3,48,Japan,Spain,"[Japan, Spain]",Spain,Japan,Germany,0.0,Japan
128,2022,Group E,4,70,Costa Rica,Germany,"[Costa Rica, Japan]",Costa Rica,Japan,Spain,0.0,Costa Rica
129,2022,Group E,5,73,Costa Rica,Germany,"[Japan, Spain]",Japan,Spain,Costa Rica,0.0,Germany


In [80]:
# Exporting final df
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\kaggle\uefa\standings_wc_eufa_men.xlsx'
final_composition_changes_df.to_excel(file_path, index=False)


# best four third placed

In [81]:
def best_four_third_placed_wc_men(goals_last_day_sorted, all_games_before_last, agg_goals_before_last_day):
    # Apply filter to process only years <= 1994
    all_games_before_last = all_games_before_last[(all_games_before_last['year'] <= 1994)].copy()
    
    # Ensure 'year' column is integer in case it has been stored as float or other format
    all_games_before_last['year'] = all_games_before_last['year'].astype(int)

    # Initialize a list to store data for each year that will be appended into the final DataFrame
    all_years_data = []

    # Group data by year and process year-by-year
    for (year,), year_data in all_games_before_last.groupby(['year']):  # Unpacking year from a tuple
        # Ensure 'year' is an integer
        year = int(year)
        print(f"\n--- Processing Year: {year} ---")
        year_data = year_data.copy()

        # Initialize dictionaries to track third-placed teams and top 4 counts for the current year
        third_teams_count = {}
        top4_count = {}

        # Step 1: Initialize columns for tracking team performance before and after goals
        year_data.loc[:, 'before_last_game_goals_scored'] = year_data['goals_scored']
        year_data.loc[:, 'before_last_game_goals_conceded'] = year_data['goals_conceded']
        year_data.loc[:, 'before_last_game_points'] = year_data['points']

        year_data.loc[:, 'last_game_goals_scored'] = 0
        year_data.loc[:, 'last_game_goals_conceded'] = 0
        year_data.loc[:, 'last_game_points'] = 0
        year_data.loc[:, 'total_goals_scored'] = year_data['before_last_game_goals_scored'] + year_data['last_game_goals_scored']
        year_data.loc[:, 'total_goals_conceded'] = year_data['before_last_game_goals_conceded'] + year_data['last_game_goals_conceded']
        year_data.loc[:, 'total_goal_difference'] = year_data['total_goals_scored'] - year_data['total_goals_conceded']

        # Print the standings before any goal is processed
        print(f"\n=== Initial Standings for Year {year} Before Processing Any Goals ===\n")
        display_columns = ['team', 'group_name', 'before_last_game_points', 'before_last_game_goals_scored', 'before_last_game_goals_conceded', 'total_goal_difference']
        print(year_data[display_columns].sort_values(by=['group_name', 'before_last_game_points', 'total_goal_difference', 'before_last_game_goals_scored'], ascending=[True, False, False, False]).to_string(index=False))
        print("\n===============================================================\n")

        # Process each goal from the sorted last-day goals for the current year
        goals_last_day_year = goals_last_day_sorted[(goals_last_day_sorted['year'] == year)]

        for i, goal in goals_last_day_year.iterrows():
            print(f"\n--- Analyzing Goal {i + 1} in {goal['group_name']}: On date {goal['match_date']}, Minute {goal['minute_regulation']} (Player Team: {goal['player_team_name']}, Home: {goal['home']}, Away: {goal['away']}) ---")

            home_team = goal['home']
            away_team = goal['away']
            player_team = goal['player_team_name']

            # Identify the opponent team
            opponent_team = home_team if player_team == away_team else away_team

            year_data.loc[year_data['team'] == player_team, 'last_game_goals_scored'] += 1
            year_data.loc[year_data['team'] == opponent_team, 'last_game_goals_conceded'] += 1

            # Update total goals scored, goals conceded, and goal difference
            year_data.loc[:, 'total_goals_scored'] = year_data['before_last_game_goals_scored'] + year_data['last_game_goals_scored']
            year_data.loc[:, 'total_goals_conceded'] = year_data['before_last_game_goals_conceded'] + year_data['last_game_goals_conceded']
            year_data.loc[:, 'total_goal_difference'] = year_data['total_goals_scored'] - year_data['total_goals_conceded']

            # Assign points for the last game dynamically
            year_data.loc[:, 'last_game_points'] = (year_data['last_game_goals_scored'] > year_data['last_game_goals_conceded']).astype(int) * 3 + \
                                                   (year_data['last_game_goals_scored'] == year_data['last_game_goals_conceded']).astype(int)

            # Update total points
            year_data.loc[:, 'total_points'] = year_data['before_last_game_points'] + year_data['last_game_points']

            # Reset third-placed teams after every goal
            third_teams_per_year = []

            # Recalculate standings after the goal
            for group, group_data in year_data.groupby('group_name'):
                sorted_standings = group_data.sort_values(by=['total_points', 'total_goal_difference', 'total_goals_scored'], ascending=[False, False, False])

                # Identify the third-placed teams after each goal, **by group**
                if len(sorted_standings) >= 3:
                    third_placed_team = sorted_standings.iloc[2]
                    third_teams_per_year.append(third_placed_team['team'])

            # Update the count for third-placed teams after each goal
            for team in third_teams_per_year:
                if team not in third_teams_count:
                    third_teams_count[team] = 0
                third_teams_count[team] += 1

            # Create DataFrame of third-placed teams with additional stats
            third_teams_df = year_data[year_data['team'].isin(third_teams_per_year)][
                ['team', 'total_points', 'last_game_points', 'total_goal_difference', 'total_goals_scored']
            ]

            # Identify the best four third-placed teams after each goal
            top4_teams = third_teams_df['team'].value_counts().nlargest(4).index.tolist()

            # Print third-placed teams' additional statistics after each goal
            print(f"Top 4 third-placed teams after this goal: {top4_teams}")
            print(f"Third-placed teams DataFrame after this goal:\n{third_teams_df}\n")

            # Update the count for top 4 third-placed teams after each goal
            for team in top4_teams:
                if team not in top4_count:
                    top4_count[team] = 0
                top4_count[team] += 1

        # After processing all goals for the year, store the results in the final list
        for team, third_count in third_teams_count.items():
            top4_count_for_team = top4_count.get(team, 0)
            all_years_data.append({
                'team': team,
                'year': year,
                'third_place_count': third_count,
                'top4_third_place_count': top4_count_for_team
            })

    # Create a DataFrame from the list of all years' data
    final_df = pd.DataFrame(all_years_data)
    
    return final_df


In [82]:
# Call the function to get the DataFrame
final_df = best_four_third_placed_wc_men(goals_last_day_sorted, all_games_before_last, agg_goals_before_last_day)

# Define the file path and save to Excel
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\kaggle\uefa\third_teams_wc_uefa_men.xlsx'
final_df.to_excel(file_path, index=False)



--- Processing Year: 1986 ---

=== Initial Standings for Year 1986 Before Processing Any Goals ===

            team group_name  before_last_game_points  before_last_game_goals_scored  before_last_game_goals_conceded  total_goal_difference
       Argentina    Group A                        3                              4                                2                      2
           Italy    Group A                        2                              2                                2                      0
        Bulgaria    Group A                        2                              2                                2                      0
     South Korea    Group A                        1                              2                                4                     -2
          Mexico    Group B                        3                              3                                2                      1
        Paraguay    Group B                        3       