# convert time

In [363]:
def convert_time_to_utc(time_str):
    """
    Converts various time formats into a standardized UTC reference.
    Handles AM/PM formats, timezone acronyms, explicit UTC offsets, and missing spaces.
    If no timezone is detected, it returns the original time value.
    """
    print(f"Original time string: {time_str}")

    if not isinstance(time_str, str) or not time_str.strip():
        print("Invalid input (not a string or empty). Returning empty string.")
        return ""  

    # Remove annotations inside brackets (e.g., [note 1])
    time_str = re.sub(r'\[.*?\]', '', time_str).strip()

    # Ensure spaces between digits and letters (e.g., "21:00CET" → "21:00 CET")
    time_str = re.sub(r'(\d)([a-zA-Z])', r'\1 \2', time_str)

    # Extract the first valid HH:MM time format from the string
    match_time = re.search(r'(\d{1,2}):(\d{2})', time_str)
    
    if not match_time:
        print("No valid time found. Returning empty string.\n")
        return ""  

    hour, minute = map(int, match_time.groups())
    print(f"Extracted local time: {hour:02}:{minute:02}")

    # Convert AM/PM to 24-hour format if present
    if "a.m." in time_str.lower() and hour == 12:
        hour = 0  # Midnight case
    elif "p.m." in time_str.lower() and hour != 12:
        hour += 12  
    local_time = f"{hour:02}:{minute:02}"
    print(f"Normalized 24-hour format: {local_time}")

    # Step 2: Extract UTC offset from explicit notation (e.g., UTC+1)
    match_utc = re.search(r'UTC\s*([+-]?\d+)', time_str)
    utc_offset = int(match_utc.group(1)) if match_utc else None

    # Step 3: Handle specific timezone acronyms if no explicit UTC offset
    timezone_offsets = {
        "CEST": 2, "CET": 1, "EEST": 3, "EDT": -4, "EST": -5,
        "PDT": -7, "PST": -8, "CDT": -5, "CST": -6, 
        "JST": 9, "KST": 9, "BRT": -3, "MSK": 3
    }
    
    if utc_offset is None:
        for tz, offset in timezone_offsets.items():
            if tz in time_str:
                utc_offset = offset
                print(f"Detected timezone acronym: {tz}, UTC Offset: {utc_offset}")
                break
    
    # If no timezone is found, return the original time
    if utc_offset is None:
        print(f"No valid timezone found. Returning original time: {local_time}\n")
        return local_time  

    # Step 4: Convert local time to UTC
    local_time_obj = datetime.strptime(local_time, '%H:%M')
    utc_time_obj = local_time_obj - timedelta(hours=utc_offset)
    
    utc_time = utc_time_obj.strftime('%H:%M')
    print(f"Final UTC time: {utc_time}\n")

    return utc_time  


# after first day league standings and points

In [364]:
def after_first(goals_df):
    """
    Process goals data to calculate standings after the first matchday 
    and create datasets for games in the last two matchdays.
    """

    # Step 1: Find the first match date for each tournament and group
    first_dates = goals_df.groupby(['year', 'stage'])['short_date'].min().reset_index()

    # Step 2: Create the dataset with all games for the first matchday (goals_first_matchday)
    goals_first_matchday = goals_df.merge(
        first_dates, on=['year', 'stage', 'short_date'], how='inner'
    )

    # Step 3: Create the dataset with all games excluding the first matchday (goals_after_first_matchday)
    goals_after_first_matchday = goals_df.merge(
        first_dates, on=['year', 'stage', 'short_date'], how='left', indicator=True
    )
    goals_after_first_matchday = goals_after_first_matchday[goals_after_first_matchday['_merge'] == 'left_only'].drop(columns=['_merge'])

    # Step 4: Convert short_date to datetime to ensure nlargest can be used
    goals_after_first_matchday['short_date'] = pd.to_datetime(goals_after_first_matchday['short_date'])

    # Find the last two match dates for each tournament and group
    last_two_dates = (
        goals_after_first_matchday.groupby(['year', 'stage'])['short_date']
        .nlargest(2)
        .reset_index()
    )

    # Step 5: Create datasets for the last two matchdays
    goals_last_two_matchdays = goals_after_first_matchday.merge(
        last_two_dates, on=['year', 'stage', 'short_date'], how='inner'
    )

    # Ensure that goals_last_two_matchdays is sorted properly by short_date and goal_minute
    goals_last_two_matchdays_sorted = goals_last_two_matchdays.sort_values(by=['short_date', 'half_time','goal_minute'], ascending=[True, True, True])

    # Initialize an empty list to store the results for games in the first matchday
    results = []

    # Iterate over each match in goals_first_matchday
    for match_id, group in goals_first_matchday.groupby(['year', 'stage', 'home_team', 'away_team']):
        # Initialize goals_home and goals_away for each match
        goals_home = 0
        goals_away = 0

        # Extract local_time and score from the first row in the group
        local_time = group.iloc[0]['local_time']
        score = group.iloc[0]['score']

        # Extract short_date from local_time (convert to date)
        short_date = pd.to_datetime(local_time).date()

        # Loop through each row to count goals for home and away teams
        for _, row in group.iterrows():
            if row['home_team'] == row['scorer_nationality']:
                goals_home += 1
            elif row['away_team'] == row['scorer_nationality']:
                goals_away += 1

        # Calculate the expected score and normalize both scores
        calculated_score = f"{goals_home}-{goals_away}"
        normalized_score = score.replace("–", "-").replace("—", "-")

        # Check if the normalized score matches the calculated score
        score_match = normalized_score == calculated_score

        # Append the results to the list
        results.append({
            'year': match_id[0],
            'stage': match_id[1],
            'home_team': match_id[2],
            'away_team': match_id[3],
            'local_time': local_time,
            'short_date': short_date,
            'goals_home': goals_home,
            'goals_away': goals_away,
            'original_score': score,
            'calculated_score': calculated_score,
            'score_match': score_match
        })

    # Convert results list into a DataFrame
    agg_goals_after_first_day = pd.DataFrame(results)

    # Add 'won' column based on comparison of goals_home and goals_away
    agg_goals_after_first_day['won'] = agg_goals_after_first_day.apply(
        lambda row: 1 if row['goals_home'] > row['goals_away'] else (-1 if row['goals_home'] < row['goals_away'] else 0), 
        axis=1
    )

    return agg_goals_after_first_day, goals_last_two_matchdays_sorted


# before last match day league standings and points

In [365]:
def before_last(goals_df):
    """
    Process goals data to create datasets for games before and on the last match day.
    Calculates home and away goals, and adds a 'won' column indicating match result.
    """
    
    # Step 1: Find the last match date for each tournament and group
    last_dates = goals_df.groupby(['year', 'stage'])['short_date'].max().reset_index()

    # Step 2: Create the dataset with all games excluding the last match day (goals_before_last_day)
    goals_before_last_day = goals_df.merge(
        last_dates, on=['year', 'stage', 'short_date'], how='left', indicator=True
    )
    goals_before_last_day = goals_before_last_day[goals_before_last_day['_merge'] == 'left_only'].drop(columns=['_merge'])

    # Step 3: Create the dataset with only the last match day games (goals_last_day)
    goals_last_day = goals_df.merge(last_dates, on=['year', 'stage', 'short_date'], how='inner')

    # Step 4: Sort goals_last_day by 'goal_minute'
    goals_last_day_sorted = goals_last_day.sort_values(by=['short_date', 'local_time', 'half_time','goal_minute'], ascending=[True, True, True, True])

    # Ensure that goals_last_day_sorted has no duplicates
    goals_last_day_sorted = goals_last_day_sorted.drop_duplicates()

    # Initialize an empty list to store the results for games before the last day
    results = []

    # Iterate over each match in goals_before_last_day
    for match_id, group in goals_before_last_day.groupby(['year', 'stage', 'home_team', 'away_team']):
        # Initialize goals_home and goals_away for each match
        goals_home = 0
        goals_away = 0
        
        # Extract local_time and score from the first row in the group
        local_time = group.iloc[0]['local_time']
        score = group.iloc[0]['score']
        
        # Extract short_date from local_time (convert to date)
        short_date = pd.to_datetime(local_time).date()
        
        # Loop through each row to count goals for home and away teams
        for _, row in group.iterrows():
            if row['home_team'] == row['scorer_nationality']:
                goals_home += 1
            elif row['away_team'] == row['scorer_nationality']:
                goals_away += 1

        # Calculate the expected score and normalize both scores
        calculated_score = f"{goals_home}-{goals_away}"
        normalized_score = score.replace("–", "-").replace("—", "-")
        
        # Check if the normalized score matches the calculated score
        score_match = normalized_score == calculated_score

        # Append the results to the list
        results.append({
            'year': match_id[0],
            'stage': match_id[1],
            'home_team': match_id[2],
            'away_team': match_id[3],
            'local_time': local_time,
            'short_date': short_date,
            'goals_home': goals_home,
            'goals_away': goals_away,
            'original_score': score,
            'calculated_score': calculated_score,
            'score_match': score_match
        })

    # Convert results list into a DataFrame
    agg_goals_before_last_day = pd.DataFrame(results)

    # Add 'won' column based on comparison of goals_home and goals_away
    agg_goals_before_last_day['won'] = agg_goals_before_last_day.apply(
        lambda row: 1 if row['goals_home'] > row['goals_away'] else (-1 if row['goals_home'] < row['goals_away'] else 0), 
        axis=1
    )

    return agg_goals_before_last_day, goals_last_day_sorted


In [366]:
def calculate_points(results, years, win_result):
    points = []
    for result, year in zip(results, years):
        if result == win_result:  # Win condition (1 for home, -1 for away)
            points.append(2 if year <= 1992 else 3)
        elif result == 0:  # Draw condition
            points.append(1)
        else:  # Loss condition
            points.append(0)
    return sum(points)

In [367]:
def aggregate_home_away_points(agg_goals_before_last_day):
    """
    Aggregate goals scored, goals conceded, points, and match count
    for both home and away games based on historical point system.
    Only unique matches (different opponents) are counted.
    Returns two DataFrames: home_games and away_games.
    """

    # Remove duplicates to ensure each match is counted only once per home-away combination
    unique_matches = agg_goals_before_last_day.drop_duplicates(subset=['year', 'stage', 'home_team', 'away_team'])

    # Step 1: Home games aggregation with match count and conditional points based on year
    home_games = agg_goals_before_last_day.groupby(['year', 'stage', 'home_team']).agg(
        goals_scored=('goals_home', 'sum'),
        goals_conceded=('goals_away', 'sum'),
        points_home=('won', lambda x: calculate_points(x, agg_goals_before_last_day.loc[x.index, 'year'], 1)),
        match_count_home=('home_team', 'count')
    ).reset_index()


    # Step 2: Away games aggregation with match count and conditional points based on year
    away_games = agg_goals_before_last_day.groupby(['year', 'stage', 'away_team']).agg(
        goals_scored=('goals_away', 'sum'),
        goals_conceded=('goals_home', 'sum'),
        points_away=('won', lambda x: calculate_points(x, agg_goals_before_last_day.loc[x.index, 'year'], -1)),
        match_count_away=('away_team', 'count')
    ).reset_index()

    return home_games, away_games


## tie-break function 

In [368]:
def tiebreaker_before(row1, row2, agg_data):
    """
    Resolves ties between two rows using head-to-head, goal difference, and goals scored criteria.
    """
    print("\n=== Applying Tiebreaker ===")
    print(f"Row1: {row1}")
    print(f"Row2: {row2}\n")

    # Look for the head-to-head match within the same year and stage
    match = agg_data[
        (agg_data['year'] == row1['year']) & (agg_data['stage'] == row1['stage']) &
        (((agg_data['home_team'] == row1['team']) & (agg_data['away_team'] == row2['team'])) |
            ((agg_data['home_team'] == row2['team']) & (agg_data['away_team'] == row1['team'])))
    ]

    # First criterion: Head-to-head result
    if not match.empty:
        match_result = match.iloc[0]
        print("Head-to-head match found:")
        print(match_result)

        if match_result['won'] == 1:  # Home team won
            if match_result['home_team'] == row1['team']:
                print(f"Head-to-head result: {row1['team']} wins")
                return row1['team'], 1, 0  # row1's team wins
            else:
                print(f"Head-to-head result: {row2['team']} wins")
                return row2['team'], 0, 1  # row2's team wins
        elif match_result['won'] == -1:  # Away team won
            if match_result['away_team'] == row1['team']:
                print(f"Head-to-head result: {row1['team']} wins")
                return row1['team'], 1, 0  # row1's team wins
            else:
                print(f"Head-to-head result: {row2['team']} wins")
                return row2['team'], 0, 1  # row2's team wins
    else:
        print("No head-to-head match found.")

    # Second criterion: Goal difference
    print(f"Goal difference: {row1['team']} = {row1['goals_difference']}, {row2['team']} = {row2['goals_difference']}")
    if row1['goals_difference'] > row2['goals_difference']:
        print(f"Goal difference result: {row1['team']} wins")
        return row1['team'], 1, 0
    elif row1['goals_difference'] < row2['goals_difference']:
        print(f"Goal difference result: {row2['team']} wins")
        return row2['team'], 0, 1

    # Third criterion: Goals scored
    print(f"Goals scored: {row1['team']} = {row1['goals_scored']}, {row2['team']} = {row2['goals_scored']}")
    if row1['goals_scored'] > row2['goals_scored']:
        print(f"Goals scored result: {row1['team']} wins")
        return row1['team'], 1, 0
    elif row1['goals_scored'] < row2['goals_scored']:
        print(f"Goals scored result: {row2['team']} wins")
        return row2['team'], 0, 1

    # If all criteria are still tied, mark as a tie
    print("All criteria tied. Result: tie")
    return 'tie', 0, 0


In [369]:
# This function does NOT consider head-to-head results in the last (third) matchday as a tiebreaker
def tiebreaker_after_old(row1, row2, agg_data):
    """
    Resolves ties between two rows using head-to-head, goal difference, and goals scored criteria.
    """
    print("\n=== Applying Tiebreaker ===")
    print(f"Row1: {row1}")
    print(f"Row2: {row2}\n")

    # Look for the head-to-head match within the same year and stage
    match = agg_data[
        (agg_data['year'] == row1['year']) & (agg_data['stage'] == row1['stage']) &
        (((agg_data['home_team'] == row1['team']) & (agg_data['away_team'] == row2['team'])) |
            ((agg_data['home_team'] == row2['team']) & (agg_data['away_team'] == row1['team'])))
    ]

    # First criterion: Head-to-head result
    if not match.empty:
        match_result = match.iloc[0]
        print("Head-to-head match found:")
        print(match_result)

        if match_result['won'] == 1:  # Home team won
            if match_result['home_team'] == row1['team']:
                print(f"Head-to-head result: {row1['team']} wins")
                return row1['team'], 1, 0  # row1's team wins
            else:
                print(f"Head-to-head result: {row2['team']} wins")
                return row2['team'], 0, 1  # row2's team wins
        elif match_result['won'] == -1:  # Away team won
            if match_result['away_team'] == row1['team']:
                print(f"Head-to-head result: {row1['team']} wins")
                return row1['team'], 1, 0  # row1's team wins
            else:
                print(f"Head-to-head result: {row2['team']} wins")
                return row2['team'], 0, 1  # row2's team wins
    else:
        print("No head-to-head match found.")

    # Second criterion: Goal difference
    print(f"Goal difference: {row1['team']} = {row1['total_goals_difference']}, {row2['team']} = {row2['total_goals_difference']}")
    if row1['total_goals_difference'] > row2['total_goals_difference']:
        print(f"Goal difference result: {row1['team']} wins")
        return row1['team'], 1, 0
    elif row1['total_goals_difference'] < row2['total_goals_difference']:
        print(f"Goal difference result: {row2['team']} wins")
        return row2['team'], 0, 1

    # Third criterion: Goals scored
    print(f"Goals scored: {row1['team']} = {row1['total_goals_scored']}, {row2['team']} = {row2['total_goals_scored']}")
    if row1['total_goals_scored'] > row2['total_goals_scored']:
        print(f"Goals scored result: {row1['team']} wins")
        return row1['team'], 1, 0
    elif row1['total_goals_scored'] < row2['total_goals_scored']:
        print(f"Goals scored result: {row2['team']} wins")
        return row2['team'], 0, 1

    # If all criteria are still tied, mark as a tie
    print("All criteria tied. Result: tie")
    return 'tie', 0, 0

In [370]:
# This function does consider head-to-head results in the last (third) matchday as a tiebreaker
def tiebreaker_after(row1, row2, agg_data):
    """
    Resolves ties between two rows using head-to-head, goal difference, and goals scored criteria.
    """
    print("\n=== Applying Tiebreaker ===")
    print(f"Row1: {row1}")
    print(f"Row2: {row2}\n")

    # Look for the head-to-head match within the same year and stage
    match = agg_data[
        (agg_data['year'] == row1['year']) & (agg_data['stage'] == row1['stage']) &
        (((agg_data['home_team'] == row1['team']) & (agg_data['away_team'] == row2['team'])) |
            ((agg_data['home_team'] == row2['team']) & (agg_data['away_team'] == row1['team'])))
    ]

    # First criterion: Head-to-head result
    if not match.empty:
        match_result = match.iloc[0]
        print("Head-to-head match found:")
        print(match_result)

        if match_result['won'] == 1:  # Home team won
            if match_result['home_team'] == row1['team']:
                print(f"Head-to-head result: {row1['team']} wins")
                return row1['team'], 1, 0  # row1's team wins
            else:
                print(f"Head-to-head result: {row2['team']} wins")
                return row2['team'], 0, 1  # row2's team wins
        elif match_result['won'] == -1:  # Away team won
            if match_result['away_team'] == row1['team']:
                print(f"Head-to-head result: {row1['team']} wins")
                return row1['team'], 1, 0  # row1's team wins
            else:
                print(f"Head-to-head result: {row2['team']} wins")
                return row2['team'], 0, 1  # row2's team wins
    else:
        print("No head-to-head match found.")
        
        # Check last game performance if no head-to-head match
        row1_last_game_performance = row1["last_game_goals_scored"] - row1["last_game_goals_conceded"]
        row2_last_game_performance = row2["last_game_goals_scored"] - row2["last_game_goals_conceded"]
        print(f"Last game performance: {row1['team']} = {row1_last_game_performance}, {row2['team']} = {row2_last_game_performance}")
        
        if row1_last_game_performance > row2_last_game_performance:
            print(f"Last game result: {row1['team']} wins")
            return row1['team'], 1, 0
        elif row1_last_game_performance < row2_last_game_performance:
            print(f"Last game result: {row2['team']} wins")
            return row2['team'], 0, 1
    
    # Second criterion: Goal difference
    print(f"Goal difference: {row1['team']} = {row1['total_goals_difference']}, {row2['team']} = {row2['total_goals_difference']}")
    if row1['total_goals_difference'] > row2['total_goals_difference']:
        print(f"Goal difference result: {row1['team']} wins")
        return row1['team'], 1, 0
    elif row1['total_goals_difference'] < row2['total_goals_difference']:
        print(f"Goal difference result: {row2['team']} wins")
        return row2['team'], 0, 1

    # Third criterion: Goals scored
    print(f"Goals scored: {row1['team']} = {row1['total_goals_scored']}, {row2['team']} = {row2['total_goals_scored']}")
    if row1['total_goals_scored'] > row2['total_goals_scored']:
        print(f"Goals scored result: {row1['team']} wins")
        return row1['team'], 1, 0
    elif row1['total_goals_scored'] < row2['total_goals_scored']:
        print(f"Goals scored result: {row2['team']} wins")
        return row2['team'], 0, 1

    # If all criteria are still tied, mark as a tie
    print("All criteria tied. Result: tie")
    return 'tie', 0, 0


## three teams tied

In [371]:
def resolve_three_way_tie(tied_group, agg_goals_before_last_day, group_goals_tracking):
    """
    Resolve a three-way tie by:
    1. Identifying the first team based on points, goal difference, goals scored, and total goals scored in the group.
    2. Ordering the remaining two teams based first on head-to-head results or, if no head-to-head exists, on the current match result.
    3. Assigning `tie_won` values: 1 for first place, 2 for second, 3 for third.
    """
    # Extract the tied teams
    tied_teams = tied_group['team'].tolist()
    print(f"\n=== Resolving Three-Way Tie for Teams: {tied_teams} ===")

    # 1️⃣ **FILTER HEAD-TO-HEAD DATA FROM PAST MATCHES**
    past_matches = agg_goals_before_last_day[
        (agg_goals_before_last_day['year'] == group_goals_tracking['year'].iloc[0]) &
        (agg_goals_before_last_day['stage'] == group_goals_tracking['stage'].iloc[0]) &
        (agg_goals_before_last_day['home_team'].isin(tied_teams)) & 
        (agg_goals_before_last_day['away_team'].isin(tied_teams))
    ]

    print("\n🔹 Past Matches Between Tied Teams:")
    print(past_matches[['year', 'stage', 'home_team', 'away_team', 'goals_home', 'goals_away']])

    # Count the number of past matches per team
    team_match_counts = pd.concat([past_matches['home_team'], past_matches['away_team']]).value_counts()

    # Select teams that have **only one past match** (these two teams must be in the current match)
    teams_in_current_match = team_match_counts[team_match_counts == 1].index.tolist()

    if len(teams_in_current_match) != 2:
        raise ValueError(f"Error: More or less than 2 teams found for the current match: {teams_in_current_match}")

    print(f"\n🔹 Teams in Current Match: {teams_in_current_match}")

    # Initialize tracking dictionaries
    goal_differences = {team: 0 for team in tied_teams}
    goals_scored = {team: 0 for team in tied_teams}
    total_goals_group = {team: group_goals_tracking.loc[group_goals_tracking['team'] == team, 'total_goals_scored'].values[0] for team in tied_teams}

    # 2️⃣ **UPDATE GOAL DIFFERENCE & GOALS SCORED FROM PAST MATCHES**
    for _, row in past_matches.iterrows():
        home_team, away_team = row['home_team'], row['away_team']
        goals_home, goals_away = row['goals_home'], row['goals_away']

        goal_differences[home_team] += goals_home - goals_away
        goal_differences[away_team] += goals_away - goals_home

        goals_scored[home_team] += goals_home
        goals_scored[away_team] += goals_away

    print("\n🔹 Goal Differences After Past Matches:")
    print(goal_differences)
    print("\n🔹 Goals Scored After Past Matches:")
    print(goals_scored)

    # 3️⃣ **ADD CURRENT MATCH IMPACT FROM `group_goals_tracking`**
    for team in teams_in_current_match:  # Only consider the two teams identified
        last_game_goals = group_goals_tracking.loc[group_goals_tracking['team'] == team, 'last_game_goals_scored'].values[0]
        last_game_conceded = group_goals_tracking.loc[group_goals_tracking['team'] == team, 'last_game_goals_conceded'].values[0]

        goal_differences[team] += last_game_goals - last_game_conceded
        goals_scored[team] += last_game_goals

        print(f"\n🔹 Current Match Impact for {team}:")
        print(f"   Goals Scored in Current Match: {last_game_goals}")
        print(f"   Goals Conceded in Current Match: {last_game_conceded}")
        print(f"   Updated Goal Difference: {goal_differences[team]}")
        print(f"   Updated Goals Scored: {goals_scored[team]}")

    # 4️⃣ **SELECT THE FIRST TEAM BASED ON ALL TIEBREAKERS**
    sorted_teams = sorted(
        tied_teams,
        key=lambda t: (goal_differences[t], goals_scored[t], total_goals_group[t]),
        reverse=True
    )
    
    # 🔍 Debugging Output: Print Sorted Teams Based on Tiebreakers
    print("\n🔹 Sorted Teams After Applying Tiebreakers:")
    for rank, team in enumerate(sorted_teams, start=1):
        print(f"   {rank}. {team} (GD: {goal_differences[team]}, GS: {goals_scored[team]}, Total GS in Group: {total_goals_group[team]})")
        
        # Take the first team as the top-ranked team
        first_team = sorted_teams[0]
        last_two_teams = sorted_teams[1:]

    # **⚠️ CHECK IF FIRST TWO TEAMS HAVE IDENTICAL VALUES**

    h2h_match = pd.DataFrame()  # Ensure it's always defined
    
    team1, team2 = sorted_teams[:2]  # First two teams
    if (
        goal_differences[team1] == goal_differences[team2] and 
        goals_scored[team1] == goals_scored[team2] and 
        total_goals_group[team1] == total_goals_group[team2]
    ):
        print("\n🔍 **First two teams have identical tiebreaker values, evaluating head-to-head...**")
        
        # Retrieve their head-to-head match
        h2h_match = past_matches[
            ((past_matches['home_team'] == team1) & (past_matches['away_team'] == team2)) |
            ((past_matches['home_team'] == team2) & (past_matches['away_team'] == team1))
        ]
        
        print("\n🔹 Head-to-Head Match Between First Two Teams:")
        print(h2h_match)

    # If a head-to-head match exists, determine the winner
    if not h2h_match.empty:
        row = h2h_match.iloc[0]
        home_team, away_team = row['home_team'], row['away_team']
        goals_home, goals_away = row['goals_home'], row['goals_away']

        print(f"\n🔹 Head-to-Head Result Before Current Match:")
        print(f"   {home_team}: {goals_home} goals")
        print(f"   {away_team}: {goals_away} goals")

        # The winner of the head-to-head match becomes the first team
        if goals_home > goals_away:
            first_team, last_two_teams = home_team, [away_team] + last_two_teams[1:]
        elif goals_away > goals_home:
            first_team, last_two_teams = away_team, [home_team] + last_two_teams[1:]

        print(f"\n🔹 First Team Identified: {first_team}")
        print(f"🔹 Last Two Teams (Before Head-to-Head Check): {last_two_teams}")


    # 5️⃣ **SORT THE LAST TWO TEAMS BASED ON TIEBREAKER VALUES**
    last_two_teams = sorted(
        last_two_teams,
        key=lambda t: (goal_differences[t], goals_scored[t], total_goals_group[t]),
        reverse=True
    )

    # Print sorted last two teams for debugging
    print("\n🔹 Sorted Last Two Teams After Applying Tiebreakers:")
    for rank, team in enumerate(last_two_teams, start=1):
        print(f"   {rank}. {team} (GD: {goal_differences[team]}, GS: {goals_scored[team]}, Total GS in Group: {total_goals_group[team]})")

    # Check if the last two teams are tied in all criteria
    team1, team2 = last_two_teams[:2]
    if (
        goal_differences[team1] == goal_differences[team2] and
        goals_scored[team1] == goals_scored[team2] and
        total_goals_group[team1] == total_goals_group[team2]
    ):
        print("\n🔍 **Last two teams are tied in all criteria, evaluating head-to-head...**")

        # Retrieve head-to-head matches (both past and current)
        h2h_match = past_matches[
            ((past_matches['home_team'] == team1) & (past_matches['away_team'] == team2)) |
            ((past_matches['home_team'] == team2) & (past_matches['away_team'] == team1))
        ]

        # If no past match is found, check the current match
        if h2h_match.empty:
            print("\n🔍 **No past head-to-head match found, evaluating current match...**")
            goals_team1 = group_goals_tracking.loc[group_goals_tracking['team'] == team1, 'last_game_goals_scored'].values[0]
            goals_team2 = group_goals_tracking.loc[group_goals_tracking['team'] == team2, 'last_game_goals_scored'].values[0]

            if goals_team1 > goals_team2:
                last_two_teams = [team1, team2]
            elif goals_team2 > goals_team1:
                last_two_teams = [team2, team1]
        else:
            # If a past head-to-head match exists, evaluate the result
            row = h2h_match.iloc[0]
            home_team, away_team = row['home_team'], row['away_team']
            goals_home, goals_away = row['goals_home'], row['goals_away']

            print(f"\n🔹 Head-to-Head Result:")
            print(f"   {home_team}: {goals_home} goals")
            print(f"   {away_team}: {goals_away} goals")

            if goals_home > goals_away:
                last_two_teams = [home_team, away_team]
            elif goals_away > goals_home:
                last_two_teams = [away_team, home_team]

    print(f"\n✅ **Final Ranking After Resolving Tie:**")
    print(f"   🥇 1st: {first_team}")
    print(f"   🥈 2nd: {last_two_teams[0]}")
    print(f"   🥉 3rd: {last_two_teams[1]}")

    # 6️⃣ **ASSIGN `three_tie` VALUES TO FINAL RANKING**
    group_goals_tracking.loc[group_goals_tracking['team'] == first_team, 'three_tie'] = 3
    group_goals_tracking.loc[group_goals_tracking['team'] == last_two_teams[0], 'three_tie'] = 2
    group_goals_tracking.loc[group_goals_tracking['team'] == last_two_teams[1], 'three_tie'] = 1

    return [first_team, last_two_teams[0], last_two_teams[1]]


# before last match day

In [372]:
def uefa_before_last(home_games, away_games, agg_goals_before_last_day, team_counts):
    """
    Process home and away games data to aggregate goals, points, and standings, including handling ties
    and adjusting for unplayed or 0-0 draws.
    """

    # Step 1: Merge home_games and away_games on year, stage, home_team with away_team
    all_games_before_last = pd.merge(
        home_games,
        away_games,
        left_on=['year', 'stage', 'home_team'],
        right_on=['year', 'stage', 'away_team'],
        how='outer',
        suffixes=('_home', '_away')
    )

    # Assign teams and handle missing values
    all_games_before_last['team'] = all_games_before_last['home_team'].fillna(all_games_before_last['away_team'])
    all_games_before_last['goals_scored'] = all_games_before_last['goals_scored_home'].fillna(0) + all_games_before_last['goals_scored_away'].fillna(0)
    all_games_before_last['goals_conceded'] = all_games_before_last['goals_conceded_home'].fillna(0) + all_games_before_last['goals_conceded_away'].fillna(0)
    all_games_before_last['points'] = all_games_before_last['points_home'].fillna(0) + all_games_before_last['points_away'].fillna(0)
    all_games_before_last['total_matches'] = all_games_before_last['match_count_home'].fillna(0) + all_games_before_last['match_count_away'].fillna(0)

     # Save the original total_matches as matches_flag
    all_games_before_last['matches_flag'] = all_games_before_last['total_matches']

    # Ensure all teams from team_counts (using team_list) are included
    team_counts = team_counts.explode('team_list').rename(columns={'team_list': 'team'})
    all_teams = team_counts[['year', 'stage', 'team']].drop_duplicates()
    all_games_before_last = all_teams.merge(
        all_games_before_last,
        on=['year', 'stage', 'team'],
        how='left'
    )

    # Check for observations with missing values in the specified columns
    missing_values = all_games_before_last[
        all_games_before_last[['goals_scored', 'goals_conceded', 'points', 'total_matches']].isnull().any(axis=1)
    ]

    # Print observations with missing values
    if not missing_values.empty:
        print("Observations with missing values before filling:")
        print(missing_values)
    else:
        print("No missing values in the specified columns.")
        
    # Fill missing values for teams with no activity
    all_games_before_last = all_games_before_last.fillna({'goals_scored': 0, 'goals_conceded': 0, 'points': 0, 'total_matches': 0})
    all_games_before_last['goals_difference'] = all_games_before_last['goals_scored'] - all_games_before_last['goals_conceded']

    # Check for observations where total_matches == 1
    matches_one = all_games_before_last[all_games_before_last['total_matches'] == 1]
    if not matches_one.empty:
        print("Observations where total_matches == 1:")
        print(matches_one)
    else:
        print("No observations where total_matches == 1.")

    # Adjust points for teams with one or no matches
    all_games_before_last.loc[all_games_before_last['total_matches'] == 1, 'points'] += 1
    all_games_before_last.loc[all_games_before_last['total_matches'] == 1, 'total_matches'] = 2

    # Check for observations where total_matches == 0
    matches_zero = all_games_before_last[all_games_before_last['total_matches'] == 0]
    if not matches_zero.empty:
        print("Observations where total_matches == 0:")
        print(matches_zero)
    else:
        print("No observations where total_matches == 0.")

    # Adjust points for teams with no matches
    all_games_before_last.loc[all_games_before_last['total_matches'] == 0, 'points'] += 2
    all_games_before_last.loc[all_games_before_last['total_matches'] == 0, 'total_matches'] = 2


    # Initial sorting by points
    all_games_before_last = all_games_before_last.sort_values(
        by=['year', 'stage', 'points'],
        ascending=[True, True, False]
    ).reset_index(drop=True)


    # Apply tie-breaking for each pair of tied teams
    all_games_before_last['tiebreaker'] = 'no need'
    all_games_before_last['tie_won'] = 0

    for i in range(len(all_games_before_last) - 1):
        row1 = all_games_before_last.iloc[i]
        row2 = all_games_before_last.iloc[i + 1]

        # Check if rows are tied in points within the same year and stage
        if row1['year'] == row2['year'] and row1['stage'] == row2['stage'] and row1['points'] == row2['points']:
            tiebreak_result, tie_won_row1, tie_won_row2 = tiebreaker_before(row1, row2, agg_goals_before_last_day)

            if tiebreak_result != 'tie':
                all_games_before_last.at[i, 'tiebreaker'] = tiebreak_result
                all_games_before_last.at[i, 'tie_won'] = tie_won_row1
                all_games_before_last.at[i + 1, 'tiebreaker'] = tiebreak_result
                all_games_before_last.at[i + 1, 'tie_won'] = tie_won_row2

    # Final sorting by all criteria
    all_games_before_last = all_games_before_last.sort_values(
        by=['year', 'stage', 'points', 'tie_won', 'goals_difference', 'goals_scored'],
        ascending=[True, True, False, False, False, False]
    ).reset_index(drop=True)

    # Assign standings within each group
    all_games_before_last['standing'] = all_games_before_last.groupby(['year', 'stage']).cumcount() + 1

    # Retain only relevant columns
    all_games_before_last = all_games_before_last[
        ['year', 'stage', 'team', 'goals_scored', 'goals_conceded', 'points', 'goals_difference', 'total_matches', 'standing', 'tiebreaker', 'tie_won', 'matches_flag']
    ]

    # Convert numerical columns to integers
    all_games_before_last[['goals_scored', 'goals_conceded', 'points', 'goals_difference', 'total_matches', 'standing']] = all_games_before_last[
        ['goals_scored', 'goals_conceded', 'points', 'goals_difference', 'total_matches', 'standing']].astype(int)

    return all_games_before_last


# last day league standing and changes

In [373]:
def uefa_final_euro(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day):
    """
    Process and track team performance for the final stage of a given UEFA Euro tournament,
    including handling the last match day's goals and updating standings.
    """

    # Step 1: Filter the data for the specific year and stage
    group_goals_tracking = all_games_before_last[
        (all_games_before_last['year'] == year) & 
        (all_games_before_last['stage'] == stage)
    ].copy()

    group_goals_last_day = goals_last_day_sorted[
        (goals_last_day_sorted['year'] == year) & 
        (goals_last_day_sorted['stage'] == stage)
    ]

    # Step 2: Initialize columns for tracking team performance
    group_goals_tracking['before_last_game_goals_scored'] = group_goals_tracking['goals_scored']
    group_goals_tracking['before_last_game_goals_conceded'] = group_goals_tracking['goals_conceded']
    group_goals_tracking['before_last_game_standing'] = group_goals_tracking['standing']
    group_goals_tracking['before_last_game_points'] = group_goals_tracking['points']

    group_goals_tracking['date'] = None
    group_goals_tracking['time'] = None


    group_goals_tracking['last_game_goals_scored'] = 0
    group_goals_tracking['last_game_goals_conceded'] = 0
    group_goals_tracking['total_goals_scored'] = group_goals_tracking['before_last_game_goals_scored']
    group_goals_tracking['total_goals_conceded'] = group_goals_tracking['before_last_game_goals_conceded']
    group_goals_tracking['total_goals_difference'] = group_goals_tracking['total_goals_scored'] - group_goals_tracking['total_goals_conceded']
    group_goals_tracking['last_game_points'] = 0
    group_goals_tracking['total_points'] = group_goals_tracking['before_last_game_points']

    # Remove unecessary columns
    group_goals_tracking = group_goals_tracking.drop(columns=['goals_difference','goals_scored', 'goals_conceded', 'standing', 'points',
                                                             'total_matches'])

    # Initialize last_game_standing to the initial standings
    group_goals_tracking['last_game_standing'] = group_goals_tracking['before_last_game_standing']

    # Add one point to each team for a 0-0 starting score
    group_goals_tracking['total_points'] += 1

    # Initialize position counters based on initial standings
    group_goals_tracking['1st'] = group_goals_tracking['before_last_game_standing'].apply(lambda x: 1 if x == 1 else 0)
    group_goals_tracking['2nd'] = group_goals_tracking['before_last_game_standing'].apply(lambda x: 1 if x == 2 else 0)
    group_goals_tracking['3rd'] = group_goals_tracking['before_last_game_standing'].apply(lambda x: 1 if x == 3 else 0)
    group_goals_tracking['4th'] = group_goals_tracking['before_last_game_standing'].apply(lambda x: 1 if x == 4 else 0)

    group_goals_tracking['changes'] = 0  # Initialize this but will be redefined later as the sum of 1st, 2nd, 3rd, 4th
    group_goals_tracking['tied'] = False  # Initialize a flag to track tied teams
    group_goals_tracking['tie_won'] = group_goals_tracking['tie_won'] 

    # Step 3: sort by 'half_time' first and then by 'goal_minute'
    group_goals_last_day = group_goals_last_day.sort_values(by=['half_time', 'goal_minute'], ascending=[True, True])

    # Print the year, stage, and standings before starting the loop for last match goals
    print(f"\n=== Initial Standings for Year {year}, {stage} Before Last Match Goals ===\n")
    display_columns = ['team', 'total_points', 'total_goals_scored', 'total_goals_conceded', 
                       'total_goals_difference', 'before_last_game_points', 'before_last_game_standing']
    print(group_goals_tracking[display_columns].to_string(index=False))
    print("\n====================================================\n")

    # Step 4: Iterate through the sorted and filtered last match goals and update the goals_tracking table
    previous_standings = group_goals_tracking['last_game_standing'].copy()
    first_iteration = True  # Variable to track the first iteration

    for _, goal in group_goals_last_day.iterrows():
        home_team = goal['home_team']
        away_team = goal['away_team']
        player_team = goal['scorer_nationality']

        group_goals_tracking.loc[group_goals_tracking['team'] == player_team, 'date'] = goal['short_date']
        group_goals_tracking.loc[group_goals_tracking['team'] == player_team, 'time'] = goal['local_time']


        # Print goal information for each goal
        print(f"Analyzing goal: {goal['goal_minute']} minute, {goal['half_time']} half time, Player team: {player_team}, Home: {home_team}, Away: {away_team}")

        # Update the goals based on who scored the goal
        if player_team == home_team:
            # Home team scored, update home scored and away conceded
            group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'last_game_goals_scored'] += 1
            group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'last_game_goals_conceded'] += 1
        elif player_team == away_team:
            # Away team scored, update away scored and home conceded
            group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'last_game_goals_scored'] += 1
            group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'last_game_goals_conceded'] += 1

        # Step 5: Update total_goals_scored, total_goals_conceded, and total_goals_difference
        group_goals_tracking['total_goals_scored'] = group_goals_tracking['before_last_game_goals_scored'] + group_goals_tracking['last_game_goals_scored']
        group_goals_tracking['total_goals_conceded'] = group_goals_tracking['before_last_game_goals_conceded'] + group_goals_tracking['last_game_goals_conceded']
        group_goals_tracking['total_goals_difference'] = group_goals_tracking['total_goals_scored'] - group_goals_tracking['total_goals_conceded']

        # Step 6: Assign points for the last game dynamically after each goal
        for i, row in group_goals_tracking.iterrows():
            if row['last_game_goals_scored'] > row['last_game_goals_conceded']:
                if year <= 1992:
                    group_goals_tracking.loc[i, 'last_game_points'] = 2  # Win before or during 1992
                else:
                    group_goals_tracking.loc[i, 'last_game_points'] = 3  # Win after 1992
            elif row['last_game_goals_scored'] == row['last_game_goals_conceded']:
                group_goals_tracking.loc[i, 'last_game_points'] = 1  # Draw
            else:
                group_goals_tracking.loc[i, 'last_game_points'] = 0  # Loss

        # Step 7: Update total points
        group_goals_tracking['total_points'] = group_goals_tracking['before_last_game_points'] + group_goals_tracking['last_game_points']


        # Step 8a: Mark teams that are tied based on total points
        group_goals_tracking['tied'] = group_goals_tracking.duplicated(subset=['total_points'], keep=False)
        print("\n=== Teams with Identical Points (Tied Teams) ===\n")
        print(group_goals_tracking[group_goals_tracking['tied']][['team', 'total_points']])

        # Reset `tie_won` to 0 for all teams
        group_goals_tracking['tie_won'] = 0

        # Step 8b: Process ties only if tied teams exist
        tied_teams = group_goals_tracking[group_goals_tracking['tied']]

        if not tied_teams.empty:
            print("\n=== Evaluating Head-to-Head for Tied Teams ===\n")
            
            # Iterate through all pairs of tied teams
            for i, row1 in tied_teams.iterrows():
                for j, row2 in tied_teams.iterrows():
                    if i < j:  # Compare each pair only once
                        print(f"Checking tie between: {row1['team']} and {row2['team']}")

                        # Apply tiebreaker function
                        tiebreak_result, tie_won_row1, tie_won_row2 = tiebreaker_after(row1, row2, agg_goals_before_last_day)

                        if tiebreak_result != 'tie':
                            print(f"Tiebreak Result: Winner is {tiebreak_result}")
                        else:
                            print(f"No winner in tiebreak between {row1['team']} and {row2['team']}")

                        # Update the `tie_won` column based on the results
                        group_goals_tracking.loc[group_goals_tracking['team'] == row1['team'], 'tie_won'] += tie_won_row1
                        group_goals_tracking.loc[group_goals_tracking['team'] == row2['team'], 'tie_won'] += tie_won_row2

        # Step 8c: Sort teams by tie-breaking criteria
        group_goals_tracking = group_goals_tracking.sort_values(
            by=['total_points', 'tie_won', 'total_goals_difference', 'total_goals_scored'],
            ascending=[False, False, False, False]
)

        # Step 9: Update standings
        group_goals_tracking['last_game_standing'] = group_goals_tracking.reset_index(drop=True).index + 1
        print("\n=== Updated Standings After Sorting and Tie Break ===\n")
        print(group_goals_tracking[['team', 'last_game_standing', 'total_points', 'total_goals_difference', 'tie_won']])

        # Step 10: Track changes and update standing positions after each goal is processed
        for i, row in group_goals_tracking.iterrows():
            team = row['team']
            if first_iteration:
                if row['before_last_game_standing'] == row['last_game_standing']:
                    continue
                else:
                    if row['last_game_standing'] == 1:
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, '1st'] += 1
                    elif row['last_game_standing'] == 2:
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, '2nd'] += 1
                    elif row['last_game_standing'] == 3:
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, '3rd'] += 1
                    elif row['last_game_standing'] == 4:
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, '4th'] += 1
                first_iteration = False
            else:
                if row['last_game_standing'] != previous_standings[i]:  
                    if row['last_game_standing'] == 1:
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, '1st'] += 1
                    elif row['last_game_standing'] == 2:
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, '2nd'] += 1
                    elif row['last_game_standing'] == 3:
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, '3rd'] += 1
                    elif row['last_game_standing'] == 4:
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, '4th'] += 1

        # Update previous standings after each goal
        previous_standings = group_goals_tracking['last_game_standing'].copy()

        # Step 11: Calculate changes as the sum of 1st, 2nd, 3rd, and 4th
        group_goals_tracking['changes'] = group_goals_tracking[['1st', '2nd', '3rd', '4th']].sum(axis=1)

        # Step 12: Print the updated group_goals_tracking after processing each goal
        print("\n=== Updated Standings After This Goal ===\n")
        display_columns = ['team','date', 'time','total_points', 'total_goals_scored', 'total_goals_conceded', 
                           'total_goals_difference', 'last_game_points', 'last_game_standing', 
                           'changes', '1st', '2nd', '3rd', '4th', 'tied', 'tie_won']
        print(group_goals_tracking[display_columns].to_string(index=False))
        print("\n========================================\n")

    # Step 13: Return the final DataFrame
    return group_goals_tracking


In [374]:
# def track_composition_changes(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day):
#     # Step 1: Filter the data for the specific year and stage
#     group_goals_tracking = all_games_before_last[
#         (all_games_before_last['year'] == year) & 
#         (all_games_before_last['stage'] == stage)
#     ].copy()

#     group_goals_last_day = goals_last_day_sorted[
#         (goals_last_day_sorted['year'] == year) & 
#         (goals_last_day_sorted['stage'] == stage)
#     ]

#     # Initialize columns for team performance and standings
#     group_goals_tracking['before_last_game_goals_scored'] = group_goals_tracking['goals_scored']
#     group_goals_tracking['before_last_game_goals_conceded'] = group_goals_tracking['goals_conceded']
#     group_goals_tracking['before_last_game_standing'] = group_goals_tracking['standing']
#     group_goals_tracking['before_last_game_points'] = group_goals_tracking['points']
#     group_goals_tracking['last_game_goals_scored'] = 0
#     group_goals_tracking['last_game_goals_conceded'] = 0
#     group_goals_tracking['total_goals_scored'] = group_goals_tracking['before_last_game_goals_scored']
#     group_goals_tracking['total_goals_conceded'] = group_goals_tracking['before_last_game_goals_conceded']
#     group_goals_tracking['total_goals_difference'] = group_goals_tracking['total_goals_scored'] - group_goals_tracking['total_goals_conceded']
#     group_goals_tracking['last_game_points'] = 0
#     group_goals_tracking['total_points'] = group_goals_tracking['before_last_game_points']
#     group_goals_tracking['last_game_standing'] = 0
#     group_goals_tracking['tie_won'] = 0  # Initialize tie_won for tiebreak resolution

#     # Add one point to each team for a 0-0 starting score
#     group_goals_tracking['total_points'] += 1

#     # Remove unecessary columns
#     group_goals_tracking = group_goals_tracking.drop(columns=['goals_difference','goals_scored', 'goals_conceded', 'standing', 'points',
#                                                              'total_matches'])


#     # Define top standings limit based on the year (3 for 1992 and earlier, 2 for 1994 and later) for World Cup  *************---------------------------------CHANGE----------------------------------------------*******************
#     # top_standings_limit = 3 if year <= 1994 else 2 

#     # Define top standings limit based on the year (3 for 2016 and later, 2 for 2014 and earlier) for Euros
#     top_standings_limit = 3 if year >= 2016 else 2

#     # Print initial standings
#     print(f"\n=== Initial Standings for {stage}, {year} (Goal Time = 0) ===")
#     print(group_goals_tracking[['team', 'total_points', 'total_goals_scored', 
#                                 'total_goals_conceded', 'total_goals_difference', 'before_last_game_standing']].to_string(index=False))
#     print("\n========================================\n")

#     # Step 2: Initialize composition tracking with initial composition (change_num = 0)
#     initial_top_teams = set(
#         group_goals_tracking[group_goals_tracking['before_last_game_standing'] <= top_standings_limit]['team']
#     )

#     composition_changes = [{
#         'year': year,
#         'stage': stage,
#         'change_num': 0,
#         'goal_time': 0,
#         'home_team': None,
#         'away_team': None,
#         'scorer_team': None,
#         'new_top_teams': list(initial_top_teams),
#         '1st': group_goals_tracking.loc[group_goals_tracking['before_last_game_standing'] == 1, 'team'].values[0] 
#                 if (group_goals_tracking['before_last_game_standing'] == 1).any() else None,
#         '2nd': group_goals_tracking.loc[group_goals_tracking['before_last_game_standing'] == 2, 'team'].values[0] 
#                 if (group_goals_tracking['before_last_game_standing'] == 2).any() else None,
#         '3rd': group_goals_tracking.loc[group_goals_tracking['before_last_game_standing'] == 3, 'team'].values[0] 
#                 if (group_goals_tracking['before_last_game_standing'] == 3).any() else None,
#         'changed': 0
#     }]

#     change_counter = 0  # Counter for the number of composition changes

#     # Step 3: Sort goals by regulation time
#     group_goals_last_day = group_goals_last_day.sort_values(by=['goal_minute'])

#     # Step 4: Iterate through each goal and track changes in composition
#     for _, goal in group_goals_last_day.iterrows():
#         home_team = goal['home_team']
#         away_team = goal['away_team']
#         scorer_team = goal['scorer_nationality']

#         # Update scores based on who scored the goal
#         if scorer_team == home_team:
#             group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'last_game_goals_scored'] += 1
#             group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'total_goals_scored'] += 1
#             group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'last_game_goals_conceded'] += 1
#             group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'total_goals_conceded'] += 1
#         elif scorer_team == away_team:
#             group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'last_game_goals_scored'] += 1
#             group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'total_goals_scored'] += 1
#             group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'last_game_goals_conceded'] += 1
#             group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'total_goals_conceded'] += 1

#         # Update goal difference
#         group_goals_tracking['total_goals_difference'] = group_goals_tracking['total_goals_scored'] - group_goals_tracking['total_goals_conceded']

#         # Step 5: Update last_game_points based on the current game state
#         group_goals_tracking['last_game_points'] = group_goals_tracking.apply(
#             lambda row: (2 if year <= 1992 else 3) if row['last_game_goals_scored'] > row['last_game_goals_conceded'] 
#             else (1 if row['last_game_goals_scored'] == row['last_game_goals_conceded'] else 0), 
#             axis=1
#         )

#         # Calculate total points by adding last game points to before_last_game_points
#         group_goals_tracking['total_points'] = group_goals_tracking['before_last_game_points'] + group_goals_tracking['last_game_points']

#         # Step 5b: Evaluate ties based on total points
#         group_goals_tracking['tied'] = group_goals_tracking.duplicated(subset=['total_points'], keep=False)

#         # Select the tied teams
#         tied_teams = group_goals_tracking[group_goals_tracking['tied']]

#         # Reset `tie_won` column to 0 for all teams
#         group_goals_tracking['tie_won'] = 0

#         if not tied_teams.empty:
#             print("\n=== Evaluating Head-to-Head for Tied Teams ===\n")

#             # Iterate through all pairs of tied teams
#             for i, row1 in tied_teams.iterrows():
#                 for j, row2 in tied_teams.iterrows():
#                     if i < j:  # Compare each pair only once
#                         team1 = row1['team']
#                         team2 = row2['team']

#                         print(f"Checking tie between: {team1} and {team2}")

#                         # Apply tiebreaker function
#                         tiebreak_result, tie_won_row1, tie_won_row2 = tiebreaker_after(row1, row2, agg_goals_before_last_day)

#                         if tiebreak_result != 'tie':
#                             print(f"Tiebreak Result: Winner is {tiebreak_result}")
#                         else:
#                             print(f"No winner in tiebreak between {team1} and {team2}")

#                         # Update the `tie_won` column based on the results
#                         group_goals_tracking.loc[group_goals_tracking['team'] == team1, 'tie_won'] += tie_won_row1
#                         group_goals_tracking.loc[group_goals_tracking['team'] == team2, 'tie_won'] += tie_won_row2

#         # Sort teams by updated points and tie-breaking criteria
#         group_goals_tracking = group_goals_tracking.sort_values(
#             by=['total_points', 'tie_won', 'total_goals_difference', 'total_goals_scored'],
#             ascending=[False, False, False, False]
#         )
        
#         # Update standings after sorting
#         group_goals_tracking['last_game_standing'] = group_goals_tracking.reset_index(drop=True).index + 1

#         # Print standings after each goal
#         print(f"\n=== Standings after goal at minute {goal['goal_minute']} in {stage}, edition {year} ===")
#         print(group_goals_tracking[['team', 'total_points', 'total_goals_scored', 'total_goals_conceded', 'total_goals_difference', 'before_last_game_standing']].to_string(index=False))
#         print("\n========================================\n")

#         # Track top teams and composition changes
#         current_top_teams = set(group_goals_tracking.nsmallest(top_standings_limit, 'last_game_standing')['team'])
#         changed = int(current_top_teams != initial_top_teams)

#         if changed:
#             change_counter += 1
#             initial_top_teams = current_top_teams

#         composition_changes.append({
#             'year': year,
#             'stage': stage,
#             'change_num': change_counter,
#             'goal_time': goal['goal_minute'],
#             'home_team': home_team,
#             'away_team': away_team,
#             'scorer_team': scorer_team,
#             'new_top_teams': list(current_top_teams),
#             '1st': group_goals_tracking.iloc[0]['team'],
#             '2nd': group_goals_tracking.iloc[1]['team'] if len(group_goals_tracking) > 1 else None,
#             '3rd': group_goals_tracking.iloc[2]['team'] if len(group_goals_tracking) > 2 else None,
#             'changed': changed,
#         })

#     return pd.DataFrame(composition_changes)


# gap between qualifying and not qualifying 

## third team tracking

In [375]:
third_place_tracking = []  # Global list to store third-place changes

def track_third_place_team(group_goals_tracking, year, stage, goal_minute, half_time, date, time):
    """
    Identifies and saves the third-place team after each goal and includes the match date and time.

    Returns:
    - A dictionary containing year, stage, minute, half_time, third-place team details, date, and time.
    """
    if len(group_goals_tracking) < 3:
        print(f"[DEBUG] Year: {year}, Stage: {stage}, Goal Minute: {goal_minute}, No third team available.")
        return {
            'year': year,
            'stage': stage,
            'goal_minute': goal_minute,
            'half_time': half_time,
            'third_team': None,
            'total_points': None,
            'total_goals_difference': None,
            'total_goals_scored': None,
            'date': date,  # Now taken from input
            'time': time   # Now taken from input
        }  # Not enough teams

    third_team = group_goals_tracking.iloc[2]  # Third-placed team

    print(f"[DEBUG] Saving Third Team - Year: {year}, Stage: {stage}, Goal Minute: {goal_minute}, "
          f"Team: {third_team['team']}, Points: {third_team['total_points']}, "
          f"Goal Difference: {third_team['total_goals_difference']}, Goals Scored: {third_team['total_goals_scored']}, "
          f"Date: {date}, Time: {time}"
    )

    return {
        'year': year,
        'stage': stage,
        'goal_minute': goal_minute,
        'half_time': half_time,
        'third_team': third_team['team'],
        'total_points': third_team['total_points'],
        'total_goals_difference': third_team['total_goals_difference'],
        'total_goals_scored': third_team['total_goals_scored'],
        'date': date,
        'time': time
    }



In [376]:
def gap_composition(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day):
    # Step 1: Filter the data for the specific year and stage
    group_goals_tracking = all_games_before_last[
        (all_games_before_last['year'] == year) & 
        (all_games_before_last['stage'] == stage)
    ].copy()

    group_goals_last_day = goals_last_day_sorted[
        (goals_last_day_sorted['year'] == year) & 
        (goals_last_day_sorted['stage'] == stage)
    ].copy()

    # Debugging: Print the filtered goals before sorting
    print(f"\n=== Goals for {stage} , {year} BEFORE Sorting ===")
    print(group_goals_last_day[['goal_minute', 'half_time', 'home_team', 'away_team', 'scorer_nationality', 'short_date', 'local_time']])
    print("\n========================================\n")

    # Initialize columns for team performance and standings
    group_goals_tracking['before_last_game_goals_scored'] = group_goals_tracking['goals_scored']
    group_goals_tracking['before_last_game_goals_conceded'] = group_goals_tracking['goals_conceded']
    group_goals_tracking['before_last_game_standing'] = group_goals_tracking['standing']
    group_goals_tracking['before_last_game_points'] = group_goals_tracking['points']
    group_goals_tracking['last_game_goals_scored'] = 0
    group_goals_tracking['last_game_goals_conceded'] = 0
    group_goals_tracking['total_goals_scored'] = group_goals_tracking['before_last_game_goals_scored']
    group_goals_tracking['total_goals_conceded'] = group_goals_tracking['before_last_game_goals_conceded']
    group_goals_tracking['total_goals_difference'] = group_goals_tracking['total_goals_scored'] - group_goals_tracking['total_goals_conceded']
    group_goals_tracking['last_game_points'] = 0
    group_goals_tracking['total_points'] = group_goals_tracking['before_last_game_points']
    group_goals_tracking['last_game_standing'] = 0
    group_goals_tracking['tie_won'] = 0  # Initialize tie_won for tiebreak resolution
    group_goals_tracking['three_tie'] = 0 # Initialize `three_tie` column to 0 for all teams



    # Add one point to each team for a 0-0 starting score
    group_goals_tracking['total_points'] += 1

    # Remove unecessary columns
    group_goals_tracking = group_goals_tracking.drop(columns=['goals_difference','goals_scored', 'goals_conceded', 'standing', 'points',
                                                             'total_matches', 'tiebreaker'])


    # Define top standings limit based on the year (3 for 1992 and earlier, 2 for 1994 and later) for World Cup  *************---------------------------------CHANGE----------------------------------------------*******************
    # top_standings_limit = 3 if year <= 1994 else 2 

    # Define top standings limit based on the year (3 for 2016 and later, 2 for 2014 and earlier) for Euros
    top_standings_limit = 3 if year >= 2016 else 2

    # Print initial standings
    print(f"\n=== STEP 1: Initial Standings for {stage}, {year} (Goal Time = 0) ===")
    print(group_goals_tracking[['team','total_points', 'total_goals_scored', 
                                'total_goals_conceded', 'total_goals_difference', 'before_last_game_standing']].to_string(index=False))
    print("\n========================================\n")

    # Initialize composition tracking with initial composition (change_num = 0)
    initial_top_teams = set(
        group_goals_tracking[group_goals_tracking['before_last_game_standing'] <= top_standings_limit]['team']
    )

    composition_changes = [{
        'year': year,
        'stage': stage,
        'change_num': 0,
        'goal_minute': 0,
        'home_team': None,
        'away_team': None,
        'scorer_team': None,
        'new_top_teams': list(initial_top_teams),
        '1st': group_goals_tracking.loc[group_goals_tracking['before_last_game_standing'] == 1, 'team'].values[0] 
                if (group_goals_tracking['before_last_game_standing'] == 1).any() else None,
        '2nd': group_goals_tracking.loc[group_goals_tracking['before_last_game_standing'] == 2, 'team'].values[0] 
                if (group_goals_tracking['before_last_game_standing'] == 2).any() else None,
        '3rd': group_goals_tracking.loc[group_goals_tracking['before_last_game_standing'] == 3, 'team'].values[0] 
                if (group_goals_tracking['before_last_game_standing'] == 3).any() else None,
        'changed': 0,
        'points_diff': 0,
        'goals_diff': 0

    }]

    change_counter = 0  # Counter for the number of composition changes

    
    # Sort by 'half_time' first and then by 'goal_minute'
    # group_goals_last_day = group_goals_last_day.sort_values(by=['half_time', 'goal_minute'], ascending=[True, True])

    # Iterate through each goal and track changes in composition
    for _, goal in group_goals_last_day.iterrows():
        home_team = goal['home_team']
        away_team = goal['away_team']
        scorer_team = goal['scorer_nationality']

        # Update scores based on who scored the goal
        if scorer_team == home_team:
            group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'last_game_goals_scored'] += 1
            group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'total_goals_scored'] += 1
            group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'last_game_goals_conceded'] += 1
            group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'total_goals_conceded'] += 1
        elif scorer_team == away_team:
            group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'last_game_goals_scored'] += 1
            group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'total_goals_scored'] += 1
            group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'last_game_goals_conceded'] += 1
            group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'total_goals_conceded'] += 1

        # Update goal difference
        group_goals_tracking['total_goals_difference'] = group_goals_tracking['total_goals_scored'] - group_goals_tracking['total_goals_conceded']

        # Update last_game_points based on the current game state
        group_goals_tracking['last_game_points'] = group_goals_tracking.apply(
            lambda row: (2 if year <= 1992 else 3) if row['last_game_goals_scored'] > row['last_game_goals_conceded'] 
            else (1 if row['last_game_goals_scored'] == row['last_game_goals_conceded'] else 0), 
            axis=1
        )

        # Calculate total points by adding last game points to before_last_game_points
        group_goals_tracking['total_points'] = group_goals_tracking['before_last_game_points'] + group_goals_tracking['last_game_points']

        # Evaluate ties based on total points
        group_goals_tracking['tied'] = group_goals_tracking.groupby('total_points')['team'].transform('size') > 1

        # Select the tied teams
        tied_teams = group_goals_tracking[group_goals_tracking['tied']]

        # Count the number of tied teams
        num_tied_teams = tied_teams['team'].nunique()

        print(f"\n=== Tied after goal at minute {goal['goal_minute']} {goal['half_time']} half time by {goal['scorer_nationality']} in {stage}, edition {year} ===")
        print(f"Number of tied teams: {num_tied_teams}")
        print(tied_teams[['team', 'total_points', 'total_goals_scored', 'total_goals_conceded', 'total_goals_difference']])

        # Reset `tie_won` column to 0 for all teams
        group_goals_tracking['tie_won'] = 0

        if not tied_teams.empty:
            print("\n=== STEP 2: Evaluating Head-to-Head for Tied Teams ===\n")

            # Group tied teams by total_points and process each group separately
            for points, tied_group in tied_teams.groupby('total_points'):

                if len(tied_group) == 3:  # Handle three-way tie
                    print(f"\n=== Resolving Three-Way Tie for {points} points ===")
                    resolved_ranking = resolve_three_way_tie(tied_group, agg_goals_before_last_day, group_goals_tracking)

                    # Assign `tie_won` values for three-way tie teams
                    for rank, team in enumerate(resolved_ranking, start=1):
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, 'three_tie'] = 4- rank  # Lower rank = lower value

                        
                else:  # Process groups with more than one team
                    print(f"\n=== Checking ties for teams with {points} points ===")
                        
                    # Sort tied_group by additional criteria to ensure proper order
                    tied_group = tied_group.sort_values(by=['total_goals_difference', 'total_goals_scored'], ascending=False)
                    
                    # Iterate through all pairs of tied teams in the current group
                    for i, row1 in tied_group.iterrows():
                        for j, row2 in tied_group.iterrows():
                            if i < j:  # Compare each pair only once
                                team1 = row1['team']
                                team2 = row2['team']

                                print(f"Checking tie between: {team1} and {team2}")

                                # Apply tiebreaker function
                                tiebreak_result, tie_won_row1, tie_won_row2 = tiebreaker_after(row1, row2, agg_goals_before_last_day)

                                if tiebreak_result != 'tie':
                                    print(f"Tiebreak Result: Winner is {tiebreak_result}")
                                else:
                                    print(f"No winner in tiebreak between {team1} and {team2}")

                                # Update the `tie_won` column based on the results
                                group_goals_tracking.loc[group_goals_tracking['team'] == team1, 'tie_won'] += tie_won_row1
                                group_goals_tracking.loc[group_goals_tracking['team'] == team2, 'tie_won'] += tie_won_row2


        # Sort teams by updated points and tie-breaking criteria
        group_goals_tracking = group_goals_tracking.sort_values(
            by=['total_points', 'tie_won', 'three_tie','total_goals_difference', 'total_goals_scored'],
            ascending=[False, False, False, False, False]
        )
        
        # Update standings after sorting
        group_goals_tracking['last_game_standing'] = group_goals_tracking.reset_index(drop=True).index + 1


        # Track third-place team
        third_place_info = track_third_place_team(group_goals_tracking, year, stage, goal['goal_minute'], goal['half_time'],
                                                  goal['short_date'], goal['local_time'])
        # Debugging: Check if third_place_info contains date and time
        print(f"[DEBUG] Third place info: {third_place_info}")

        # Append third-place info to a global list for separate tracking
        third_place_tracking.append(third_place_info)

        # Debugging: Print the last saved entry
        print(f"[DEBUG] Third Place Tracking Updated: {third_place_tracking[-1]}")


        # Print standings after each goal
        print(f"\n=== STEP 3: Standings after goal at minute {goal['goal_minute']} {goal['half_time']} half time by {goal['scorer_nationality']} in {stage}, edition {year} ===")
        print(group_goals_tracking[['team', 'total_points', 'total_goals_scored', 'total_goals_conceded', 'total_goals_difference', 'before_last_game_standing']].to_string(index=False))
        print("\n========================================\n")

        # Initialize tiebreak_result and other variables before the conditions
        tiebreak_result = None
        original_goals_scored_row2 = None
        original_goals_difference_row2 = None
        tie_won_row1 = 0
        tie_won_row2 = 0


        # Calculate points_diff and goals_diff, then apply the tiebreaker if conditions are met
        if top_standings_limit == 3:
            # Calculate points_diff for top_standings_limit == 3
            points_diff = group_goals_tracking.loc[group_goals_tracking['last_game_standing'] == 3, 'total_points'].values[0] - \
                        group_goals_tracking.loc[group_goals_tracking['last_game_standing'] == 4, 'total_points'].values[0]

            # Calculate goals_diff for top_standings_limit == 3
            goals_diff = group_goals_tracking.loc[group_goals_tracking['last_game_standing'] == 3, 'total_goals_difference'].values[0] - \
                        group_goals_tracking.loc[group_goals_tracking['last_game_standing'] == 4, 'total_goals_difference'].values[0]
            
            # Add points_last variable to check if the fourth team is drawing its match 
            points_last = group_goals_tracking.loc[group_goals_tracking['last_game_standing'] == 4, 'last_game_points'].values[0]
            

           # Apply tiebreaker for three-points-win system after 2012
            if (points_diff == 2 and goals_diff <= 1 and points_last == 1):
                print("\n=== STEP 4: Applying Potential Tiebreaker for 3nd and 4th Place ===\n")
                team_3rd = group_goals_tracking.loc[group_goals_tracking['last_game_standing'] == 3, 'team'].values[0]
                team_4th = group_goals_tracking.loc[group_goals_tracking['last_game_standing'] == 4, 'team'].values[0]

                # Get rows for the 3rd and 4th teams
                row1 = group_goals_tracking[group_goals_tracking['team'] == team_3rd].iloc[0]
                row2_index = group_goals_tracking[group_goals_tracking['team'] == team_4th].index[0]  # Get index for row2

                # Save original values for resetting later
                original_goals_scored_row2 = group_goals_tracking.loc[row2_index, 'total_goals_scored']
                original_goals_difference_row2 = group_goals_tracking.loc[row2_index, 'total_goals_difference']

                # Increment total_goals_scored and total_goals_difference for row2
                group_goals_tracking.loc[row2_index, 'total_goals_scored'] += 1
                group_goals_tracking.loc[row2_index, 'total_goals_difference'] += 1

                # Retrieve updated row2 after incrementing values
                row2 = group_goals_tracking.loc[row2_index]

                # Apply tiebreaker
                tiebreak_result, tie_won_row1, tie_won_row2 = tiebreaker_after(row1, row2, agg_goals_before_last_day)

                # Reset total_goals_scored and total_goals_difference to their original values
                group_goals_tracking.loc[row2_index, 'total_goals_scored'] = original_goals_scored_row2
                group_goals_tracking.loc[row2_index, 'total_goals_difference'] = original_goals_difference_row2

        elif top_standings_limit == 2:
            # Calculate points_diff for top_standings_limit == 2
            points_diff = group_goals_tracking.loc[group_goals_tracking['last_game_standing'] == 2, 'total_points'].values[0] - \
                        group_goals_tracking.loc[group_goals_tracking['last_game_standing'] == 3, 'total_points'].values[0]

            # Calculate goals_diff for top_standings_limit == 2
            goals_diff = group_goals_tracking.loc[group_goals_tracking['last_game_standing'] == 2, 'total_goals_difference'].values[0] - \
                        group_goals_tracking.loc[group_goals_tracking['last_game_standing'] == 3, 'total_goals_difference'].values[0]
            
            # Add points_last variable to check if the fourth team is drawing its match 
            points_last = group_goals_tracking.loc[group_goals_tracking['last_game_standing'] == 3, 'last_game_points'].values[0]
            
            # Apply tiebreaker for two-points-win based system before 1992 OR three-points-win system after 1992
            if (points_diff == 1 and goals_diff <= 1 and points_last == 1 and year <= 1992) or \
    (points_diff == 2 and goals_diff <= 1 and points_last == 1 and year > 1992):
                print("\n=== STEP 4: Applying Potential Tiebreaker for 2nd and 3rd Place ===\n")
                team_2nd = group_goals_tracking.loc[group_goals_tracking['last_game_standing'] == 2, 'team'].values[0]
                team_3rd = group_goals_tracking.loc[group_goals_tracking['last_game_standing'] == 3, 'team'].values[0]

                # Get rows for the 2nd and 3rd teams
                row1 = group_goals_tracking[group_goals_tracking['team'] == team_2nd].iloc[0]
                row2_index = group_goals_tracking[group_goals_tracking['team'] == team_3rd].index[0]  # Get index for row2

                # Save original values for resetting later
                original_goals_scored_row2 = group_goals_tracking.loc[row2_index, 'total_goals_scored']
                original_goals_difference_row2 = group_goals_tracking.loc[row2_index, 'total_goals_difference']

                # Increment total_goals_scored and total_goals_difference for row2
                group_goals_tracking.loc[row2_index, 'total_goals_scored'] += 1
                group_goals_tracking.loc[row2_index, 'total_goals_difference'] += 1

                # Retrieve updated row2 after incrementing values
                row2 = group_goals_tracking.loc[row2_index]

                # Apply tiebreaker
                tiebreak_result, tie_won_row1, tie_won_row2 = tiebreaker_after (row1, row2, agg_goals_before_last_day)

                # Reset total_goals_scored and total_goals_difference to their original values
                group_goals_tracking.loc[row2_index, 'total_goals_scored'] = original_goals_scored_row2
                group_goals_tracking.loc[row2_index, 'total_goals_difference'] = original_goals_difference_row2

        # Track top teams and composition changes
        current_top_teams = set(group_goals_tracking.nsmallest(top_standings_limit, 'last_game_standing')['team'])
        changed = int(current_top_teams != initial_top_teams)

        if changed:
            change_counter += 1
            initial_top_teams = current_top_teams

        composition_changes.append({
            'year': year,
            'stage': stage,
            'date': goal['short_date'],
            'time':goal['local_time'],
            'change_num': change_counter,
            'goal_minute': goal['goal_minute'],
            'half_time': goal['half_time'],
            'home_team': home_team,
            'away_team': away_team,
            'scorer_team': scorer_team,
            'new_top_teams': list(current_top_teams),
            '1st': group_goals_tracking.iloc[0]['team'],
            '2nd': group_goals_tracking.iloc[1]['team'] if len(group_goals_tracking) > 1 else None,
            '3rd': group_goals_tracking.iloc[2]['team'] if len(group_goals_tracking) > 2 else None,
            'changed': changed,
            'points_diff': points_diff,
            'goals_diff': goals_diff,
            'tiebreak_result': tiebreak_result,
        })

        # Convert composition_changes into a DataFrame after the loop
        composition_changes_df = pd.DataFrame(composition_changes)

        # Convert third-place tracking into a DataFrame
        third_place_df = pd.DataFrame(third_place_tracking)

    return composition_changes_df, third_place_df


# best four third_placed

## men


In [377]:
def ensure_goal_minute_zero(third_place_df, all_games_before_last):
    """
    Ensures that each (year, stage) combination has an entry with `goal_minute = 0` 
    created from `all_games_before_last`. Any pre-existing `goal_minute = 0` entries 
    in `third_place_df` are removed before adding the new ones.
    
    Only considers `all_games_before_last` for years >= 2016.

    Parameters:
    - third_place_df: DataFrame tracking third-placed teams after each goal.
    - all_games_before_last: DataFrame containing pre-last-matchday standings.

    Returns:
    - Updated third_place_df with `goal_minute = 0` entries strictly from `all_games_before_last`.
    """

    print("[INFO] Removing existing `goal_minute = 0` entries...")
    third_place_df = third_place_df[third_place_df['goal_minute'] != 0].copy()

    # Filter all_games_before_last to include only years >= 2016
    filtered_games = all_games_before_last[all_games_before_last['year'] >= 2016].copy()

    # Filter all_games_before_last to include only years <= 1994
    # filtered_games = all_games_before_last[all_games_before_last['year'] <= 1994].copy()


    print("[INFO] Creating `goal_minute = 0` entries from `all_games_before_last`")
    new_entries = []

    # Get unique (year, stage) combinations from the filtered data
    unique_combinations = filtered_games[['year', 'stage']].drop_duplicates()

    for _, row in unique_combinations.iterrows():
        year, stage = row['year'], row['stage']

        # Retrieve third-placed team from the filtered data
        third_team_data = filtered_games[
            (filtered_games['year'] == year) & 
            (filtered_games['stage'] == stage) & 
            (filtered_games['standing'] == 3)
        ]

        if third_team_data.empty:
            print(f"[WARNING] No third-placed team found for Year {year}, Stage {stage} in `all_games_before_last`.")
            continue  # Skip if no data available

        # Extract relevant data
        third_team = third_team_data.iloc[0]  # Select first matching row
        new_entry = {
            'year': year,
            'stage': stage,
            'goal_minute': 0,
            'half_time': 1,  # Assume first half
            'third_team': third_team['team'],
            'total_points': third_team['points'] + 1,  # Assuming they gain 1 point at start
            'total_goals_difference': third_team['goals_scored'] - third_team['goals_conceded'],
            'total_goals_scored': third_team['goals_scored']
        }

        new_entries.append(new_entry)

    # Append new rows if any were created
    if new_entries:
        print(f"[INFO] Adding {len(new_entries)} new `goal_minute = 0` entries.")
        third_place_df = pd.concat([third_place_df, pd.DataFrame(new_entries)], ignore_index=True)

    # Ensure sorting by year, stage, and goal_minute
    third_place_df = third_place_df.sort_values(by=['year', 'stage', 'goal_minute']).reset_index(drop=True)

    print("[INFO] `goal_minute = 0` entries updated successfully.")
    
    return third_place_df


In [378]:
def track_third_place_teams_with_top_four(third_place_df):
    """
    Tracks and updates third-place teams dynamically at each goal_minute.

    - Ensures every row includes third-place info for ALL groups.
    - Adds:
        - `top_four_third_teams`: Best 4 third-placed teams.
        - `last_two_third_teams`: Weakest 2 third-placed teams.
        - `tied_teams`: List of teams tied in ranking before sorting.
        - `change_flag`: 1 if top 4 changes (ignoring order), 0 otherwise.
        - `change_count`: Running count of top 4 changes per year.

    Sorting Criteria:
    1. `total_points` (Descending)
    2. `total_goals_difference` (Descending)
    3. `total_goals_scored` (Descending)
    4. **Use `team_priority` to break ties only when necessary**.

    Parameters:
    - third_place_df: DataFrame containing tracking info of third-placed teams.

    Returns:
    - DataFrame with `year`, `goal_minute`, rankings, and change tracking.
    """

    # Ensure correct sorting of input data
    third_place_df = third_place_df.sort_values(
        by=['year','date','time','goal_minute', 'half_time', 'stage'], ascending=[True, True, True, True, True, True]
    ).reset_index(drop=True)

    # Initialize list to store results
    tracking_results = []

    # Get all unique groups/stages
    all_stages = sorted(third_place_df['stage'].unique())

    # Get unique years
    years = sorted(third_place_df['year'].unique())

    for year in years:
        # Filter data for the specific year
        year_data = third_place_df[third_place_df['year'] == year].copy()

        # Sort goals by date, time, and then goal_minute
        year_data = year_data.sort_values(by=['date', 'time', 'goal_minute'], ascending=[True, True, True])


        # Get the sorted goal minutes in the correct order
        unique_goal_minutes = year_data[year_data['goal_minute'] > 0]['goal_minute'].unique()


        # Dictionary to store third-placed teams dynamically for each stage
        third_teams_info = {stage: None for stage in all_stages}  # Initialize all groups with None

        # Step 1: Extract initial row where `goal_minute == 0`
        initial_rows = year_data[year_data['goal_minute'] == 0]

        # Fill third_teams_info with initial values
        for _, row in initial_rows.iterrows():
            stage = row['stage']
            third_teams_info[stage] = [
                row['third_team'], 
                pd.to_numeric(row['total_points'], errors='coerce'),  # Convert safely
                pd.to_numeric(row['total_goals_difference'], errors='coerce'),
                pd.to_numeric(row['total_goals_scored'], errors='coerce')
            ]

        # Compute initial rankings
        third_teams_sorted = [
            info for info in third_teams_info.values() 
            if isinstance(info, list) and len(info) == 4 and isinstance(info[1], (int, float))
        ]

        # ✅ Identify tied teams before sorting
        tied_teams = [
            team[0] for team in third_teams_sorted
            if sum(1 for t in third_teams_sorted if t[1:] == team[1:]) > 1
        ]

        # ✅ Apply sorting: first standard ranking, then apply `team_priority` for tie-breaking
        def ranking_key(team):
            # Extract team properties
            points, goal_diff, goals_scored = team[1], team[2], team[3]
            priority_list = team_priority.get(year, [])

            # If tied, use team_priority ranking
            if team[0] in priority_list:
                priority_rank = priority_list.index(team[0])
            else:
                priority_rank = float('inf')  # No priority, rank lowest

            return (points, goal_diff, goals_scored, -priority_rank)  # Negative so lower index = higher rank

        third_teams_sorted.sort(key=ranking_key, reverse=True)

        top_four_third_teams = sorted([team[0] for team in third_teams_sorted[:4]])  # Sort to ignore order changes
        last_two_third_teams = [team[0] for team in third_teams_sorted[-2:]]

        # Store the first row in results
        first_observation = {
            'year': year, 'goal_minute': 0, 
            'date': initial_rows.iloc[0]['date'] if not initial_rows.empty else None,
            'time': initial_rows.iloc[0]['time'] if not initial_rows.empty else None,
            'stage': initial_rows.iloc[0]['stage'] if not initial_rows.empty else None,
            'top_four_third_teams': top_four_third_teams, 
            'last_two_third_teams': last_two_third_teams, 
            'tied_teams': tied_teams,
            'change_flag': 0, 'change_count': 0
        }
        first_observation.update(third_teams_info)
        tracking_results.append(first_observation)

        # Step 2: Process remaining rows dynamically
        last_known_row = first_observation.copy()  # Keep track of last known full row
        change_count = 0  # Track the number of changes in a year

        for goal_minute in unique_goal_minutes:
            # Filter only rows corresponding to the current goal_minute
            goal_minute_rows = year_data[year_data['goal_minute'] == goal_minute]

            # Copy the last known row and update goal_minute
            new_row = last_known_row.copy()
            new_row['goal_minute'] = goal_minute

            # Preserve all previous stage values before modifying the specific stage
            for prev_stage in all_stages:
                if prev_stage in last_known_row:
                    new_row[prev_stage] = last_known_row[prev_stage]

            # Update only if we have a valid goal_minute entry
            if not goal_minute_rows.empty:
                # Identify the stage being processed
                affected_stage = goal_minute_rows.iloc[0]['stage']

                # Update `date` and `time` for the currently processed stage
                new_row['date'] = goal_minute_rows.iloc[0]['date']
                new_row['time'] = goal_minute_rows.iloc[0]['time']
                new_row['half_time'] = goal_minute_rows.iloc[0]['half_time']
                new_row['stage'] = affected_stage  # Track the affected stage

                # Update only the changed group
                for _, row in goal_minute_rows.iterrows():
                    stage = row['stage']
                    
                    if stage == affected_stage:  # Ensure only the correct stage updates
                        new_row[stage] = [
                            row['third_team'], 
                            pd.to_numeric(row['total_points'], errors='coerce'),
                            pd.to_numeric(row['total_goals_difference'], errors='coerce'),
                            pd.to_numeric(row['total_goals_scored'], errors='coerce')
                        ]

            # Compute rankings after the update
            third_teams_sorted = [
                info for info in new_row.values() 
                if isinstance(info, list) and len(info) == 4 and isinstance(info[1], (int, float))
            ]

            # ✅ Identify tied teams before sorting
            tied_teams = [
                team[0] for team in third_teams_sorted
                if sum(1 for t in third_teams_sorted if t[1:] == team[1:]) > 1
            ]

            # ✅ Apply sorting with `team_priority` as tie-breaker
            third_teams_sorted.sort(key=ranking_key, reverse=True)

            top_four_third_teams = sorted([team[0] for team in third_teams_sorted[:4]])  # Sort to ignore order changes
            last_two_third_teams = [team[0] for team in third_teams_sorted[-2:]]

            # Check if there is a change in the top four (ignoring order)
            change_flag = 1 if set(top_four_third_teams) != set(last_known_row['top_four_third_teams']) else 0
            if change_flag:
                change_count += 1

            # Add rankings and change tracking to new row
            new_row['top_four_third_teams'] = top_four_third_teams
            new_row['last_two_third_teams'] = last_two_third_teams
            new_row['tied_teams'] = tied_teams
            new_row['change_flag'] = change_flag
            new_row['change_count'] = change_count

            # Append updated row to results
            tracking_results.append(new_row)

            # Update last known row
            last_known_row = new_row.copy()

    # Convert results into DataFrame and sort before returning
    tracking_results = pd.DataFrame(tracking_results).sort_values(
        by=['year', 'date', 'time', 'goal_minute', 'half_time', 'stage'],
        ascending=[True, True, True, True, True, True]
    ).reset_index(drop=True)

    # Convert results into DataFrame
    return pd.DataFrame(tracking_results)


### women

In [379]:
def best_two_third_placed_eu_women(goals_last_day_sorted, all_games_before_last, agg_goals_before_last_day):
    """
    Process goals and calculate standings for third-placed teams, returning stats after each goal,
    with a single column per group containing all third-team info. Adds a list of teams excluded from
    the best 2 third-placed teams after evaluating each goal and a changes variable that increments
    every time the composition of top2_third_teams changes.
    """
    # Apply filter to process only years 2009 and 2013
    all_games_before_last = all_games_before_last[(all_games_before_last['year'].isin([2009, 2013]))].copy()    
    all_games_before_last['year'] = all_games_before_last['year'].astype(int)

    # Add 1 point to each team for the assumed 0-0 starting score
    all_games_before_last['points'] += 1

    results = []

    # Group by year and process each year separately
    for year, year_data in all_games_before_last.groupby('year'):
        year = int(year)
        print(f"\n--- Processing Year: {year} ---")
        year_data = year_data.copy()

        # Initialize columns for tracking stats
        year_data['before_last_game_goals_scored'] = year_data['goals_scored']
        year_data['before_last_game_goals_conceded'] = year_data['goals_conceded']
        year_data['before_last_game_points'] = year_data['points']
        year_data['last_game_goals_scored'] = 0
        year_data['last_game_goals_conceded'] = 0
        year_data['total_goals_scored'] = year_data['before_last_game_goals_scored']
        year_data['total_goals_conceded'] = year_data['before_last_game_goals_conceded']
        year_data['total_goals_difference'] = year_data['total_goals_scored'] - year_data['total_goals_conceded']
        year_data['last_game_points'] = 0
        year_data['total_points'] = year_data['before_last_game_points']

        # Sort goals by time for the current year
        goals_last_day_year = goals_last_day_sorted[(goals_last_day_sorted['year'] == year)]

        # Initialize a set to track locked stages
        locked_stages = set()

        # Initial standings for goal_time = 0
        third_teams_info = {}
        year_data['tied_won'] = 0  # Initialize tied_won column

        # Sort teams by the `standing` variable
        sorted_standings = year_data.sort_values(by='standing', ascending=True)

        # Extract third-placed teams based on sorted order
        for group_stage, group_data in sorted_standings.groupby('stage'):
            group_key = group_stage.replace(" ", "_")  # Ensure valid keys
            if len(group_data) >= 3:
                third_placed = group_data.iloc[2]  # Third-placed team based on `standing`
                third_teams_info[f'third_team_info_{group_key}'] = {
                    'team': third_placed['team'],
                    'points': int(third_placed['total_points']),
                    'goals_difference': int(third_placed['total_goals_difference']),
                    'goals_scored': int(third_placed['total_goals_scored'])
                }
            else:
                third_teams_info[f'third_team_info_{group_key}'] = None

        # Determine the top 2 third-placed teams initially
        sorted_third_teams = sorted(
            [(info['team'], info['points'], info['goals_difference'], info['goals_scored']) for info in third_teams_info.values() if info],
            key=lambda x: (x[1], x[2], x[3]),
            reverse=True
        )
        top2_third_teams = [team[0] for team in sorted_third_teams[:2]]
        excluded_teams = [team[0] for team in sorted_third_teams[2:]]
        previous_top2 = top2_third_teams
        changes = 0

        # Add the initial state row for the year
        results.append({
            'year': year,
            'stage': None,
            'goal_time': 0,
            'home_team': None,
            'away_team': None,
            'scorer_team': None,
            **third_teams_info,
            'top2_third_teams': top2_third_teams,
            'excluded_teams': excluded_teams,
            'changes': changes
        })

        # Process goals
        for _, goal in goals_last_day_year.iterrows():
            stage = goal['stage']
            home_team = goal['home_team']
            away_team = goal['away_team']
            player_team = goal['scorer_nationality']

            # Lock stats for stages before the current one
            if stage not in locked_stages:
                for locked_stage in year_data['stage'].unique():
                    if locked_stage != stage and locked_stage not in locked_stages:
                        locked_stages.add(locked_stage)

            # Update scores for the current stage
            if stage not in locked_stages:
                opponent_team = home_team if player_team == away_team else away_team
                year_data.loc[year_data['team'] == player_team, 'last_game_goals_scored'] += 1
                year_data.loc[year_data['team'] == opponent_team, 'last_game_goals_conceded'] += 1

                # Update total goals scored, goals conceded, and goal difference
                year_data['total_goals_scored'] = year_data['before_last_game_goals_scored'] + year_data['last_game_goals_scored']
                year_data['total_goals_conceded'] = year_data['before_last_game_goals_conceded'] + year_data['last_game_goals_conceded']
                year_data['total_goals_difference'] = year_data['total_goals_scored'] - year_data['total_goals_conceded']

                # Adjust points dynamically for each game outcome
                year_data['last_game_points'] = (
                    (year_data['last_game_goals_scored'] > year_data['last_game_goals_conceded']).astype(int) * 2 +
                    (year_data['last_game_goals_scored'] == year_data['last_game_goals_conceded']).astype(int) * 0 +
                    (year_data['last_game_goals_scored'] < year_data['last_game_goals_conceded']).astype(int) * -1
                )

                # Update total points
                year_data['total_points'] = year_data['before_last_game_points'] + year_data['last_game_points']

            # Recalculate third-placed team info for each group
            third_teams_info = {}

            for group_stage, group_data in year_data.groupby('stage'):
                group_key = group_stage.replace(" ", "_")  # Ensure valid keys

                # Sort teams by total points, goal difference, and goals scored
                sorted_standings = group_data.sort_values(
                    by=['total_points', 'total_goals_difference', 'total_goals_scored'], 
                    ascending=[False, False, False]
                )

                # Check for ties in total points
                tied_teams = sorted_standings[sorted_standings.duplicated(subset=['total_points'], keep=False)]

                if not tied_teams.empty:
                    # Resolve ties using tiebreaker_after
                    for i, row1 in tied_teams.iterrows():
                        for j, row2 in tied_teams.iterrows():
                            if i != j:  # Ensure no self-comparison
                                winner, _, _ = tiebreaker_after(row1, row2, agg_goals_before_last_day)
                                if winner == row1['team']:
                                    sorted_standings.loc[sorted_standings['team'] == row1['team'], 'tied_won'] += 1

                # Final sorting with tie-breaking criteria
                sorted_standings = sorted_standings.sort_values(
                    by=['total_points', 'tied_won', 'total_goals_difference', 'total_goals_scored'], 
                    ascending=[False, False, False, False]
                )

                # Extract third-placed team
                if len(sorted_standings) >= 3:
                    third_placed = sorted_standings.iloc[2]
                    third_teams_info[f'third_team_info_{group_key}'] = {
                        'team': third_placed['team'],
                        'points': int(third_placed['total_points']),
                        'goals_difference': int(third_placed['total_goals_difference']),
                        'goals_scored': int(third_placed['total_goals_scored'])
                    }
                else:
                    third_teams_info[f'third_team_info_{group_key}'] = None

            # Sort third-placed teams across groups and get top 2
            sorted_third_teams = sorted(
                [(info['team'], info['points'], info['goals_difference'], info['goals_scored']) for info in third_teams_info.values() if info],
                key=lambda x: (x[1], x[2], x[3]),
                reverse=True
            )
            top2_third_teams = [team[0] for team in sorted_third_teams[:2]]
            excluded_teams = [team[0] for team in sorted_third_teams[2:]]

            # Check for changes in top2_third_teams
            if top2_third_teams != previous_top2:
                changes += 1
                previous_top2 = top2_third_teams

            results.append({
                'year': year,
                'stage': stage,
                'goal_time': goal['goal_minute'],
                'half_time': goal['half_time'],
                'home_team': home_team,
                'away_team': away_team,
                'scorer_team': player_team,
                **third_teams_info,
                'top2_third_teams': top2_third_teams,
                'excluded_teams': excluded_teams,
                'changes': changes
            })

    return pd.DataFrame(results)

## World Cup

### women

In [380]:
def best_two_third_placed_wc_women(goals_last_day_sorted, all_games_before_last, agg_goals_before_last_day):
    """
    Process goals and calculate standings for third-placed teams, returning stats after each goal,
    with a single column per group containing all third-team info. Adds a list of teams excluded from
    the best 2 third-placed teams after evaluating each goal and a changes variable that increments
    every time the composition of top2_third_teams changes.
    """
    # Apply filter to process only years == 1991
    all_games_before_last = all_games_before_last[(all_games_before_last['year'] == 1991)].copy()  
    all_games_before_last['year'] = all_games_before_last['year'].astype(int)

    # Add 1 point to each team for the assumed 0-0 starting score
    all_games_before_last['points'] += 1

    results = []

    # Group by year and process each year separately
    for year, year_data in all_games_before_last.groupby('year'):
        year = int(year)
        print(f"\n--- Processing Year: {year} ---")
        year_data = year_data.copy()

        # Initialize columns for tracking stats
        year_data['before_last_game_goals_scored'] = year_data['goals_scored']
        year_data['before_last_game_goals_conceded'] = year_data['goals_conceded']
        year_data['before_last_game_points'] = year_data['points']
        year_data['last_game_goals_scored'] = 0
        year_data['last_game_goals_conceded'] = 0
        year_data['total_goals_scored'] = year_data['before_last_game_goals_scored']
        year_data['total_goals_conceded'] = year_data['before_last_game_goals_conceded']
        year_data['total_goals_difference'] = year_data['total_goals_scored'] - year_data['total_goals_conceded']
        year_data['last_game_points'] = 0
        year_data['total_points'] = year_data['before_last_game_points']

        # Sort goals by time for the current year
        goals_last_day_year = goals_last_day_sorted[(goals_last_day_sorted['year'] == year)]

        # Initialize a set to track locked stages
        locked_stages = set()

        # Initial standings for goal_time = 0
        third_teams_info = {}
        year_data['tied_won'] = 0  # Initialize tied_won column

        # Sort teams by the `standing` variable
        sorted_standings = year_data.sort_values(by='standing', ascending=True)

        # Extract third-placed teams based on sorted order
        for group_stage, group_data in sorted_standings.groupby('stage'):
            group_key = group_stage.replace(" ", "_")  # Ensure valid keys
            if len(group_data) >= 3:
                third_placed = group_data.iloc[2]  # Third-placed team based on `standing`
                third_teams_info[f'third_team_info_{group_key}'] = {
                    'team': third_placed['team'],
                    'points': int(third_placed['total_points']),
                    'goals_difference': int(third_placed['total_goals_difference']),
                    'goals_scored': int(third_placed['total_goals_scored'])
                }
            else:
                third_teams_info[f'third_team_info_{group_key}'] = None

        # Determine the top 2 third-placed teams initially
        sorted_third_teams = sorted(
            [(info['team'], info['points'], info['goals_difference'], info['goals_scored']) for info in third_teams_info.values() if info],
            key=lambda x: (x[1], x[2], x[3]),
            reverse=True
        )
        top2_third_teams = [team[0] for team in sorted_third_teams[:2]]
        excluded_teams = [team[0] for team in sorted_third_teams[2:]]
        previous_top2 = top2_third_teams
        changes = 0

        # Add the initial state row for the year
        results.append({
            'year': year,
            'stage': None,
            'goal_time': 0,
            'home_team': None,
            'away_team': None,
            'scorer_team': None,
            **third_teams_info,
            'top2_third_teams': top2_third_teams,
            'excluded_teams': excluded_teams,
            'changes': changes
        })

        # Process goals
        for _, goal in goals_last_day_year.iterrows():
            stage = goal['stage']
            home_team = goal['home_team']
            away_team = goal['away_team']
            player_team = goal['scorer_nationality']

            # Lock stats for stages before the current one
            if stage not in locked_stages:
                for locked_stage in year_data['stage'].unique():
                    if locked_stage != stage and locked_stage not in locked_stages:
                        locked_stages.add(locked_stage)

            # Update scores for the current stage
            if stage not in locked_stages:
                opponent_team = home_team if player_team == away_team else away_team
                year_data.loc[year_data['team'] == player_team, 'last_game_goals_scored'] += 1
                year_data.loc[year_data['team'] == opponent_team, 'last_game_goals_conceded'] += 1

                # Update total goals scored, goals conceded, and goal difference
                year_data['total_goals_scored'] = year_data['before_last_game_goals_scored'] + year_data['last_game_goals_scored']
                year_data['total_goals_conceded'] = year_data['before_last_game_goals_conceded'] + year_data['last_game_goals_conceded']
                year_data['total_goals_difference'] = year_data['total_goals_scored'] - year_data['total_goals_conceded']

                # Adjust points dynamically for each game outcome
                year_data['last_game_points'] = (
                    (year_data['last_game_goals_scored'] > year_data['last_game_goals_conceded']).astype(int) * 2 +
                    (year_data['last_game_goals_scored'] == year_data['last_game_goals_conceded']).astype(int) * 0 +
                    (year_data['last_game_goals_scored'] < year_data['last_game_goals_conceded']).astype(int) * -1
                )

                # Update total points
                year_data['total_points'] = year_data['before_last_game_points'] + year_data['last_game_points']

            # Recalculate third-placed team info for each group
            third_teams_info = {}

            for group_stage, group_data in year_data.groupby('stage'):
                group_key = group_stage.replace(" ", "_")  # Ensure valid keys

                # Sort teams by total points, goal difference, and goals scored
                sorted_standings = group_data.sort_values(
                    by=['total_points', 'total_goals_difference', 'total_goals_scored'], 
                    ascending=[False, False, False]
                )

                # Check for ties in total points
                tied_teams = sorted_standings[sorted_standings.duplicated(subset=['total_points'], keep=False)]

                if not tied_teams.empty:
                    # Resolve ties using tiebreaker_after
                    for i, row1 in tied_teams.iterrows():
                        for j, row2 in tied_teams.iterrows():
                            if i != j:  # Ensure no self-comparison
                                winner, _, _ = tiebreaker_after(row1, row2, agg_goals_before_last_day)
                                if winner == row1['team']:
                                    sorted_standings.loc[sorted_standings['team'] == row1['team'], 'tied_won'] += 1

                # Final sorting with tie-breaking criteria
                sorted_standings = sorted_standings.sort_values(
                    by=['total_points', 'tied_won', 'total_goals_difference', 'total_goals_scored'], 
                    ascending=[False, False, False, False]
                )

                # Extract third-placed team
                if len(sorted_standings) >= 3:
                    third_placed = sorted_standings.iloc[2]
                    third_teams_info[f'third_team_info_{group_key}'] = {
                        'team': third_placed['team'],
                        'points': int(third_placed['total_points']),
                        'goals_difference': int(third_placed['total_goals_difference']),
                        'goals_scored': int(third_placed['total_goals_scored'])
                    }
                else:
                    third_teams_info[f'third_team_info_{group_key}'] = None

            # Sort third-placed teams across groups and get top 2
            sorted_third_teams = sorted(
                [(info['team'], info['points'], info['goals_difference'], info['goals_scored']) for info in third_teams_info.values() if info],
                key=lambda x: (x[1], x[2], x[3]),
                reverse=True
            )
            top2_third_teams = [team[0] for team in sorted_third_teams[:2]]
            excluded_teams = [team[0] for team in sorted_third_teams[2:]]

            # Check for changes in top2_third_teams
            if top2_third_teams != previous_top2:
                changes += 1
                previous_top2 = top2_third_teams

            results.append({
                'year': year,
                'stage': stage,
                'goal_time': goal['goal_minute'],
                'half_time': goal['half_time'],
                'home_team': home_team,
                'away_team': away_team,
                'scorer_team': player_team,
                **third_teams_info,
                'top2_third_teams': top2_third_teams,
                'excluded_teams': excluded_teams,
                'changes': changes
            })

    return pd.DataFrame(results)

# suspense

In [381]:
def active_suspense(all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day):
    """
    Calculate active suspense for each team in each stage and year, check and report conditions
    after each goal, resolve ties, and return a DataFrame with team, stage, year, active suspense count,
    reasons for not meeting the conditions, and tie status.
    """
    # Initialize a results list to store suspense events
    results = []

    # Step 1: Loop through each unique combination of year and stage
    unique_groups = all_games_before_last[['year', 'stage']].drop_duplicates()
    for _, group in unique_groups.iterrows():
        year = group['year']
        stage = group['stage']

        # Filter the data for the specific year and stage
        group_goals_tracking = all_games_before_last[
            (all_games_before_last['year'] == year) & 
            (all_games_before_last['stage'] == stage)
        ].copy()

        group_goals_last_day = goals_last_day_sorted[
            (goals_last_day_sorted['year'] == year) & 
            (goals_last_day_sorted['stage'] == stage)
        ]

        if group_goals_last_day.empty:
            print(f"No goals recorded for stage {stage} in year {year}. Skipping...")
            continue

        # Initialize tracking columns
        group_goals_tracking['before_last_game_goals_scored'] = group_goals_tracking['goals_scored']
        group_goals_tracking['before_last_game_goals_conceded'] = group_goals_tracking['goals_conceded']
        group_goals_tracking['before_last_game_points'] = group_goals_tracking['points']
        group_goals_tracking['last_game_goals_scored'] = 0
        group_goals_tracking['last_game_goals_conceded'] = 0
        group_goals_tracking['total_goals_scored'] = group_goals_tracking['before_last_game_goals_scored']
        group_goals_tracking['total_goals_conceded'] = group_goals_tracking['before_last_game_goals_conceded']
        group_goals_tracking['total_goals_difference'] = group_goals_tracking['total_goals_scored'] - group_goals_tracking['total_goals_conceded']
        group_goals_tracking['last_game_points'] = 0
        group_goals_tracking['total_points'] = group_goals_tracking['before_last_game_points']

        # Initialize team suspense count
        team_suspense_count = {team: 0 for team in group_goals_tracking['team'].unique()}

        # Sort by 'half_time' first and then by 'goal_minute'
        group_goals_last_day = group_goals_last_day.sort_values(by=['half_time', 'goal_minute'], ascending=[True, True])

        # Process each goal and calculate active suspense
        for _, goal in group_goals_last_day.iterrows():
            home_team = goal['home_team']
            away_team = goal['away_team']
            scorer_team = goal['scorer_nationality']
            goal_minute = goal['goal_minute']
            half_time = goal['half_time']

            # Update scores based on who scored the goal
            if scorer_team == home_team:
                group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'last_game_goals_scored'] += 1
                group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'last_game_goals_conceded'] += 1
            elif scorer_team == away_team:
                group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'last_game_goals_scored'] += 1
                group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'last_game_goals_conceded'] += 1

            # Update totals dynamically
            group_goals_tracking['total_goals_scored'] = group_goals_tracking['before_last_game_goals_scored'] + group_goals_tracking['last_game_goals_scored']
            group_goals_tracking['total_goals_conceded'] = group_goals_tracking['before_last_game_goals_conceded'] + group_goals_tracking['last_game_goals_conceded']
            group_goals_tracking['total_goals_difference'] = group_goals_tracking['total_goals_scored'] - group_goals_tracking['total_goals_conceded']

            # Update points
            for i, row in group_goals_tracking.iterrows():
                if row['last_game_goals_scored'] > row['last_game_goals_conceded']:
                    group_goals_tracking.loc[i, 'last_game_points'] = 3 if year > 1994 else 2
                elif row['last_game_goals_scored'] == row['last_game_goals_conceded']:
                    group_goals_tracking.loc[i, 'last_game_points'] = 1
                else:
                    group_goals_tracking.loc[i, 'last_game_points'] = 0

            group_goals_tracking['total_points'] = group_goals_tracking['before_last_game_points'] + group_goals_tracking['last_game_points']

            # Step 8: Mark teams that are tied
            group_goals_tracking['tied'] = group_goals_tracking.duplicated(subset=['total_points', 'total_goals_difference', 'total_goals_scored'], keep=False)

            # Reset `tied_won` to 0 for all teams
            group_goals_tracking['tied_won'] = 0

            # Resolve ties using head-to-head results
            tied_teams = group_goals_tracking[group_goals_tracking['tied']]
            if not tied_teams.empty:
                for index, row in tied_teams.iterrows():
                    team1 = row['team']
                    for other_index, other_row in tied_teams[tied_teams.index != index].iterrows():
                        team2 = other_row['team']
                        match = agg_goals_before_last_day[
                            ((agg_goals_before_last_day['home_team'] == team1) & (agg_goals_before_last_day['away_team'] == team2)) |
                            ((agg_goals_before_last_day['home_team'] == team2) & (agg_goals_before_last_day['away_team'] == team1))
                        ]
                        if not match.empty:
                            match_result = match.iloc[0]['won']
                            if match_result == 1:  # Home win
                                if match.iloc[0]['home_team'] == team1:
                                    group_goals_tracking.loc[group_goals_tracking['team'] == team1, 'tied_won'] += 1
                                else:
                                    group_goals_tracking.loc[group_goals_tracking['team'] == team2, 'tied_won'] += 1
                            elif match_result == -1:  # Away win
                                if match.iloc[0]['away_team'] == team1:
                                    group_goals_tracking.loc[group_goals_tracking['team'] == team1, 'tied_won'] += 1
                                else:
                                    group_goals_tracking.loc[group_goals_tracking['team'] == team2, 'tied_won'] += 1

            # Sort teams by total points, goal difference, goals scored, and tied_won
            group_goals_tracking = group_goals_tracking.sort_values(
                by=['total_points', 'total_goals_difference', 'total_goals_scored', 'tied_won'],
                ascending=[False, False, False, False]
            ).reset_index(drop=True)
            group_goals_tracking['last_game_standing'] = group_goals_tracking.index + 1

            # Evaluate active suspense
            for i, row in group_goals_tracking.iterrows():
                current_standing = row['last_game_standing']
                reason = ""
                current_team_status = "drawing" if row['last_game_goals_scored'] == row['last_game_goals_conceded'] else "losing"

                if year < 2016 and current_standing == 3:
                    if not group_goals_tracking[group_goals_tracking['last_game_standing'] == 2].empty:
                        next_team = group_goals_tracking[group_goals_tracking['last_game_standing'] == 2].iloc[0]
                        next_team_status = "drawing" if next_team['last_game_goals_scored'] == next_team['last_game_goals_conceded'] else "losing"

                        if (
                            (next_team['total_points'] - row['total_points'] <= 1 and
                             row['total_goals_difference'] - next_team['total_goals_difference'] <= 1)
                        ):
                            team_suspense_count[row['team']] += 1
                            results.append({
                                'team': row['team'],
                                'stage': stage,
                                'year': year,
                                'goal_minute': goal_minute,
                                'half_time': half_time,
                                'active_suspense_count': 1,
                                'reason': f"Active suspense met ({current_team_status} vs {next_team_status})"
                            })
                        else:
                            reason = f"Condition for moving to 2nd not met ({current_team_status} vs {next_team_status})"
                    else:
                        reason = "No team found in 2nd position for evaluation"

                elif year >= 2016 and current_standing == 4:
                    if not group_goals_tracking[group_goals_tracking['last_game_standing'] == 3].empty:
                        next_team = group_goals_tracking[group_goals_tracking['last_game_standing'] == 3].iloc[0]
                        next_team_status = "drawing" if next_team['last_game_goals_scored'] == next_team['last_game_goals_conceded'] else "losing"

                        if (
                            (next_team['total_points'] - row['total_points'] <= 1 and
                             row['total_goals_difference'] - next_team['total_goals_difference'] <= 1)
                        ):
                            team_suspense_count[row['team']] += 1
                            results.append({
                                'team': row['team'],
                                'stage': stage,
                                'year': year,
                                'goal_minute': goal_minute,
                                'half_time': half_time,
                                'active_suspense_count': 1,
                                'reason': f"Active suspense met ({current_team_status} vs {next_team_status})"
                            })
                        else:
                            reason = f"Condition for moving to 3rd not met ({current_team_status} vs {next_team_status})"
                    else:
                        reason = "No team found in 3rd position for evaluation"

                if reason:
                    results.append({
                        'team': row['team'],
                        'stage': stage,
                        'year': year,
                        'goal_minute': goal_minute,
                        'half_time': half_time,
                        'active_suspense_count': 0,
                        'reason': reason
                    })

    # Convert results to a DataFrame
    return pd.DataFrame(results)


# probabilities

## single match probabilities

In [382]:
def calculate_elo_probabilities(elo_home, elo_away, base_draw_factor=0.2):
    """
    Calculate the probabilities of home win, draw, and away win using Elo ratings.
    Includes a dynamic draw factor based on Elo differences.
    """
    prob_home_win = 1 / (1 + 10 ** ((elo_away - elo_home) / 400))
    prob_away_win = 1 / (1 + 10 ** ((elo_home - elo_away) / 400))
    
    # Dynamic draw factor based on Elo difference
    elo_diff = abs(elo_home - elo_away)
    dynamic_draw_factor = base_draw_factor * (1 - min(elo_diff / 400, 1))
    
    prob_draw = dynamic_draw_factor * (prob_home_win + prob_away_win)
    
    # Normalize probabilities to sum to 1
    total = prob_home_win + prob_away_win + prob_draw
    prob_home_win /= total
    prob_draw /= total
    prob_away_win /= total
    
    return prob_home_win, prob_draw, prob_away_win



In [383]:
def integrate_elo_probabilities(goals_df, elo_data):
    """
    Integrate Elo ratings into the dataset and calculate probabilities for each goal and match.
    """
    # Merge Elo ratings for home and away teams
    goals_df = goals_df.merge(
        elo_data[['year', 'team_name', 'elo']],
        left_on=['year', 'home_team'],
        right_on=['year', 'team_name'],
        how='left'
    ).rename(columns={'elo': 'elo_home'})

    goals_df = goals_df.merge(
        elo_data[['year', 'team_name', 'elo']],
        left_on=['year', 'away_team'],
        right_on=['year', 'team_name'],
        how='left'
    ).rename(columns={'elo': 'elo_away'})

    # Drop redundant columns
    goals_df = goals_df.drop(columns=['team_name_x', 'team_name_y'])

    # # Calculate probabilities for each goal and match
    # goals_df['P_home_win'], goals_df['P_draw'], goals_df['P_away_win'] = zip(
    #     *goals_df.apply(
    #         lambda row: calculate_elo_probabilities(row['elo_home'], row['elo_away'])
    #         if not (pd.isna(row['elo_home']) or pd.isna(row['elo_away']))
    #         else (np.nan, np.nan, np.nan),
    #         axis=1
    #     )
    # )

    return goals_df

In [384]:
def update_elo_after_match(home_elo, away_elo, score, k_factor=20):
    """
    Update Elo ratings for both teams after a match based on the score.

    Parameters:
    - home_elo: Current Elo rating of the home team.
    - away_elo: Current Elo rating of the away team.
    - score: Match score in the format "home_goals-away_goals".
    - k_factor: Elo adjustment factor (default is 20).

    Returns:
    - Updated Elo ratings for the home and away teams.
    """
    try:
        # Parse the score
        score = score.replace("–", "-")  # Normalize dashes
        home_goals, away_goals = map(int, score.split("-"))

        # Determine match outcome
        if home_goals > away_goals:
            outcome_home, outcome_away = 1, 0  # Home team wins
        elif home_goals < away_goals:
            outcome_home, outcome_away = 0, 1  # Away team wins
        else:
            outcome_home, outcome_away = 0.5, 0.5  # Draw

        # Calculate probabilities
        prob_home_win, prob_draw, prob_away_win = calculate_elo_probabilities(home_elo, away_elo)

        # Update Elo ratings
        home_elo_new = home_elo + k_factor * (outcome_home - prob_home_win)
        away_elo_new = away_elo + k_factor * (outcome_away - prob_away_win)

        return home_elo_new, away_elo_new
    except Exception as e:
        print(f"Error processing score '{score}': {e}")
        return home_elo, away_elo  # Return original ratings if an error occurs


In [385]:
def update_probabilities_for_following_matches(goals_df):
    """
    Update the probabilities for subsequent matches based on previous match scores.
    """
    # Sort matches by date and time
    goals_df = goals_df.sort_values(by=['short_date', 'local_time'])

    # Initialize dictionaries to keep track of updated Elo ratings
    updated_elos = {}

    # Iterate through matches
    for index, row in goals_df.iterrows():
        home_team, away_team = row['home_team'], row['away_team']
        year = row['year']

        # Get current Elo ratings
        home_elo = updated_elos.get((year, home_team), row['elo_home'])
        away_elo = updated_elos.get((year, away_team), row['elo_away'])

        # Update Elo ratings after the match using the score
        if pd.notna(row['score']):
            home_elo, away_elo = update_elo_after_match(home_elo, away_elo, row['score'])
            updated_elos[(year, home_team)] = home_elo
            updated_elos[(year, away_team)] = away_elo

        # Recalculate probabilities for the match
        prob_home_win, prob_draw, prob_away_win = calculate_elo_probabilities(home_elo, away_elo)
        goals_df.at[index, 'P_home_win'] = prob_home_win
        goals_df.at[index, 'P_draw'] = prob_draw
        goals_df.at[index, 'P_away_win'] = prob_away_win

    return goals_df

## qualification probabilities

In [386]:
def calculate_position_probabilities(
    initial_standings, 
    remaining_matches, 
    match_probs_df, 
    elo_ratings, 
    tiebreaker_before
):
    """
    Calculate probabilities for each team to end in each position after remaining matches.

    Parameters:
    - initial_standings: DataFrame of initial standings (e.g., after the first matchday or before the last matchday).
    - remaining_matches: DataFrame of remaining matches (e.g., last two matchdays or last matchday).
    - match_probs_df: DataFrame with integrated Elo probabilities for matches.
    - elo_ratings: Dictionary of current Elo ratings for each team.
    - apply_tiebreaker: Function to resolve ties in standings.
    
    Returns:
    - position_probabilities: Dictionary where keys are team names and values are lists of probabilities
                              for each final position (1st, 2nd, 3rd, 4th).
    """

    # Initialize position counters for all teams
    teams = initial_standings['team'].tolist()
    position_counts = {team: [0, 0, 0, 0] for team in teams}

    # Number of simulations
    total_simulations = 1000

    for _ in range(total_simulations):
        # Create a copy of standings for the simulation
        simulated_standings = initial_standings.copy()

        # Simulate the results of remaining matches
        for _, match in remaining_matches.iterrows():
            home_team = match['home_team']
            away_team = match['away_team']

            # Retrieve Elo probabilities for the match
            prob_home_win = match['P_home_win']
            prob_draw = match['P_draw']
            prob_away_win = match['P_away_win']

            # Simulate the outcome of the match
            outcome = np.random.choice(
                ['home_win', 'draw', 'away_win'],
                p=[prob_home_win, prob_draw, prob_away_win]
            )

            # Update points and goals based on the simulated outcome
            if outcome == 'home_win':
                simulated_standings.loc[simulated_standings['team'] == home_team, 'points'] += 3
                simulated_standings.loc[simulated_standings['team'] == home_team, 'goals_scored'] += 1
                simulated_standings.loc[simulated_standings['team'] == away_team, 'goals_conceded'] += 1
            elif outcome == 'away_win':
                simulated_standings.loc[simulated_standings['team'] == away_team, 'points'] += 3
                simulated_standings.loc[simulated_standings['team'] == away_team, 'goals_scored'] += 1
                simulated_standings.loc[simulated_standings['team'] == home_team, 'goals_conceded'] += 1
            else:  # Draw
                simulated_standings.loc[simulated_standings['team'] == home_team, 'points'] += 1
                simulated_standings.loc[simulated_standings['team'] == away_team, 'points'] += 1

        # Update goal difference
        simulated_standings['goals_difference'] = simulated_standings['goals_scored'] - simulated_standings['goals_conceded']

        # Apply tie-breaking logic for tied teams
        simulated_standings['tied'] = simulated_standings.duplicated(subset=['points'], keep=False)
        tied_teams = simulated_standings[simulated_standings['tied']]

        if not tied_teams.empty:
            for i, row1 in tied_teams.iterrows():
                for j, row2 in tied_teams.iterrows():
                    if i != j:
                        tiebreak_result, tie_won_row1, tie_won_row2 = tiebreaker_before(row1, row2, match_probs_df)

                        if tiebreak_result == row1['team']:
                            simulated_standings.loc[simulated_standings['team'] == row1['team'], 'tie_won'] += tie_won_row1
                        elif tiebreak_result == row2['team']:
                            simulated_standings.loc[simulated_standings['team'] == row2['team'], 'tie_won'] += tie_won_row2

        # Sort by updated standings
        simulated_standings = simulated_standings.sort_values(
            by=['points', 'tie_won', 'goals_difference', 'goals_scored'],
            ascending=[False, False, False, False]
        ).reset_index(drop=True)

        # Count positions for each team
        for position, row in simulated_standings.iterrows():
            position_counts[row['team']][position] += 1

    # Convert counts to probabilities
    position_probabilities = {}
    for team, counts in position_counts.items():
        position_probabilities[team] = [count / total_simulations for count in counts]

    return position_probabilities
