# convert time

In [1]:
def convert_time_to_utc(time_str):
    """
    Converts various time formats into a standardized UTC reference.
    Handles AM/PM formats, timezone acronyms, explicit UTC offsets, and missing spaces.
    If no timezone is detected, it returns the original time value.
    """
    print(f"Original time string: {time_str}")

    if not isinstance(time_str, str) or not time_str.strip():
        print("Invalid input (not a string or empty). Returning empty string.")
        return ""  

    # Remove annotations inside brackets (e.g., [note 1])
    time_str = re.sub(r'\[.*?\]', '', time_str).strip()

    # Ensure spaces between digits and letters (e.g., "21:00CET" → "21:00 CET")
    time_str = re.sub(r'(\d)([a-zA-Z])', r'\1 \2', time_str)

    # Extract the first valid HH:MM time format from the string
    match_time = re.search(r'(\d{1,2}):(\d{2})', time_str)
    
    if not match_time:
        print("No valid time found. Returning empty string.\n")
        return ""  

    hour, minute = map(int, match_time.groups())
    print(f"Extracted local time: {hour:02}:{minute:02}")

    # Convert AM/PM to 24-hour format if present
    if "a.m." in time_str.lower() and hour == 12:
        hour = 0  # Midnight case
    elif "p.m." in time_str.lower() and hour != 12:
        hour += 12  
    local_time = f"{hour:02}:{minute:02}"
    print(f"Normalized 24-hour format: {local_time}")

    # Step 2: Extract UTC offset from explicit notation (e.g., UTC+1)
    match_utc = re.search(r'UTC\s*([+-]?\d+)', time_str)
    utc_offset = int(match_utc.group(1)) if match_utc else None

    # Step 3: Handle specific timezone acronyms if no explicit UTC offset
    timezone_offsets = {
        "CEST": 2, "CET": 1, "EEST": 3, "EDT": -4, "EST": -5,
        "PDT": -7, "PST": -8, "CDT": -5, "CST": -6, 
        "JST": 9, "KST": 9, "BRT": -3, "MSK": 3, "AMT": -4
    }
    
    if utc_offset is None:
        for tz, offset in timezone_offsets.items():
            if tz in time_str:
                utc_offset = offset
                print(f"Detected timezone acronym: {tz}, UTC Offset: {utc_offset}")
                break
    
    # If no timezone is found, return the original time
    if utc_offset is None:
        print(f"No valid timezone found. Returning original time: {local_time}\n")
        return local_time  

    # Step 4: Convert local time to UTC
    local_time_obj = datetime.strptime(local_time, '%H:%M')
    utc_time_obj = local_time_obj - timedelta(hours=utc_offset)
    
    utc_time = utc_time_obj.strftime('%H:%M')
    print(f"Final UTC time: {utc_time}\n")

    return utc_time  


In [2]:
def format_time_column(df, column_name):
    """
    Ensures the time column is in HH:MM format without losing data.
    It removes seconds if present and keeps values unchanged if already correct.
    """
    def clean_time(value):
        if isinstance(value, str):
            match = re.match(r'(\d{1,2}):(\d{2}):(\d{2})', value)  # Matches HH:MM:SS
            if match:
                return f"{match.group(1)}:{match.group(2)}"  # Keep only HH:MM
            match = re.match(r'(\d{1,2}):(\d{2})', value)  # Matches HH:MM
            if match:
                return value  # Already correct
        return value  # Keep original if not a string

    df[column_name] = df[column_name].astype(str).apply(clean_time)
    return df

# after first day league standings and points

In [3]:
def after_first(goals_df):
    """
    Process goals data to calculate standings after the first matchday 
    and create datasets for games in the last two matchdays.
    """

    # Step 1: Find the first match date for each tournament and group
    first_dates = goals_df.groupby(['year', 'stage'])['short_date'].min().reset_index()

    # Step 2: Create the dataset with all games for the first matchday (goals_first_matchday)
    goals_first_matchday = goals_df.merge(
        first_dates, on=['year', 'stage', 'short_date'], how='inner'
    )

    # Step 3: Create the dataset with all games excluding the first matchday (goals_after_first_matchday)
    goals_after_first_matchday = goals_df.merge(
        first_dates, on=['year', 'stage', 'short_date'], how='left', indicator=True
    )
    goals_after_first_matchday = goals_after_first_matchday[goals_after_first_matchday['_merge'] == 'left_only'].drop(columns=['_merge'])

    # Step 4: Convert short_date to datetime to ensure nlargest can be used
    goals_after_first_matchday['short_date'] = pd.to_datetime(goals_after_first_matchday['short_date'])

    # Find the last two match dates for each tournament and group
    last_two_dates = (
        goals_after_first_matchday.groupby(['year', 'stage'])['short_date']
        .nlargest(2)
        .reset_index()
    )

    # Step 5: Create datasets for the last two matchdays
    goals_last_two_matchdays = goals_after_first_matchday.merge(
        last_two_dates, on=['year', 'stage', 'short_date'], how='inner'
    )

    # Ensure that goals_last_two_matchdays is sorted properly by short_date and goal_minute
    goals_last_two_matchdays_sorted = goals_last_two_matchdays.sort_values(by=['short_date','half_time', 'goal_minute'], ascending=[True, True, True])


    # Initialize an empty list to store the results for games in the first matchday
    results = []

    # Iterate over each match in goals_first_matchday
    for match_id, group in goals_first_matchday.groupby(['year', 'stage', 'home_team', 'away_team']):
        # Initialize goals_home and goals_away for each match
        goals_home = 0
        goals_away = 0

        # Extract local_time and score from the first row in the group
        local_time = group.iloc[0]['local_time']
        score = group.iloc[0]['score']

        # Extract short_date from local_time (convert to date)
        short_date = pd.to_datetime(local_time).date()

        # Loop through each row to count goals for home and away teams
        for _, row in group.iterrows():
            if row['home_team'] == row['scorer_nationality']:
                goals_home += 1
            elif row['away_team'] == row['scorer_nationality']:
                goals_away += 1

        # Calculate the expected score and normalize both scores
        calculated_score = f"{goals_home}-{goals_away}"
        normalized_score = score.replace("–", "-").replace("—", "-")

        # Check if the normalized score matches the calculated score
        score_match = normalized_score == calculated_score

        # Append the results to the list
        results.append({
            'year': match_id[0],
            'stage': match_id[1],
            'home_team': match_id[2],
            'away_team': match_id[3],
            'local_time': local_time,
            'short_date': short_date,
            'goals_home': goals_home,
            'goals_away': goals_away,
            'original_score': score,
            'calculated_score': calculated_score,
            'score_match': score_match
        })

    # Convert results list into a DataFrame
    agg_goals_after_first_day = pd.DataFrame(results)

    # Add 'won' column based on comparison of goals_home and goals_away
    agg_goals_after_first_day['won'] = agg_goals_after_first_day.apply(
        lambda row: 1 if row['goals_home'] > row['goals_away'] else (-1 if row['goals_home'] < row['goals_away'] else 0), 
        axis=1
    )

    return agg_goals_after_first_day, goals_last_two_matchdays_sorted


# before last match day league standings and points

In [4]:
def before_last(goals_df):
    """
    Process goals data to create datasets for games before and on the last match day.
    Calculates home and away goals, and adds a 'won' column indicating match result.
    """
    
    # Step 1: Find the last match date for each tournament and group
    last_dates = goals_df.groupby(['year', 'stage'])['short_date'].max().reset_index()

    # Step 2: Create the dataset with all games excluding the last match day (goals_before_last_day)
    goals_before_last_day = goals_df.merge(
        last_dates, on=['year', 'stage', 'short_date'], how='left', indicator=True
    )
    goals_before_last_day = goals_before_last_day[goals_before_last_day['_merge'] == 'left_only'].drop(columns=['_merge'])

    # Step 3: Create the dataset with only the last match day games (goals_last_day)
    goals_last_day = goals_df.merge(last_dates, on=['year', 'stage', 'short_date'], how='inner')

    # Step 4: Sort goals_last_day by 'goal_minute'
    goals_last_day_sorted = goals_last_day.sort_values(by=['short_date','local_time', 'half_time', 'goal_minute'], ascending=True)

    # Ensure that goals_last_day_sorted has no duplicates
    goals_last_day_sorted = goals_last_day_sorted.drop_duplicates()

    # Initialize an empty list to store the results for games before the last day
    results = []

    # Iterate over each match in goals_before_last_day
    for match_id, group in goals_before_last_day.groupby(['year', 'stage', 'home_team', 'away_team']):
        # Initialize goals_home and goals_away for each match
        goals_home = 0
        goals_away = 0
        
        # Extract local_time and score from the first row in the group
        local_time = group.iloc[0]['local_time']
        score = group.iloc[0]['score']
        
        # Extract short_date from local_time (convert to date)
        short_date = pd.to_datetime(local_time).date()
        
        # Loop through each row to count goals for home and away teams
        for _, row in group.iterrows():
            if row['home_team'] == row['scorer_nationality']:
                goals_home += 1
            elif row['away_team'] == row['scorer_nationality']:
                goals_away += 1

        # Calculate the expected score and normalize both scores
        calculated_score = f"{goals_home}-{goals_away}"
        normalized_score = score.replace("–", "-").replace("—", "-")
        
        # Check if the normalized score matches the calculated score
        score_match = normalized_score == calculated_score

        # Append the results to the list
        results.append({
            'year': match_id[0],
            'stage': match_id[1],
            'home_team': match_id[2],
            'away_team': match_id[3],
            'local_time': local_time,
            'short_date': short_date,
            'goals_home': goals_home,
            'goals_away': goals_away,
            'original_score': score,
            'calculated_score': calculated_score,
            'score_match': score_match
        })

    # Convert results list into a DataFrame
    agg_goals_before_last_day = pd.DataFrame(results)

    # Add 'won' column based on comparison of goals_home and goals_away
    agg_goals_before_last_day['won'] = agg_goals_before_last_day.apply(
        lambda row: 1 if row['goals_home'] > row['goals_away'] else (-1 if row['goals_home'] < row['goals_away'] else 0), 
        axis=1
    )

    return agg_goals_before_last_day, goals_last_day_sorted


In [5]:
def calculate_points(results, years, win_result):
    points = []
    for result, year in zip(results, years):
        if result == win_result:  # Win condition (1 for home, -1 for away)
            points.append(2 if year <= 1992 else 3)
        elif result == 0:  # Draw condition
            points.append(1)
        else:  # Loss condition
            points.append(0)
    return sum(points)

In [6]:
def aggregate_home_away_points(agg_goals_before_last_day):
    """
    Aggregate goals scored, goals conceded, points, and match count
    for both home and away games based on historical point system.
    Only unique matches (different opponents) are counted.
    Returns two DataFrames: home_games and away_games.
    """

    # Remove duplicates to ensure each match is counted only once per home-away combination
    unique_matches = agg_goals_before_last_day.drop_duplicates(subset=['year', 'stage', 'home_team', 'away_team'])

    # Step 1: Home games aggregation with match count and conditional points based on year
    home_games = agg_goals_before_last_day.groupby(['year', 'stage', 'home_team']).agg(
        goals_scored=('goals_home', 'sum'),
        goals_conceded=('goals_away', 'sum'),
        points_home=('won', lambda x: calculate_points(x, agg_goals_before_last_day.loc[x.index, 'year'], 1)),
        match_count_home=('home_team', 'count')
    ).reset_index()


    # Step 2: Away games aggregation with match count and conditional points based on year
    away_games = agg_goals_before_last_day.groupby(['year', 'stage', 'away_team']).agg(
        goals_scored=('goals_away', 'sum'),
        goals_conceded=('goals_home', 'sum'),
        points_away=('won', lambda x: calculate_points(x, agg_goals_before_last_day.loc[x.index, 'year'], -1)),
        match_count_away=('away_team', 'count')
    ).reset_index()

    return home_games, away_games


## tie-break function 

In [7]:
def tiebreaker_before(row1, row2, agg_data):
    """
    Resolves ties between two rows using head-to-head, goal difference, and goals scored criteria.
    """
    print("\n=== Applying Tiebreaker ===")
    print(f"Row1: {row1}")
    print(f"Row2: {row2}\n")

    # Look for the head-to-head match within the same year and stage
    match = agg_data[
        (agg_data['year'] == row1['year']) & (agg_data['stage'] == row1['stage']) &
        (((agg_data['home_team'] == row1['team']) & (agg_data['away_team'] == row2['team'])) |
            ((agg_data['home_team'] == row2['team']) & (agg_data['away_team'] == row1['team'])))
    ]

    # First criterion: Goal difference
    print(f"Goal difference: {row1['team']} = {row1['goals_difference']}, {row2['team']} = {row2['goals_difference']}")
    if row1['goals_difference'] > row2['goals_difference']:
        print(f"Goal difference result: {row1['team']} wins")
        return row1['team'], 1, 0
    elif row1['goals_difference'] < row2['goals_difference']:
        print(f"Goal difference result: {row2['team']} wins")
        return row2['team'], 0, 1

    # Second criterion: Goals scored
    print(f"Goals scored: {row1['team']} = {row1['goals_scored']}, {row2['team']} = {row2['goals_scored']}")
    if row1['goals_scored'] > row2['goals_scored']:
        print(f"Goals scored result: {row1['team']} wins")
        return row1['team'], 1, 0
    elif row1['goals_scored'] < row2['goals_scored']:
        print(f"Goals scored result: {row2['team']} wins")
        return row2['team'], 0, 1
    
    # Third criterion: Head-to-head result
    if not match.empty:
        match_result = match.iloc[0]
        print("Head-to-head match found:")
        print(match_result)

        if match_result['won'] == 1:  # Home team won
            if match_result['home_team'] == row1['team']:
                print(f"Head-to-head result: {row1['team']} wins")
                return row1['team'], 1, 0  # row1's team wins
            else:
                print(f"Head-to-head result: {row2['team']} wins")
                return row2['team'], 0, 1  # row2's team wins
        elif match_result['won'] == -1:  # Away team won
            if match_result['away_team'] == row1['team']:
                print(f"Head-to-head result: {row1['team']} wins")
                return row1['team'], 1, 0  # row1's team wins
            else:
                print(f"Head-to-head result: {row2['team']} wins")
                return row2['team'], 0, 1  # row2's team wins
    else:
        print("No head-to-head match found.")

    # If all criteria are still tied, mark as a tie
    print("All criteria tied. Result: tie")
    return 'tie', 0, 0


In [8]:
# This function does NOT consider head-to-head results in the last (third) matchday as a tiebreaker
def tiebreaker_after_old(row1, row2, agg_data):
    """
    Resolves ties between two rows using head-to-head, goal difference, and goals scored criteria.
    """
    print("\n=== Applying Tiebreaker ===")
    print(f"Row1: {row1}")
    print(f"Row2: {row2}\n")

    # Look for the head-to-head match within the same year and stage
    match = agg_data[
        (agg_data['year'] == row1['year']) & (agg_data['stage'] == row1['stage']) &
        (((agg_data['home_team'] == row1['team']) & (agg_data['away_team'] == row2['team'])) |
            ((agg_data['home_team'] == row2['team']) & (agg_data['away_team'] == row1['team'])))
    ]

    # First criterion: Goal difference
    print(f"Goal difference: {row1['team']} = {row1['total_goals_difference']}, {row2['team']} = {row2['total_goals_difference']}")
    if row1['total_goals_difference'] > row2['total_goals_difference']:
        print(f"Goal difference result: {row1['team']} wins")
        return row1['team'], 1, 0
    elif row1['total_goals_difference'] < row2['total_goals_difference']:
        print(f"Goal difference result: {row2['team']} wins")
        return row2['team'], 0, 1

    # Second criterion: Goals scored
    print(f"Goals scored: {row1['team']} = {row1['total_goals_scored']}, {row2['team']} = {row2['total_goals_scored']}")
    if row1['total_goals_scored'] > row2['total_goals_scored']:
        print(f"Goals scored result: {row1['team']} wins")
        return row1['team'], 1, 0
    elif row1['total_goals_scored'] < row2['total_goals_scored']:
        print(f"Goals scored result: {row2['team']} wins")
        return row2['team'], 0, 1
    
    # Third criterion: Head-to-head result
    if not match.empty:
        match_result = match.iloc[0]
        print("Head-to-head match found:")
        print(match_result)

        if match_result['won'] == 1:  # Home team won
            if match_result['home_team'] == row1['team']:
                print(f"Head-to-head result: {row1['team']} wins")
                return row1['team'], 1, 0  # row1's team wins
            else:
                print(f"Head-to-head result: {row2['team']} wins")
                return row2['team'], 0, 1  # row2's team wins
        elif match_result['won'] == -1:  # Away team won
            if match_result['away_team'] == row1['team']:
                print(f"Head-to-head result: {row1['team']} wins")
                return row1['team'], 1, 0  # row1's team wins
            else:
                print(f"Head-to-head result: {row2['team']} wins")
                return row2['team'], 0, 1  # row2's team wins
    else:
        print("No head-to-head match found.")

    # If all criteria are still tied, mark as a tie
    print("All criteria tied. Result: tie")
    return 'tie', 0, 0


In [9]:
# This function does consider head-to-head results in the last (third) matchday as a tiebreaker

def tiebreaker_after(row1, row2, agg_data):
    """
    Resolves ties between two rows using head-to-head, goal difference, and goals scored criteria.
    """
    print("\n=== Applying Tiebreaker ===")
    print(f"Row1: {row1}")
    print(f"Row2: {row2}\n")

    # Look for the head-to-head match within the same year and stage
    match = agg_data[
        (agg_data['year'] == row1['year']) & (agg_data['stage'] == row1['stage']) &
        (((agg_data['home_team'] == row1['team']) & (agg_data['away_team'] == row2['team'])) |
            ((agg_data['home_team'] == row2['team']) & (agg_data['away_team'] == row1['team'])))
    ]

    # First criterion: Goal difference
    print(f"Goal difference: {row1['team']} = {row1['total_goals_difference']}, {row2['team']} = {row2['total_goals_difference']}")
    if row1['total_goals_difference'] > row2['total_goals_difference']:
        print(f"Goal difference result: {row1['team']} wins")
        return row1['team'], 1, 0
    elif row1['total_goals_difference'] < row2['total_goals_difference']:
        print(f"Goal difference result: {row2['team']} wins")
        return row2['team'], 0, 1

    # Second criterion: Goals scored
    print(f"Goals scored: {row1['team']} = {row1['total_goals_scored']}, {row2['team']} = {row2['total_goals_scored']}")
    if row1['total_goals_scored'] > row2['total_goals_scored']:
        print(f"Goals scored result: {row1['team']} wins")
        return row1['team'], 1, 0
    elif row1['total_goals_scored'] < row2['total_goals_scored']:
        print(f"Goals scored result: {row2['team']} wins")
        return row2['team'], 0, 1
    
    # Third criterion: Head-to-head result
    if not match.empty:
        match_result = match.iloc[0]
        print("Head-to-head match found:")
        print(match_result)

        if match_result['won'] == 1:  # Home team won
            if match_result['home_team'] == row1['team']:
                print(f"Head-to-head result: {row1['team']} wins")
                return row1['team'], 1, 0  # row1's team wins
            else:
                print(f"Head-to-head result: {row2['team']} wins")
                return row2['team'], 0, 1  # row2's team wins
        elif match_result['won'] == -1:  # Away team won
            if match_result['away_team'] == row1['team']:
                print(f"Head-to-head result: {row1['team']} wins")
                return row1['team'], 1, 0  # row1's team wins
            else:
                print(f"Head-to-head result: {row2['team']} wins")
                return row2['team'], 0, 1  # row2's team wins
    else:
        print("No head-to-head match found.")
        
        # Check last game performance if no head-to-head match
        row1_last_game_performance = row1["last_game_goals_scored"] - row1["last_game_goals_conceded"]
        row2_last_game_performance = row2["last_game_goals_scored"] - row2["last_game_goals_conceded"]
        print(f"Last game performance: {row1['team']} = {row1_last_game_performance}, {row2['team']} = {row2_last_game_performance}")
        
        if row1_last_game_performance > row2_last_game_performance:
            print(f"Last game result: {row1['team']} wins")
            return row1['team'], 1, 0
        elif row1_last_game_performance < row2_last_game_performance:
            print(f"Last game result: {row2['team']} wins")
            return row2['team'], 0, 1

    # If all criteria are still tied, mark as a tie
    print("All criteria tied. Result: tie")
    return 'tie', 0, 0


## three teams tied

In [10]:
def resolve_three_way_tie(tied_group, agg_goals_before_last_day, group_goals_tracking):
    """
    Resolve a three-way tie by:
    1. Sorting by goal difference and goals scored in tied matches.
    2. If still tied, use head-to-head match result.
    3. If still tied, compare current match performance.
    4. If still tied, use total goals scored in the group.
    5. Assign `three_tie` values: 3 for 1st, 2 for 2nd, 1 for 3rd.
    """

    tied_teams = tied_group['team'].tolist()
    print(f"\n=== Resolving Three-Way Tie for Teams: {tied_teams} ===")

    year = group_goals_tracking['year'].iloc[0]
    stage = group_goals_tracking['stage'].iloc[0]

    # Filter past matches between tied teams
    past_matches = agg_goals_before_last_day[
        (agg_goals_before_last_day['year'] == year) &
        (agg_goals_before_last_day['stage'] == stage) &
        (agg_goals_before_last_day['home_team'].isin(tied_teams)) &
        (agg_goals_before_last_day['away_team'].isin(tied_teams))
    ]

    # print("\n🔹 Past Matches Between Tied Teams:")
    # print(past_matches[['home_team', 'away_team', 'goals_home', 'goals_away']])

    # Count team appearances in past matches
    team_match_counts = pd.concat([
        past_matches['home_team'],
        past_matches['away_team']
    ]).value_counts()

    # Initialize tracking
    goal_differences = {team: 0 for team in tied_teams}
    goals_scored = {team: 0 for team in tied_teams}
    total_goals_group = {
        team: group_goals_tracking.loc[group_goals_tracking['team'] == team, 'total_goals_scored'].values[0]
        for team in tied_teams
    }

    # Store current match stats (safe version)
    current_stats = {
        team: (
            int(group_goals_tracking.loc[group_goals_tracking['team'] == team, 'last_game_goals_scored'].iloc[0]),
            int(group_goals_tracking.loc[group_goals_tracking['team'] == team, 'last_game_goals_conceded'].iloc[0])
        )
        for team in tied_teams
    }


    # Add stats from past head-to-head matches
    for _, row in past_matches.iterrows():
        home, away = row['home_team'], row['away_team']
        gh, ga = row['goals_home'], row['goals_away']

        goal_differences[home] += gh - ga
        goal_differences[away] += ga - gh

        goals_scored[home] += gh
        goals_scored[away] += ga

    # Add current match stats **only** for teams with 1 appearance in past_matches
    for team in tied_teams:
        if team_match_counts.get(team, 0) == 1:
            last_scored = current_stats[team][0]
            last_conceded = current_stats[team][1]

            goal_differences[team] += last_scored - last_conceded
            goals_scored[team] += last_scored

            # print(f"\n🔹 Added Current Match Stats for {team}:")
            # print(f"   Goals Scored: {last_scored}, Goals Conceded: {last_conceded}")

    # print("\n🔹 Goal Differences and Goals Scored After All Matches:")
    for team in tied_teams:
        print(f"   {team} => GD: {goal_differences[team]}, GS: {goals_scored[team]}")

    # Define basic sort key
    def team_sort_key(team):
        return (goal_differences[team], goals_scored[team])

    # Initial sort
    sorted_teams = sorted(tied_teams, key=team_sort_key, reverse=True)

    # Resolve ties using head-to-head, then current match, then total group goals
    for i in range(len(sorted_teams)):
        for j in range(i + 1, len(sorted_teams)):
            t1, t2 = sorted_teams[i], sorted_teams[j]
            if team_sort_key(t1) == team_sort_key(t2):
                print(f"\n🔹 Resolving Tie Between {t1} and {t2}:")
                # Check head-to-head
                h2h = past_matches[
                    ((past_matches['home_team'] == t1) & (past_matches['away_team'] == t2)) |
                    ((past_matches['home_team'] == t2) & (past_matches['away_team'] == t1))
                ]
                if not h2h.empty:
                    row = h2h.iloc[0]
                    gh, ga = row['goals_home'], row['goals_away']
                    home, away = row['home_team'], row['away_team']
                    # print(f"   Head-to-Head Match: {home} {gh}-{ga} {away}")
                    if gh > ga and t2 == home:
                        sorted_teams[i], sorted_teams[j] = sorted_teams[j], sorted_teams[i]
                    elif ga > gh and t2 == away:
                        sorted_teams[i], sorted_teams[j] = sorted_teams[j], sorted_teams[i]
                    elif gh == ga:
                        # Head-to-head was a draw, compare current match stats
                        t1_perf = current_stats[t1][0] - current_stats[t1][1]
                        t2_perf = current_stats[t2][0] - current_stats[t2][1]
                        if t2_perf > t1_perf:
                            sorted_teams[i], sorted_teams[j] = sorted_teams[j], sorted_teams[i]
                        elif t2_perf == t1_perf:
                            if total_goals_group[t2] > total_goals_group[t1]:
                                sorted_teams[i], sorted_teams[j] = sorted_teams[j], sorted_teams[i]
                else:
                    # print("   No head-to-head match found, check current match performance")
                    t1_perf = current_stats[t1][0] - current_stats[t1][1]
                    t2_perf = current_stats[t2][0] - current_stats[t2][1]
                    if t2_perf > t1_perf:
                        sorted_teams[i], sorted_teams[j] = sorted_teams[j], sorted_teams[i]
                    elif t2_perf == t1_perf:
                        if total_goals_group[t2] > total_goals_group[t1]:
                            sorted_teams[i], sorted_teams[j] = sorted_teams[j], sorted_teams[i]

    # Final output
    print("\n✅ Final Resolved Three-Way Ranking:")
    for rank, team in enumerate(sorted_teams[::-1], start=1):
        print(f"   {rank}. {team} (GD: {goal_differences[team]}, GS: {goals_scored[team]}, Total GS in Group: {total_goals_group[team]})")

    # Assign `three_tie` values
    group_goals_tracking.loc[group_goals_tracking['team'] == sorted_teams[0], 'three_tie'] = 3
    group_goals_tracking.loc[group_goals_tracking['team'] == sorted_teams[1], 'three_tie'] = 2
    group_goals_tracking.loc[group_goals_tracking['team'] == sorted_teams[2], 'three_tie'] = 1

    return sorted_teams


## four teams tied

In [11]:
def resolve_four_way_tie(tied_group, agg_goals_before_last_day, group_goals_tracking):
    """
    Resolve a four-way tie by:
    1. Identifying the first team based on points, goal difference, goals scored, and total goals scored in the group.
    2. Ordering the remaining three teams based first on head-to-head results or, if no head-to-head exists, on the current match result.
    3. Assigning `tie_won` values: 1 for first place, 2 for second, 3 for third, and 4 for fourth.
    """
    # Extract the tied teams
    tied_teams = tied_group['team'].tolist()
    print(f"\n=== Resolving Four-Way Tie for Teams: {tied_teams} ===")

    # 1️⃣ **FILTER HEAD-TO-HEAD DATA FROM PAST MATCHES**
    past_matches = agg_goals_before_last_day[
        (agg_goals_before_last_day['year'] == group_goals_tracking['year'].iloc[0]) &
        (agg_goals_before_last_day['stage'] == group_goals_tracking['stage'].iloc[0]) &
        (agg_goals_before_last_day['home_team'].isin(tied_teams)) & 
        (agg_goals_before_last_day['away_team'].isin(tied_teams))
    ]

    print("\n🔹 Past Matches Between Tied Teams:")
    print(past_matches[['year', 'stage', 'home_team', 'away_team', 'goals_home', 'goals_away']])

    # Initialize tracking dictionaries
    goal_differences = {team: 0 for team in tied_teams}
    goals_scored = {team: 0 for team in tied_teams}


    # 2️⃣ **UPDATE GOAL DIFFERENCE & GOALS SCORED FROM PAST AND CURRENT MATCHES**
    for team in tied_teams:
        last_game_goals = group_goals_tracking.loc[group_goals_tracking['team'] == team, 'last_game_goals_scored'].values[0]
        last_game_conceded = group_goals_tracking.loc[group_goals_tracking['team'] == team, 'last_game_goals_conceded'].values[0]

        goal_differences[team] += last_game_goals - last_game_conceded
        goals_scored[team] += last_game_goals

    for _, row in past_matches.iterrows():
        home_team, away_team = row['home_team'], row['away_team']
        goals_home, goals_away = row['goals_home'], row['goals_away']

        goal_differences[home_team] += goals_home - goals_away
        goal_differences[away_team] += goals_away - goals_home
        goals_scored[home_team] += goals_home
        goals_scored[away_team] += goals_away

    # 3️⃣ **CHECK IF ANY PAIR OF TEAMS HAVE IDENTICAL VALUES**
    for i in range(len(tied_teams)):
        for j in range(i + 1, len(tied_teams)):
            team1, team2 = tied_teams[i], tied_teams[j]
            if (
                goal_differences[team1] == goal_differences[team2] and 
                goals_scored[team1] == goals_scored[team2]
            ):
                print(f"\n🔍 **Teams {team1} and {team2} have identical tiebreaker values, evaluating head-to-head...**")
                
                h2h_match = past_matches[
                    ((past_matches['home_team'] == team1) & (past_matches['away_team'] == team2)) |
                    ((past_matches['home_team'] == team2) & (past_matches['away_team'] == team1))
                ]
                
                if not h2h_match.empty:
                    row = h2h_match.iloc[0]
                    home_team, away_team = row['home_team'], row['away_team']
                    goals_home, goals_away = row['goals_home'], row['goals_away']

                    print(f"\n🔹 Head-to-Head Result: {home_team} {goals_home} - {goals_away} {away_team}")

                    if goals_home > goals_away:
                        tied_teams[i], tied_teams[j] = home_team, away_team
                    elif goals_away > goals_home:
                        tied_teams[i], tied_teams[j] = away_team, home_team

    # 4️⃣ **SORT TEAMS BASED ON UPDATED CRITERIA**

    print("\nSorting tied teams based on:")
    print("- Goal difference")
    print("- Goals scored")

    for t in tied_teams:
        print(f"{t}: Goal Diff = {goal_differences[t]}, Goals Scored = {goals_scored[t]}")

    sorted_teams = sorted(
        tied_teams,
        key=lambda t: (goal_differences[t], goals_scored[t]),
        reverse=True
    )
    
    # 5️⃣ **ASSIGN FINAL RANKINGS**
    for rank, team in enumerate(sorted_teams, start=1):
        group_goals_tracking.loc[group_goals_tracking['team'] == team, 'four_tie'] = 5 - rank

    print("\n✅ **Final Adjusted Ranking After Head-to-Head Evaluation:**")
    for rank, team in enumerate(sorted_teams, start=1):
        print(f"   {rank}. {team}")

    return sorted_teams


# before last match day

In [12]:
def fifa_before_last(home_games, away_games, agg_goals_before_last_day, team_counts):
    """
    Process home and away games data to aggregate goals, points, and standings, including handling ties
    and adjusting for unplayed or 0-0 draws.
    """

    # Step 1: Merge home_games and away_games on year, stage, home_team with away_team
    all_games_before_last = pd.merge(
        home_games,
        away_games,
        left_on=['year', 'stage', 'home_team'],
        right_on=['year', 'stage', 'away_team'],
        how='outer',
        suffixes=('_home', '_away')
    )

    # Assign teams and handle missing values
    # Combine home_team and away_team into a single 'team' column and calculate goals, points, and matches
    all_games_before_last['team'] = all_games_before_last['home_team'].fillna(all_games_before_last['away_team'])
    all_games_before_last['goals_scored'] = all_games_before_last['goals_scored_home'].fillna(0) + all_games_before_last['goals_scored_away'].fillna(0)
    all_games_before_last['goals_conceded'] = all_games_before_last['goals_conceded_home'].fillna(0) + all_games_before_last['goals_conceded_away'].fillna(0)
    all_games_before_last['points'] = all_games_before_last['points_home'].fillna(0) + all_games_before_last['points_away'].fillna(0)
    all_games_before_last['total_matches'] = all_games_before_last['match_count_home'].fillna(0) + all_games_before_last['match_count_away'].fillna(0)

    # Save the original total_matches as matches_flag
    all_games_before_last['matches_flag'] = all_games_before_last['total_matches']

    # Ensure all teams from team_counts (using team_list) are included
    team_counts = team_counts.explode('team_list').rename(columns={'team_list': 'team'})

    # Select unique teams and merge
    all_teams = team_counts[['year', 'stage', 'team']].drop_duplicates()
    all_games_before_last = all_teams.merge(
        all_games_before_last,
        on=['year', 'stage', 'team'],
        how='left'
    )

    # Check for observations with missing values in the specified columns
    missing_values = all_games_before_last[
        all_games_before_last[['goals_scored', 'goals_conceded', 'points', 'total_matches']].isnull().any(axis=1)
    ]

    # Print observations with missing values
    if not missing_values.empty:
        print("Observations with missing values before filling:")
        print(missing_values)
    else:
        print("No missing values in the specified columns.")

    # Fill missing values for teams with no activity
    all_games_before_last = all_games_before_last.fillna({'goals_scored': 0, 'goals_conceded': 0, 'points': 0, 'goals_difference': 0, 'total_matches': 0})
    all_games_before_last['goals_difference'] = all_games_before_last['goals_scored'] - all_games_before_last['goals_conceded']

    # Check for observations where total_matches == 1
    matches_one = all_games_before_last[all_games_before_last['total_matches'] == 1]
    if not matches_one.empty:
        print("Observations where total_matches == 1:")
        print(matches_one)
    else:
        print("No observations where total_matches == 1.")

    # Adjust points for teams with one or no matches
    all_games_before_last.loc[all_games_before_last['total_matches'] == 1, 'points'] += 1
    all_games_before_last.loc[all_games_before_last['total_matches'] == 1, 'total_matches'] = 2

    # Check for observations where total_matches == 0
    matches_zero = all_games_before_last[all_games_before_last['total_matches'] == 0]
    if not matches_zero.empty:
        print("Observations where total_matches == 0:")
        print(matches_zero)
    else:
        print("No observations where total_matches == 0.")

    # Adjust points for teams with no matches
    all_games_before_last.loc[all_games_before_last['total_matches'] == 0, 'points'] += 2
    all_games_before_last.loc[all_games_before_last['total_matches'] == 0, 'total_matches'] = 2


    # Initial sorting by points
    all_games_before_last = all_games_before_last.sort_values(
        by=['year', 'stage', 'points'],
        ascending=[True, True, False]
    ).reset_index(drop=True)


    # Apply tie-breaking for each pair of tied teams
    all_games_before_last['tiebreaker'] = 'no need'
    all_games_before_last['tie_won'] = 0

    for i in range(len(all_games_before_last) - 1):
        row1 = all_games_before_last.iloc[i]
        row2 = all_games_before_last.iloc[i + 1]

        # Check if rows are tied in points within the same year and stage
        if row1['year'] == row2['year'] and row1['stage'] == row2['stage'] and row1['points'] == row2['points']:
            tiebreak_result, tie_won_row1, tie_won_row2 = tiebreaker_before(row1, row2, agg_goals_before_last_day)

            if tiebreak_result != 'tie':
                all_games_before_last.at[i, 'tiebreaker'] = tiebreak_result
                all_games_before_last.at[i, 'tie_won'] = tie_won_row1
                all_games_before_last.at[i + 1, 'tiebreaker'] = tiebreak_result
                all_games_before_last.at[i + 1, 'tie_won'] = tie_won_row2

    # Final sorting by all criteria
    all_games_before_last = all_games_before_last.sort_values(
        by=['year', 'stage', 'points', 'goals_difference', 'goals_scored','tie_won'],
        ascending=[True, True, False, False, False, False]
    ).reset_index(drop=True)

    # Assign standings within each group
    all_games_before_last['standing'] = all_games_before_last.groupby(['year', 'stage']).cumcount() + 1

    # Retain only relevant columns
    all_games_before_last = all_games_before_last[['year', 'stage', 'team', 'standing', 'points', 
                                               'goals_scored', 'goals_conceded', 'goals_difference', 
                                               'total_matches', 'tiebreaker', 'tie_won']]

    # Convert numerical columns to integers
    all_games_before_last[['goals_scored', 'goals_conceded', 'points', 'goals_difference', 'total_matches', 'standing']] = all_games_before_last[
        ['goals_scored', 'goals_conceded', 'points', 'goals_difference', 'total_matches', 'standing']].astype(int)

    return all_games_before_last



# last day league standing and changes

In [13]:
def fifa_final_wc(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day):
    """
    Process and track team performance for the final stage of a given UEFA Euro tournament,
    including handling the last match day's goals and updating standings.
    """

    # Step 1: Filter the data for the specific year and stage
    group_goals_tracking = all_games_before_last[
        (all_games_before_last['year'] == year) & 
        (all_games_before_last['stage'] == stage)
    ].copy()

    group_goals_last_day = goals_last_day_sorted[
        (goals_last_day_sorted['year'] == year) & 
        (goals_last_day_sorted['stage'] == stage)
    ]

    # Step 2: Initialize columns for tracking team performance
    group_goals_tracking['before_last_game_goals_scored'] = group_goals_tracking['goals_scored']
    group_goals_tracking['before_last_game_goals_conceded'] = group_goals_tracking['goals_conceded']
    group_goals_tracking['before_last_game_standing'] = group_goals_tracking['standing']
    group_goals_tracking['before_last_game_points'] = group_goals_tracking['points']

    group_goals_tracking['date'] = None
    group_goals_tracking['time'] = None


    group_goals_tracking['last_game_goals_scored'] = 0
    group_goals_tracking['last_game_goals_conceded'] = 0
    group_goals_tracking['total_goals_scored'] = group_goals_tracking['before_last_game_goals_scored']
    group_goals_tracking['total_goals_conceded'] = group_goals_tracking['before_last_game_goals_conceded']
    group_goals_tracking['total_goals_difference'] = group_goals_tracking['total_goals_scored'] - group_goals_tracking['total_goals_conceded']
    group_goals_tracking['last_game_points'] = 0
    group_goals_tracking['total_points'] = group_goals_tracking['before_last_game_points']

    # Remove unecessary columns
    group_goals_tracking = group_goals_tracking.drop(columns=['goals_difference','goals_scored', 'goals_conceded', 'standing', 'points',
                                                             'total_matches'])

    # Initialize last_game_standing to the initial standings
    group_goals_tracking['last_game_standing'] = group_goals_tracking['before_last_game_standing']

    # Add one point to each team for a 0-0 starting score
    group_goals_tracking['total_points'] += 1

    # Initialize position counters based on initial standings
    group_goals_tracking['1st'] = group_goals_tracking['before_last_game_standing'].apply(lambda x: 1 if x == 1 else 0)
    group_goals_tracking['2nd'] = group_goals_tracking['before_last_game_standing'].apply(lambda x: 1 if x == 2 else 0)
    group_goals_tracking['3rd'] = group_goals_tracking['before_last_game_standing'].apply(lambda x: 1 if x == 3 else 0)
    group_goals_tracking['4th'] = group_goals_tracking['before_last_game_standing'].apply(lambda x: 1 if x == 4 else 0)

    group_goals_tracking['changes'] = 0  # Initialize this but will be redefined later as the sum of 1st, 2nd, 3rd, 4th
    group_goals_tracking['tied'] = False  # Initialize a flag to track tied teams
    group_goals_tracking['tie_won'] = group_goals_tracking['tie_won'] 

    # Sort by 'half_time' first and then by 'goal_minute'
    group_goals_last_day = group_goals_last_day.sort_values(by=['half_time', 'goal_minute'], ascending=[True, True])

    # Print the year, stage, and standings before starting the loop for last match goals
    print(f"\n=== Initial Standings for Year {year}, {stage} Before Last Match Goals ===\n")
    display_columns = ['team', 'total_points', 'total_goals_scored', 'total_goals_conceded', 
                       'total_goals_difference', 'before_last_game_points', 'before_last_game_standing']
    print(group_goals_tracking[display_columns].to_string(index=False))
    print("\n====================================================\n")

    # Step 4: Iterate through the sorted and filtered last match goals and update the goals_tracking table
    previous_standings = group_goals_tracking['last_game_standing'].copy()
    first_iteration = True  # Variable to track the first iteration

    for _, goal in group_goals_last_day.iterrows():
        home_team = goal['home_team']
        away_team = goal['away_team']
        player_team = goal['scorer_nationality']

        group_goals_tracking.loc[group_goals_tracking['team'] == player_team, 'date'] = goal['short_date']
        group_goals_tracking.loc[group_goals_tracking['team'] == player_team, 'time'] = goal['local_time']

        # Print goal information for each goal
        print(f"Analyzing goal: {goal['goal_minute']} minute, {goal['half_time']} half time, Player team: {player_team}, Home: {home_team}, Away: {away_team}")

        # Update the goals based on who scored the goal
        if player_team == home_team:
            # Home team scored, update home scored and away conceded
            group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'last_game_goals_scored'] += 1
            group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'last_game_goals_conceded'] += 1
        elif player_team == away_team:
            # Away team scored, update away scored and home conceded
            group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'last_game_goals_scored'] += 1
            group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'last_game_goals_conceded'] += 1

        # Step 5: Update total_goals_scored, total_goals_conceded, and total_goals_difference
        group_goals_tracking['total_goals_scored'] = group_goals_tracking['before_last_game_goals_scored'] + group_goals_tracking['last_game_goals_scored']
        group_goals_tracking['total_goals_conceded'] = group_goals_tracking['before_last_game_goals_conceded'] + group_goals_tracking['last_game_goals_conceded']
        group_goals_tracking['total_goals_difference'] = group_goals_tracking['total_goals_scored'] - group_goals_tracking['total_goals_conceded']

        # Step 6: Assign points for the last game dynamically after each goal
        for i, row in group_goals_tracking.iterrows():
            if row['last_game_goals_scored'] > row['last_game_goals_conceded']:
                if year <= 1992:
                    group_goals_tracking.loc[i, 'last_game_points'] = 2  # Win before or during 1992
                else:
                    group_goals_tracking.loc[i, 'last_game_points'] = 3  # Win after 1992
            elif row['last_game_goals_scored'] == row['last_game_goals_conceded']:
                group_goals_tracking.loc[i, 'last_game_points'] = 1  # Draw
            else:
                group_goals_tracking.loc[i, 'last_game_points'] = 0  # Loss

        # Step 7: Update total points
        group_goals_tracking['total_points'] = group_goals_tracking['before_last_game_points'] + group_goals_tracking['last_game_points']


        # Step 8a: Mark teams that are tied based on total points
        group_goals_tracking['tied'] = group_goals_tracking.duplicated(subset=['total_points'], keep=False)
        print("\n=== Teams with Identical Points (Tied Teams) ===\n")
        print(group_goals_tracking[group_goals_tracking['tied']][['team', 'total_points']])

        # Reset `tie_won` to 0 for all teams
        group_goals_tracking['tie_won'] = 0

        # Step 8b: Process ties only if tied teams exist
        tied_teams = group_goals_tracking[group_goals_tracking['tied']]

        if not tied_teams.empty:
            print("\n=== Evaluating Head-to-Head for Tied Teams ===\n")
            
            # Iterate through all pairs of tied teams
            for i, row1 in tied_teams.iterrows():
                for j, row2 in tied_teams.iterrows():
                    if i < j:  # Compare each pair only once
                        print(f"Checking tie between: {row1['team']} and {row2['team']}")

                        # Apply tiebreaker function
                        tiebreak_result, tie_won_row1, tie_won_row2 = tiebreaker_after(row1, row2, agg_goals_before_last_day)

                        if tiebreak_result != 'tie':
                            print(f"Tiebreak Result: Winner is {tiebreak_result}")
                        else:
                            print(f"No winner in tiebreak between {row1['team']} and {row2['team']}")

                        # Update the `tie_won` column based on the results
                        group_goals_tracking.loc[group_goals_tracking['team'] == row1['team'], 'tie_won'] += tie_won_row1
                        group_goals_tracking.loc[group_goals_tracking['team'] == row2['team'], 'tie_won'] += tie_won_row2

        # Step 8c: Sort teams by tie-breaking criteria
        group_goals_tracking = group_goals_tracking.sort_values(
            by=['total_points', 'total_goals_difference', 'total_goals_scored', 'tie_won'],
            ascending=[False, False, False, False]
)

        # Step 9: Update standings
        group_goals_tracking['last_game_standing'] = group_goals_tracking.reset_index(drop=True).index + 1
        print("\n=== Updated Standings After Sorting and Tie Break ===\n")
        print(group_goals_tracking[['team', 'last_game_standing', 'total_points', 'total_goals_difference', 'tie_won']])

        # Step 10: Track changes and update standing positions after each goal is processed
        for i, row in group_goals_tracking.iterrows():
            team = row['team']
            if first_iteration:
                if row['before_last_game_standing'] == row['last_game_standing']:
                    continue
                else:
                    if row['last_game_standing'] == 1:
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, '1st'] += 1
                    elif row['last_game_standing'] == 2:
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, '2nd'] += 1
                    elif row['last_game_standing'] == 3:
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, '3rd'] += 1
                    elif row['last_game_standing'] == 4:
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, '4th'] += 1
                first_iteration = False
            else:
                if row['last_game_standing'] != previous_standings[i]:  
                    if row['last_game_standing'] == 1:
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, '1st'] += 1
                    elif row['last_game_standing'] == 2:
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, '2nd'] += 1
                    elif row['last_game_standing'] == 3:
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, '3rd'] += 1
                    elif row['last_game_standing'] == 4:
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, '4th'] += 1

        # Update previous standings after each goal
        previous_standings = group_goals_tracking['last_game_standing'].copy()

        # Step 11: Calculate changes as the sum of 1st, 2nd, 3rd, and 4th
        group_goals_tracking['changes'] = group_goals_tracking[['1st', '2nd', '3rd', '4th']].sum(axis=1)

        # Step 12: Print the updated group_goals_tracking after processing each goal
        print("\n=== Updated Standings After This Goal ===\n")
        display_columns = ['team' ,'date', 'time', 'total_points', 'total_goals_scored', 'total_goals_conceded', 
                           'total_goals_difference', 'last_game_points', 'last_game_standing', 
                           'changes', '1st', '2nd', '3rd', '4th', 'tied', 'tie_won']
        print(group_goals_tracking[display_columns].to_string(index=False))
        print("\n========================================\n")

    # Step 13: Return the final DataFrame
    return group_goals_tracking


In [14]:
# def track_composition_changes(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day):
#     # Step 1: Filter the data for the specific year and stage
#     group_goals_tracking = all_games_before_last[
#         (all_games_before_last['year'] == year) & 
#         (all_games_before_last['stage'] == stage)
#     ].copy()

#     group_goals_last_day = goals_last_day_sorted[
#         (goals_last_day_sorted['year'] == year) & 
#         (goals_last_day_sorted['stage'] == stage)
#     ]

#     # Initialize columns for team performance and standings
#     group_goals_tracking['before_last_game_goals_scored'] = group_goals_tracking['goals_scored']
#     group_goals_tracking['before_last_game_goals_conceded'] = group_goals_tracking['goals_conceded']
#     group_goals_tracking['before_last_game_standing'] = group_goals_tracking['standing']
#     group_goals_tracking['before_last_game_points'] = group_goals_tracking['points']
#     group_goals_tracking['last_game_goals_scored'] = 0
#     group_goals_tracking['last_game_goals_conceded'] = 0
#     group_goals_tracking['total_goals_scored'] = group_goals_tracking['before_last_game_goals_scored']
#     group_goals_tracking['total_goals_conceded'] = group_goals_tracking['before_last_game_goals_conceded']
#     group_goals_tracking['total_goals_difference'] = group_goals_tracking['total_goals_scored'] - group_goals_tracking['total_goals_conceded']
#     group_goals_tracking['last_game_points'] = 0
#     group_goals_tracking['total_points'] = group_goals_tracking['before_last_game_points']
#     group_goals_tracking['last_game_standing'] = 0
#     group_goals_tracking['tie_won'] = 0  # Initialize tie_won for tiebreak resolution

#     # Add one point to each team for a 0-0 starting score
#     group_goals_tracking['total_points'] += 1

#     # Remove unecessary columns
#     group_goals_tracking = group_goals_tracking.drop(columns=['goals_difference','goals_scored', 'goals_conceded', 'standing', 'points',
#                                                              'total_matches'])


#     # Define top standings limit based on the year (3 for 1992 and earlier, 2 for 1994 and later) for World Cup  *************---------------------------------CHANGE----------------------------------------------*******************
#     top_standings_limit = 3 if year <= 1994 else 2 

#     # Define top standings limit based on the year (3 for 2016 and later, 2 for 2014 and earlier) for Euros
#     # top_standings_limit = 3 if year >= 2016 else 2

#     # Print initial standings
#     print(f"\n=== Initial Standings for {stage}, {year} (Goal Time = 0) ===")
#     print(group_goals_tracking[['team', 'total_points', 'total_goals_scored', 
#                                 'total_goals_conceded', 'total_goals_difference', 'before_last_game_standing']].to_string(index=False))
#     print("\n========================================\n")

#     # Step 2: Initialize composition tracking with initial composition (change_num = 0)
#     initial_top_teams = set(
#         group_goals_tracking[group_goals_tracking['before_last_game_standing'] <= top_standings_limit]['team']
#     )

#     composition_changes = [{
#         'year': year,
#         'stage': stage,
#         'change_num': 0,
#         'goal_time': 0,
#         'home_team': None,
#         'away_team': None,
#         'scorer_team': None,
#         'new_top_teams': list(initial_top_teams),
#         '1st': group_goals_tracking.loc[group_goals_tracking['before_last_game_standing'] == 1, 'team'].values[0] 
#                 if (group_goals_tracking['before_last_game_standing'] == 1).any() else None,
#         '2nd': group_goals_tracking.loc[group_goals_tracking['before_last_game_standing'] == 2, 'team'].values[0] 
#                 if (group_goals_tracking['before_last_game_standing'] == 2).any() else None,
#         '3rd': group_goals_tracking.loc[group_goals_tracking['before_last_game_standing'] == 3, 'team'].values[0] 
#                 if (group_goals_tracking['before_last_game_standing'] == 3).any() else None,
#         'changed': 0
#     }]

#     change_counter = 0  # Counter for the number of composition changes

#     # Step 3: Sort goals by regulation time
#     group_goals_last_day = group_goals_last_day.sort_values(by=['goal_minute'])

#     # Step 4: Iterate through each goal and track changes in composition
#     for _, goal in group_goals_last_day.iterrows():
#         home_team = goal['home_team']
#         away_team = goal['away_team']
#         scorer_team = goal['scorer_nationality']

#         # Update scores based on who scored the goal
#         if scorer_team == home_team:
#             group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'last_game_goals_scored'] += 1
#             group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'total_goals_scored'] += 1
#             group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'last_game_goals_conceded'] += 1
#             group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'total_goals_conceded'] += 1
#         elif scorer_team == away_team:
#             group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'last_game_goals_scored'] += 1
#             group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'total_goals_scored'] += 1
#             group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'last_game_goals_conceded'] += 1
#             group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'total_goals_conceded'] += 1

#         # Update goal difference
#         group_goals_tracking['total_goals_difference'] = group_goals_tracking['total_goals_scored'] - group_goals_tracking['total_goals_conceded']

#         # Step 5: Update last_game_points based on the current game state
#         group_goals_tracking['last_game_points'] = group_goals_tracking.apply(
#             lambda row: (2 if year <= 1992 else 3) if row['last_game_goals_scored'] > row['last_game_goals_conceded'] 
#             else (1 if row['last_game_goals_scored'] == row['last_game_goals_conceded'] else 0), 
#             axis=1
#         )

#         # Calculate total points by adding last game points to before_last_game_points
#         group_goals_tracking['total_points'] = group_goals_tracking['before_last_game_points'] + group_goals_tracking['last_game_points']

#         # Step 5b: Evaluate ties based on total points
#         group_goals_tracking['tied'] = group_goals_tracking.duplicated(subset=['total_points'], keep=False)

#         # Select the tied teams
#         tied_teams = group_goals_tracking[group_goals_tracking['tied']]

#         # Reset `tie_won` column to 0 for all teams
#         group_goals_tracking['tie_won'] = 0

#         if not tied_teams.empty:
#             print("\n=== Evaluating Head-to-Head for Tied Teams ===\n")

#             # Iterate through all pairs of tied teams
#             for i, row1 in tied_teams.iterrows():
#                 for j, row2 in tied_teams.iterrows():
#                     if i < j:  # Compare each pair only once
#                         team1 = row1['team']
#                         team2 = row2['team']

#                         print(f"Checking tie between: {team1} and {team2}")

#                         # Apply tiebreaker function
#                         tiebreak_result, tie_won_row1, tie_won_row2 = tiebreaker_after(row1, row2, agg_goals_before_last_day)

#                         if tiebreak_result != 'tie':
#                             print(f"Tiebreak Result: Winner is {tiebreak_result}")
#                         else:
#                             print(f"No winner in tiebreak between {team1} and {team2}")

#                         # Update the `tie_won` column based on the results
#                         group_goals_tracking.loc[group_goals_tracking['team'] == team1, 'tie_won'] += tie_won_row1
#                         group_goals_tracking.loc[group_goals_tracking['team'] == team2, 'tie_won'] += tie_won_row2

#         # Sort teams by updated points and tie-breaking criteria
#         group_goals_tracking = group_goals_tracking.sort_values(
#             by=['total_points', 'total_goals_difference', 'total_goals_scored', 'tie_won'],
#             ascending=[False, False, False, False]
#         )
        
#         # Update standings after sorting
#         group_goals_tracking['last_game_standing'] = group_goals_tracking.reset_index(drop=True).index + 1

#         # Print standings after each goal
#         print(f"\n=== Standings after goal at minute {goal['goal_minute']} in {stage}, edition {year} ===")
#         print(group_goals_tracking[['team', 'total_points', 'total_goals_scored', 'total_goals_conceded', 'total_goals_difference', 'before_last_game_standing']].to_string(index=False))
#         print("\n========================================\n")

#         # Track top teams and composition changes
#         current_top_teams = set(group_goals_tracking.nsmallest(top_standings_limit, 'last_game_standing')['team'])
#         changed = int(current_top_teams != initial_top_teams)

#         if changed:
#             change_counter += 1
#             initial_top_teams = current_top_teams

#         composition_changes.append({
#             'year': year,
#             'stage': stage,
#             'change_num': change_counter,
#             'goal_time': goal['goal_minute'],
#             'home_team': home_team,
#             'away_team': away_team,
#             'scorer_team': scorer_team,
#             'new_top_teams': list(current_top_teams),
#             '1st': group_goals_tracking.iloc[0]['team'],
#             '2nd': group_goals_tracking.iloc[1]['team'] if len(group_goals_tracking) > 1 else None,
#             '3rd': group_goals_tracking.iloc[2]['team'] if len(group_goals_tracking) > 2 else None,
#             'changed': changed,
#         })

#     return pd.DataFrame(composition_changes)


# gap between qualifying and not qualifying 

## third team tracking

In [15]:
third_place_tracking = []  # Global list to store third-place changes

def track_third_place_team(group_goals_tracking, year, stage, goal_minute, half_time, date, time):
    """
    Identifies and saves the third-place team after each goal and includes the match date and time.

    Returns:
    - A dictionary containing year, stage, minute, half_time, third-place team details, date, and time.
    """
    if len(group_goals_tracking) < 3:
        print(f"[DEBUG] Year: {year}, Stage: {stage}, Goal Minute: {goal_minute}, No third team available.")
        return {
            'year': year,
            'stage': stage,
            'goal_minute': goal_minute,
            'half_time': half_time,
            'third_team': None,
            'total_points': None,
            'total_goals_difference': None,
            'total_goals_scored': None,
            'date': date,  # Now taken from input
            'time': time   # Now taken from input
        }  # Not enough teams

    third_team = group_goals_tracking.iloc[2]  # Third-placed team

    print(f"[DEBUG] Saving Third Team - Year: {year}, Stage: {stage}, Goal Minute: {goal_minute}, "
          f"Team: {third_team['team']}, Points: {third_team['total_points']}, "
          f"Goal Difference: {third_team['total_goals_difference']}, Goals Scored: {third_team['total_goals_scored']}, "
          f"Date: {date}, Time: {time}"
    )

    return {
        'year': year,
        'stage': stage,
        'goal_minute': goal_minute,
        'half_time': half_time,
        'third_team': third_team['team'],
        'total_points': third_team['total_points'],
        'total_goals_difference': third_team['total_goals_difference'],
        'total_goals_scored': third_team['total_goals_scored'],
        'date': date,
        'time': time
    }



In [16]:
def gap_composition(year, stage, all_games_before_last, goals_last_day_sorted, agg_goals_before_last_day):
    # Step 1: Filter the data for the specific year and stage
    group_goals_tracking = all_games_before_last[
        (all_games_before_last['year'] == year) & 
        (all_games_before_last['stage'] == stage)
    ].copy()

    group_goals_last_day = goals_last_day_sorted[
        (goals_last_day_sorted['year'] == year) & 
        (goals_last_day_sorted['stage'] == stage)
    ]

    # Initialize columns for team performance and standings
    group_goals_tracking['before_last_game_goals_scored'] = group_goals_tracking['goals_scored']
    group_goals_tracking['before_last_game_goals_conceded'] = group_goals_tracking['goals_conceded']
    group_goals_tracking['before_last_game_standing'] = group_goals_tracking['standing']
    group_goals_tracking['before_last_game_points'] = group_goals_tracking['points']
    group_goals_tracking['last_game_goals_scored'] = 0
    group_goals_tracking['last_game_goals_conceded'] = 0
    group_goals_tracking['total_goals_scored'] = group_goals_tracking['before_last_game_goals_scored']
    group_goals_tracking['total_goals_conceded'] = group_goals_tracking['before_last_game_goals_conceded']
    group_goals_tracking['total_goals_difference'] = group_goals_tracking['total_goals_scored'] - group_goals_tracking['total_goals_conceded']
    group_goals_tracking['last_game_points'] = 0
    group_goals_tracking['total_points'] = group_goals_tracking['before_last_game_points']
    group_goals_tracking['last_game_standing'] = 0
    group_goals_tracking['tie_won'] = 0  # Initialize tie_won for tiebreak resolution
    group_goals_tracking['three_tie'] = 0 # Initialize `three_tie` column to 0 for all teams
    group_goals_tracking['four_tie'] = 0 # Initialize `four_tie` column to 0 for all teams




    # Add one point to each team for a 0-0 starting score
    group_goals_tracking['total_points'] += 1

    # Remove unecessary columns
    group_goals_tracking = group_goals_tracking.drop(columns=['goals_difference','goals_scored', 'goals_conceded', 'standing', 'points',
                                                             'total_matches', 'tiebreaker'])


    # Define top standings limit based on the year (3 for 1992 and earlier, 2 for 1994 and later) for World Cup  *************---------------------------------TOGGLE----------------------------------------------*******************
    # top_standings_limit = 3 if year <= 1994 else 2 

    # Define top standings limit based on the year (3 for 2016 and later, 2 for 2014 and earlier) for Euros
    top_standings_limit = 3 if year >= 2016 else 2

    # Print initial standings
    print(f"\n=== STEP 1: Initial Standings for {stage}, {year} (Goal Time = 0) ===")
    print(group_goals_tracking[['team','total_points', 'total_goals_scored', 
                                'total_goals_conceded', 'total_goals_difference', 'before_last_game_standing']].to_string(index=False))
    print("\n========================================\n")

    # Initialize composition tracking with initial composition (change_num = 0)
    initial_top_teams = set(
        group_goals_tracking[group_goals_tracking['before_last_game_standing'] <= top_standings_limit]['team']
    )

    # Get safe values for each rank (1st to 4th), fallback to None if not present
    first_team = group_goals_tracking.loc[group_goals_tracking['before_last_game_standing'] == 1]
    second_team = group_goals_tracking.loc[group_goals_tracking['before_last_game_standing'] == 2]
    third_team = group_goals_tracking.loc[group_goals_tracking['before_last_game_standing'] == 3]
    fourth_team = group_goals_tracking.loc[group_goals_tracking['before_last_game_standing'] == 4]

    composition_changes = [{
        'year': year,
        'stage': stage,
        'change_num': 0,
        'goal_minute': 0,
        'home_team': None,
        'away_team': None,
        'scorer_team': None,
        'new_top_teams': list(initial_top_teams),

        '1st': first_team['team'].values[0] if not first_team.empty else None,
        '1st_points': first_team['total_points'].values[0] if not first_team.empty else None,
        '1st_goals_diff': first_team['total_goals_difference'].values[0] if not first_team.empty else None,
        '1st_goals_scored': first_team['total_goals_scored'].values[0] if not first_team.empty else None,
        '1st_last_game_points': first_team['last_game_points'].values[0] if not first_team.empty else None,
        '1st_last_game_goals_diff': (
            (first_team['last_game_goals_scored'].values[0] - first_team['last_game_goals_conceded'].values[0])
            if not first_team.empty else None
        ),


        '2nd': second_team['team'].values[0] if not second_team.empty else None,
        '2nd_points': second_team['total_points'].values[0] if not second_team.empty else None,
        '2nd_goals_diff': second_team['total_goals_difference'].values[0] if not second_team.empty else None,
        '2nd_last_game_points': second_team['last_game_points'].values[0] if not second_team.empty else None,
        '2nd_goals_scored': second_team['total_goals_scored'].values[0] if not second_team.empty else None,
        '2nd_last_game_goals_diff': (
            (second_team['last_game_goals_scored'].values[0] - second_team['last_game_goals_conceded'].values[0])
            if not second_team.empty else None
        ),


        '3rd': third_team['team'].values[0] if not third_team.empty else None,
        '3rd_points': third_team['total_points'].values[0] if not third_team.empty else None,
        '3rd_goals_diff': third_team['total_goals_difference'].values[0] if not third_team.empty else None,
        '3rd_goals_scored': third_team['total_goals_scored'].values[0] if not third_team.empty else None,
        '3rd_last_game_points': third_team['last_game_points'].values[0] if not third_team.empty else None,
        '3rd_last_game_goals_diff': (
            (third_team['last_game_goals_scored'].values[0] - third_team['last_game_goals_conceded'].values[0])
            if not third_team.empty else None
        ),

        '4th': fourth_team['team'].values[0] if not fourth_team.empty else None,
        '4th_points': fourth_team['total_points'].values[0] if not fourth_team.empty else None,
        '4th_goals_diff': fourth_team['total_goals_difference'].values[0] if not fourth_team.empty else None,
        '4th_goals_scored': fourth_team['total_goals_scored'].values[0] if not fourth_team.empty else None,
        '4th_last_game_points': fourth_team['last_game_points'].values[0] if not fourth_team.empty else None,
        '4th_last_game_goals_diff': (
            (fourth_team['last_game_goals_scored'].values[0] - fourth_team['last_game_goals_conceded'].values[0])
            if not fourth_team.empty else None
        ),

        'changed': 0,
        'points_diff': 0,
        'goals_diff': 0
    }]

    change_counter = 0  # Counter for the number of composition changes

    # Sort by 'half_time' first and then by 'goal_minute'
    group_goals_last_day = group_goals_last_day.sort_values(by=['half_time', 'goal_minute'], ascending=[True, True])
    # Iterate through each goal and track changes in composition
    for _, goal in group_goals_last_day.iterrows():
        home_team = goal['home_team']
        away_team = goal['away_team']
        scorer_team = goal['scorer_nationality']

        # Update scores based on who scored the goal
        if scorer_team == home_team:
            group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'last_game_goals_scored'] += 1
            group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'total_goals_scored'] += 1
            group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'last_game_goals_conceded'] += 1
            group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'total_goals_conceded'] += 1
        elif scorer_team == away_team:
            group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'last_game_goals_scored'] += 1
            group_goals_tracking.loc[group_goals_tracking['team'] == away_team, 'total_goals_scored'] += 1
            group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'last_game_goals_conceded'] += 1
            group_goals_tracking.loc[group_goals_tracking['team'] == home_team, 'total_goals_conceded'] += 1

        # Update goal difference
        group_goals_tracking['total_goals_difference'] = group_goals_tracking['total_goals_scored'] - group_goals_tracking['total_goals_conceded']

        # Update last_game_points based on the current game state
        group_goals_tracking['last_game_points'] = group_goals_tracking.apply(
            lambda row: (2 if year <= 1992 else 3) if row['last_game_goals_scored'] > row['last_game_goals_conceded'] 
            else (1 if row['last_game_goals_scored'] == row['last_game_goals_conceded'] else 0), 
            axis=1
        )

        # Calculate total points by adding last game points to before_last_game_points
        group_goals_tracking['total_points'] = group_goals_tracking['before_last_game_points'] + group_goals_tracking['last_game_points']

        # Evaluate ties based on total points
        group_goals_tracking['tied'] = group_goals_tracking.groupby('total_points')['team'].transform('size') > 1

        # Select the tied teams
        tied_teams = group_goals_tracking[group_goals_tracking['tied']]

        # Count the number of tied teams
        num_tied_teams = tied_teams['team'].nunique()

        print(f"\n=== Tied after goal at minute {goal['goal_minute']} {goal['half_time']} half time by {goal['scorer_nationality']} in {stage}, edition {year} ===")
        print(f"Number of tied teams: {num_tied_teams}")
        print(tied_teams[['team', 'total_points', 'total_goals_scored', 'total_goals_conceded', 'total_goals_difference']])

        # Reset `tie` columns to 0 for all teams
        group_goals_tracking['tie_won'] = 0
        group_goals_tracking['three_tie'] = 0
        group_goals_tracking['four_tie'] = 0

        if not tied_teams.empty:
            print("\n=== STEP 2: Evaluating Head-to-Head for Tied Teams ===\n")

            # Group tied teams by total_points and process each group separately
            for points, tied_group in tied_teams.groupby('total_points'):

                if len(tied_group) == 4:  # Handle four-way tie
                    print(f"\n=== Resolving Four-Way Tie for {points} points ===")
                    resolved_ranking = resolve_four_way_tie(tied_group, agg_goals_before_last_day, group_goals_tracking)

                    # Assign `tie_won` values for four-way tie teams
                    for rank, team in enumerate(resolved_ranking, start=1):
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, 'four_tie'] = 5 - rank  # Lower rank = lower value

                elif len(tied_group) == 3:  # Handle three-way tie
                    print(f"\n=== Resolving Three-Way Tie for {points} points ===")
                    resolved_ranking = resolve_three_way_tie(tied_group, agg_goals_before_last_day, group_goals_tracking)

                    # Assign `tie_won` values for three-way tie teams
                    for rank, team in enumerate(resolved_ranking, start=1):
                        group_goals_tracking.loc[group_goals_tracking['team'] == team, 'three_tie'] = 4- rank  # Lower rank = lower value

                        
                else:  # Process groups with more than one team
                    print(f"\n=== Checking ties for teams with {points} points ===")
                        
                    # Sort tied_group by additional criteria to ensure proper order
                    tied_group = tied_group.sort_values(by=['total_goals_difference', 'total_goals_scored'], ascending=False)
                    
                    # Iterate through all pairs of tied teams in the current group
                    for i, row1 in tied_group.iterrows():
                        for j, row2 in tied_group.iterrows():
                            if i < j:  # Compare each pair only once
                                team1 = row1['team']
                                team2 = row2['team']

                                print(f"Checking tie between: {team1} and {team2}")

                                # Apply tiebreaker function
                                tiebreak_result, tie_won_row1, tie_won_row2 = tiebreaker_after(row1, row2, agg_goals_before_last_day)

                                if tiebreak_result != 'tie':
                                    print(f"Tiebreak Result: Winner is {tiebreak_result}")
                                else:
                                    print(f"No winner in tiebreak between {team1} and {team2}")

                                # Update the `tie_won` column based on the results
                                group_goals_tracking.loc[group_goals_tracking['team'] == team1, 'tie_won'] += tie_won_row1
                                group_goals_tracking.loc[group_goals_tracking['team'] == team2, 'tie_won'] += tie_won_row2


        # Sort teams by updated points and tie-breaking criteria
        group_goals_tracking = group_goals_tracking.sort_values(
            by=['total_points','four_tie','three_tie','total_goals_difference','total_goals_scored','tie_won'],
            ascending=[False, False, False, False, False, False]
        )
        
        # Update standings after sorting
        group_goals_tracking['last_game_standing'] = group_goals_tracking.reset_index(drop=True).index + 1


        # Track third-place team
        third_place_info = track_third_place_team(group_goals_tracking, year, stage, goal['goal_minute'], goal['half_time'],
                                                  goal['short_date'], goal['local_time'])
        # Append third-place info to a global list for separate tracking
        third_place_tracking.append(third_place_info)

        # Debugging: Print the last saved entry
        print(f"[DEBUG] Third Place Tracking Updated: {third_place_tracking[-1]}")


        # Print standings after each goal
        print(f"\n=== STEP 3: Standings after goal at minute {goal['goal_minute']} {goal['half_time']} half time by {goal['scorer_nationality']} in {stage}, edition {year} ===")
        print(group_goals_tracking[['team', 'total_points', 'total_goals_scored', 'total_goals_conceded', 'total_goals_difference', 'before_last_game_standing']].to_string(index=False))
        print("\n========================================\n")

        # Initialize tiebreak_result and other variables before the conditions
        tiebreak_result = None
        original_goals_scored_row2 = None
        original_goals_difference_row2 = None
        tie_won_row1 = 0
        tie_won_row2 = 0


        # Calculate points_diff and goals_diff, then apply the tiebreaker if conditions are met
        if top_standings_limit == 3:
            # Calculate points_diff for top_standings_limit == 3
            points_diff = group_goals_tracking.loc[group_goals_tracking['last_game_standing'] == 3, 'total_points'].values[0] - \
                        group_goals_tracking.loc[group_goals_tracking['last_game_standing'] == 4, 'total_points'].values[0]

            # Calculate goals_diff for top_standings_limit == 3
            goals_diff = group_goals_tracking.loc[group_goals_tracking['last_game_standing'] == 3, 'total_goals_difference'].values[0] - \
                        group_goals_tracking.loc[group_goals_tracking['last_game_standing'] == 4, 'total_goals_difference'].values[0]
            
            # Add points_last variable to check if the fourth team is drawing its match 
            points_last = group_goals_tracking.loc[group_goals_tracking['last_game_standing'] == 4, 'last_game_points'].values[0]
            

           # Apply tiebreaker for three-points-win system after 2012
            if (points_diff == 2 and goals_diff <= 1 and points_last == 1):
                print("\n=== STEP 4: Applying Potential Tiebreaker for 3nd and 4th Place ===\n")
                team_3rd = group_goals_tracking.loc[group_goals_tracking['last_game_standing'] == 3, 'team'].values[0]
                team_4th = group_goals_tracking.loc[group_goals_tracking['last_game_standing'] == 4, 'team'].values[0]

                # Get rows for the 3rd and 4th teams
                row1 = group_goals_tracking[group_goals_tracking['team'] == team_3rd].iloc[0]
                row2_index = group_goals_tracking[group_goals_tracking['team'] == team_4th].index[0]  # Get index for row2

                # Save original values for resetting later
                original_goals_scored_row2 = group_goals_tracking.loc[row2_index, 'total_goals_scored']
                original_goals_difference_row2 = group_goals_tracking.loc[row2_index, 'total_goals_difference']

                # Increment total_goals_scored and total_goals_difference for row2
                group_goals_tracking.loc[row2_index, 'total_goals_scored'] += 1
                group_goals_tracking.loc[row2_index, 'total_goals_difference'] += 1

                # Retrieve updated row2 after incrementing values
                row2 = group_goals_tracking.loc[row2_index]

                # Apply tiebreaker
                tiebreak_result, tie_won_row1, tie_won_row2 = tiebreaker_after(row1, row2, agg_goals_before_last_day)

                # Reset total_goals_scored and total_goals_difference to their original values
                group_goals_tracking.loc[row2_index, 'total_goals_scored'] = original_goals_scored_row2
                group_goals_tracking.loc[row2_index, 'total_goals_difference'] = original_goals_difference_row2

        elif top_standings_limit == 2:
            # Calculate points_diff for top_standings_limit == 2
            points_diff = group_goals_tracking.loc[group_goals_tracking['last_game_standing'] == 2, 'total_points'].values[0] - \
                        group_goals_tracking.loc[group_goals_tracking['last_game_standing'] == 3, 'total_points'].values[0]

            # Calculate goals_diff for top_standings_limit == 2
            goals_diff = group_goals_tracking.loc[group_goals_tracking['last_game_standing'] == 2, 'total_goals_difference'].values[0] - \
                        group_goals_tracking.loc[group_goals_tracking['last_game_standing'] == 3, 'total_goals_difference'].values[0]
            
            # Add points_last variable to check if the fourth team is drawing its match 
            points_last = group_goals_tracking.loc[group_goals_tracking['last_game_standing'] == 3, 'last_game_points'].values[0]
            
            # Apply tiebreaker for two-points-win based system before 1992 OR three-points-win system after 1992
            if (points_diff == 1 and goals_diff <= 1 and points_last == 1 and year <= 1992) or \
    (points_diff == 2 and goals_diff <= 1 and points_last == 1 and year > 1992):
                print("\n=== STEP 4: Applying Potential Tiebreaker for 2nd and 3rd Place ===\n")
                team_2nd = group_goals_tracking.loc[group_goals_tracking['last_game_standing'] == 2, 'team'].values[0]
                team_3rd = group_goals_tracking.loc[group_goals_tracking['last_game_standing'] == 3, 'team'].values[0]

                # Get rows for the 2nd and 3rd teams
                row1 = group_goals_tracking[group_goals_tracking['team'] == team_2nd].iloc[0]
                row2_index = group_goals_tracking[group_goals_tracking['team'] == team_3rd].index[0]  # Get index for row2

                # Save original values for resetting later
                original_goals_scored_row2 = group_goals_tracking.loc[row2_index, 'total_goals_scored']
                original_goals_difference_row2 = group_goals_tracking.loc[row2_index, 'total_goals_difference']

                # Increment total_goals_scored and total_goals_difference for row2
                group_goals_tracking.loc[row2_index, 'total_goals_scored'] += 1
                group_goals_tracking.loc[row2_index, 'total_goals_difference'] += 1

                # Retrieve updated row2 after incrementing values
                row2 = group_goals_tracking.loc[row2_index]

                # Apply tiebreaker
                tiebreak_result, tie_won_row1, tie_won_row2 = tiebreaker_after (row1, row2, agg_goals_before_last_day)

                # Reset total_goals_scored and total_goals_difference to their original values
                group_goals_tracking.loc[row2_index, 'total_goals_scored'] = original_goals_scored_row2
                group_goals_tracking.loc[row2_index, 'total_goals_difference'] = original_goals_difference_row2

        # Track top teams and composition changes
        current_top_teams = set(group_goals_tracking.nsmallest(top_standings_limit, 'last_game_standing')['team'])
        changed = int(current_top_teams != initial_top_teams)

        if changed:
            change_counter += 1
            initial_top_teams = current_top_teams

                
        composition_changes.append({
        'year': year,
        'stage': stage,
        'date': goal['short_date'],
        'time': goal['local_time'],
        'change_num': change_counter,
        'goal_minute': goal['goal_minute'],
        'half_time': goal['half_time'],
        'home_team': home_team,
        'away_team': away_team,
        'scorer_team': scorer_team,
        'new_top_teams': list(current_top_teams),

        '1st': group_goals_tracking.iloc[0]['team'],
        '1st_points': group_goals_tracking.iloc[0]['total_points'],
        '1st_goals_diff': group_goals_tracking.iloc[0]['total_goals_difference'],
        '1st_goals_scored': group_goals_tracking.iloc[0]['total_goals_scored'],
        '1st_last_game_points': group_goals_tracking.iloc[0]['last_game_points'],
        '1st_last_game_goals_diff': ( 
            (group_goals_tracking.iloc[0]['last_game_goals_scored'] - group_goals_tracking.iloc[0]['last_game_goals_conceded'])
            if group_goals_tracking.shape[0] > 0 else None
        ),


        '2nd': group_goals_tracking.iloc[1]['team'] if group_goals_tracking.shape[0] > 1 else None,
        '2nd_points': group_goals_tracking.iloc[1]['total_points'] if group_goals_tracking.shape[0] > 1 else None,
        '2nd_goals_diff': group_goals_tracking.iloc[1]['total_goals_difference'] if group_goals_tracking.shape[0] > 1 else None,
        '2nd_goals_scored': group_goals_tracking.iloc[1]['total_goals_scored'] if group_goals_tracking.shape[0] > 1 else None,
        '2nd_last_game_points': group_goals_tracking.iloc[1]['last_game_points'] if group_goals_tracking.shape[0] > 1 else None,
        '2nd_last_game_goals_diff': (
            (group_goals_tracking.iloc[1]['last_game_goals_scored'] - group_goals_tracking.iloc[1]['last_game_goals_conceded'])
            if group_goals_tracking.shape[0] > 1 else None
        ),

        '3rd': group_goals_tracking.iloc[2]['team'] if group_goals_tracking.shape[0] > 2 else None,
        '3rd_points': group_goals_tracking.iloc[2]['total_points'] if group_goals_tracking.shape[0] > 2 else None,
        '3rd_goals_diff': group_goals_tracking.iloc[2]['total_goals_difference'] if group_goals_tracking.shape[0] > 2 else None,
        '3rd_goals_scored': group_goals_tracking.iloc[2]['total_goals_scored'] if group_goals_tracking.shape[0] > 2 else None,
        '3rd_last_game_points': group_goals_tracking.iloc[2]['last_game_points'] if group_goals_tracking.shape[0] > 2 else None,
        '3rd_last_game_goals_diff': (
            (group_goals_tracking.iloc[2]['last_game_goals_scored'] - group_goals_tracking.iloc[2]['last_game_goals_conceded'])
            if group_goals_tracking.shape[0] > 2 else None
        ),

        '4th': group_goals_tracking.iloc[3]['team'] if group_goals_tracking.shape[0] > 3 else None,
        '4th_points': group_goals_tracking.iloc[3]['total_points'] if group_goals_tracking.shape[0] > 3 else None,
        '4th_goals_diff': group_goals_tracking.iloc[3]['total_goals_difference'] if group_goals_tracking.shape[0] > 3 else None,
        '4th_goals_scored': group_goals_tracking.iloc[3]['total_goals_scored'] if group_goals_tracking.shape[0] > 3 else None,
        '4th_last_game_points': group_goals_tracking.iloc[3]['last_game_points'] if group_goals_tracking.shape[0] > 3 else None,
        '4th_last_game_goals_diff': (
            (group_goals_tracking.iloc[3]['last_game_goals_scored'] - group_goals_tracking.iloc[3]['last_game_goals_conceded'])
            if group_goals_tracking.shape[0] > 3 else None
        ),

        'changed': changed,
        'points_diff': points_diff,
        'goals_diff': goals_diff,
        'tiebreak_result': tiebreak_result
        })




        # Convert composition_changes into a DataFrame after the loop
        composition_changes_df = pd.DataFrame(composition_changes)

        # Convert third-place tracking into a DataFrame
        third_place_df = pd.DataFrame(third_place_tracking)

    return composition_changes_df, third_place_df


# best four third_placed

## European Championship

### men

In [17]:
def ensure_goal_minute_zero(third_place_df, all_games_before_last):
    """
    Ensures that each (year, stage) combination has an entry with `goal_minute = 0` 
    created from `all_games_before_last`. Any pre-existing `goal_minute = 0` entries 
    in `third_place_df` are removed before adding the new ones.

    The function ensures that the first observation in each (year, stage) group 
    has the same date and time as all other observations in the same group.

    Parameters:
    - third_place_df: DataFrame tracking third-placed teams after each goal.
    - all_games_before_last: DataFrame containing pre-last-matchday standings.

    Returns:
    - Updated third_place_df with `goal_minute = 0` entries strictly from `all_games_before_last`.
    """

    print("[INFO] Removing existing `goal_minute = 0` entries...")
    # third_place_df = third_place_df[third_place_df['goal_minute'] != 0].copy()

    #*************---------------------------------TOGGLE----------------------------------------------*******************

    # Filter all_games_before_last to include only years <= 1994
    # filtered_games = all_games_before_last[all_games_before_last['year'] <= 1994].copy()

    # Filter all_games_before_last to include only years >= 2016
    filtered_games = all_games_before_last[all_games_before_last['year'] >= 2016].copy()

    print("[INFO] Creating `goal_minute = 0` entries from `all_games_before_last`")
    new_entries = []

    # Get unique (year, stage) combinations from the filtered data
    unique_combinations = filtered_games[['year', 'stage']].drop_duplicates()

    for _, row in unique_combinations.iterrows():
        year, stage = row['year'], row['stage']

        # Retrieve third-placed team from the filtered data
        third_team_data = filtered_games[
            (filtered_games['year'] == year) & 
            (filtered_games['stage'] == stage) & 
            (filtered_games['standing'] == 3)
        ]

        if third_team_data.empty:
            print(f"[WARNING] No third-placed team found for Year {year}, Stage {stage} in `all_games_before_last`.")
            continue  # Skip if no data available

        # Extract relevant data
        third_team = third_team_data.iloc[0]  # Select first matching row

        # Get the common date and time from existing third_place_df for (year, stage)
        group_data = third_place_df[(third_place_df['year'] == year) & (third_place_df['stage'] == stage)]
        if not group_data.empty:
            common_date = group_data['date'].iloc[0]
            common_time = group_data['time'].iloc[0]
        else:
            common_date = None
            common_time = None

        new_entry = {
            'year': year,
            'stage': stage,
            'goal_minute': 0,
            'half_time': 1,  # Assume first half
            'third_team': third_team['team'],
            'total_points': third_team['points'],  # Assuming they do not gain 1 point at start
            'total_goals_difference': third_team['goals_scored'] - third_team['goals_conceded'],
            'total_goals_scored': third_team['goals_scored'],
            'date': common_date,  # Assign common date
            'time': common_time  # Assign common time
        }

        new_entries.append(new_entry)

    # Append new rows if any were created
    if new_entries:
        print(f"[INFO] Adding {len(new_entries)} new `goal_minute = 0` entries.")
        third_place_df = pd.concat([third_place_df, pd.DataFrame(new_entries)], ignore_index=True)

    # Ensure sorting by year, stage, and goal_minute
    third_place_df = third_place_df.sort_values(by=['year', 'date', 'time', 'goal_minute', 'stage'], ascending=True).reset_index(drop=True)

    print("[INFO] `goal_minute = 0` entries updated successfully.")
    
    return third_place_df


In [18]:
def initial_third_teams (third_place_df):
    """
    Restructures third_place_df to match the format of the second table.
    For each year, it gathers all `goal_minute = 0` entries and organizes them 
    into a structured format where each stage has its own column containing a list 
    of [third_team, total_points, total_goals_difference, total_goals_scored].
    
    It also:
    - Sorts teams by total_points, total_goals_difference, and total_goals_scored.
    - Identifies the top four and last two teams.
    - Checks for teams tied in all ranking criteria.
    - Initializes `change_count` to zero.

    Parameters:
    - third_place_df: DataFrame containing third-placed teams after each goal.

    Returns:
    - A reformatted DataFrame with a single row per year containing structured group data.
    """

    # Filter for goal_minute == 0
    filtered_df = third_place_df[third_place_df['goal_minute'] == 0].copy()

    # Group by year
    structured_data = []
    for year, group in filtered_df.groupby('year'):
        year_entry = {
            'year': year,
            'goal_minute': 0,
            'date': group['date'].iloc[0],  # Assuming same date for goal_minute = 0
            'time': group['time'].iloc[0],  # Assuming same time for goal_minute = 0
            'change_flag': 0,  # Default initialization
            'change_count': 0,  # Default initialization
            'top_four': [],
            'last_two': [],
            'tied_team': []
        }

        stage_data = []
        for _, row in group.iterrows():
            stage_entry = {
                'stage': row['stage'],
                'team': row['third_team'],
                'total_points': row['total_points'],
                'total_goals_difference': row['total_goals_difference'],
                'total_goals_scored': row['total_goals_scored']
            }
            stage_data.append(stage_entry)

        # Sort teams within the group
        sorted_teams = sorted(stage_data, key=lambda x: (-x['total_points'], -x['total_goals_difference'], -x['total_goals_scored']))

        # Assign sorted results to year_entry
        for stage in sorted_teams:
            year_entry[stage['stage']] = [
                stage['team'], 
                stage['total_points'], 
                stage['total_goals_difference'], 
                stage['total_goals_scored']
            ]

        # Extract top four and last two
        year_entry['top_four'] = [[team['team'], team['total_points'], team['total_goals_difference'], team['total_goals_scored']] for team in sorted_teams[:4]]
        year_entry['last_two'] = [[team['team'], team['total_points'], team['total_goals_difference'], team['total_goals_scored']] for team in sorted_teams[-2:]]

        # Identify tied teams
        tied_teams = []
        for i in range(len(sorted_teams) - 1):
            if (sorted_teams[i]['total_points'] == sorted_teams[i + 1]['total_points'] and
                sorted_teams[i]['total_goals_difference'] == sorted_teams[i + 1]['total_goals_difference'] and
                sorted_teams[i]['total_goals_scored'] == sorted_teams[i + 1]['total_goals_scored']):
                tied_teams.append(sorted_teams[i]['team'])
                tied_teams.append(sorted_teams[i + 1]['team'])

        # Store unique tied teams
        year_entry['tied_team'] = list(set(tied_teams))

        structured_data.append(year_entry)

    # Convert to DataFrame
    structured_df = pd.DataFrame(structured_data)

    # Ensure sorted by year
    structured_df = structured_df.sort_values(by='year').reset_index(drop=True)

    return structured_df


In [19]:
def third_teams(third_place_df, structured_df, team_priority):
    """
    Tracks and updates third-place standings dynamically over time.

    - Ensures each year starts with the first row from `structured_df`.
    - Persists previous values within the same year.
    - Updates all groups correctly, not just `Group A`.
    - Updates only when a new change occurs for a group.
    - Maintains correct ordering of the dataset.

    Parameters:
    - third_place_df: DataFrame tracking third-placed teams after each goal.
    - structured_df: DataFrame in the structured format.
    - team_priority: List of team names in priority order for tie-breaking.

    Returns:
    - A DataFrame tracking all updates over time, preserving the specified column order.
    """

    # Create a list to store all intermediate updates
    history_records = []

    # Initialize a dictionary to store the last known values of each group
    last_known_values = {}
    prev_year = None

    for _, row in third_place_df.iterrows():
        year = row['year']
        stage = row['stage']
        goal_minute = row['goal_minute']

        print(f"\nProcessing: Year={year}, Stage={stage}, Goal Minute={goal_minute}")

        # When the year changes, retrieve the first row from structured_df for all groups
        if prev_year is None or year != prev_year:
            print(f"--- YEAR CHANGED: {year} --- Resetting group values to first row of structured_df")

            first_row_index = structured_df[structured_df['year'] == year].index

            if len(first_row_index) == 0:
                print(f"WARNING: No data found for year {year}. Skipping...")
                continue  # Skip if year not found

            first_row_index = first_row_index[0]  # Get the first row for the year
            first_row = structured_df.loc[first_row_index].copy()

            # Store initial values for each group, ensuring a complete reset
            last_known_values = {col: first_row[col] if col in first_row else None for col in ['Group A', 'Group B', 'Group C', 'Group D', 'Group E', 'Group F']}

        # Copy the first row from structured_df to initialize the row
        updated_row = structured_df.loc[first_row_index].copy()
        updated_row['stage'] = stage  # Include stage in the final output

        # Debugging print: Before updating groups
        print(f"Before Update: { {col: last_known_values[col] for col in last_known_values} }")

        # Ensure previous values persist for each group
        for col in ['Group A', 'Group B', 'Group C', 'Group D', 'Group E', 'Group F']:
            if col in last_known_values and last_known_values[col] is not None:
                updated_row[col] = last_known_values[col]  # Use last known value

        # Update stage-specific values
        if stage in updated_row.index:
            updated_row[stage] = [
                row['third_team'],
                row['total_points'],
                row['total_goals_difference'],
                row['total_goals_scored']
            ]

            # If goal_minute == 0, add +1 to total_points for the respective team
            if goal_minute == 0:
                updated_row[stage][1] += 1
            # Store updated value in last_known_values to persist it
            last_known_values[stage] = updated_row[stage]

        # Update general tracking variables
        updated_row['goal_minute'] = row['goal_minute']
        updated_row['half_time'] = row['half_time']
        updated_row['date'] = row['date']
        updated_row['time'] = row['time']

        # Recalculate sorted teams after update
        updated_stage_data = []
        for col in ['Group A', 'Group B', 'Group C', 'Group D', 'Group E', 'Group F']:
            if isinstance(updated_row[col], list) and len(updated_row[col]) == 4:
                updated_stage_data.append({
                    'stage': col,
                    'team': updated_row[col][0],
                    'total_points': updated_row[col][1],
                    'total_goals_difference': updated_row[col][2],
                    'total_goals_scored': updated_row[col][3]
                })
        
        # Sorting logic with tie-breaker using team_priority
        def tie_breaker(team):
            return team_priority.index(team['team']) if team['team'] in team_priority else float('inf')

        updated_stage_data.sort(key=lambda x: (-x['total_points'], -x['total_goals_difference'], -x['total_goals_scored'], tie_breaker(x)))

        # Update top_four and last_two with **only team names**
        updated_row['top_four'] = [team['team'] for team in updated_stage_data[:4]]
        updated_row['last_two'] = [team['team'] for team in updated_stage_data[-2:]]

        # Detect tied teams
        tied_teams = []
        for i in range(len(updated_stage_data) - 1):
            if (updated_stage_data[i]['total_points'] == updated_stage_data[i + 1]['total_points'] and
                updated_stage_data[i]['total_goals_difference'] == updated_stage_data[i + 1]['total_goals_difference'] and
                updated_stage_data[i]['total_goals_scored'] == updated_stage_data[i + 1]['total_goals_scored']):
                tied_teams.append(updated_stage_data[i]['team'])
                tied_teams.append(updated_stage_data[i + 1]['team'])

        updated_row['tied_teams'] = list(set(tied_teams))

        # Debugging print: After updating groups
        print(f"After Update: { {col: updated_row[col] for col in ['Group A', 'Group B', 'Group C', 'Group D', 'Group E', 'Group F']} }")

        # Append the updated row to history_records
        history_records.append(updated_row)

        # Update previous year tracker
        prev_year = year

    # Convert list of dictionaries to DataFrame
    history_df = pd.DataFrame(history_records)

    # Ensure the final column order
    column_order = [
        'year', 'date', 'time', 'goal_minute', 'half_time', 'stage',
        'Group A', 'Group B', 'Group C', 'Group D', 'Group E', 'Group F',
        'top_four', 'last_two', 'tied_teams'
    ]

    # Retain only necessary columns and ensure correct order
    history_df = history_df.reindex(columns=column_order, fill_value=None)

    return history_df


In [20]:
def third_track(third_teams_df):
    """
    Processes third_teams_df to compute change_flag and change_count based on `top_four`, 
    considering the sets of teams rather than their order.

    - Detects changes in `top_four` between consecutive rows for the same `year`.
    - Sets `change_flag = 1` if `top_four` (as a set) changes.
    - Resets `change_flag = 0` when the year changes.
    - Increments `change_count` only when a change occurs.
    - Resets `change_count = 0` when the year changes.
    - Ensures sorting by `year, date, time, goal_minute, half_time, stage`.

    Parameters:
    - third_teams_df: DataFrame containing third-place standings over time.

    Returns:
    - A DataFrame with `change_flag` and `change_count` included.
    """

    # Ensure correct sorting
    sorted_df = third_teams_df.sort_values(by=['year', 'date', 'time', 'half_time', 'goal_minute', 'stage']).reset_index(drop=True)

    # Initialize change_flag and change_count
    sorted_df['change_flag'] = 0
    sorted_df['change_count'] = 0

    # Track the change count per year
    prev_top_four = None
    current_change_count = 0
    prev_year = None

    for i in range(len(sorted_df)):
        current_row = sorted_df.iloc[i]
        current_year = current_row['year']
        current_top_four = set(current_row['top_four'])  # Convert list to set for order independence

        # Reset change_flag and change_count when the year changes
        if prev_year is not None and current_year != prev_year:
            current_change_count = 0
            sorted_df.at[i, 'change_flag'] = 0  # Ensure reset at the start of a new year

        # Check if there is a change in top_four
        if prev_top_four is not None and prev_year == current_year:
            if prev_top_four != current_top_four:  # Now checking set equality instead of list order
                sorted_df.at[i, 'change_flag'] = 1
                current_change_count += 1  # Increment only when a change occurs

        # Store the current change_count
        sorted_df.at[i, 'change_count'] = current_change_count

        # Update previous values for the next iteration
        prev_top_four = current_top_four
        prev_year = current_year

    return sorted_df


### women

In [21]:
def best_two_third_placed_eu_women(goals_last_day_sorted, all_games_before_last, agg_goals_before_last_day):
    """
    Process goals and calculate standings for third-placed teams, returning stats after each goal,
    with a single column per group containing all third-team info. Adds a list of teams excluded from
    the best 2 third-placed teams after evaluating each goal and a changes variable that increments
    every time the composition of top2_third_teams changes.
    """
    # Apply filter to process only years 2009 and 2013
    all_games_before_last = all_games_before_last[(all_games_before_last['year'].isin([2009, 2013]))].copy()    
    all_games_before_last['year'] = all_games_before_last['year'].astype(int)

    # Add 1 point to each team for the assumed 0-0 starting score
    all_games_before_last['points'] += 1

    results = []

    # Group by year and process each year separately
    for year, year_data in all_games_before_last.groupby('year'):
        year = int(year)
        print(f"\n--- Processing Year: {year} ---")
        year_data = year_data.copy()

        # Initialize columns for tracking stats
        year_data['before_last_game_goals_scored'] = year_data['goals_scored']
        year_data['before_last_game_goals_conceded'] = year_data['goals_conceded']
        year_data['before_last_game_points'] = year_data['points']
        year_data['last_game_goals_scored'] = 0
        year_data['last_game_goals_conceded'] = 0
        year_data['total_goals_scored'] = year_data['before_last_game_goals_scored']
        year_data['total_goals_conceded'] = year_data['before_last_game_goals_conceded']
        year_data['total_goals_difference'] = year_data['total_goals_scored'] - year_data['total_goals_conceded']
        year_data['last_game_points'] = 0
        year_data['total_points'] = year_data['before_last_game_points']

        # Sort goals by time for the current year
        goals_last_day_year = goals_last_day_sorted[(goals_last_day_sorted['year'] == year)]

        # Initialize a set to track locked stages
        locked_stages = set()

        # Initial standings for goal_time = 0
        third_teams_info = {}
        year_data['tied_won'] = 0  # Initialize tied_won column

        # Sort teams by the `standing` variable
        sorted_standings = year_data.sort_values(by='standing', ascending=True)

        # Extract third-placed teams based on sorted order
        for group_stage, group_data in sorted_standings.groupby('stage'):
            group_key = group_stage.replace(" ", "_")  # Ensure valid keys
            if len(group_data) >= 3:
                third_placed = group_data.iloc[2]  # Third-placed team based on `standing`
                third_teams_info[f'third_team_info_{group_key}'] = {
                    'team': third_placed['team'],
                    'points': int(third_placed['total_points']),
                    'goals_difference': int(third_placed['total_goals_difference']),
                    'goals_scored': int(third_placed['total_goals_scored'])
                }
            else:
                third_teams_info[f'third_team_info_{group_key}'] = None

        # Determine the top 2 third-placed teams initially
        sorted_third_teams = sorted(
            [(info['team'], info['points'], info['goals_difference'], info['goals_scored']) for info in third_teams_info.values() if info],
            key=lambda x: (x[1], x[2], x[3]),
            reverse=True
        )
        top2_third_teams = [team[0] for team in sorted_third_teams[:2]]
        excluded_teams = [team[0] for team in sorted_third_teams[2:]]
        previous_top2 = top2_third_teams
        changes = 0

        # Add the initial state row for the year
        results.append({
            'year': year,
            'stage': None,
            'goal_time': 0,
            'home_team': None,
            'away_team': None,
            'scorer_team': None,
            **third_teams_info,
            'top2_third_teams': top2_third_teams,
            'excluded_teams': excluded_teams,
            'changes': changes
        })

        # Process goals
        for _, goal in goals_last_day_year.iterrows():
            stage = goal['stage']
            home_team = goal['home_team']
            away_team = goal['away_team']
            player_team = goal['scorer_nationality']

            # Lock stats for stages before the current one
            if stage not in locked_stages:
                for locked_stage in year_data['stage'].unique():
                    if locked_stage != stage and locked_stage not in locked_stages:
                        locked_stages.add(locked_stage)

            # Update scores for the current stage
            if stage not in locked_stages:
                opponent_team = home_team if player_team == away_team else away_team
                year_data.loc[year_data['team'] == player_team, 'last_game_goals_scored'] += 1
                year_data.loc[year_data['team'] == opponent_team, 'last_game_goals_conceded'] += 1

                # Update total goals scored, goals conceded, and goal difference
                year_data['total_goals_scored'] = year_data['before_last_game_goals_scored'] + year_data['last_game_goals_scored']
                year_data['total_goals_conceded'] = year_data['before_last_game_goals_conceded'] + year_data['last_game_goals_conceded']
                year_data['total_goals_difference'] = year_data['total_goals_scored'] - year_data['total_goals_conceded']

                # Adjust points dynamically for each game outcome
                year_data['last_game_points'] = (
                    (year_data['last_game_goals_scored'] > year_data['last_game_goals_conceded']).astype(int) * 2 +
                    (year_data['last_game_goals_scored'] == year_data['last_game_goals_conceded']).astype(int) * 0 +
                    (year_data['last_game_goals_scored'] < year_data['last_game_goals_conceded']).astype(int) * -1
                )

                # Update total points
                year_data['total_points'] = year_data['before_last_game_points'] + year_data['last_game_points']

            # Recalculate third-placed team info for each group
            third_teams_info = {}

            for group_stage, group_data in year_data.groupby('stage'):
                group_key = group_stage.replace(" ", "_")  # Ensure valid keys

                # Sort teams by total points, goal difference, and goals scored
                sorted_standings = group_data.sort_values(
                    by=['total_points', 'total_goals_difference', 'total_goals_scored'], 
                    ascending=[False, False, False]
                )

                # Check for ties in total points
                tied_teams = sorted_standings[sorted_standings.duplicated(subset=['total_points'], keep=False)]

                if not tied_teams.empty:
                    # Resolve ties using tiebreaker_after
                    for i, row1 in tied_teams.iterrows():
                        for j, row2 in tied_teams.iterrows():
                            if i != j:  # Ensure no self-comparison
                                winner, _, _ = tiebreaker_after(row1, row2, agg_goals_before_last_day)
                                if winner == row1['team']:
                                    sorted_standings.loc[sorted_standings['team'] == row1['team'], 'tied_won'] += 1

                # Final sorting with tie-breaking criteria
                sorted_standings = sorted_standings.sort_values(
                    by=['total_points', 'tied_won', 'total_goals_difference', 'total_goals_scored'], 
                    ascending=[False, False, False, False]
                )

                # Extract third-placed team
                if len(sorted_standings) >= 3:
                    third_placed = sorted_standings.iloc[2]
                    third_teams_info[f'third_team_info_{group_key}'] = {
                        'team': third_placed['team'],
                        'points': int(third_placed['total_points']),
                        'goals_difference': int(third_placed['total_goals_difference']),
                        'goals_scored': int(third_placed['total_goals_scored'])
                    }
                else:
                    third_teams_info[f'third_team_info_{group_key}'] = None

            # Sort third-placed teams across groups and get top 2
            sorted_third_teams = sorted(
                [(info['team'], info['points'], info['goals_difference'], info['goals_scored']) for info in third_teams_info.values() if info],
                key=lambda x: (x[1], x[2], x[3]),
                reverse=True
            )
            top2_third_teams = [team[0] for team in sorted_third_teams[:2]]
            excluded_teams = [team[0] for team in sorted_third_teams[2:]]

            # Check for changes in top2_third_teams
            if top2_third_teams != previous_top2:
                changes += 1
                previous_top2 = top2_third_teams

            results.append({
                'year': year,
                'stage': stage,
                'goal_time': goal['goal_minute'],
                'half_time': goal['half_time'],
                'home_team': home_team,
                'away_team': away_team,
                'scorer_team': player_team,
                **third_teams_info,
                'top2_third_teams': top2_third_teams,
                'excluded_teams': excluded_teams,
                'changes': changes
            })

    return pd.DataFrame(results)

## World Cup

### men

In [22]:
def best_four_third_placed_wc_men(goals_last_day_sorted, all_games_before_last, agg_goals_before_last_day):
    """
    Process goals and calculate standings for third-placed teams, returning stats after each goal,
    with a single column per group containing all third-team info. Adds a list of teams excluded from
    the best 4 third-placed teams after evaluating each goal and a changes variable that increments
    every time the composition of top4_third_teams changes.
    """
    # Filter data for years >= 2016
    all_games_before_last = all_games_before_last[(all_games_before_last['year'] >= 2016)].copy()
    all_games_before_last['year'] = all_games_before_last['year'].astype(int)

    # Add 1 point to each team for the assumed 0-0 starting score
    all_games_before_last['points'] += 1

    results = []

    # Group by year and process each year separately
    for year, year_data in all_games_before_last.groupby('year'):
        year = int(year)
        print(f"\n--- Processing Year: {year} ---")
        year_data = year_data.copy()

        # Initialize columns for tracking stats
        year_data['before_last_game_goals_scored'] = year_data['goals_scored']
        year_data['before_last_game_goals_conceded'] = year_data['goals_conceded']
        year_data['before_last_game_points'] = year_data['points']
        year_data['last_game_goals_scored'] = 0
        year_data['last_game_goals_conceded'] = 0
        year_data['total_goals_scored'] = year_data['before_last_game_goals_scored']
        year_data['total_goals_conceded'] = year_data['before_last_game_goals_conceded']
        year_data['total_goals_difference'] = year_data['total_goals_scored'] - year_data['total_goals_conceded']
        year_data['last_game_points'] = 0
        year_data['total_points'] = year_data['before_last_game_points']

        # Sort goals by time for the current year
        goals_last_day_year = goals_last_day_sorted[(goals_last_day_sorted['year'] == year)]

        # Initialize a set to track locked stages
        locked_stages = set()

        # Initial standings for goal_time = 0
        third_teams_info = {}
        year_data['tied_won'] = 0  # Initialize tied_won column

        # Sort teams by the `standing` variable
        sorted_standings = year_data.sort_values(by='standing', ascending=True)

        # Extract third-placed teams based on sorted order
        for group_stage, group_data in sorted_standings.groupby('stage'):
            group_key = group_stage.replace(" ", "_")  # Ensure valid keys
            if len(group_data) >= 3:
                third_placed = group_data.iloc[2]  # Third-placed team based on `standing`
                third_teams_info[f'third_team_info_{group_key}'] = {
                    'team': third_placed['team'],
                    'points': int(third_placed['total_points']),
                    'goals_difference': int(third_placed['total_goals_difference']),
                    'goals_scored': int(third_placed['total_goals_scored'])
                }
            else:
                third_teams_info[f'third_team_info_{group_key}'] = None

        # Determine the top 4 third-placed teams initially
        sorted_third_teams = sorted(
            [(info['team'], info['points'], info['goals_difference'], info['goals_scored']) for info in third_teams_info.values() if info],
            key=lambda x: (x[1], x[2], x[3]),
            reverse=True
        )
        top4_third_teams = [team[0] for team in sorted_third_teams[:4]]
        excluded_teams = [team[0] for team in sorted_third_teams[4:]]
        previous_top4 = top4_third_teams
        changes = 0

        # Add the initial state row for the year
        results.append({
            'year': year,
            'stage': None,
            'goal_time': 0,
            'home_team': None,
            'away_team': None,
            'scorer_team': None,
            **third_teams_info,
            'top4_third_teams': top4_third_teams,
            'excluded_teams': excluded_teams,
            'changes': changes
        })

        # Process goals
        for _, goal in goals_last_day_year.iterrows():
            stage = goal['stage']
            home_team = goal['home_team']
            away_team = goal['away_team']
            player_team = goal['scorer_nationality']

            # Lock stats for stages before the current one
            if stage not in locked_stages:
                for locked_stage in year_data['stage'].unique():
                    if locked_stage != stage and locked_stage not in locked_stages:
                        locked_stages.add(locked_stage)

            # Update scores for the current stage
            if stage not in locked_stages:
                opponent_team = home_team if player_team == away_team else away_team
                year_data.loc[year_data['team'] == player_team, 'last_game_goals_scored'] += 1
                year_data.loc[year_data['team'] == opponent_team, 'last_game_goals_conceded'] += 1

                # Update total goals scored, goals conceded, and goal difference
                year_data['total_goals_scored'] = year_data['before_last_game_goals_scored'] + year_data['last_game_goals_scored']
                year_data['total_goals_conceded'] = year_data['before_last_game_goals_conceded'] + year_data['last_game_goals_conceded']
                year_data['total_goals_difference'] = year_data['total_goals_scored'] - year_data['total_goals_conceded']

                # Adjust points dynamically for each game outcome
                year_data['last_game_points'] = (
                    (year_data['last_game_goals_scored'] > year_data['last_game_goals_conceded']).astype(int) * 2 +
                    (year_data['last_game_goals_scored'] == year_data['last_game_goals_conceded']).astype(int) * 0 +
                    (year_data['last_game_goals_scored'] < year_data['last_game_goals_conceded']).astype(int) * -1
                )

                # Update total points
                year_data['total_points'] = year_data['before_last_game_points'] + year_data['last_game_points']

            # Recalculate third-placed team info for each group
            third_teams_info = {}

            for group_stage, group_data in year_data.groupby('stage'):
                group_key = group_stage.replace(" ", "_")  # Ensure valid keys

                # Sort teams by total points, goal difference, and goals scored
                sorted_standings = group_data.sort_values(
                    by=['total_points', 'total_goals_difference', 'total_goals_scored'], 
                    ascending=[False, False, False]
                )

                # Check for ties in total points
                tied_teams = sorted_standings[sorted_standings.duplicated(subset=['total_points'], keep=False)]

                if not tied_teams.empty:
                    # Resolve ties using tiebreaker_after
                    for i, row1 in tied_teams.iterrows():
                        for j, row2 in tied_teams.iterrows():
                            if i != j:  # Ensure no self-comparison
                                winner, _, _ = tiebreaker_after(row1, row2, agg_goals_before_last_day)
                                if winner == row1['team']:
                                    sorted_standings.loc[sorted_standings['team'] == row1['team'], 'tied_won'] += 1

                # Final sorting with tie-breaking criteria
                sorted_standings = sorted_standings.sort_values(
                    by=['total_points', 'total_goals_difference', 'total_goals_scored', 'tied_won'], 
                    ascending=[False, False, False, False]
                )

                # Extract third-placed team
                if len(sorted_standings) >= 3:
                    third_placed = sorted_standings.iloc[2]
                    third_teams_info[f'third_team_info_{group_key}'] = {
                        'team': third_placed['team'],
                        'points': int(third_placed['total_points']),
                        'goals_difference': int(third_placed['total_goals_difference']),
                        'goals_scored': int(third_placed['total_goals_scored'])
                    }
                else:
                    third_teams_info[f'third_team_info_{group_key}'] = None

            # Sort third-placed teams across groups and get top 4
            sorted_third_teams = sorted(
                [(info['team'], info['points'], info['goals_difference'], info['goals_scored']) for info in third_teams_info.values() if info],
                key=lambda x: (x[1], x[2], x[3]),
                reverse=True
            )
            top4_third_teams = [team[0] for team in sorted_third_teams[:4]]
            excluded_teams = [team[0] for team in sorted_third_teams[4:]]

            # Check for changes in top4_third_teams
            if top4_third_teams != previous_top4:
                changes += 1
                previous_top4 = top4_third_teams

            results.append({
                'year': year,
                'stage': stage,
                'goal_time': goal['goal_minute'],
                'half_time': goal['half_time'],
                'home_team': home_team,
                'away_team': away_team,
                'scorer_team': player_team,
                **third_teams_info,
                'top4_third_teams': top4_third_teams,
                'excluded_teams': excluded_teams,
                'changes': changes
            })

    return pd.DataFrame(results)

### women

In [23]:
def best_two_third_placed_wc_women(goals_last_day_sorted, all_games_before_last, agg_goals_before_last_day):
    """
    Process goals and calculate standings for third-placed teams, returning stats after each goal,
    with a single column per group containing all third-team info. Adds a list of teams excluded from
    the best 2 third-placed teams after evaluating each goal and a changes variable that increments
    every time the composition of top2_third_teams changes.
    """
    # Apply filter to process only years == 1991
    all_games_before_last = all_games_before_last[(all_games_before_last['year'] == 1991)].copy()  
    all_games_before_last['year'] = all_games_before_last['year'].astype(int)

    # Add 1 point to each team for the assumed 0-0 starting score
    all_games_before_last['points'] += 1

    results = []

    # Group by year and process each year separately
    for year, year_data in all_games_before_last.groupby('year'):
        year = int(year)
        print(f"\n--- Processing Year: {year} ---")
        year_data = year_data.copy()

        # Initialize columns for tracking stats
        year_data['before_last_game_goals_scored'] = year_data['goals_scored']
        year_data['before_last_game_goals_conceded'] = year_data['goals_conceded']
        year_data['before_last_game_points'] = year_data['points']
        year_data['last_game_goals_scored'] = 0
        year_data['last_game_goals_conceded'] = 0
        year_data['total_goals_scored'] = year_data['before_last_game_goals_scored']
        year_data['total_goals_conceded'] = year_data['before_last_game_goals_conceded']
        year_data['total_goals_difference'] = year_data['total_goals_scored'] - year_data['total_goals_conceded']
        year_data['last_game_points'] = 0
        year_data['total_points'] = year_data['before_last_game_points']

        # Sort goals by time for the current year
        goals_last_day_year = goals_last_day_sorted[(goals_last_day_sorted['year'] == year)]

        # Initialize a set to track locked stages
        locked_stages = set()

        # Initial standings for goal_time = 0
        third_teams_info = {}
        year_data['tied_won'] = 0  # Initialize tied_won column

        # Sort teams by the `standing` variable
        sorted_standings = year_data.sort_values(by='standing', ascending=True)

        # Extract third-placed teams based on sorted order
        for group_stage, group_data in sorted_standings.groupby('stage'):
            group_key = group_stage.replace(" ", "_")  # Ensure valid keys
            if len(group_data) >= 3:
                third_placed = group_data.iloc[2]  # Third-placed team based on `standing`
                third_teams_info[f'third_team_info_{group_key}'] = {
                    'team': third_placed['team'],
                    'points': int(third_placed['total_points']),
                    'goals_difference': int(third_placed['total_goals_difference']),
                    'goals_scored': int(third_placed['total_goals_scored'])
                }
            else:
                third_teams_info[f'third_team_info_{group_key}'] = None

        # Determine the top 2 third-placed teams initially
        sorted_third_teams = sorted(
            [(info['team'], info['points'], info['goals_difference'], info['goals_scored']) for info in third_teams_info.values() if info],
            key=lambda x: (x[1], x[2], x[3]),
            reverse=True
        )
        top2_third_teams = [team[0] for team in sorted_third_teams[:2]]
        excluded_teams = [team[0] for team in sorted_third_teams[2:]]
        previous_top2 = top2_third_teams
        changes = 0

        # Add the initial state row for the year
        results.append({
            'year': year,
            'stage': None,
            'goal_time': 0,
            'home_team': None,
            'away_team': None,
            'scorer_team': None,
            **third_teams_info,
            'top2_third_teams': top2_third_teams,
            'excluded_teams': excluded_teams,
            'changes': changes
        })

        # Process goals
        for _, goal in goals_last_day_year.iterrows():
            stage = goal['stage']
            home_team = goal['home_team']
            away_team = goal['away_team']
            player_team = goal['scorer_nationality']

            # Lock stats for stages before the current one
            if stage not in locked_stages:
                for locked_stage in year_data['stage'].unique():
                    if locked_stage != stage and locked_stage not in locked_stages:
                        locked_stages.add(locked_stage)

            # Update scores for the current stage
            if stage not in locked_stages:
                opponent_team = home_team if player_team == away_team else away_team
                year_data.loc[year_data['team'] == player_team, 'last_game_goals_scored'] += 1
                year_data.loc[year_data['team'] == opponent_team, 'last_game_goals_conceded'] += 1

                # Update total goals scored, goals conceded, and goal difference
                year_data['total_goals_scored'] = year_data['before_last_game_goals_scored'] + year_data['last_game_goals_scored']
                year_data['total_goals_conceded'] = year_data['before_last_game_goals_conceded'] + year_data['last_game_goals_conceded']
                year_data['total_goals_difference'] = year_data['total_goals_scored'] - year_data['total_goals_conceded']

                # Adjust points dynamically for each game outcome
                year_data['last_game_points'] = (
                    (year_data['last_game_goals_scored'] > year_data['last_game_goals_conceded']).astype(int) * 2 +
                    (year_data['last_game_goals_scored'] == year_data['last_game_goals_conceded']).astype(int) * 0 +
                    (year_data['last_game_goals_scored'] < year_data['last_game_goals_conceded']).astype(int) * -1
                )

                # Update total points
                year_data['total_points'] = year_data['before_last_game_points'] + year_data['last_game_points']

            # Recalculate third-placed team info for each group
            third_teams_info = {}

            for group_stage, group_data in year_data.groupby('stage'):
                group_key = group_stage.replace(" ", "_")  # Ensure valid keys

                # Sort teams by total points, goal difference, and goals scored
                sorted_standings = group_data.sort_values(
                    by=['total_points', 'total_goals_difference', 'total_goals_scored'], 
                    ascending=[False, False, False]
                )

                # Check for ties in total points
                tied_teams = sorted_standings[sorted_standings.duplicated(subset=['total_points'], keep=False)]

                if not tied_teams.empty:
                    # Resolve ties using tiebreaker_after
                    for i, row1 in tied_teams.iterrows():
                        for j, row2 in tied_teams.iterrows():
                            if i != j:  # Ensure no self-comparison
                                winner, _, _ = tiebreaker_after(row1, row2, agg_goals_before_last_day)
                                if winner == row1['team']:
                                    sorted_standings.loc[sorted_standings['team'] == row1['team'], 'tied_won'] += 1

                # Final sorting with tie-breaking criteria
                sorted_standings = sorted_standings.sort_values(
                    by=['total_points', 'tied_won', 'total_goals_difference', 'total_goals_scored'], 
                    ascending=[False, False, False, False]
                )

                # Extract third-placed team
                if len(sorted_standings) >= 3:
                    third_placed = sorted_standings.iloc[2]
                    third_teams_info[f'third_team_info_{group_key}'] = {
                        'team': third_placed['team'],
                        'points': int(third_placed['total_points']),
                        'goals_difference': int(third_placed['total_goals_difference']),
                        'goals_scored': int(third_placed['total_goals_scored'])
                    }
                else:
                    third_teams_info[f'third_team_info_{group_key}'] = None

            # Sort third-placed teams across groups and get top 2
            sorted_third_teams = sorted(
                [(info['team'], info['points'], info['goals_difference'], info['goals_scored']) for info in third_teams_info.values() if info],
                key=lambda x: (x[1], x[2], x[3]),
                reverse=True
            )
            top2_third_teams = [team[0] for team in sorted_third_teams[:2]]
            excluded_teams = [team[0] for team in sorted_third_teams[2:]]

            # Check for changes in top2_third_teams
            if top2_third_teams != previous_top2:
                changes += 1
                previous_top2 = top2_third_teams

            results.append({
                'year': year,
                'stage': stage,
                'goal_time': goal['goal_minute'],
                'half_time': goal['half_time'],
                'home_team': home_team,
                'away_team': away_team,
                'scorer_team': player_team,
                **third_teams_info,
                'top2_third_teams': top2_third_teams,
                'excluded_teams': excluded_teams,
                'changes': changes
            })

    return pd.DataFrame(results)

# suspense

#### h2h variable 

In [24]:
def assign_h2h_per_row(group):
    # Reset index so we can loop by position
    group = group.reset_index(drop=True)

    def compute_row_h2h(i, row):
        # Fill in missing teams if needed and possible
        if row["goal_minute"] == 0 and (pd.isna(row["home_team"]) or pd.isna(row["away_team"])):
            if i + 1 < len(group):
                next_row = group.iloc[i + 1]
                if pd.isna(row["home_team"]):
                    row["home_team"] = next_row["home_team"]
                if pd.isna(row["away_team"]):
                    row["away_team"] = next_row["away_team"]

        # Extract H2H logic variables
        third_qualify = row["third_qualify"]
        team_1st = row["1st"]
        team_2nd = row["2nd"]
        team_3rd = row["3rd"]
        team_4th = row["4th"]

        # Define the valid H2H pairs for this row’s context
        if third_qualify == 1:
            valid_pairs = {
                (team_3rd, team_4th), (team_4th, team_3rd),
                (team_1st, team_2nd), (team_2nd, team_1st)
            }
        else:
            valid_pairs = {
                (team_2nd, team_3rd), (team_3rd, team_2nd),
                (team_1st, team_4th), (team_4th, team_1st)
            }

        match_pair = (row["home_team"], row["away_team"])

        return int(match_pair in valid_pairs)

    # Apply the computation row by row
    group["h2h"] = [compute_row_h2h(i, group.loc[i].copy()) for i in range(len(group))]
    return group


In [25]:
def assign_lagging_won(final_composition_changes_df, goals_df):

    final_df = final_composition_changes_df.copy()

    # Assign lagging and leading teams based on third_qualify
    final_df['lagging_team'] = final_df.apply(
        lambda row: row['3rd'] if row['third_qualify'] == 0 else row['4th'], axis=1
    )
    final_df['leading_team'] = final_df.apply(
        lambda row: row['2nd'] if row['third_qualify'] == 0 else row['3rd'], axis=1
    )

    # Function to determine result for lagging team
    def determine_result(row):
        if row['h2h'] != 0 or pd.isna(row['lagging_team']) or pd.isna(row['leading_team']):
            return None

        year = row['year']
        stage = row['stage']
        lagging = row['lagging_team']
        leading = row['leading_team']

        # Find the corresponding match
        match = goals_df[
            (goals_df['year'] == year) &
            (goals_df['stage'] == stage) &
            (
                ((goals_df['home_team'] == lagging) & (goals_df['away_team'] == leading)) |
                ((goals_df['home_team'] == leading) & (goals_df['away_team'] == lagging))
            )
        ]

        if match.empty:
            return None

        score = match.iloc[0]['score']
        try:
            home_goals, away_goals = map(int, score.split('–'))
        except:
            return None

        if match.iloc[0]['home_team'] == lagging:
            lagging_goals = home_goals
            leading_goals = away_goals
        else:
            lagging_goals = away_goals
            leading_goals = home_goals

        if lagging_goals > leading_goals:
            return 1
        elif lagging_goals == leading_goals:
            return 0
        else:
            return -1

    # Apply result logic
    final_df['lagging_won'] = final_df.apply(determine_result, axis=1)

    return final_df


In [26]:
# def calculate_suspense(row):
#     required_fields = ['pts_diff', 'gls_diff', 'year', 'third_qualify',
#                        '3rd_last_game_points', '3rd_last_game_goals_diff','4th_last_game_points',
#                        '3rd_goals_scored', '4th_goals_scored', '4th_last_game_goals_diff','2nd_goals_scored', 'goal_minute']
    
#     if any(pd.isna(row[field]) for field in required_fields):
#         return 0

        
#     if row['year'] <= 1992:  # 2-point win system
#         if (

#             # losing and still losing but better goal difference
#             (row['third_qualify']== 1 and
#              row['pts_diff'] == 0 and
#             row['gls_diff'] == -1 and 
#             row['4th_goals_scored'] >= row['3rd_goals_scored']) or

#             (row['third_qualify']== 0 and
#             row['pts_diff'] == 0 and
#             row['gls_diff'] == -1 and 
#             row['3rd_goals_scored'] >= row['2nd_goals_scored']) or

#             # from losing to drawing/ from drawing to winning with hypothetical same goals difference but more goals scored
#             (row['third_qualify'] == 1 and
#             row['4th_last_game_goals_diff'] in [-1,0] and
#             row['pts_diff'] in [-1, 0] and
#             row['gls_diff'] == -1 and
#             row['4th_goals_scored'] >= row['3rd_goals_scored']) or
            
#             (row['third_qualify'] == 0 and
#             row['3rd_last_game_goals_diff'] in [-1,0] and
#             row['pts_diff'] in [-1, 0] and
#             row['gls_diff'] == -1 and
#             row['3rd_goals_scored'] >= row['2nd_goals_scored']) or

#             # from losing to drawing/ from drawing to winning with better goals difference
#             (row['third_qualify'] == 1 and
#             row['4th_last_game_goals_diff'] in [-1,0] and
#             row['pts_diff'] in [-1, 0] and
#             row['gls_diff'] >= 0) or
            
#             (row['third_qualify'] == 0 and
#             row['3rd_last_game_goals_diff'] in [-1,0] and
#             row['pts_diff'] in [-1, 0] and
#             row['gls_diff'] >= 0) or

#             # h2h scenrarios drawing to winning / losing to drawing
#             ## higher number of points
#             (row['third_qualify'] == 1 and
#             row['pts_diff'] > -2 and 
#              row['h2h'] == 1 and
#              row['4th_last_game_goals_diff'] in [-1,0]
#              ) or 

#             (row['third_qualify'] == 0 and
#             row['pts_diff'] > -2 and 
#              row['h2h'] == 1 and
#              row['3rd_last_game_goals_diff'] in [-1,0]
#              ) or 

#             ## same number of points but higher number of goals scored 
#             (row['third_qualify'] == 1 and
#             row['pts_diff'] == -2 and 
#              row['h2h'] == 1 and
#              row['4th_last_game_goals_diff'] in [-1,0] and 
#              row['4th_goals_scored'] >= row['3rd_goals_scored']

#              ) or 

#             (row['third_qualify'] == 0 and
#             row['pts_diff'] == -2 and 
#              row['h2h'] == 1 and
#              row['3rd_last_game_goals_diff'] in [-1,0] and
#              row['3rd_goals_scored'] >= row['2nd_goals_scored']
#              )

#              ## lagging team is winning and leading team is drawing potentially dropping points and goals difference
#              ## Leading team has a lower goal difference than lagging team
#             or (row['third_qualify'] == 1 and 
#                 row['4th_last_game_points'] == 2 and
#                 row['3rd_last_game_points'] == 1 and
#                 row['gls_diff'] >= 0 and
#                 row['pts_diff'] == -1 and
#                 row['h2h']== 0) or

#             (row['third_qualify'] == 0 and 
#                 row['3rd_last_game_points'] == 2 and
#                 row['2nd_last_game_points'] == 1 and
#                 row['gls_diff'] >= 0 and
#                 row['pts_diff'] == -1 and
#                 row['h2h']== 0) or
 
#             ## Leading team in only one goal above lagging team but with an equal or lower number of goals scored
#             (row['third_qualify'] == 1 and 
#                 row['4th_last_game_points'] == 2 and
#                 row['3rd_last_game_points'] == 1 and
#                 row['gls_diff'] == -1 and
#                 row['pts_diff'] == -1 and
#                 row['4th_goals_scored'] >= row['3rd_goals_scored'] and
#                 row['h2h']== 0) or
            
#             (row['third_qualify'] == 0 and
#                 row['3rd_last_game_points'] == 2 and
#                 row['2nd_last_game_points'] == 1 and
#                 row['gls_diff'] == -1 and
#                 row['pts_diff'] == -1 and
#                 row['3rd_goals_scored'] >= row['2nd_goals_scored'] and 
#                 row['h2h']== 0) or

#             ## lagging team is winning and leading team is losing potentially dropping points and goals difference
#             ## Leading team has a lower goal difference than lagging team
#                 (row['third_qualify'] == 1 and 
#                 row['4th_last_game_points'] == 2 and
#                 row['3rd_last_game_points'] == 0 and
#                 row['gls_diff'] >= 0 and
#                 row['pts_diff'] == 0 and 
#                  row['h2h']== 0) or

#             (row['third_qualify'] == 0 and 
#                 row['3rd_last_game_points'] == 2 and
#                 row['2nd_last_game_points'] == 0 and
#                 row['gls_diff'] >= 0 and
#                 row['pts_diff'] == 0 and 
#                 row['h2h']== 0) or

#             ## Leading team in only one goal above lagging team but with an equal or lower number of goals scored
#             (row['third_qualify'] == 1 and 
#                 row['4th_last_game_points'] == 2 and
#                 row['3rd_last_game_points'] == 0 and
#                 row['gls_diff'] == -1 and
#                 row['pts_diff'] == 0 and
#                 row['4th_goals_scored'] >= row['3rd_goals_scored'] and 
#                 row['h2h']== 0) or
            
#             (row['third_qualify'] == 0 and
#                 row['3rd_last_game_points'] == 2 and
#                 row['2nd_last_game_points'] == 0 and
#                 row['gls_diff'] == -1 and
#                 row['pts_diff'] == 0 and
#                 row['3rd_goals_scored'] >= row['2nd_goals_scored'] and 
#                 row['h2h']== 0) or

#                 # lagging team is not winning and has same points as leading team
#                 (row['third_qualify'] == 1 and
#                 row['pts_diff'] == 0 and 
#                 row['4th_last_game_points'] != 2) or

#                 (row['third_qualify'] == 0 and
#                 row['pts_diff'] == 0 and
#                 row['3rd_last_game_points'] != 2) 
            


#         ):
#             return 1


#     else:  # 3-point win system

#         # Immediate opening goal causes shift
#         if row['goal_minute'] == 0 and row['pts_diff'] == -2 and row['gls_diff'] >= -1:
#             return 1
#         if row['goal_minute'] == 0 and row['pts_diff'] in [-1, 0]:
#             return 1
        
#         # Winning scenarios

#         if row['third_qualify'] == 0 and row['pts_diff'] == -1 and row['3rd_last_game_points'] == 3 and row['2nd_last_game_points'] == 1 and row['h2h'] == 0:
#             if (row['gls_diff'] >= 0) or \
#                (row['gls_diff'] == -1 and row['3rd_goals_scored'] >= row['2nd_goals_scored']):
#                 return 1
            
#         if row['third_qualify'] == 1 and row['pts_diff'] == -1 and row['4th_last_game_points'] == 3 and row['3rd_last_game_points'] == 1 and row['h2h'] == 0:
#             if (row['gls_diff'] >= 0) or \
#                (row['gls_diff'] == -1 and row['4th_goals_scored'] >= row['3rd_goals_scored']):
#                 return 1
            
#         if row['third_qualify'] == 0 and row['pts_diff'] == 0 and row['3rd_last_game_points'] == 3 and row['2nd_last_game_points'] == 0 and row['h2h'] == 0:
#             if (row['gls_diff'] >= 0) or \
#                (row['gls_diff'] == -1 and row['3rd_goals_scored'] >= row['2nd_goals_scored']):
#                 return 1
            
#         if row['third_qualify'] == 1 and row['pts_diff'] == 0 and row['4th_last_game_points'] == 3 and row['3rd_last_game_points'] == 0 and row['h2h'] == 0:
#             if (row['gls_diff'] >= 0) or \
#                (row['gls_diff'] == -1 and row['4th_goals_scored'] >= row['3rd_goals_scored']):
#                 return 1

#         # Drawing scenarios
#         if row['third_qualify'] == 0 and row['pts_diff'] == -2 and row['3rd_last_game_points'] == 1:
#             if (row['gls_diff'] >= 0) or \
#                (row['gls_diff'] == -1 and row['3rd_goals_scored'] >= row['2nd_goals_scored']):
#                 return 1

#         if row['third_qualify'] == 1 and row['pts_diff'] == -2 and row['4th_last_game_points'] == 1:
#             if (row['gls_diff'] >= 0) or \
#                (row['gls_diff'] == -1 and row['4th_goals_scored'] >= row['3rd_goals_scored']):
#                 return 1
            
#         if row['third_qualify'] == 0 and row['pts_diff'] == -1 and row['3rd_last_game_points'] == 1:
#           return 1
        
#         if row['third_qualify'] == 1 and row['pts_diff'] == -1 and row['4th_last_game_points'] == 1:
#           return 1

#         # Losing scenarios
#         if row['third_qualify'] == 0 and row['pts_diff'] == -1 and row['3rd_last_game_goals_diff'] == -1:
#             if (row['gls_diff'] >= 0) or \
#                (row['gls_diff'] == -1 and row['3rd_goals_scored'] >= row['2nd_goals_scored']):
#                 return 1

#         if row['third_qualify'] == 1 and row['pts_diff'] == -1 and row['4th_last_game_goals_diff'] == -1:
#             if (row['gls_diff'] >= 0) or \
#                (row['gls_diff'] == -1 and row['4th_goals_scored'] >= row['3rd_goals_scored']):
#                 return 1
            
#         # losing and still losing but better goal difference
#         if row['third_qualify']== 1 and row['pts_diff'] == 0 and row['gls_diff'] == -1 and  row['4th_goals_scored'] >= row['3rd_goals_scored']:
#             return 1

#         if row['third_qualify']== 0 and row['pts_diff'] == 0 and row['gls_diff'] == -1 and row['3rd_goals_scored'] >= row['2nd_goals_scored']: 
#             return 1
            
#         # h2h scenarios from drawing to winning
#         if row['third_qualify'] == 0 and row['pts_diff'] > -4 and row['h2h'] == 1 and row['3rd_last_game_goals_diff'] == 0:
#             return 1
#         if row['third_qualify'] == 1 and row['pts_diff'] > -4 and row['h2h'] == 1 and row['4th_last_game_goals_diff'] == 0:
#             return 1
#         if row['third_qualify'] == 0 and row['pts_diff'] == -4 and row['h2h'] == 1 and row['3rd_last_game_goals_diff'] == 0 and row['3rd_goals_scored'] >= row['2nd_goals_scored']:
#             return 1
#         if row['third_qualify'] == 1 and row['pts_diff'] == -4 and row['h2h'] == 1 and row['4th_last_game_goals_diff'] == 0 and row['4th_goals_scored'] >= row['3rd_goals_scored']:
#             return 1
        
#         # h2h scenarios from losing to drawing

#         if row['third_qualify'] == 0 and row['pts_diff'] > -3 and row['h2h'] == 1 and row['3rd_last_game_goals_diff'] == -1:
#             return 1
#         if row['third_qualify'] == 1 and row['pts_diff'] > -3 and row['h2h'] == 1 and row['4th_last_game_goals_diff'] == -1:
#             return 1
#         if row['third_qualify'] == 0 and row['pts_diff'] == -3 and row['h2h'] == 1 and row['3rd_last_game_goals_diff'] == -1 and row['3rd_goals_scored'] >= row['2nd_goals_scored']:
#             return 1
#         if row['third_qualify'] == 1 and row['pts_diff'] == -3 and row['h2h'] == 1 and row['4th_last_game_goals_diff'] == -1 and row['4th_goals_scored'] >= row['3rd_goals_scored']:
#             return 1
        
#         # lagging team is not winning and has same points as leading team
#         if row['third_qualify'] == 0 and row['pts_diff'] == 0 and row['3rd_last_game_points'] != 3:
#             return 1
#         if row['third_qualify'] == 1 and row['pts_diff'] == 0 and row['4th_last_game_points'] != 3:
#             return 1


#     return 0


In [27]:
def calculate_suspense(row):

    required_fields = ['pts_diff', 'gls_diff', 'year', 'third_qualify',
                       '3rd_last_game_points', '3rd_last_game_goals_diff', '4th_last_game_points',
                       '3rd_goals_scored', '4th_goals_scored', '4th_last_game_goals_diff',
                       '2nd_goals_scored', 'goal_minute', 'h2h']
    
    if any(pd.isna(row.get(f)) for f in required_fields):
        return 0

    tq = row['third_qualify']

    # Assign lagging and leading values dynamically
    lagging_points = row['4th_last_game_points'] if tq == 1 else row['3rd_last_game_points']
    leading_points = row['3rd_last_game_points'] if tq == 1 else row['4th_last_game_points']

    lagging_goals_scored = row['4th_goals_scored'] if tq == 1 else row['3rd_goals_scored']
    leading_goals_scored = row['3rd_goals_scored'] if tq == 1 else row['2nd_goals_scored']

    last_game_goals_diff = row['4th_last_game_goals_diff'] if tq == 1 else row['3rd_last_game_goals_diff']

    pts_diff = row['pts_diff']
    gls_diff = row['gls_diff']
    year = row['year']
    h2h = row['h2h']
    goal_minute = row['goal_minute']

    # 2-point win system
    if year <= 1992:
        if (
            # losing and still losing but better goal difference
            (pts_diff == 0 and gls_diff == -1 and lagging_goals_scored >= leading_goals_scored) or

            # from losing to drawing/ from drawing to winning with hypothetical same goals difference but more goals scored
            (last_game_goals_diff in [-1, 0] and pts_diff in [-1, 0] and gls_diff == -1 and lagging_goals_scored >= leading_goals_scored) or

            # from losing to drawing/ from drawing to winning with better goals difference
            (last_game_goals_diff in [-1, 0] and pts_diff in [-1, 0] and gls_diff >= 0) or

            # h2h scenrarios drawing to winning / losing to drawing
            (h2h == 1 and (
                ## higher number of points
                (pts_diff > -2 and last_game_goals_diff in [-1, 0]) or
                ## same number of points but higher number of goals scored 
                (pts_diff == -2 and last_game_goals_diff in [-1, 0] and lagging_goals_scored >= leading_goals_scored)
            )) or

            ## lagging team is winning and leading team is drawing potentially dropping points and goals difference
            ## leading team has a lower goal difference than lagging team
            (lagging_points == 2 and leading_points == 1 and gls_diff >= 0 and pts_diff == -1 and h2h == 0) or
            ## Leading team in only one goal above lagging team but with an equal or lower number of goals scored
            (lagging_points == 2 and leading_points == 1 and gls_diff == -1 and pts_diff == -1 and lagging_goals_scored >= leading_goals_scored and h2h == 0) or

            ## lagging team is winning and leading team is losing potentially dropping points and goals difference
            ##leading team has a lower goal difference than lagging team
            (lagging_points == 2 and leading_points == 0 and gls_diff >= 0 and pts_diff == 0 and h2h == 0) or
            ## Leading team in only one goal above lagging team but with an equal or lower number of goals scored
            (lagging_points == 2 and leading_points == 0 and gls_diff == -1 and pts_diff == 0 and lagging_goals_scored >= leading_goals_scored and h2h == 0) or

            # Equal points but leading team is not winning
            (pts_diff == 0 and lagging_points != 2)
        ):
            return 1

    else:  # 3-point win system
        if (
            # Opening goal causes a shift
            (goal_minute == 0 and pts_diff == -2 and gls_diff >= -1) or
            (goal_minute == 0 and pts_diff in [-1, 0]) or

            # winning scenarios
            (pts_diff == -1 and lagging_points == 3 and leading_points == 1 and h2h == 0 and (
                gls_diff >= 0 or (gls_diff == -1 and lagging_goals_scored >= leading_goals_scored)
            )) or
            (pts_diff == 0 and lagging_points == 3 and leading_points == 0 and h2h == 0 and (
                gls_diff >= 0 or (gls_diff == -1 and lagging_goals_scored >= leading_goals_scored)
            )) or

            ## drawing scenarios
            (pts_diff == -2 and lagging_points == 1 and (
                gls_diff >= 0 or (gls_diff == -1 and lagging_goals_scored >= leading_goals_scored)
            )) or
            (pts_diff == -1 and lagging_points == 1) or

            ## losing scenarios
            (pts_diff == -1 and last_game_goals_diff == -1 and (
                gls_diff >= 0 or (gls_diff == -1 and lagging_goals_scored >= leading_goals_scored)
            )) or
            # losing and still losing but better goal difference
            (pts_diff == 0 and gls_diff == -1 and lagging_goals_scored >= leading_goals_scored) or
            # h2h scenarios from drawing to winning
            (h2h == 1 and (
                # higher number of points
                (pts_diff > -4 and last_game_goals_diff == 0) or
                # same number of points but higher number of goals scored
                (pts_diff == -4 and last_game_goals_diff == 0 and lagging_goals_scored >= leading_goals_scored)
            )) or
            # h2h scenarios from losing to drawing
            (h2h == 1 and (
                # higher number of points
                (pts_diff > -3 and last_game_goals_diff == -1) or
                # same number of points but higher number of goals scored
                (pts_diff == -3 and last_game_goals_diff == -1 and lagging_goals_scored >= leading_goals_scored)
            )) or
            # lagging team is not winning and has same points as leading team
            (pts_diff == 0 and lagging_points != 3) 

        ):
            return 1

    return 0


In [28]:
# def update_suspense(final_composition_changes_df):
#     final_df = final_composition_changes_df.copy()

#     # Initialize suspense mask
#     suspense_mask = pd.Series(False, index=final_df.index)

#     # Store conditions per group
#     conditions = []

#     # 1. 3rd losing but could qualify
#     conditions.append((
#         (final_df['third_qualify'] == 0) &
#         (final_df['pts_diff'] == -1) &
#         (final_df['3rd_last_game_goals_diff'] == -1) &
#         (final_df['gls_diff'] == -1) &
#         (final_df['goal_minute'] != 0) &
#         (final_df['3rd_goals_scored'] == final_df['2nd_goals_scored'] - 1),
#         '3rd', '2nd'
#     ))

#     # 2. 4th losing but could avoid elimination
#     conditions.append((
#         (final_df['third_qualify'] == 1) &
#         (final_df['pts_diff'] == -1) &
#         (final_df['4th_last_game_goals_diff'] == -1) &
#         (final_df['gls_diff'] == -1) &
#         (final_df['goal_minute'] != 0) &
#         (final_df['4th_goals_scored'] == final_df['3rd_goals_scored'] - 1),
#         '4th', '3rd'
#     ))

#     # 3. 3rd drawing, could win and qualify
#     conditions.append((
#         (final_df['third_qualify'] == 0) &
#         (final_df['pts_diff'] == -2) &
#         (final_df['3rd_last_game_points'] == 1) &
#         (final_df['gls_diff'] == -1) &
#         (final_df['goal_minute'] != 0) &
#         (final_df['3rd_goals_scored'] == final_df['2nd_goals_scored'] - 1),
#         '3rd', '2nd'
#     ))

#     # 4. 4th drawing, could win and avoid elimination
#     conditions.append((
#         (final_df['third_qualify'] == 1) &
#         (final_df['pts_diff'] == -2) &
#         (final_df['4th_last_game_points'] == 1) &
#         (final_df['gls_diff'] == -1) &
#         (final_df['goal_minute'] != 0) &
#         (final_df['4th_goals_scored'] == final_df['3rd_goals_scored'] - 1),
#         '4th', '3rd'
#     ))

#     # 5. Early goal (minute 0), qualifying case
#     conditions.append((
#         (final_df['third_qualify'] == 0) &
#         (final_df['year'] > 1992) &
#         (final_df['goal_minute'] == 0) &
#         (final_df['pts_diff'] == -2) &
#         (final_df['gls_diff'] == -1) &
#         (final_df['3rd_goals_scored'] == final_df['2nd_goals_scored'] - 1),
#         '3rd', '2nd'
#     ))

#     # 6. Early goal (minute 0), elimination case
#     conditions.append((
#         (final_df['third_qualify'] == 1) &
#         (final_df['year'] > 1992) &
#         (final_df['goal_minute'] == 0) &
#         (final_df['pts_diff'] == -2) &
#         (final_df['gls_diff'] == -1) &
#         (final_df['4th_goals_scored'] == final_df['3rd_goals_scored'] - 1),
#         '4th', '3rd'
#     ))

#     # 7. 3rd losing but same points as 2nd
#     conditions.append((
#         (final_df['third_qualify'] == 0) &
#         (final_df['pts_diff'] == 0) &
#         (final_df['3rd_last_game_goals_diff'] <= -2),
#         '3rd', '2nd'
#     ))

#     # 8. 4th losing but same points as 3rd
#     conditions.append((
#         (final_df['third_qualify'] == 1) &
#         (final_df['pts_diff'] == 0) &
#         (final_df['4th_last_game_goals_diff'] <= -2),
#         '4th', '3rd'
#     ))

#     # Create lagging and leading team columns
#     final_df['lagging_team'] = None
#     final_df['leading_team'] = None

#     # Apply each condition to suspense mask and assign teams
#     for condition, lagging_col, leading_col in conditions:
#         suspense_mask |= condition
#         final_df.loc[condition, 'lagging_team'] = final_df.loc[condition, lagging_col]
#         final_df.loc[condition, 'leading_team'] = final_df.loc[condition, leading_col]

#     # Final assignments
#     final_df.loc[suspense_mask, 'suspense'] = 1
#     final_df.loc[suspense_mask, 'h2h_suspense'] = 1

#     print(f"🧩 Initialized suspense and h2h_suspense = 1 for {suspense_mask.sum()} rows.\n")
#     return final_df


In [29]:
def update_suspense(final_composition_changes_df):
    final_df = final_composition_changes_df.copy()
    suspense_mask = pd.Series(False, index=final_df.index)

    final_df['lagging_team'] = None
    final_df['leading_team'] = None

    for tq_value, lagging_col, leading_col, lg_gd_col, lg_pts_col in [(0, '3rd', '2nd', '3rd_last_game_goals_diff', '3rd_last_game_points'),
                                                                      (1, '4th', '3rd', '4th_last_game_goals_diff', '4th_last_game_points')]:
        tq_mask = final_df['third_qualify'] == tq_value
        leading_gs_col = '2nd_goals_scored' if tq_value == 0 else '3rd_goals_scored'
        lagging_gs_col = '3rd_goals_scored' if tq_value == 0 else '4th_goals_scored'

        # 1. Lagging team losing but could qualify or avoid elimination
        mask1 = (
            tq_mask &
            (final_df['pts_diff'] == -1) &
            (final_df[lg_gd_col] == -1) &
            (final_df['gls_diff'] == -1) &
            (final_df['goal_minute'] != 0) &
            (final_df[lagging_gs_col] == final_df[leading_gs_col] - 1)
        )

        # 2. Lagging team drawing but could win and qualify/avoid elimination
        mask2 = (
            tq_mask &
            (final_df['pts_diff'] == -2) &
            (final_df[lg_pts_col] == 1) &
            (final_df['gls_diff'] == -1) &
            (final_df['goal_minute'] != 0) &
            (final_df[lagging_gs_col] == final_df[leading_gs_col] - 1)
        )

        # 3. Early goal, suspense already triggered
        mask3 = (
            tq_mask &
            (final_df['year'] > 1992) &
            (final_df['goal_minute'] == 0) &
            (final_df['pts_diff'] == -2) &
            (final_df['gls_diff'] == -1) &
            (final_df[lagging_gs_col] == final_df[leading_gs_col] - 1)
        )

        # 4. Lagging team losing but equal points
        mask4 = (
            tq_mask &
            (final_df['pts_diff'] == 0) &
            (final_df[lg_gd_col] <= -2)
        )

        for condition in [mask1, mask2, mask3, mask4]:
            suspense_mask |= condition
            final_df.loc[condition, 'lagging_team'] = final_df.loc[condition, lagging_col]
            final_df.loc[condition, 'leading_team'] = final_df.loc[condition, leading_col]

    final_df.loc[suspense_mask, 'suspense'] = 1
    final_df.loc[suspense_mask, 'h2h_suspense'] = 1

    print(f"🧩 Initialized suspense and h2h_suspense = 1 for {suspense_mask.sum()} rows.\n")
    return final_df


In [30]:
def evaluate_lagging_losses_and_update_suspense(final_composition_changes_df, goals_df):
    import re

    df = final_composition_changes_df.copy()
    suspense_rows = df[df['h2h_suspense'] == 1].copy()
    print(f"🔍 Checking {len(suspense_rows)} suspense observations...\n")

    goals_df = goals_df.copy()
    goals_df['score_clean'] = goals_df['score'].astype(str).str.replace(r'[–—−]', '-', regex=True)

    updated_count = 0

    for _, row in suspense_rows.iterrows():
        year = row['year']
        stage = row['stage']
        minute = row['goal_minute']
        lagging = row['lagging_team']
        leading = row['leading_team']

        print(f"\n🔎 Year: {year}, Stage: {stage}, Minute: {minute}")
        print(f"   Match: {lagging} (lagging) vs {leading} (leading)")

        match = goals_df[
            (goals_df['year'] == year) &
            (goals_df['stage'] == stage) &
            (
                ((goals_df['home_team'] == lagging) & (goals_df['away_team'] == leading)) |
                ((goals_df['home_team'] == leading) & (goals_df['away_team'] == lagging))
            )
        ]

        if match.empty:
            print("   ⚠️ No matching game found in goals_df.")
            continue

        m = match.iloc[0]
        score_str = str(m['score'])
        print(f"   📋 Found final score: {m['home_team']} {score_str} {m['away_team']}")

        parts = re.sub(r'[–—−]', '-', score_str).split('-')
        if len(parts) != 2:
            print(f"   ⚠️ Invalid score format: {score_str}")
            continue

        try:
            home_goals, away_goals = map(int, parts)
            home_team = m['home_team']
            away_team = m['away_team']

            lagging_lost = (
                (home_team == lagging and home_goals < away_goals) or
                (away_team == lagging and away_goals < home_goals)
            )

            if lagging_lost:
                df.loc[
                    (df['year'] == year) &
                    (df['stage'] == stage) &
                    (df['goal_minute'] == minute) &
                    (df['lagging_team'] == lagging) &
                    (df['leading_team'] == leading),
                    'suspense'
                ] = 0
                updated_count += 1
                print("   ❌ Lagging team LOST — suspense set to 0")
            else:
                print("   ✅ Lagging team did NOT lose — suspense remains 1")

        except Exception as e:
            print(f"   ⚠️ Error parsing score '{score_str}': {e}")

    print(f"\n✅ Done. Suspense set to 0 for {updated_count} observations where lagging team lost.")
    return df


In [31]:
def update_suspense_draw_lots(final_composition_changes_df):
    # Ensure the 'suspense' column exists and is initialized to 0
    if 'suspense' not in final_composition_changes_df.columns:
        final_composition_changes_df['suspense'] = 0
    else:
        final_composition_changes_df['suspense'] = final_composition_changes_df['suspense'].fillna(0)

    # 2-point system (year <= 1992)
    final_composition_changes_df.loc[
        (final_composition_changes_df['year'] <= 1992) &
        (final_composition_changes_df['third_qualify'] == 0) &
        (final_composition_changes_df['h2h'] == 1) &
        (final_composition_changes_df['pts_diff'] == -2) &
        (final_composition_changes_df['gls_diff'] == -2) &
        (final_composition_changes_df['3rd_last_game_points'] != 2) &
        (final_composition_changes_df['3rd_last_game_goals_diff'] >= -1) &
        (final_composition_changes_df['suspense'] == 0),
        'suspense'
    ] = 1

    final_composition_changes_df.loc[
        (final_composition_changes_df['year'] <= 1992) &
        (final_composition_changes_df['third_qualify'] == 1) &
        (final_composition_changes_df['h2h'] == 1) &
        (final_composition_changes_df['pts_diff'] == -2) &
        (final_composition_changes_df['gls_diff'] == -2) &
        (final_composition_changes_df['4th_last_game_points'] != 2) &
        (final_composition_changes_df['4th_last_game_goals_diff'] >= -1) &
        (final_composition_changes_df['suspense'] == 0),
        'suspense'
    ] = 1

    # 3-point system (year > 1992)
    ## from losing to drawing
    final_composition_changes_df.loc[
        (final_composition_changes_df['year'] > 1992) &
        (final_composition_changes_df['third_qualify'] == 0) &
        (final_composition_changes_df['h2h'] == 1) &
        (final_composition_changes_df['pts_diff'] == -3) &
        (final_composition_changes_df['gls_diff'] == -2) &
        (final_composition_changes_df['3rd_last_game_goals_diff'] == -1) &
        (final_composition_changes_df['suspense'] == 0),
        'suspense'
    ] = 1

    final_composition_changes_df.loc[
        (final_composition_changes_df['year'] > 1992) &
        (final_composition_changes_df['third_qualify'] == 1) &
        (final_composition_changes_df['h2h'] == 1) &
        (final_composition_changes_df['pts_diff'] == -3) &
        (final_composition_changes_df['gls_diff'] == -2) &
        (final_composition_changes_df['4th_last_game_goals_diff'] == -1) &
        (final_composition_changes_df['suspense'] == 0),
        'suspense'
    ] = 1

    ## from drawing to winning
    final_composition_changes_df.loc[
        (final_composition_changes_df['year'] > 1992) &
        (final_composition_changes_df['third_qualify'] == 0) &
        (final_composition_changes_df['h2h'] == 1) &
        (final_composition_changes_df['pts_diff'] == -3) &
        (final_composition_changes_df['gls_diff'] == -2) &
        (final_composition_changes_df['3rd_last_game_points'] == 1) &
        (final_composition_changes_df['suspense'] == 0),
        'suspense'
    ] = 1

    final_composition_changes_df.loc[
        (final_composition_changes_df['year'] > 1992) &
        (final_composition_changes_df['third_qualify'] == 1) &
        (final_composition_changes_df['h2h'] == 1) &
        (final_composition_changes_df['pts_diff'] == -3) &
        (final_composition_changes_df['gls_diff'] == -2) &
        (final_composition_changes_df['4th_last_game_points'] == 1) &
        (final_composition_changes_df['suspense'] == 0),
        'suspense'
    ] = 1

    return final_composition_changes_df


# probabilities

In [32]:
def integrate_elo_probabilities(goals_df, elo_data):
    """
    Integrate Elo ratings into the dataset and calculate probabilities for each goal and match.
    """
    # Merge Elo ratings for home and away teams
    goals_df = goals_df.merge(
        elo_data[['year', 'team_name', 'elo']],
        left_on=['year', 'home_team'],
        right_on=['year', 'team_name'],
        how='left'
    ).rename(columns={'elo': 'elo_home'})

    goals_df = goals_df.merge(
        elo_data[['year', 'team_name', 'elo']],
        left_on=['year', 'away_team'],
        right_on=['year', 'team_name'],
        how='left'
    ).rename(columns={'elo': 'elo_away'})

    # Drop redundant columns
    goals_df = goals_df.drop(columns=['team_name_x', 'team_name_y'])

    # # Calculate probabilities for each goal and match
    # goals_df['P_home_win'], goals_df['P_draw'], goals_df['P_away_win'] = zip(
    #     *goals_df.apply(
    #         lambda row: calculate_elo_probabilities(row['elo_home'], row['elo_away'])
    #         if not (pd.isna(row['elo_home']) or pd.isna(row['elo_away']))
    #         else (np.nan, np.nan, np.nan),
    #         axis=1
    #     )
    # )

    return goals_df