In [1]:
import itertools
import random
import pandas as pd
import numpy as np
import json
from typing import Optional, Dict, Tuple
from collections import defaultdict

In [2]:
# Read matches results 
matches = pd.read_excel("data/matches.xlsx", dtype={"round": int, "home_team_id": str, "away_team_id": str}, index_col=0)
matches

Unnamed: 0_level_0,round,date,home_team_id,home_goals,home_shootout_goals,away_team_id,away_goals,away_shootout_goals
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,2023-01-01,08,2.0,,12,4.0,
2,1,2023-01-01,04,2.0,,03,1.0,
3,1,2023-01-01,02,5.0,,06,0.0,
4,1,2023-01-01,05,5.0,,11,3.0,
5,1,2023-01-01,01,1.0,,10,5.0,
...,...,...,...,...,...,...,...,...
62,11,2023-03-19,11,,,09,,
63,11,2023-03-19,07,,,02,,
64,11,2023-03-19,06,,,01,,
65,11,2023-03-19,08,,,04,,


In [3]:
# Drop rows with missing home_goals or away_goals values to get already played games
played_games = matches.dropna(subset=['home_goals', 'away_goals'])

# Calculate goal difference
played_games = played_games.assign(goal_difference=abs(played_games["home_goals"] - played_games["away_goals"]))
played_games

Unnamed: 0_level_0,round,date,home_team_id,home_goals,home_shootout_goals,away_team_id,away_goals,away_shootout_goals,goal_difference
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1,2023-01-01,8,2.0,,12,4.0,,2.0
2,1,2023-01-01,4,2.0,,3,1.0,,1.0
3,1,2023-01-01,2,5.0,,6,0.0,,5.0
4,1,2023-01-01,5,5.0,,11,3.0,,2.0
5,1,2023-01-01,1,1.0,,10,5.0,,4.0
6,1,2023-01-01,9,4.0,,7,2.0,,2.0
7,2,2023-01-08,3,3.0,,8,5.0,,2.0
8,2,2023-01-08,10,3.0,,2,2.0,,1.0
9,2,2023-01-08,12,3.0,,1,2.0,,1.0
10,2,2023-01-08,7,5.0,,5,3.0,,2.0


In [4]:
# Select rows where both home_goals and away_goals are missing to get unplayed games
unplayed_games = matches[matches['home_goals'].isna() & matches['away_goals'].isna()]
unplayed_games

Unnamed: 0_level_0,round,date,home_team_id,home_goals,home_shootout_goals,away_team_id,away_goals,away_shootout_goals
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
55,10,2023-03-12,1,,,7,,
56,10,2023-03-12,2,,,11,,
57,10,2023-03-12,4,,,12,,
58,10,2023-03-12,9,,,3,,
59,10,2023-03-12,10,,,6,,
60,10,2023-03-12,5,,,8,,
61,11,2023-03-19,12,,,10,,
62,11,2023-03-19,11,,,9,,
63,11,2023-03-19,7,,,2,,
64,11,2023-03-19,6,,,1,,


In [5]:
# Count the number of occurrences of each goal difference value 
gd_counts  = played_games["goal_difference"].value_counts().sort_index()

# Calculate the relative frequency for each goal difference value
gd_weights = gd_counts/gd_counts.sum()
gd_weights

0.0    0.148148
1.0    0.351852
2.0    0.259259
3.0    0.074074
4.0    0.129630
5.0    0.037037
Name: goal_difference, dtype: float64

In [6]:
def head_to_head_winner(played_games: pd.DataFrame, team1: str, team2: str) -> Optional[str]:
    """
    Determines the winner of a head-to-head match between two teams in a given played games dataframe.

    Parameters:
    -----------
    played_games : pd.DataFrame
        Dataframe of played games with columns 'home_team_id', 'away_team_id', 'home_goals', 'away_goals', 
        'home_shootout_goals', 'away_shootout_goals'.
    team1 : str
        The id of the first team.
    team2 : str
        The id of the second team.

    Returns:
    --------
    Optional[str]
        Returns the id of the winning team if the match between the two teams has been played. If the two teams haven't 
        played each other, the function returns None.
    """
    # Filter the dataframe to find the match with the given team ids
    head_to_head = played_games[
        ((played_games['home_team_id'] == team1) & (played_games['away_team_id'] == team2)) | 
        ((played_games['home_team_id'] == team2) & (played_games['away_team_id'] == team1))]
    
    # If the match hasn't been played return None
    if head_to_head.empty:
        return None
    
    # Get the relevant values from the head to head match
    home_team_id = head_to_head.at[head_to_head.index[0], 'home_team_id']
    away_team_id = head_to_head.at[head_to_head.index[0], 'away_team_id']
    home_goals = head_to_head.at[head_to_head.index[0], 'home_goals']
    away_goals = head_to_head.at[head_to_head.index[0], 'away_goals']
    home_shootout_goals = np.nan_to_num(head_to_head.at[head_to_head.index[0], 'home_shootout_goals'])
    away_shootout_goals = np.nan_to_num(head_to_head.at[head_to_head.index[0], 'away_shootout_goals'])
    
    # Return the winner of the head to head based on the goals scored and shootout goals (if applicable)
    if home_goals > away_goals or home_shootout_goals > away_shootout_goals:
        return home_team_id
    else:
        return away_team_id

In [7]:
def swap_team_positions(classification: Dict[str, Dict[str, int]], team1: str, team2: str) -> Dict[str, Dict[str, int]]:
    """   
    Swaps the positions between two teams in a classification dictionary.
    
    Parameters:
    -----------
    classification : Dict[str, Dict[str, int]]
        A dictionary containing the current classification of teams.
    team1 : str
        The id of the first team.
    team2 : str
        The id of the second team.
    
    Returns:
    --------
    Dict[str, Dict[str, int]]
        A new classification dictionary with the positions of the two specified teams swapped.
    """
    # Get the dictionary keys as list to get the index of each team
    keys = list(classification)
    i1 = keys.index(team1)
    i2 = keys.index(team2)
    
    # Swap values at index i1 and i2
    keys[i1], keys[i2] = keys[i2], keys[i1]
    
    # Return the dictionary with the positions swaped
    return {k: classification[k] for k in keys}

In [8]:
def sort_classification(classification: Dict[str, Dict[str, int]], played_games: pd.DataFrame) -> Dict[str, Dict[str, int]]:
    """
    Sorts a dictionary of team classification results first by the number of wins (`W`) and then by the goal difference (`GD`). 
    If two or more teams have the same number of wins and goal difference, the function resolves the tie by using the 
    head-to-head winner.
    
    Parameters:
    -----------
    classification : Dict[str, Dict[str, int]]
        A dictionary containing the current classification of teams.
    played_games : pd.DataFrame
        Dataframe of played games with columns 'home_team_id', 'away_team_id', 'home_goals', 'away_goals', 
        'home_shootout_goals', 'away_shootout_goals'.
    
    Returns:
    --------
    Dict[str, Dict[str, int]]
        A new classification dictionary with the teams sorted.
    """
    # Sort the classification dictionary by the most wins and then by goal difference
    classification = dict(sorted(classification.items(), key=lambda item: (-item[1]['W'], -item[1]['GD'])))

    # In case of draw (same W and GD) use head to head winner to break the tie by checking the teams bellow team1
    for i, team1 in enumerate(classification):
        for team2 in list(classification)[i+1:]:
            # In case team1 is tied with another team below in the classification (team2) and that team won the head to head 
            # then swap team positions
            if (classification[team1] == classification[team2]) & (head_to_head_winner(played_games, team1, team2) == team2):
                classification = swap_team_positions(classification, team1, team2)
    
    return classification

In [9]:
def build_classification(played_games: pd.DataFrame) -> Dict[str, Dict[str, int]]:
    """
    Builds a classification table for a given played games dataframe.

    The classification table is a dictionary with team id's as keys and dictionaries (with keys 'W' (wins), 'L' (losses), 
    and 'GD' (goal difference)) as values.
    
    Parameters:
    -----------
    played_games : pd.DataFrame
        Dataframe of played games with columns 'home_team_id', 'away_team_id', 'home_goals', 'away_goals', 
        'home_shootout_goals', 'away_shootout_goals'.
    
    Returns:
    --------
    Dict[str, Dict[str, int]]
        A dictionary containing the sorted classification of teams, with the following structure:
        {
            team_id_1: {'W': number_of_wins_1, 'L': number_of_losses_1, 'GD': goal_difference_1},
            team_id_2: {'W': number_of_wins_2, 'L': number_of_losses_2, 'GD': goal_difference_2},
            ...
        }
    """
    # Initialize the classification dictionary
    classification = defaultdict(lambda: {'W': 0, 'L': 0, 'GD': 0})

    # Loop over each row in the DataFrame
    for _, match in played_games.iterrows():
        home_team_id = match['home_team_id']
        away_team_id = match['away_team_id']
        home_goals = match['home_goals']
        away_goals = match['away_goals']
        home_shootout_goals = np.nan_to_num(match['home_shootout_goals'])
        away_shootout_goals = np.nan_to_num(match['away_shootout_goals'])

        # Update the wins and loses for both teams
        if home_goals > away_goals or home_shootout_goals > away_shootout_goals:
            classification[home_team_id]['W'] += 1
            classification[away_team_id]['L'] += 1
        else:
            classification[home_team_id]['L'] += 1
            classification[away_team_id]['W'] += 1

        # Update the goal difference
        classification[home_team_id]['GD'] += home_goals - away_goals
        classification[away_team_id]['GD'] += away_goals - home_goals


    # Sort the classification dictionary by the league's criteria 
    classification = sort_classification(classification, played_games)
    
    return classification

In [10]:
classification = build_classification(played_games)
classification

{'09': {'W': 7, 'L': 2, 'GD': 8.0},
 '10': {'W': 7, 'L': 2, 'GD': 7.0},
 '07': {'W': 6, 'L': 3, 'GD': 9.0},
 '04': {'W': 6, 'L': 3, 'GD': -1.0},
 '02': {'W': 5, 'L': 4, 'GD': 11.0},
 '05': {'W': 5, 'L': 4, 'GD': -1.0},
 '11': {'W': 4, 'L': 5, 'GD': -2.0},
 '12': {'W': 4, 'L': 5, 'GD': -2.0},
 '01': {'W': 4, 'L': 5, 'GD': -4.0},
 '06': {'W': 3, 'L': 6, 'GD': 0.0},
 '08': {'W': 3, 'L': 6, 'GD': -7.0},
 '03': {'W': 0, 'L': 9, 'GD': -18.0}}

In [11]:
def update_simulated_match(match_ix: int, simulated_classification: Dict[str, Dict[str, int]], 
                           unplayed_games_tmp: pd.DataFrame, home_team_id: str, away_team_id: str, 
                           match_result: str, gd: int) -> None:
    """
    Updates the simulated_classification dictionary and unplayed_games_tmp dataframe with the given simulated result of a 
    match between two teams.
    
    Parameters:
    -----------
    match_ix : int
        Index of the match in the unplayed_games_tmp dataframe.
    simulated_classification : Dict[str, Dict[str, int]]
        A dictionary containing a copy of the current classification of the teams, which will be updated with the 
        simulated result.
    unplayed_games_tmp : pd.DataFrame
        A DataFrame containing a copy of the unplayed games with columns 'home_team_id', 'away_team_id', 'home_goals', 
        'away_goals', 'home_shootout_goals', 'away_shootout_goals', which will be updated with the simulated result.
    home_team_id : str
        The id of the home team.
    away_team_id : str
        The id of the away team.
    match_result : str
        The result of the match. Must be one of 'W' (home team wins) or 'L' (away team wins).
    gd : int
        The goal difference of the match.
        
    Returns:
    --------
    None
    """
    # If the match result is a win for the home team
    if match_result == 'W':
        # Increment the home team's wins and goal difference in the simulated_classification dictionary
        simulated_classification[home_team_id]['W'] += 1
        simulated_classification[home_team_id]['GD'] += gd
        # Increment the away team's losses and decrease their goal difference in the simulated_classification dictionary
        simulated_classification[away_team_id]['L'] += 1
        simulated_classification[away_team_id]['GD'] -= gd
        # Add the goal difference to the home team's score in the unplayed_games_tmp dataframe
        unplayed_games_tmp.at[match_ix, 'home_goals'] += gd
        # If the goal difference is 0, set the shootout goals in the unplayed_games_tmp dataframe
        if gd == 0:
            unplayed_games_tmp.loc[match_ix, ['home_shootout_goals', 'away_shootout_goals']] = [3, 0]
    # If the match result is a win for the away team
    else:
        # Increment the home team's losses and decrease their goal difference in the simulated_classification dictionary
        simulated_classification[home_team_id]['L'] += 1
        simulated_classification[home_team_id]['GD'] -= gd
        # Increment the away team's wins and goal difference in the simulated_classification dictionary
        simulated_classification[away_team_id]['W'] += 1
        simulated_classification[away_team_id]['GD'] += gd
        # Add the goal difference to the away team's score in the unplayed_games_tmp dataframe
        unplayed_games_tmp.at[match_ix, 'away_goals'] += gd
        # If the goal difference is 0, set the shootout goals in the unplayed_games_tmp dataframe
        if gd == 0:
            unplayed_games_tmp.loc[match_ix, ['home_shootout_goals', 'away_shootout_goals']] = [0, 3]

In [12]:
def simulate_combination(classification: Dict[str, Dict[str, int]],
                         unplayed_games: pd.DataFrame,
                         results_combination: Tuple[str, ...],
                         gd_weights: pd.Series) -> Tuple[Dict[str, Dict[str, int]], pd.DataFrame]:
    """
    Simulates one of the remaining possible combinations of match results (W/L) and returns the updated simulated_classification
    and unplayed_games_tmp for the given combination.
    
    Parameters:
    -----------
    classification : Dict[str, Dict[str, int]]
        A dictionary containing the current classification of teams.
    
    unplayed_games : pd.DataFrame
        Dataframe of unplayed games with columns 'home_team_id', 'away_team_id', 'home_goals', 'away_goals', 
        'home_shootout_goals', 'away_shootout_goals'.

    results_combination : Tuple[str, ...]
        A tuple containing the simulated results of each match in the unplayed_games dataframe. 
        The results can be 'W' (home team wins) or 'L' (away team wins).
        
    gd_weights : pd.Series
        A pandas series containing the weights for the different possible goal differences.
        The index of the series represents the goal difference, and the values represent the weight for that goal difference.
        
    Returns:
    --------
    Tuple[Dict[str, Dict[str, int]], pd.DataFrame]
        A tuple containing the updated simulated classification and unplayed_games_tmp for the given combination. 
    """
    # Create a copy of the current classification dictionary to avoid modifying the original dictionary
    simulated_classification = {team: {'W': classification[team]['W'], 
                                       'L': classification[team]['L'], 
                                       'GD': classification[team]['GD']} 
                                for team in classification}
    
    # Initialize the unplayed_games_tmp dataframe to keep track of the simulated matches
    unplayed_games_tmp = unplayed_games.assign(home_goals=0, away_goals=0).reset_index().copy()

    # Simulate each match of the combination
    for match_ix, match in unplayed_games_tmp.iterrows():
        # Get the home and away team ids for the current match
        home_team_id = match['home_team_id']
        away_team_id = match['away_team_id']

        # Get the simulated result for the current match
        match_result = results_combination[match_ix]

        # Randomly generate a goal difference for the current match using the gd_weights
        gd = random.choices(range(len(gd_weights)), weights=gd_weights)[0]

        # Update the simulated_classification and unplayed_games_tmp dataframes
        update_simulated_match(match_ix, 
                               simulated_classification, 
                               unplayed_games_tmp, 
                               home_team_id, 
                               away_team_id, 
                               match_result, 
                               gd)
        
    # Return the updated simulated_classification and unplayed_games_tmp dataframes
    return simulated_classification, unplayed_games_tmp

In [13]:
def simulate_results(classification: Dict[str, Dict[str, int]], 
                     played_games: pd.DataFrame,
                     unplayed_games: pd.DataFrame,
                     gd_weights: pd.Series) -> Tuple[str, ...]:
    """
    Simulates all possible combinations of results for the remaining unplayed games and returns the final classification 
    of teams for each outcome. The amount of possible combinations is 2^len(unplayed_games).
    
    Parameters:
    -----------
    classification : Dict[str, Dict[str, int]]
        A dictionary containing the current classification of teams, with the following structure:
        {
            team_id_1: {'W': number_of_wins_1, 'L': number_of_losses_1, 'GD': goal_difference_1},
            team_id_2: {'W': number_of_wins_2, 'L': number_of_losses_2, 'GD': goal_difference_2},
            ...
        }
        
    played_games : pd.DataFrame
        Dataframe of played games with columns 'home_team_id', 'away_team_id', 'home_goals', 'away_goals', 
        'home_shootout_goals', 'away_shootout_goals'.
        
    unplayed_games : pd.DataFrame
        Dataframe of unplayed games with columns 'home_team_id', 'away_team_id', 'home_goals', 'away_goals', 
        'home_shootout_goals', 'away_shootout_goals'.
        
    gd_weights : pd.Series
        A pandas series containing the weights for the different possible goal differences.
        The index of the series represents the goal difference, and the values represent the weight for that goal difference.
    
    Returns:
    --------
    Tuple[str, ...]
        A generator object that yields tuples representing the final classification of teams for each possible outcome.
        Each tuple contains the team id's ordered by their final position, from the first to the last position.
    """
    # Loop through all possible combinations of 'W' and 'L' for the unplayed games
    for results_combination in itertools.product(['W', 'L'], repeat=len(unplayed_games)):
        # Simulate the results of the games for the current combination
        simulated_classification, unplayed_games_tmp = simulate_combination(classification, 
                                                                            unplayed_games, 
                                                                            results_combination, 
                                                                            gd_weights)

        # Calculate the positions of the teams based on their updated records
        positions = sort_classification(simulated_classification, pd.concat([played_games, unplayed_games_tmp]))
        
        # Yield the teams ordered by its final position as a tuple
        yield tuple(positions)

In [14]:
# Initialize a dictionary to keep track of the counts of each position for each team
position_counts = {team: {position: 0 for position in range(1, len(classification)+1)} for team in classification.keys()}

# Simulate all possible results and count the positions of each team
for positions in simulate_results(classification, played_games, unplayed_games, gd_weights):
    for i, team in enumerate(positions):
        position_counts[team][i+1] += 1

In [15]:
# Print the counts for each team and position
for team in position_counts:
    print(f'{team}:')
    for position, count in position_counts[team].items():
        print(f'  Position {position}: {count} times, {round(count/2**len(unplayed_games)*100, 2)}%')

09:
  Position 1: 1912 times, 46.68%
  Position 2: 1156 times, 28.22%
  Position 3: 716 times, 17.48%
  Position 4: 263 times, 6.42%
  Position 5: 46 times, 1.12%
  Position 6: 3 times, 0.07%
  Position 7: 0 times, 0.0%
  Position 8: 0 times, 0.0%
  Position 9: 0 times, 0.0%
  Position 10: 0 times, 0.0%
  Position 11: 0 times, 0.0%
  Position 12: 0 times, 0.0%
10:
  Position 1: 1462 times, 35.69%
  Position 2: 1409 times, 34.4%
  Position 3: 854 times, 20.85%
  Position 4: 305 times, 7.45%
  Position 5: 61 times, 1.49%
  Position 6: 5 times, 0.12%
  Position 7: 0 times, 0.0%
  Position 8: 0 times, 0.0%
  Position 9: 0 times, 0.0%
  Position 10: 0 times, 0.0%
  Position 11: 0 times, 0.0%
  Position 12: 0 times, 0.0%
07:
  Position 1: 621 times, 15.16%
  Position 2: 866 times, 21.14%
  Position 3: 1031 times, 25.17%
  Position 4: 666 times, 16.26%
  Position 5: 687 times, 16.77%
  Position 6: 216 times, 5.27%
  Position 7: 9 times, 0.22%
  Position 8: 0 times, 0.0%
  Position 9: 0 times,

In [16]:
save_file = open("results/positions_J9.json", "w")
json.dump(position_counts, save_file)
save_file.close()