In [1]:
import itertools
import random
import pandas as pd
import numpy as np
import json
from typing import Optional, Dict, Tuple, List
from collections import defaultdict

In [2]:
# Read matches results 
matches = pd.read_excel("data/matches.xlsx",
                        dtype={"round": int, "home_team_id": str, "home_goals": 'Int64', "away_team_id": str, "away_goals": 'Int64'}, 
                        index_col=0)
matches

Unnamed: 0_level_0,round,date,home_team_id,home_goals,home_shootout_goals,away_team_id,away_goals,away_shootout_goals
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,2023-01-01,08,2,,12,4,
2,1,2023-01-01,04,2,,03,1,
3,1,2023-01-01,02,5,,06,0,
4,1,2023-01-01,05,5,,11,3,
5,1,2023-01-01,01,1,,10,5,
...,...,...,...,...,...,...,...,...
62,11,2023-03-19,11,,,09,,
63,11,2023-03-19,07,,,02,,
64,11,2023-03-19,06,,,01,,
65,11,2023-03-19,08,,,04,,


In [3]:
# Drop rows with missing home_goals or away_goals values to get already played games
played_games = matches.dropna(subset=['home_goals', 'away_goals'])
played_games

Unnamed: 0_level_0,round,date,home_team_id,home_goals,home_shootout_goals,away_team_id,away_goals,away_shootout_goals
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,2023-01-01,8,2,,12,4,
2,1,2023-01-01,4,2,,3,1,
3,1,2023-01-01,2,5,,6,0,
4,1,2023-01-01,5,5,,11,3,
5,1,2023-01-01,1,1,,10,5,
6,1,2023-01-01,9,4,,7,2,
7,2,2023-01-08,3,3,,8,5,
8,2,2023-01-08,10,3,,2,2,
9,2,2023-01-08,12,3,,1,2,
10,2,2023-01-08,7,5,,5,3,


In [4]:
# Select rows where both home_goals and away_goals are missing to get unplayed games
unplayed_games = matches[matches['home_goals'].isna() & matches['away_goals'].isna()]
unplayed_games

Unnamed: 0_level_0,round,date,home_team_id,home_goals,home_shootout_goals,away_team_id,away_goals,away_shootout_goals
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
55,10,2023-03-12,1,,,7,,
56,10,2023-03-12,2,,,11,,
57,10,2023-03-12,4,,,12,,
58,10,2023-03-12,9,,,3,,
59,10,2023-03-12,10,,,6,,
60,10,2023-03-12,5,,,8,,
61,11,2023-03-19,12,,,10,,
62,11,2023-03-19,11,,,9,,
63,11,2023-03-19,7,,,2,,
64,11,2023-03-19,6,,,1,,


In [5]:
def calculate_mean_goals(played_games: pd.DataFrame, team_goals: str) -> float:
    """
    Calculates the mean goals scored by the winning/losing team per match.

    Parameters:
    -----------
    played_games : pd.DataFrame
        Dataframe of played games with columns 'home_goals' and 'away_goals'.
        
    team_goals : str
        A string indicating whether to calculate the mean goals of winning or losing teams. Must be 'winning' or 'losing'.


    Returns:
    --------
    floart
        The mean of the desired type of goals.
    """
    # Determine which goals to consider based on the input string
    if team_goals == 'winning':
        goals = [max(match['home_goals'], match['away_goals']) for _, match in played_games.iterrows()]
    elif team_goals == 'losing':
        goals = [min(match['home_goals'], match['away_goals']) for _, match in played_games.iterrows()]
    else:
        raise ValueError("Invalid goal type, must be 'winning' or 'losing'")
        
    # Calculate the mean
    mean_goals = sum(goals)/len(goals)
    
    return mean_goals

In [6]:
def head_to_head_winner(played_games: pd.DataFrame, team1: str, team2: str) -> Optional[str]:
    """
    Determines the winner of a head-to-head match between two teams in a given played games dataframe.

    Parameters:
    -----------
    played_games : pd.DataFrame
        Dataframe of played games with columns 'home_team_id', 'away_team_id', 'home_goals', 'away_goals', 
        'home_shootout_goals', 'away_shootout_goals'.
        
    team1 : str
        The id of the first team.
        
    team2 : str
        The id of the second team.


    Returns:
    --------
    Optional[str]
        Returns the id of the winning team if the match between the two teams has been played. If the two teams haven't 
        played each other, the function returns None.
    """
    # Filter the dataframe to find the match with the given team ids
    head_to_head = played_games[
        ((played_games['home_team_id'] == team1) & (played_games['away_team_id'] == team2)) | 
        ((played_games['home_team_id'] == team2) & (played_games['away_team_id'] == team1))]
    
    # If the match hasn't been played return None
    if head_to_head.empty:
        return None
    
    # Get the relevant values from the head to head match
    home_team_id = head_to_head.at[head_to_head.index[0], 'home_team_id']
    away_team_id = head_to_head.at[head_to_head.index[0], 'away_team_id']
    home_goals = head_to_head.at[head_to_head.index[0], 'home_goals']
    away_goals = head_to_head.at[head_to_head.index[0], 'away_goals']
    home_shootout_goals = np.nan_to_num(head_to_head.at[head_to_head.index[0], 'home_shootout_goals'])
    away_shootout_goals = np.nan_to_num(head_to_head.at[head_to_head.index[0], 'away_shootout_goals'])
    
    # Return the winner of the head to head based on the goals scored and shootout goals (if applicable)
    if home_goals > away_goals or home_shootout_goals > away_shootout_goals:
        return home_team_id
    else:
        return away_team_id

In [7]:
def swap_team_positions(classification: List[Tuple[str, Dict[str, int]]], team1: str, team2: str) -> List[Tuple[str, Dict[str, int]]]:
    """   
    Swaps the positions between two teams in a classification list.
    
    Parameters:
    -----------
    classification : List[Tuple[str, Dict[str, int]]]
        A list of tuples containing the current classification of teams, with the following structure:
        [
            (team_id_1, {'W': number_of_wins_1, 'L': number_of_losses_1, 'GF': goals_for_1, 'GA': goals_against_1, 'GD': goal_difference_1}),
            (team_id_2, {'W': number_of_wins_2, 'L': number_of_losses_2, 'GF': goals_for_2, 'GA': goals_against_2, 'GD': goal_difference_2}),
            ...
        ]
        
    team1 : str
        The id of the first team.
        
    team2 : str
        The id of the second team.
    
    
    Returns:
    --------
    List[Tuple[str, Dict[str, int]]]
        A new classification list with the positions of the two given teams swapped.
    """
    # Get the team_id's as list to retrieve the index of the teams to be swapped
    keys = [i[0] for i in classification]
    i1 = keys.index(team1)
    i2 = keys.index(team2)
    
    # Swap values at index i1 and i2
    classification[i1], classification[i2] = classification[i2], classification[i1]
    
    # Return the classification with the positions swaped
    return classification

In [8]:
def sort_classification(classification: List[Tuple[str, Dict[str, int]]], played_games: pd.DataFrame) -> List[Tuple[str, Dict[str, int]]]:
    """
    Sorts a list of team classification results first by the number of wins (`W`) and then by the goal difference (`GD`). 
    If two or more teams have the same number of wins and goal difference, the function resolves the tie by using the 
    head-to-head winner.
    
    Parameters:
    -----------
    classification : List[Tuple[str, Dict[str, int]]]
        A list of tuples containing the current classification of teams. Each tuple contains a team name (str) and its 
        corresponding statistics (Dict[str, int]).
    played_games : pd.DataFrame
        Dataframe of played games with columns 'home_team_id', 'away_team_id', 'home_goals', 'away_goals', 
        'home_shootout_goals', 'away_shootout_goals'.
    
    Returns:
    --------
    List[Tuple[str, Dict[str, int]]]
        A new classification list of tuples with the teams sorted.
    """
    # Sort the classification list of tuples by the most wins and then by goal difference
    classification = sorted(classification, key=lambda item: (-item[1]['W'], -item[1]['GD']))

    # In case of draw (same W and GD) use head to head winner to break the tie by checking the teams bellow team1
    for i, (team1, stats1) in enumerate(classification):
        w1 = stats1['W']
        gd1 = stats1['GD']
        
        for team2, stats2 in classification[i+1:]:
            w2 = stats2['W']
            gd2 = stats2['GD']
            # In case team1 is tied with another team below in the classification (team2) and that team won the head to head 
            # then swap team positions
            if (w1 == w2) & (gd1 == gd2) & (head_to_head_winner(played_games, team1, team2) == team2):
                classification = swap_team_positions(classification, team1, team2)
    
    return classification

In [9]:
def build_classification(played_games: pd.DataFrame) -> List[Tuple[str, Dict[str, int]]]:
    """
    Builds a classification table for a given played games dataframe.

    The classification table is a list of tuples with two elements. The firs element is the team id and the second one is a 
    dictionary with keys 'W' (wins), 'L' (losses), 'GF' (goals for), 'GA' (goals against) and 'GD' (goal difference).
    
    Parameters:
    -----------
    played_games : pd.DataFrame
        Dataframe of played games with columns 'home_team_id', 'away_team_id', 'home_goals', 'away_goals', 
        'home_shootout_goals', 'away_shootout_goals'.
    
    
    Returns:
    --------
    List[Tuple[str, Dict[str, int]]]
        A list of tuples containing the sorted classification of teams, with the following structure:
        [
            (team_id_1, {'W': number_of_wins_1, 'L': number_of_losses_1, 'GF': goals_for_1, 'GA': goals_against_1, 'GD': goal_difference_1}),
            (team_id_2, {'W': number_of_wins_2, 'L': number_of_losses_2, 'GF': goals_for_2, 'GA': goals_against_2, 'GD': goal_difference_2}),
            ...
        ]
    """
    # Initialize the classification as a dictionary
    classification = defaultdict(lambda: {'W': 0, 'L': 0, 'GF': 0, 'GA': 0, 'GD': 0})

    # Loop over each row in the DataFrame
    for _, match in played_games.iterrows():
        home_team_id = match['home_team_id']
        away_team_id = match['away_team_id']
        home_goals = match['home_goals']
        away_goals = match['away_goals']
        home_shootout_goals = np.nan_to_num(match['home_shootout_goals'], nan=0)
        away_shootout_goals = np.nan_to_num(match['away_shootout_goals'], nan=0)

        # Update the wins and loses for both teams
        if home_goals > away_goals or home_shootout_goals > away_shootout_goals:
            classification[home_team_id]['W'] += 1
            classification[away_team_id]['L'] += 1
        else:
            classification[home_team_id]['L'] += 1
            classification[away_team_id]['W'] += 1

        # Update the goals for/against and goal difference
        classification[home_team_id]['GF'] += home_goals
        classification[home_team_id]['GA'] += away_goals
        classification[home_team_id]['GD'] += home_goals - away_goals
        
        classification[away_team_id]['GF'] += away_goals
        classification[away_team_id]['GA'] += home_goals
        classification[away_team_id]['GD'] += away_goals - home_goals


    # Sort the classification by the league's criteria and return it as a list of tuples
    classification = sort_classification(list(classification.items()), played_games)
    
    return classification

In [10]:
classification = build_classification(played_games)
classification

[('09', {'W': 7, 'L': 2, 'GF': 19, 'GA': 11, 'GD': 8}),
 ('10', {'W': 7, 'L': 2, 'GF': 29, 'GA': 22, 'GD': 7}),
 ('07', {'W': 6, 'L': 3, 'GF': 31, 'GA': 22, 'GD': 9}),
 ('04', {'W': 6, 'L': 3, 'GF': 18, 'GA': 19, 'GD': -1}),
 ('02', {'W': 5, 'L': 4, 'GF': 29, 'GA': 18, 'GD': 11}),
 ('05', {'W': 5, 'L': 4, 'GF': 29, 'GA': 30, 'GD': -1}),
 ('11', {'W': 4, 'L': 5, 'GF': 21, 'GA': 23, 'GD': -2}),
 ('12', {'W': 4, 'L': 5, 'GF': 28, 'GA': 30, 'GD': -2}),
 ('01', {'W': 4, 'L': 5, 'GF': 23, 'GA': 27, 'GD': -4}),
 ('06', {'W': 3, 'L': 6, 'GF': 22, 'GA': 22, 'GD': 0}),
 ('08', {'W': 3, 'L': 6, 'GF': 23, 'GA': 30, 'GD': -7}),
 ('03', {'W': 0, 'L': 9, 'GF': 17, 'GA': 35, 'GD': -18})]

In [11]:
def simulate_match(match_ix: int, simulated_classification: Dict[str, Dict[str, int]], 
                   unplayed_games_tmp: pd.DataFrame, home_team_id: str, away_team_id: str, 
                   match_result: str, wg_mean: float, lg_mean: float) -> None:
    """
    Simulates a match and updates the simulated_classification dictionary and unplayed_games_tmp dataframe with the 
    simulated result of a match between two teams.
    
    Parameters:
    -----------
    match_ix : int
        Index of the match in the unplayed_games_tmp dataframe.
        
    simulated_classification : Dict[str, Dict[str, int]]
        A dictionary containing a copy of the current classification of the teams, which will be updated with the 
        simulated result.
    
    unplayed_games_tmp : pd.DataFrame
        A DataFrame containing a copy of the unplayed games with columns 'home_team_id', 'away_team_id', 'home_goals', 
        'away_goals', 'home_shootout_goals', 'away_shootout_goals', which will be updated with the simulated result.
    
    home_team_id : str
        The id of the home team.
    
    away_team_id : str
        The id of the away team.
    
    match_result : str
        The result of the match. Must be one of 'W' (home team wins) or 'L' (away team wins).
    
    wg_mean: float
        The mean of the goals scored by the winning team.
    
    lg_mean: float
        The mean of the goals scored by the losing team.
        
        
    Returns:
    --------
    None
    """
    # Simulate goals for the winning and losing team following a Poisson distribution and calculate goal difference
    wg = np.random.poisson(wg_mean, 1)[0]
    lg = min(np.random.poisson(lg_mean, 1)[0], wg)
    gd = wg - lg    
    
    # If the match result is a win for the home team
    if match_result == 'W':
        # Increment the home team's wins and goal difference in the simulated_classification dictionary
        simulated_classification[home_team_id]['W'] += 1
        simulated_classification[home_team_id]['GF'] += wg
        simulated_classification[home_team_id]['GA'] += lg
        simulated_classification[home_team_id]['GD'] += gd
        # Increment the away team's losses and decrease their goal difference in the simulated_classification dictionary
        simulated_classification[away_team_id]['L'] += 1
        simulated_classification[away_team_id]['GF'] += lg
        simulated_classification[away_team_id]['GA'] += wg
        simulated_classification[away_team_id]['GD'] -= gd
        # Add the goal difference to the home team's score in the unplayed_games_tmp dataframe
        unplayed_games_tmp.at[match_ix, 'home_goals'] += wg
        unplayed_games_tmp.at[match_ix, 'away_goals'] += lg
        # If the goal difference is 0, set the shootout goals in the unplayed_games_tmp dataframe
        if gd == 0:
            unplayed_games_tmp.loc[match_ix, ['home_shootout_goals', 'away_shootout_goals']] = [3, 0]
            
    # If the match result is a win for the away team
    else:
        # Increment the home team's losses and decrease their goal difference in the simulated_classification dictionary
        simulated_classification[home_team_id]['L'] += 1
        simulated_classification[home_team_id]['GF'] += lg
        simulated_classification[home_team_id]['GA'] += wg
        simulated_classification[home_team_id]['GD'] -= gd
        # Increment the away team's wins and goal difference in the simulated_classification dictionary
        simulated_classification[away_team_id]['W'] += 1
        simulated_classification[away_team_id]['GF'] += wg
        simulated_classification[away_team_id]['GA'] += lg
        simulated_classification[away_team_id]['GD'] += gd
        # Add the goal difference to the away team's score in the unplayed_games_tmp dataframe
        unplayed_games_tmp.at[match_ix, 'away_goals'] += lg
        unplayed_games_tmp.at[match_ix, 'away_goals'] += wg
        # If the goal difference is 0, set the shootout goals in the unplayed_games_tmp dataframe
        if gd == 0:
            unplayed_games_tmp.loc[match_ix, ['home_shootout_goals', 'away_shootout_goals']] = [0, 3]

In [12]:
def simulate_combination(classification: List[Tuple[str, Dict[str, int]]],
                         unplayed_games: pd.DataFrame,
                         results_combination: Tuple[str, ...],
                         wg_mean: float, lg_mean: float) -> Tuple[List[Tuple[str, Dict[str, int]]], pd.DataFrame]:
    """
    Simulates one of the remaining possible combinations of match results (W/L) and returns the updated simulated_classification
    and unplayed_games_tmp for the given combination.
    
    Parameters:
    -----------
    classification : List[Tuple[str, Dict[str, int]]]
        A list containing the current classification of teams.
    
    unplayed_games : pd.DataFrame
        Dataframe of unplayed games with columns 'home_team_id', 'away_team_id', 'home_goals', 'away_goals', 
        'home_shootout_goals', 'away_shootout_goals'.

    results_combination : Tuple[str, ...]
        A tuple containing the simulated results of each match in the unplayed_games dataframe. 
        The results can be 'W' (home team wins) or 'L' (away team wins).
        
    wg_mean: float
        The mean of the goals scored by the winning team.
    
    lg_mean: float
        The mean of the goals scored by the losing team.
        
        
    Returns:
    --------
    Tuple[List[Tuple[str, Dict[str, int]]], pd.DataFrame]
        A tuple containing the updated simulated classification and unplayed_games_tmp for the given combination. 
    """
    # Create a copy of the current classification (to avoid modifying the original list) as dict (to update it easier)
    simulated_classification = {team: dict(stats) for team, stats in classification}
    
    # Initialize the unplayed_games_tmp dataframe to keep track of the simulated matches
    unplayed_games_tmp = unplayed_games.assign(home_goals=0, away_goals=0).reset_index().copy()

    # Simulate each match of the combination
    for match_ix, match in unplayed_games_tmp.iterrows():
        # Get the home and away team ids for the current match
        home_team_id = match['home_team_id']
        away_team_id = match['away_team_id']

        # Get the simulated result for the current match
        match_result = results_combination[match_ix]

        # Update the simulated_classification and unplayed_games_tmp
        simulate_match(match_ix,
                       simulated_classification, 
                       unplayed_games_tmp, 
                       home_team_id, 
                       away_team_id, 
                       match_result, 
                       wg_mean,
                       lg_mean)
        
    # Return the updated simulated_classification as list and unplayed_games_tmp dataframe
    return list(simulated_classification.items()), unplayed_games_tmp

In [13]:
def simulate_results(classification: List[Tuple[str, Dict[str, int]]],
                     played_games: pd.DataFrame,
                     unplayed_games: pd.DataFrame) -> Tuple[str, ...]:
    """
    Simulates all possible combinations of results for the remaining unplayed games and returns the final classification 
    of teams for each outcome. The amount of possible combinations is 2^len(unplayed_games).
    
    Parameters:
    -----------
    classification : List[Tuple[str, Dict[str, int]]]
        A list of tuples containing the current classification of teams, with the following structure:
        [
            (team_id_1, {'W': number_of_wins_1, 'L': number_of_losses_1, 'GF': goals_for_1, 'GA': goals_against_1, 'GD': goal_difference_1}),
            (team_id_2, {'W': number_of_wins_2, 'L': number_of_losses_2, 'GF': goals_for_2, 'GA': goals_against_2, 'GD': goal_difference_2}),
            ...
        ]
        
    played_games : pd.DataFrame
        Dataframe of played games with columns 'home_team_id', 'away_team_id', 'home_goals', 'away_goals', 
        'home_shootout_goals', 'away_shootout_goals'.
        
    unplayed_games : pd.DataFrame
        Dataframe of unplayed games with columns 'home_team_id', 'away_team_id', 'home_goals', 'away_goals', 
        'home_shootout_goals', 'away_shootout_goals'.
    
    
    Returns:
    --------
    Tuple[str, ...]
        A generator object that yields tuples representing the final classification of teams for each possible outcome.
        Each tuple contains the team id's ordered by their final position, from the first to the last position.
    """
    # Calculate the mean winning and losing goals
    wg_mean = calculate_mean_goals(played_games, 'winning')
    lg_mean = calculate_mean_goals(played_games, 'losing')

    # Loop through all possible combinations of 'W' and 'L' for the unplayed games
    for results_combination in itertools.product(['W', 'L'], repeat=len(unplayed_games)):
        # Simulate the results of the games for the current combination
        simulated_classification, unplayed_games_tmp = simulate_combination(classification, 
                                                                            unplayed_games, 
                                                                            results_combination,
                                                                            wg_mean, 
                                                                            lg_mean)

        # Calculate the positions of the teams based on their updated records
        final_classification = sort_classification(simulated_classification, pd.concat([played_games, unplayed_games_tmp]))
        
        # Yield the teams ordered by its final position as a tuple
        yield tuple(team[0] for team in final_classification)

In [14]:
# Initialize a dictionary to keep track of the counts of each position for each team
position_counts = {team[0]: {position: 0 for position in range(1, len(classification)+1)} for team in classification}

# Simulate all possible results and count the positions of each team
for simulation in simulate_results(classification, played_games, unplayed_games):
    for i, team in enumerate(simulation):
        position_counts[team][i+1] += 1

In [15]:
# Print the counts for each team and position
for team in position_counts:
    print(f'{team}:')
    for position, count in position_counts[team].items():
        print(f'  Position {position}: {count} times, {round(count/2**len(unplayed_games)*100, 2)}%')

09:
  Position 1: 1874 times, 45.75%
  Position 2: 1182 times, 28.86%
  Position 3: 687 times, 16.77%
  Position 4: 297 times, 7.25%
  Position 5: 52 times, 1.27%
  Position 6: 4 times, 0.1%
  Position 7: 0 times, 0.0%
  Position 8: 0 times, 0.0%
  Position 9: 0 times, 0.0%
  Position 10: 0 times, 0.0%
  Position 11: 0 times, 0.0%
  Position 12: 0 times, 0.0%
10:
  Position 1: 1488 times, 36.33%
  Position 2: 1382 times, 33.74%
  Position 3: 795 times, 19.41%
  Position 4: 344 times, 8.4%
  Position 5: 80 times, 1.95%
  Position 6: 7 times, 0.17%
  Position 7: 0 times, 0.0%
  Position 8: 0 times, 0.0%
  Position 9: 0 times, 0.0%
  Position 10: 0 times, 0.0%
  Position 11: 0 times, 0.0%
  Position 12: 0 times, 0.0%
07:
  Position 1: 572 times, 13.96%
  Position 2: 857 times, 20.92%
  Position 3: 1066 times, 26.03%
  Position 4: 667 times, 16.28%
  Position 5: 643 times, 15.7%
  Position 6: 240 times, 5.86%
  Position 7: 44 times, 1.07%
  Position 8: 6 times, 0.15%
  Position 9: 1 times,

In [16]:
save_file = open("results/positions_J9.json", "w")
json.dump(position_counts, save_file)
save_file.close()