In [1]:
import numpy as np
from random import randint, shuffle
import time
import datetime
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio

from plotly.validators.scatter.marker import SymbolValidator

from sklearn.preprocessing import Normalizer
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import KMeans

import warnings
warnings.simplefilter("ignore", UserWarning)

## 1 - Simulated Data

### Random data functions

In [2]:
#### DATA SIMULADA
# 1 - lista de usuarios/jugadores

def players_ids_list(n_players) -> pd.DataFrame:

    """
    Create a list of lists containing each team's players.
    """
    # random player ids, shuffled
    player_ids = [i for i in range(10000,100000)]
    shuffle(player_ids)

    n_first = 0
    n_last = n_players[0]
    
    # first team
    player_list = player_ids[n_first : n_last]
    all_players_list = [player_list]

    # other teams if n_players recieved more than 1 user input
    if len(n_players) > 1:
        for n in n_players[1:]:
            n_first = n_last
            n_last += n

            player_list = player_ids[n_first : n_last]
            all_players_list.extend([player_list])

    return all_players_list

#----- create individual player
def player_score(player_id = randint(10000, 99999), date = datetime.date.today().isoformat(),
                 events=['A','B']) -> pd.DataFrame:
    """
    Creates a DataFrame with a single random player and scores for each event (game)

    **Parameters**
    -------------
    player_id: player unique identifier
    date: date in YYYY-MM-DD (today by default)
    events: list of str of simultaneous activities
    """
    player = pd.DataFrame(
        {
            'player_id'  : [f"{player_id}" for i in range(len(events))],
            'event_date' : [date for i in range(len(events))],
            'event_game' : events,
            'score'      : [randint(0,3) for i in range(len(events))]
        }
    )

    return player

#----- create a team
def team_players(player_id= [randint(10000,99999)], date = datetime.date.today().isoformat(),
                 events=['A','B']) -> pd.DataFrame:

    """
    Creates a team score data with a given list of players with random scores for each event.
    """
    
    # creates a dataframe with the first player on list (player_id)
    team = player_score(player_id[0], date, events)
    
    # if more players on list, it will concatenate this new players (from team_player[1])
    if len(player_id) > 1:
        for player in range(1, len(player_id)):
            team = pd.concat([team, # main dataframe
                              player_score(player_id[player], date, events)
                             ])
    # row reindex
    team.reset_index(drop = True, inplace = True)
    
    return team

#----- create a team with random scores
def team_scores(player_id = [randint(10000,99999)], date = datetime.date.today().isoformat(),
                events=['A','B']) -> pd.DataFrame:
    """
    Creates a team score data with a given list of players with random scores
    for each event, and asigns score description.
    """

    team = team_players(player_id, date, events)

    medal = []

    for s in team['score'].values:
        if s == 1:
            medal.append('bronze')
        elif s == 2:
            medal.append('silver')
        elif s == 3:
            medal.append('gold')
        else:
            medal.append('not played')

    team['medal'] = pd.Series(medal)
    
    return team

### Main data simulation function

In [3]:
# ejecuta todo de 1 - simulated data
# streamlit user inputs example

#input_n_players = [3, 0, 0, 0]
input_n_players = [3, 6, 4, 0]
#input_n_players = [3, 6, 4, 1]

events_input = ['A', 'B', 'C']
team_names_input = ['summer', 'autumn', 'winter', 'spring']

# esta funcion debe estar antes del pipeline (en sidebar)
first_team, *teams= players_ids_list(input_n_players)

In [4]:
# agregar date = date_input
def data_sim() -> dict:
    """
    Simulated data from user defined parameters. Returns a dict with all needed variables for
    the pipeline (individual teams raw data, bool list for iteration, full df with teams raw data
    and aggregated teams data)
    """

    df_first_team = team_scores(player_id = first_team,
                                #date = date_input,
                                events=events_input)
    df_first_team['team'] = pd.Series([team_names_input[0] for i in range(len(df_first_team))])

    *df_teams, = [pd.DataFrame() for i in range(3)]
    bool_list = []

    for l_idx in range(len(teams)):
        if len(teams[l_idx]) > 1:
            df_teams[l_idx] = team_scores(player_id = teams[l_idx],
                                        #date = date_input,
                                        events=events_input)
            df_teams[l_idx]['team'] = pd.Series([team_names_input[1:][l_idx] for i in range(len(df_teams[l_idx]))])
            bool_list.append(True)
        else:
            bool_list.append(False)

    # concatenate all teams
    df_teams_disagg = df_first_team.copy()
    for i in range(len(bool_list)):
        if bool_list[i] == True:
            df_teams_disagg = pd.concat([df_teams_disagg, df_teams[i]]).reset_index(drop=True)
    
    # teams aggregated data
    df_teams_agg = df_teams_disagg.groupby(['event_date', 'event_game', 'team', 'medal']).sum('score').reset_index()

    # full output
    data_sim_output = {
        'teams_raw' : [df_first_team, df_teams],
        'bool_list' : bool_list,
        'all_teams_disagg' :df_teams_disagg,
        'all_teams_agg' : df_teams_agg
    }

    del df_first_team, df_teams, bool_list, df_teams_disagg, df_teams_agg
    
    return data_sim_output

In [5]:
# save output in a variable
simulated_data = data_sim()

# show
print(simulated_data.keys())

dict_keys(['teams_raw', 'bool_list', 'all_teams_disagg', 'all_teams_agg'])


## 2 - Pipeline

### Streamlit output and global required variables

In [6]:
# streamlit user inputs example

#input_n_players = [3, 0, 0, 0]
input_n_players = [3, 6, 4, 0]
#input_n_players = [3, 6, 4, 1]

events_input = ['A', 'B', 'C']
team_names_input = ['summer', 'autumn', 'winter', 'spring']

# esta funcion debe estar antes del pipeline (en sidebar)
first_team, *teams= players_ids_list(input_n_players)

# streamlit output
simulated_data = data_sim()

### Pipeline functions (directly from modules)

In [7]:
df_first_team = simulated_data['teams_raw'][0]
df_teams = simulated_data['teams_raw'][1]
bool_list = simulated_data['bool_list']
df_teams_disagg = simulated_data['all_teams_disagg']
df_teams_agg = simulated_data['all_teams_agg']

In [8]:
def pipeline(df_agg_data: pd.DataFrame, df_disagg_data: pd.DataFrame,
             df_first_team_data: pd.DataFrame, df_teams_l_data = list[pd.DataFrame],
             b_l = list[bool]
             ) -> pd.DataFrame:

    """
    Pipeline Function
    ----------
    Generates all metric columns in the aggregated DataFrame from simulated data.

    Output
    ----------
    A transformed pandas DataFrame with relative and absolute metrics and score methods
    """

    if len(df_agg_data) > 0 and len(df_disagg_data) > 0:

            # P: absolute medal count procedure
        def abs_medal_count(df_agg_data: pd.DataFrame) -> None:
            """
            *Procedure*
            -
            Adds absolute medal count (absoute frequence) column in df with aggregated data.
            """


            # create an empty list for absolute medal count
            medal_abs_frequence_l = []
            # rename score column
            df_agg_data.columns = ['event_date', 'event_game', 'team', 'medal', 'acc_w_score']

            for i in range(len(df_agg_data)):
                if df_agg_data['medal'].iat[i] == 'gold':
                    medal_abs_frequence_l.append(int(df_agg_data['acc_w_score'].iat[i]/3))
                elif df_agg_data['medal'].iat[i] == 'silver':
                    medal_abs_frequence_l.append(int(df_agg_data['acc_w_score'].iat[i]/2))
                elif df_agg_data['medal'].iat[i] == 'bronze':
                    medal_abs_frequence_l.append(int(df_agg_data['acc_w_score'].iat[i]/1))
                else:
                    medal_abs_frequence_l.append(0)

                # adds a column in df with aggregated data
                df_agg_data['medal_abs_frequence'] = pd.Series(medal_abs_frequence_l)
                

            # P: set categorical type on columns for category order procedure
        def agg_categories(df_agg_data: pd.DataFrame, df_disagg_data: pd.DataFrame) -> None:
            """
            *Procedure*
            -
            Set categorical type on team and medal columns in df with aggregated data.
            """    
            # set categorical type on teams to order by team
            df_agg_data['team'] = pd.Categorical(
                values = [i for i in df_agg_data['team'].values],
                categories = [i for i in df_disagg_data['team'].unique()],
                ordered = True)
            df_agg_data.sort_values(by=['event_game','team', 'medal'], ascending=[True,True, True], ignore_index=True, inplace=True)

            # set categorical type on medal to order by medal
            df_agg_data['medal'] = pd.Categorical(df_agg_data['medal'],
                                                ordered=True,
                                                categories = ['not played','bronze', 'silver', 'gold'])

            #----- data metrics

            #---------- active players count in each team function 
        def players_count(df_first_team_data: pd.DataFrame, df_teams_l_data = list[pd.DataFrame],
                            b_l = list[bool]) -> list:
            """
            *Function*
            -
            Returns a list of active players in each team.
            """

            # first team players (active)
            team_n_players = [len(df_first_team_data['player_id'].unique())]
            # all other team players
            for i in range(len(b_l)):
                if b_l[i] == True:
                    team_n_players.append(len(df_teams_l_data[i]['player_id'].unique()))  # type: ignore

            return team_n_players

            #---------- total active players during the event day function
            # in review: changed players_count in params (df_first_team_data, df_teams_l_data, b_l)
        def total_players_count(df_first_team_data = df_first_team_data, 
                                df_teams_l_data = df_teams_l_data, 
                                b_l = b_l) -> int: 
            """
            *Function*
            -
            Returns the sum of all values in players_count function.
            """
            t_p_count = players_count(df_first_team_data = df_first_team_data,
                                    df_teams_l_data = df_teams_l_data,
                                    b_l = b_l)

            total_count = 0

            for n in t_p_count:
                total_count += n

            return total_count

        #---------- participation ratio for all purposes function
        def participation_ratio(total_players: int, group_count: int) -> float:
            """
            *Function*
            -
            Simple user ratio. Used for relative comparison.

            *Params*
            -
            total_players: can be: base number of users, total active users, or team users.
            group_count: can be: team users or active users.
            """
            if total_players >= group_count:
                p_r = group_count / total_players

                return round(p_r*100, 2)

        # P: adds teams relative size column from total active players function
        # in review: players_count
        def add_team_rel_size(df_agg_data: pd.DataFrame, df_disagg_data: pd.DataFrame,
                            df_first_team_data: pd.DataFrame,
                            df_teams_l_data: list[pd.DataFrame],
                            b_l: list[bool]) -> pd.DataFrame:
            """
            *Function*
            -
            Adds a column that indicates the relative portion of a team in face to total active players
            """

            # count team players
            team_count_l = players_count(df_first_team_data = df_first_team_data,
                                        df_teams_l_data = df_teams_l_data,
                                        b_l = b_l)
            # team relative size compared to all active players during the day
            team_rel_size_l = [participation_ratio(total_players = total_players_count(),
                                                group_count = team_count_l[n])\
                                    for n in range(len(team_count_l))]
                    
            # aux df with relative size data
            df_aux = pd.DataFrame({
                    'team' : list(df_disagg_data['team'].unique()),
                    'team_relative_size' : team_rel_size_l})
                    
            # merges with df with aggregated data
            df_agg_data = df_agg_data.merge(right=df_aux, on='team', how='inner')

            return df_agg_data

        #---------- participants from each team in a given event function
        def team_event_participation(event: str, filter: str,
                                    df_first_team_data, df_teams_l_data, b_l) -> list[float]:
            """
            *Function*
            -
            List of relative count participation of each team in a given event. The order will be:
            - first team participation ratio (df_first_team)
            - other teams participation ratios, in order, from df_teams (list of dfs)
                
            *Params*
            -
            event: str, name of a common event in each team df
            filter: str, value that represents non participants in given event
            """
            # all active players in each team
            players_count_l = players_count(df_first_team_data = df_first_team_data,
                                            df_teams_l_data = df_teams_l_data,
                                            b_l = b_l)
            # active players that played in an event
            #----- first team filter
            teams_notplayed_count = [len(df_first_team_data[df_first_team_data['event_game']==event][df_first_team_data['medal']!=filter])]
            #------ other teams filter
            for team, b in zip(df_teams_l_data, b_l):
                if b == True:
                    teams_notplayed_count.append(len(team[team['event_game']==event][team['medal']!=filter]))

            # participation relative count from teams and events, not from total players
            teams_participants_ratio = [participation_ratio(t_players, t_particip)
                                        for t_players, t_particip in zip(players_count_l, teams_notplayed_count)]
                
            return teams_participants_ratio

        # P: adds participation ratio column function
        def add_team_event_participation(df_agg_data: pd.DataFrame,
                                        df_disagg_data: pd.DataFrame,
                                        df_first_team_data: pd.DataFrame,
                                        df_teams_l_data: list[pd.DataFrame],
                                        b_l: list[bool]) -> pd.DataFrame:
            """
            *Function*
            -
            Adds team participation ratio column to df with aggregated data
            """
            event_aux = df_disagg_data['event_game'].unique()
            team_aux = df_disagg_data['team'].unique()

            # list of lists with team participation in each event
            team_event_r = [team_event_participation(event_aux[e], 'not played', df_first_team_data, df_teams_l_data, b_l)
                            for e in range(len(event_aux))]

            # lists to create an aux df to add participation ratios to aggregated data
            event_aux_l, team_aux_l, team_event_r_l = [], [], []
            for t in range(len(team_aux)):
                for e in range(len(event_aux)):
                    team_aux_l.append(team_aux[t])
                    event_aux_l.append(event_aux[e])
                    team_event_r_l.append(team_event_r[e][t])

            df_aux = pd.DataFrame({
                'team' : team_aux_l,
                'event_game' : event_aux_l,
                'team_participation_ratio': team_event_r_l})
                
            df_agg_data = df_agg_data.merge(right=df_aux, on=['team', 'event_game'], how='inner')

            return df_agg_data

        # P: add medal relative count from each team player counts procedure
        def add_medal_rel_frequence(df_agg_data: pd.DataFrame,
                                    df_disagg_data: pd.DataFrame,
                                    df_first_team_data: pd.DataFrame,
                                    df_teams_l_data: list[pd.DataFrame],
                                    b_l: list[bool]) -> None:
            """
            *Procedure*
            -
            Adds medal relative count column from each team size (n players) to df with aggregated data
            """
            teams_aux = df_disagg_data['team'].unique()
            team_count = players_count(df_first_team_data = df_first_team_data,
                                    df_teams_l_data = df_teams_l_data, b_l = b_l)
                
            # creates list of medal relative frequences by team
            m_rel_f_l = []
            for count, team in zip(team_count, teams_aux):
                m_rel_f_l.extend(df_agg_data[df_agg_data['team']==team]['medal_abs_frequence'].map(lambda x: round((x/count)*100, 2)))
                
            # appends to aggregated df data
            df_agg_data['medal_rel_frequence'] = pd.Series(m_rel_f_l)

        # P: adds performance score method column procedure
        def team_performance_score(df_agg_data: pd.DataFrame) -> None:
            """
            *Procedure*
            -
            Adds team medals` performance score to df with aggregated data. It`s calculated as:

                performance_score = medal_relative_frequence * medal_weight
                    
            where medal_weight = (acc_w_score/medal_abs_frequence), acc_w_score is accumulated score by
            medal and medal_abs_frequence is each medal count.
            """
            medal_w = (df_agg_data['acc_w_score']/df_agg_data['medal_abs_frequence']).fillna(0)

            df_agg_data['perform_score'] = df_agg_data['medal_rel_frequence']*medal_w

        # P: adds total scores to aggregated data
        def total_scores(df_agg_data: pd.DataFrame):
            """
            *Function*
            -
            Adds columns with sum of accumulative and performance scores.
            """
            df_aux = df_agg_data[['event_game', 'team', 'acc_w_score', 'perform_score']]\
                        .groupby(['event_game', 'team']).sum(['acc_w_score', 'perform_score'])\
                        .reset_index()
            df_aux.columns = ['event_game', 'team', 'acc_w_score_total', 'perform_score_total']

            df_agg_data = pd.merge(left = df_agg_data, right= df_aux,
                                   how = 'inner', on = ['event_game', 'team'])
            return df_agg_data

        # Pipeline Excecution
        abs_medal_count(df_agg_data)
        agg_categories(df_agg_data, df_disagg_data)
        df_agg_data = add_team_rel_size(df_agg_data, df_disagg_data,
                                            df_first_team_data, df_teams_l_data, b_l) 
        df_agg_data = add_team_event_participation(df_agg_data, df_disagg_data,
                                                    df_first_team_data, df_teams_l_data, b_l)
        add_medal_rel_frequence(df_agg_data, df_disagg_data, df_first_team_data, df_teams_l_data, b_l) 
        team_performance_score(df_agg_data)
        df_agg_data = total_scores(df_agg_data)

        return df_agg_data

### Output

In [9]:
pipe_data =pipeline(df_agg_data = df_teams_agg,
                    df_disagg_data = df_teams_disagg,
                    df_first_team_data = df_first_team,
                    df_teams_l_data = df_teams,
                    b_l = bool_list)

## 3 - Metric selectors

In [10]:
pipe_data.head()

Unnamed: 0,event_date,event_game,team,medal,acc_w_score,medal_abs_frequence,team_relative_size,team_participation_ratio,medal_rel_frequence,perform_score,acc_w_score_total,perform_score_total
0,2025-01-12,A,summer,bronze,2,2,23.08,100.0,66.67,66.67,4,133.33
1,2025-01-12,A,summer,silver,2,1,23.08,100.0,33.33,66.66,4,133.33
2,2025-01-12,B,summer,bronze,1,1,23.08,66.67,33.33,33.33,3,99.99
3,2025-01-12,B,summer,not played,0,0,23.08,66.67,0.0,0.0,3,99.99
4,2025-01-12,B,summer,silver,2,1,23.08,66.67,33.33,66.66,3,99.99


In [11]:
df_aux = pipe_data[['team', 'event_game','acc_w_score', 'perform_score']]\
         .groupby(['team', 'event_game']).sum().reset_index()

df_aux.columns = ['team', 'event_game', 'acc_w_score_total', 'perform_score_total']

pd.merge(left = pipe_data,
         right= df_aux,
         on= ['team', 'event_game'],
         how= 'inner')

Unnamed: 0,event_date,event_game,team,medal,acc_w_score,medal_abs_frequence,team_relative_size,team_participation_ratio,medal_rel_frequence,perform_score,acc_w_score_total_x,perform_score_total_x,acc_w_score_total_y,perform_score_total_y
0,2025-01-12,A,summer,bronze,2,2,23.08,100.0,66.67,66.67,4,133.33,4,133.33
1,2025-01-12,A,summer,silver,2,1,23.08,100.0,33.33,66.66,4,133.33,4,133.33
2,2025-01-12,B,summer,bronze,1,1,23.08,66.67,33.33,33.33,3,99.99,3,99.99
3,2025-01-12,B,summer,not played,0,0,23.08,66.67,0.0,0.0,3,99.99,3,99.99
4,2025-01-12,B,summer,silver,2,1,23.08,66.67,33.33,66.66,3,99.99,3,99.99
5,2025-01-12,C,summer,bronze,1,1,23.08,66.67,33.33,33.33,3,99.99,3,99.99
6,2025-01-12,C,summer,not played,0,0,23.08,66.67,0.0,0.0,3,99.99,3,99.99
7,2025-01-12,C,summer,silver,2,1,23.08,66.67,33.33,66.66,3,99.99,3,99.99
8,2025-01-12,A,autumn,bronze,4,4,46.15,83.33,66.67,66.67,7,116.68,7,116.68
9,2025-01-12,A,autumn,gold,3,1,46.15,83.33,16.67,50.01,7,116.68,7,116.68


In [12]:
# high and low scores with accumulative method
# acc score data, sumado 
pd.merge(left = pipe_data[['team', 'event_game','acc_w_score']]\
         .groupby(['team', 'event_game']).sum('sum_acc_score').reset_index(),
         right= pipe_data,
         on= ['team', 'event_game'],
         how= 'inner')\
    .sort_values(['acc_w_score_x', 'acc_w_score_y','team_participation_ratio'], ascending=False)


Unnamed: 0,team,event_game,acc_w_score_x,event_date,medal,acc_w_score_y,medal_abs_frequence,team_relative_size,team_participation_ratio,medal_rel_frequence,perform_score,acc_w_score_total,perform_score_total
7,autumn,C,13,2025-01-12,gold,6,2,46.15,100.0,33.33,99.99,13,216.66
8,autumn,C,13,2025-01-12,silver,6,3,46.15,100.0,50.0,100.0,13,216.66
6,autumn,C,13,2025-01-12,bronze,1,1,46.15,100.0,16.67,16.67,13,216.66
17,winter,A,10,2025-01-12,gold,6,2,30.77,100.0,50.0,150.0,10,250.0
18,winter,A,10,2025-01-12,silver,4,2,30.77,100.0,50.0,100.0,10,250.0
3,autumn,B,9,2025-01-12,bronze,4,4,46.15,100.0,66.67,66.67,9,150.02
4,autumn,B,9,2025-01-12,gold,3,1,46.15,100.0,16.67,50.01,9,150.02
5,autumn,B,9,2025-01-12,silver,2,1,46.15,100.0,16.67,33.34,9,150.02
0,autumn,A,7,2025-01-12,bronze,4,4,46.15,83.33,66.67,66.67,7,116.68
22,winter,C,7,2025-01-12,gold,3,1,30.77,100.0,25.0,75.0,7,175.0


## Plotly

### Custom template

In [13]:
timestamp = datetime.datetime.now(tz = datetime.timezone.utc)
utc_timestamp = timestamp.replace(tzinfo = datetime.timezone.utc).timestamp()

pio.templates['aer_bg_alpha0'] = go.layout.Template(
    layout_annotations = [dict(
        name = 'timestamp',
        text = f'Simulated Data · {timestamp.strftime("%Y-%B-%d, %H:%M:%S %Z %z")} Timestamp:{utc_timestamp}',
        textangle = 0,
        opacity = 0.3,
        font = dict(color = 'gray', size = 12),
        xref = 'paper',
        yref = 'paper',
        x = 0.99,
        y = 0.01,
        showarrow = False)],
    layout_geo_bgcolor = 'rgba(0,0,0,0)',
    layout_polar_bgcolor = 'rgba(0,0,0,0)',
    layout_ternary_bgcolor = 'rgba(0,0,0,0)',
    layout_paper_bgcolor = 'rgba(0,0,0,0)',
    layout_plot_bgcolor = 'rgba(0,0,0,0)',
    layout_colorway = ['#90B7E4', '#F29ABD', '#7B84CA', '#F0ECA5', '#EE7984',
                       '#EEC5DD', '#A6738F', '#EBACA8', '#36ACDD', '#316DB1'],
    #layout_colorway = ['#8BC1FF', '#FFA3C8', '#8B96E8', '#FDF550', ],
    layout_colorscale_diverging = [[0, '#F29ABD'],
                                   [0.1, '#F2AAB8'],
                                   [0.2, '#F1BBB3'],
                                   [0.3, '#F1CBAF'],
                                   [0.4, '#F0DCAA'],
                                   [0.5, '#F0ECA5'],
                                   [0.6, '#CAD3A7'],
                                   [0.7, '#A4B9AA'],
                                   [0.8, '#7DA0AC'],
                                   [0.9, '#5786AF'],
                                   [1, '#316DB1']],
    layout_colorscale_sequential = [[0.0, '#EEC5DD'],
                                    [0.1111111111111111, '#DBBCD8'],
                                    [0.2222222222222222, '#C8B3D4'],
                                    [0.3333333333333333, '#B5ABD0'],
                                    [0.4444444444444444, '#A2A2CB'],
                                    [0.5555555555555556, '#9099C7'],
                                    [0.6666666666666666, '#7D90C3'],
                                    [0.7777777777777778, '#6A87BE'],
                                    [0.8888888888888888, '#577FBA'],
                                    [0.9999999999999999, '#4476B5'],
                                    [1.0, '#316DB1']],
    layout_colorscale_sequentialminus =  [[0.0, '#EEC5DD'],
                                    [0.1111111111111111, '#DBBCD8'],
                                    [0.2222222222222222, '#C8B3D4'],
                                    [0.3333333333333333, '#B5ABD0'],
                                    [0.4444444444444444, '#A2A2CB'],
                                    [0.5555555555555556, '#9099C7'],
                                    [0.6666666666666666, '#7D90C3'],
                                    [0.7777777777777778, '#6A87BE'],
                                    [0.8888888888888888, '#577FBA'],
                                    [0.9999999999999999, '#4476B5'],
                                    [1.0, '#316DB1']])



fig = go.Figure()
fig.update_layout(template = 'plotly_white+aer_bg_alpha0')
fig.show()

In [14]:
pio.templates.default = 'aer_bg_alpha0'

### Plot tests

In [15]:
aux_data = pipe_data[['team','event_game', 'acc_w_score_total', 'perform_score_total', 'team_participation_ratio']].copy()
aux_data.drop_duplicates(inplace = True, ignore_index=True)
aux_data

Unnamed: 0,team,event_game,acc_w_score_total,perform_score_total,team_participation_ratio
0,summer,A,4,133.33,100.0
1,summer,B,3,99.99,66.67
2,summer,C,3,99.99,66.67
3,autumn,A,7,116.68,83.33
4,autumn,B,9,150.02,100.0
5,autumn,C,13,216.66,100.0
6,winter,A,10,250.0,100.0
7,winter,B,3,75.0,75.0
8,winter,C,7,175.0,100.0


In [None]:
#subgroup_event = aux_data['event_game'].unique()
#group_team = aux_data['team'].unique() # no se esta usando en la fig
#template_color = pio.templates[pio.templates.default]['layout']['colorway']
#subplot_titles = ['Accumulated Score', 'Performance Score', 'Teams Participacion']
#
#colors = [template_color[i] for f in range(len(subgroup_event))]

In [17]:
aux_data[aux_data['event_game'] == subgroup_event[0]]

Unnamed: 0,team,event_game,acc_w_score_total,perform_score_total,team_participation_ratio
0,summer,A,6,200.0,100.0
3,autumn,A,11,183.34,66.67
6,winter,A,4,100.0,75.0


In [18]:
aux_data[aux_data['team'] == group_team[0]]

Unnamed: 0,team,event_game,acc_w_score_total,perform_score_total,team_participation_ratio
0,summer,A,6,200.0,100.0
1,summer,B,5,166.65,66.67
2,summer,C,5,166.65,66.67


In [16]:
#filter_data = aux_data[aux_data['event_game']==subgroup_event[0]]['acc_w_score_total']

def accent_color(data : list, main_color : str, accent : str) -> list[str]:
    """
    Function
    -
    Create a list of a same color with an accent color were the max value is.

    Parameters
    -
    - data: list of numbers from where the max value will be picked
    - main_color: HEX, CSS, rgb or rgba in str format
    - accent: HEX, CSS, rgb or rgba in str format, must share the same color format as main_color
    """
    color_list = []
    
    for i in data:
        if i != data.max():
            color_list.append(main_color)
        else:
            color_list.append(accent)

    return color_list
    

#accent_color(filter_data, template_color[0], template_color[-1])

In [None]:
def bar_highlights(data : pd.DataFrame, x : str, y: list, subplot_titles : list, col_group : str,legend_group : list,t : int = 50,b : int = 30,l : int = 0,r : int = 0, theme = 'plotly_white'):
    """
    Function
    -
    Create metric barplots with accent on max value bars.

    Parameters
    -
    - data: dataframe with no duplicated data, ideal if has total values and/or percentages
    - x: name of the column with etiquettes on x axis
    - y: list of columns that will be plotted independently each faceted plot
    - subplot_titles: subplot titles list
    - col_group: name of the column with categorical etiquettes. Used to select from where the data will be filtered for categorical visualization
    - legend_group: list of values from selected col_group (data[col_group]), used to iterate. Also gives legend tag names
    - Margin params: t (top), b (bottom), l (left), r (right)
    """


    template_color = pio.templates[pio.templates.default]['layout']['colorway']

    # build subplot figure
    bar_metrics_subplot = make_subplots(rows = 1, cols = 3,
                                        subplot_titles = subplot_titles,
                                        shared_yaxes=False)

    # bar plots
    for i in range(len(legend_group)):
        f_data = data[data[col_group] == legend_group[i]]
        
        bar_metrics_subplot.add_trace(go.Bar( # facet 1
            name            = legend_group[i],
            x               = f_data[x],
            y               = f_data[y[0]],
            marker_color    = accent_color(f_data[y[0]], template_color[i], template_color[-1]),
            text            = f_data[y[0]],
            legendgroup     = legend_group[i],
            hovertemplate   = "<extra>" + legend_group[i] + " - %{x}</extra>"+
                            "Absolute Score sum: %{y} points"
            ),row=1,col=1)
        
        bar_metrics_subplot.add_trace(go.Bar( # facet 2
            name            = legend_group[i],
            x               = f_data[x],
            y               = f_data[y[1]],
            marker_color    = accent_color(f_data[y[1]], template_color[i], template_color[-1]),
            text            = [float(f"{n:.2f}") for n in f_data[y[1]]],
            legendgroup     = legend_group[i],
            showlegend      = False,
            hovertemplate   = "<extra>" + legend_group[i] + " - %{x}</extra>"+
                            "Performance Score sum: %{y} points"
            ),row=1, col=2)
        
        bar_metrics_subplot.add_trace(go.Bar( # facet 3
            name            = legend_group[i],
            x               = f_data[x],
            y               = f_data[y[2]],
            marker_color    = accent_color(f_data[y[2]], template_color[i], template_color[-1]),
            text            = [f"{n}%" for n in f_data[y[2]]],
            legendgroup     = legend_group[i],
            showlegend      = False,
            hovertemplate   = "<extra>" + legend_group[i] + " - %{x}</extra>"+
                            "Participation: %{y}%"
            ),row=1, col=3)

    # layout and axes
    bar_metrics_subplot.update_yaxes(showticklabels = False, showgrid=True)
    bar_metrics_subplot.update_xaxes(showticklabels = False, showgrid=False)

    bar_metrics_subplot.update_layout(
        barmode='group',
        hovermode = 'x',
        legend_orientation = 'h',
        template = f'{theme}+{pio.templates.default}',
        margin = dict(t=t, b=b, l=l, r=r),
        height = 300)

    return bar_metrics_subplot

In [20]:
aux_data = pipe_data[['team','event_game', 'acc_w_score_total', 'perform_score_total', 'team_participation_ratio']].copy()
aux_data.drop_duplicates(inplace = True, ignore_index=True)

bar_highlights(data = aux_data,
               x = 'team',
               y = ['acc_w_score_total', 'perform_score_total', 'team_participation_ratio'],
               subplot_titles = ['Accumulated Score', 'Performance Score', 'Teams Participacion'],
               col_group = 'event_game',
               legend_group = aux_data['event_game'].unique())

### working plots (optional excecute)

In [16]:
# OK (a la app)
def polar_data_path(data: pd.DataFrame, lv_base:str, lv_mid:str, lv_out:str, empty_leaf: str=None) -> list[pd.DataFrame, list[int]]:

    """
    *Function*
    -
    Data processing for polar charts, icicle and treemaps. Returns formated data according to given path columns,
    ideal to create subplot traces with plotly graph_objects and plotly make_subplots.

    *Parameters*
    -
    - data: DataFrame that has no None or null values, must be raw or disaggregated data
    - lv_base: column of strings with base (root) labels
    - lv_mid: column of strings with middle ring (branch) labels
    - lv_out: column of strings with outter ring (leafs) labels
    - empty_leaf: if some values don't reach the last detail level (or outmost ring), can set a str value
        that is not meant to be shown, leaving an empty space in the chart. This won't define ids, parents or
        labels for those values, keeping the same length for all output lists. Please note it allows for a
        single value (None by default).

    *Returns*
    -
    A pd.DataFrame with ids, labels, parents and values columns, and a list of integers with each level length,
    starting from leaf to root.
    """

    # prep params-----------------------------------------------------------
    path = [lv_base, lv_mid, lv_out]
    levels = len(path)
    aux_id, aux_parent, aux_label, aux_value = [],[],[],[]

    # grouped df based on value counts, bypass bias from "score" and categorical columns
    df_gb = data.copy().value_counts(subset=path).reset_index()\
    .sort_values(path, ascending=True, ignore_index=True)

    df_gb.columns = [lv_base, lv_mid, lv_out, 'count']
    #-----------------------------------------------------------------------

    # Leaf / Outer Level----------------------------------------------------
    # for full ring
    if empty_leaf == None:
        aux_value = df_gb['count'].to_list()

        lv_iter = [tuple(df_gb[p]) for p in path]
        *lv, last = lv_iter
                
        for lv0,lv1,lv2 in zip(*lv, last):
            aux_id.append(f"{lv0}/{lv1}/{lv2}")
            aux_parent.append(f"{lv0}/{lv1}")
            aux_label.append(lv2)
        
        len_lv2 = len(aux_id)
            
    # for fragmented ring
    else:
        aux_df = df_gb[df_gb[path[-1]] != empty_leaf]
            
        # insert values to aux_value, excluding where the ring won't be filled
        aux_value = aux_df['count'].to_list()
            
        lv_iter = [tuple(aux_df[p]) for p in path]
        *lv, last = lv_iter
                
        for lv0,lv1,lv2 in zip(*lv, last):
            aux_id.append(f"{lv0}/{lv1}/{lv2}")
            aux_parent.append(f"{lv0}/{lv1}")
            aux_label.append(lv2)
        
        len_lv2 = len(aux_id)
    #-----------------------------------------------------------------------

    # Branch / Middle Level-------------------------------------------------
    levels -= 1
    path = path[:levels]
            
    df_gb = df_gb.groupby(path).sum('count').reset_index()
    df_gb.sort_values(by=path[levels::-1], ascending=True, ignore_index=True, inplace=True)

    lv_iter = [tuple(df_gb[p]) for p in path]
    *lv, last = lv_iter
            
    # build level
    aux_value.extend(df_gb['count'].to_list())
            
    for lv0,lv1 in zip(*lv, last):
        aux_id.append(f"{lv0}/{lv1}")
        aux_parent.append(f"{lv0}")
        aux_label.append(lv1)
        
    len_lv1 = len(aux_id)
    #-----------------------------------------------------------------------

    # Root / Base Level-----------------------------------------------------
    levels -= 1
    path = path[:levels]

    df_gb = df_gb.groupby(path).sum('count').reset_index()
    df_gb.sort_values(by=path[levels::-1], ascending=True, ignore_index=True, inplace=True)
            
    # build level
    aux_value.extend(df_gb['count'].to_list())
    lv_iter = df_gb[path].values.flatten().tolist()
            
    for lv0 in lv_iter:
        aux_id.append(lv0)
        aux_parent.append("")
        aux_label.append(lv0)
        
    len_lv0 = len(aux_id)
    #-----------------------------------------------------------------------

    return pd.DataFrame({'ids':aux_id, 'parents':aux_parent, 'labels':aux_label, 'values':aux_value}), [len_lv2, len_lv1-len_lv2, len_lv0-len_lv1 ]

In [17]:
sb_data, lengths = polar_data_path(df_teams_disagg,
                                   lv_base = 'event_game',
                                   lv_mid = 'team',
                                   lv_out = 'medal', empty_leaf='not played')

sb_data.columns, lengths

(Index(['ids', 'parents', 'labels', 'values'], dtype='object'), [19, 9, 3])

In [18]:
# OK
def customdata_levels(data : pd.DataFrame, col_orders : list[str], mode : str = 'simple',
                      categorical_cols : str | list[str] = None):
    """
    *Function*
    -
    Adapts input data for hovertemplate customdata and formating in plotly tree-type figures

    *Parameters*
    -
    - data: dataframe with additional data not present in figure main data
    - col_orders: list of all columns, sorted as needed. If mode is set to 'group', the last
        column will be used to create a sum column
    - mode: can be 'simple' or 'group' (default: 'simple'). In simple mode, a sort_values will
        be applied, in 'group' mode, a groupby will be applied instead, with a resulting sum column
    - categorical_cols: columns to rectify it's dtype from categorical to object. Recommended
        when the category orders affects and creates discrepancies.
    """
    if categorical_cols != None:
        data.reset_index(inplace=True)
        aux_data = data[categorical_cols].to_numpy()
        data[categorical_cols] = pd.DataFrame(aux_data, dtype='object')

    match mode:
        case 'simple':
            c_data = data.sort_values(by= col_orders, ignore_index=True)
        case 'group':
            c_data = data[col_orders].groupby(col_orders[:-1]).sum(col_orders[-1]).reset_index()

    return c_data

In [19]:
test_order = customdata_levels(data = pipe_data[pipe_data['medal']!='not played'][['team', 'medal','medal_abs_frequence']],
                  col_orders= ['team', 'medal'], mode='simple', categorical_cols='medal')

In [20]:
# OK (a la app)
def polar_customdata(data : list[pd.DataFrame], customdata_l : list[str],
                     customdata_b : list[str], customdata_r : list[str],
                     n_rows : list[int], col_orders : list[list]) -> list[str]:
    """
    *Function*
    -
    Creates a list with custom template hover data for each level rendered in a tree or polar
    plotly plot with base, one branch and leafs (3 level depth). This function can be
    modified according to each plotly tree-like or polar-like structure.

    *Parameters*
    -
    - data: list with 3 data origins related to data path used in a tree/polar figure
    - customdata_l: columns used, in order, to be displayed in hover for leaf level
    - customdata_b: columns used, in order, to be displayed in hover for branch level
    - customdata_r: columns used, in order, to be displayed in hover for root level
    - n_rows: level n_rows from data path, from leaf to root
    - col_orders: list of columns used to sort each data level, must be the same order
        as in data path levels
    """

    # check lenghts (error)
    data_leaf = customdata_levels(data = data[0], col_orders = col_orders[0],
                                categorical_cols='medal')
    data_branch = customdata_levels(data = data[1], col_orders = col_orders[1], mode = 'group')
    data_root = customdata_levels(data = data[2], col_orders = col_orders[2], mode = 'group')

    if len(data_leaf) != n_rows[0]:
        return "Leaf data has not the same length as leaf data info"
    if len(data_branch) != n_rows[1]:
        return "Branch data has not the same length as branch data info"
    if len(data_root) != n_rows[2]:
        return "Root data has not the same length as root data info"

#    if len(data_leaf) == n_rows[0] and len(data_branch) == n_rows[1] and len(data_root) == n_rows[2]:
    # leaf customdata-----------------------------------------------------------------
    hover_leaf = [f"<b>{data_leaf[customdata_l[0]].at[i].capitalize()} medal</b><br>"+
                f"<i>Medal count</i>: {data_leaf[customdata_l[1]].at[i]}<br>"+
                f"<i>Medal relative count</i>: {data_leaf[customdata_l[2]].at[i]}%<br>"+
                f"<br><b>Medal Score Methods</b><br>"+
                f"<i>Accumulative</i>: {data_leaf[customdata_l[3]].at[i]}<br>"+
                f"<i>Performance</i>: {data_leaf[customdata_l[4]].at[i]:.2f}<br>"+
                f"<extra><b>Team<br>{data_leaf[customdata_l[5]].at[i]}<br>medals</b></extra>" for i in range(n_rows[0])]

    # branch customdata---------------------------------------------------------------
    hover_branch = [f"<b>{data_branch[customdata_b[0]].at[i].capitalize()}</b><br>"+
                    f"<i>Active team players</i>: {data_branch[customdata_b[1]].at[i]}<br>"+
                    f"<i>Team participation</i>: {data_branch[customdata_b[2]].at[i]}%<br>"+
                    f"<br><b>Team Score Methods</b><br>"+
                    f"<i>Accumulative</i>: {data_branch[customdata_b[3]].at[i]}<br>"+
                    f"<i>Performance</i>: {data_branch[customdata_b[4]].at[i]:.2f}<br>"+
                    f"<extra><b>Event<br>{data_branch[customdata_b[5]].at[i]}</b></extra>" for i in range(n_rows[1])]

    # root customdata-----------------------------------------------------------------
    hover_root = [f"<b>Event {data_root[customdata_r[0]].at[i].capitalize()}</b><br>"+
                    f"<i>Player count</i>: {data_root[customdata_r[1]].at[i]}<br>" for i in range(n_rows[2])]


    customdata_l = hover_leaf+hover_branch+hover_root

    return customdata_l

In [21]:
customdata_l=polar_customdata(
    data = [pipe_data[pipe_data['medal']!='not played'],
            pipe_data[pipe_data['medal']!='not played'],
            pipe_data],
    customdata_l = ['medal', 'medal_abs_frequence', 'medal_rel_frequence',
                    'acc_w_score', 'perform_score', 'team'],
    customdata_b = ['team', 'medal_abs_frequence', 'team_participation_ratio',
                    'acc_w_score_total', 'perform_score_total', 'event_game'],
    customdata_r = ['event_game','medal_abs_frequence'],
    n_rows = lengths,
    col_orders = [['event_game', 'team', 'medal'],
                  ['team', 'event_game', 'team_participation_ratio','acc_w_score_total',
                   'perform_score_total', 'medal_abs_frequence'],
                  ['event_game','medal_abs_frequence']])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [22]:
# hay problemas con el anillo exterior del sunburst
go.Figure(go.Sunburst(
            ids = sb_data['ids'],
            labels = sb_data['labels'],
            parents = sb_data['parents'],
            values = sb_data['values'],
            branchvalues = 'total',
            customdata = customdata_l,
            hovertemplate = "%{customdata}<extra></extra>"
        )).update_layout(
            width = 500, height = 500,
            margin = dict(t=0, b=0, l=0, r=0))

In [23]:
def players_score_figure(df_data: pd.DataFrame,
                         sorted: bool=True, ascending: bool =True,
                         theme: str = 'plotly_white' ,hole: float=0.50,
                         h: int =400, w: int =900) -> go.Figure:

    """
    *Function*
    -
    Creates figure with barpolar subplots to represent individual players scores in each event in a given date.

    *Parameters*
    -
    -df_data: dataframe containing desagregate players, teams (factions, color, etc.), scores and score description
     (medal, cups, etc.)
    -sorted: (default: True) sorts players by scores .
    -ascending: (default: True) define the way player scores are sorted, only takes effect if sorted is set to True.
    -theme: (default: 'plotly_white) specify plotly template to mix with default (custom) template
    -hole: (default: 0.5) set empty center from 0 to 1
    -h: (default: 400) figure height
    -w: (defautl: 900) figure width
    """

    # copy to keep integrity
    df=df_data.copy()
    # sort values before building the figure (if ascending=True)
    df.sort_values(['team', 'score'], ascending=ascending, inplace=sorted)

    # other Figure inputs
    #----- team list
    team_names = [i for i in df['team'].unique()]
    #----- event list
    events = [i for i in df['event_game'].unique()]
    #----- color theme from default template
    template_color = pio.templates[pio.templates.default]['layout']['colorway']
    color_theme = template_color[:len(team_names)]
    
    # Create Figure
    fig_polar = make_subplots(
        rows = 1, cols = len(events),
        column_titles = [f"Event: {e}" for e in events],
        specs = [[{'type':'polar'}]*len(df['event_game'].unique())]
    )
    #----- config unified legend
    sp_legendgroup = [True]
    sp_legendgroup.extend([False for e in range(len(events[1:]))])
    sp_legendgroup
    
    #----- traces: bar polar by event and team
    for e in range(len(events)):
        for t in range(len(team_names)):
            fig_polar.add_trace(go.Barpolar(
                name = "Team "+ team_names[t],
                r = list(df[df['event_game']==events[e]][df['team']==team_names[t]]['score']),
                theta = list(df[df['event_game']==events[e]][df['team']==team_names[t]]['player_id']),
                marker_color = color_theme[t],
                legendgroup = team_names[t],
                showlegend = sp_legendgroup[e],
                customdata = df[df['event_game']==events[e]][df['team']==team_names[t]][['event_date', 'medal']],
                hovertemplate = "<extra><b style='color:black;'>" "Team "+ team_names[t] +"</b></extra>"
                                "<b>Player %{theta}</b><br>"+
                                "<br><i>Date</i>: %{customdata[0]}<br>"+
                                "<i>Medal</i>: %{customdata[1]}<br>"+
                                "<i>Score</i>: %{r} points"
                ),row=1, col=e+1)

    #----- bar polar config
    fig_polar.update_polars(
        patch = dict(hole = hole,
                     radialaxis = dict(showticklabels=False,
                                       visible = False),
                     angularaxis= dict(showticklabels=False,
                                       visible = False,
                                       categoryorder = 'array',
                                       categoryarray = team_names)))

    #----- figure layout
    fig_polar.update_layout(
        legend = dict(font_size = 10,
                      orientation = 'h',
                      yanchor = 'bottom'
                     ),
        hoverlabel = dict(bordercolor = 'white',
                          font_size = 8,
                          font_color = 'black',
                         ),
        template = f'{theme}+{pio.templates.default}',
        height = h, width = w,
        title = f"Players participation during {', '.join(events[:-1])} and {events[-1]} events"
    )
    
    return fig_polar

In [24]:
barpolar_ph = players_score_figure(df_teams_disagg,
                 sorted=True, ascending=True, hole=0.50, h=400, w=800)
barpolar_ph.show()

In [25]:
print(df_teams_disagg['event_game'].unique(), df_teams_disagg['team'].unique())
print(pipe_data['event_game'].unique(), pipe_data['team'].unique())

['A' 'B' 'C'] ['summer' 'autumn' 'winter']
['A' 'B' 'C'] ['summer' 'autumn' 'winter']


In [26]:
test = {team:n for team,n in zip(df_teams_disagg['event_game'].unique(), ['lele','foo','baa'])}

In [27]:
df_data = df_teams_disagg
ascending = False
sorted = True
color_order = tuple(df_teams_disagg['team'].unique())
title = None
theme = 'plotly_white'
hole = 0.25
h = 1200
w = 400
customdata = ['event_date', 'medal']

In [28]:
df=df_data.copy()
# sort values before building the figure (if ascending=True)
df.sort_values(['team', 'score'], ascending=ascending, inplace=sorted)

# other Figure inputs
#----- team list
team_names = [i for i in df['team'].unique()]
#----- event list
events = [i for i in df['event_game'].unique()]

#----- color theme from default template
if color_order == None:
    color_order = team_names
else:
    pass
    
template_color = pio.templates[pio.templates.default]['layout']['colorway']
color_theme = tuple(template_color[:len(color_order)])
group_color = {order:color for order,color in zip(color_order, color_theme)}

group_order = tuple(group_color.keys())
# ----------------------------------------------------------------------------

# Create Figure
fig_polar = make_subplots(
    rows = len(events), cols = 1,
    vertical_spacing = 0,
    #column_titles = [e for e in events],
    #row_titles = [e for e in events],
    specs = [[{'type':'polar'}]]*len(df['event_game'].unique())
)
#----- config unified legend
sp_legendgroup = [True]
sp_legendgroup.extend([False for e in range(len(events[1:]))])
    
#----- traces: bar polar by event and team iter
for e in range(len(events)):
    for t in range(len(group_order)):
        # barpolar -------------------------------------------------------------------
        fig_polar.add_trace(go.Barpolar(
            name = "Team "+ group_order[t],
            r = list(df[df['event_game']==events[e]][df['team']==group_order[t]]['score']),
            theta = list(df[df['event_game']==events[e]][df['team']==group_order[t]]['player_id']),
            marker_color = group_color[group_order[t]],
            marker_line_color = group_color[group_order[t]],
            legendgroup = group_order[t],
            showlegend = sp_legendgroup[e],
            customdata = df[df['event_game']==events[e]][df['team']==group_order[t]][customdata],
            hovertemplate = "<extra></extra>"+
                            "<b>Team "+group_order[t]+"</b>"+
                            "<br>Event "+events[e]+"<br><br>"+
                            "<b>Player %{theta}</b><br>"+
                            "<br><i>Date</i>: %{customdata[0]}<br>"+
                            "<i>Medal</i>: %{customdata[1]}<br>"+
                            "<i>Score</i>: %{r} points"
        # ----------------------------------------------------------------------------
            ),row=e+1, col=1)

#----- bar polar config
fig_polar.update_polars(
    patch = dict(hole = hole,
                 radialaxis = dict(showticklabels=False,
                                   visible = False),
                 angularaxis= dict(showticklabels=False,
                                   visible = False,
                                   categoryorder = 'array',
                                   categoryarray = color_order)),
    bargap = 0)

#----- figure layout
if title != None:
    fig_polar.update_layout(title=title)
else:
    fig_polar.update_layout(margin_t = 90)

fig_polar.update_layout(
    legend = dict(font_size = 14,
                  orientation = 'h',
                  y = 1.055,
                  #yanchor = 'bottom',
                  x = 0,
                  #xref = 'container' , xanchor = 'center'
                  ),
    hoverlabel = dict(bordercolor = 'white',
                      font_size = 12,
                      font_color = 'black'),
    template = f'{theme}+{pio.templates.default}',
    margin = dict(b = 0, l=0, r=0),
    height = h, width = w,
    title = title)

fig_polar.show()

In [29]:
pipe_data.columns

Index(['event_date', 'event_game', 'team', 'medal', 'acc_w_score',
       'medal_abs_frequence', 'team_relative_size', 'team_participation_ratio',
       'medal_rel_frequence', 'perform_score', 'acc_w_score_total',
       'perform_score_total'],
      dtype='object')

In [30]:
pipe_data[pipe_data['event_game']=='A'].head(3)

Unnamed: 0,event_date,event_game,team,medal,acc_w_score,medal_abs_frequence,team_relative_size,team_participation_ratio,medal_rel_frequence,perform_score,acc_w_score_total,perform_score_total
0,2024-12-13,A,summer,gold,3,1,23.08,66.67,33.33,99.99,5,166.65
1,2024-12-13,A,summer,not played,0,0,23.08,66.67,0.0,0.0,5,166.65
2,2024-12-13,A,summer,silver,2,1,23.08,66.67,33.33,66.66,5,166.65


In [31]:
pipe_data[pipe_data['event_game']=='A'][pipe_data['team']=='summer']['acc_w_score_total'][0]

5

In [32]:
#params
df_data = pipe_data[pipe_data['medal']!='not played']
x_data = 'medal'
y_data = 'acc_w_score'
facet_data_col = 'team'
selector = 'event_game'
y_title = 'Scores by Accumulative Method'
hline_values = 'acc_w_score_total'
hline_annot_iter = df_data[facet_data_col].unique()
hline_annot = ''
category_order = ['not played', 'bronze', 'silver', 'gold']
barcornerradius = "0%"
title = None
theme = 'plotly_white'
w = 900
h = 400
customdata_cols = ['medal_rel_frequence','medal','team_participation_ratio', 'acc_w_score']
hovertemplate = '''<br><i>Proportion %{customdata[0]} medals</i>: %{customdata[1]}
                <br>Team event participation %{customdata[2]}%
                <br><i>Medal score</i>: %{customdata[3]} points<extra></extra>'''

In [33]:
# function
def cust_bar_hline(df_data: pd.DataFrame,
                   x_data: str, y_data: str, facet_data_col: str, selector: str,
                   hline_values: str, hline_annot_iter: list, hline_annot: str = '',
                   y_title: str = None, customdata_cols: list=None, hovertemplate: str = None,
                   category_order: list = None,
                   title: str=None, barcornerradius: str = '0%', theme: str = 'plotly_white',
                   w: int = 900, h: int = 400) -> go.Figure:
    """
    *Function*
    -
    Creates a figure with interactive bar plots and horizontal lines. Recommended for
    data that has agregated numerical values.

    *Parameters*
    -
    - df_data: pd.DataFrame, main data to plot
    - x_data: str, category values on x axis for barplots
    - y_data: str, numerical values on y axis for barplots
    - facet_data_col: str, column with categorical data to facet plots
    - selector: str, filter value, can be set directly, recommended for value selector objects
    - hline_values: str, column with values for horizontal line position
    - hline_annot_iter: list|np.array, list or array to iterate for hline annotation
    - hline_annot: str = '', additional annotation
    - y_title: str = None, single y axis title for all subplots
    - customdata_cols: list=None, list of columns that will be used in hovertemplate
    - hovertemplate: str = None, text displayed on hover info and label, using html text format
    - category_order: list|np.array = None, sort bar plot figures
    - title: str=None, figure title
    - barcornerradius: str = '0%', set bar round corners
    - theme: str = 'plotly_white', set plotly template to mix with default template
    - w: int = 900, set figure width 
    - h: int = 400, set figure height
    """

    # Iterative variables
    subplot_cols = list(df_data[facet_data_col].unique())
    selector_filter = df_data[selector].unique()[0]

    template_color = pio.templates[pio.templates.default]['layout']['colorway']
    color_theme = template_color[:len(subplot_cols)]

    # Subplot figure
    bar_h_fig = make_subplots(cols = len(subplot_cols),
                            shared_xaxes = True,
                            shared_yaxes=True,
                            y_title = y_title,
                            column_titles= subplot_cols,
                            print_grid=False,
                            specs = [[{"secondary_y" : True} for t in range(len(subplot_cols))]])

    bar_legendgroup = [True]
    bar_legendgroup.extend([False for e in range(len(subplot_cols[1:]))])

    # bar
    for i in range(len(subplot_cols)):
        bar_h_fig.add_trace(
            go.Bar(
                x = df_data[df_data[selector]==selector_filter][df_data[facet_data_col]==subplot_cols[i]][x_data],
                y = df_data[df_data[selector]==selector_filter][df_data[facet_data_col]==subplot_cols[i]][y_data].values,
                name = subplot_cols[i],
                marker_color = color_theme[i],
                legendgroup = subplot_cols[i]+' bar',
                customdata = df_data[df_data[selector]==selector_filter][df_data[facet_data_col]==subplot_cols[i]][customdata_cols],
                hovertemplate = hovertemplate
            ),row = 1, col = i+1, secondary_y = False)

    # hline
    for i in range(len(subplot_cols)):
        bar_h_fig.add_hline(y = df_data[df_data[selector]==selector_filter][df_data[facet_data_col]==subplot_cols[i]][hline_values].unique()[0],
                        line_color = color_theme[i],
                        line_width = 1,
                        annotation_text = f"{hline_annot_iter[i]}{hline_annot}",
                        annotation_position = 'top left',
                        exclude_empty_subplots=False, secondary_y=True)    

    #----- axes styling and category order
    bar_h_fig.update_xaxes(
        categoryorder = 'array',
        categoryarray = category_order,
        showticklabels= False,
        showspikes = False,
        showgrid = False)
    bar_h_fig.update_yaxes(showgrid = False, showticklabels = False)
    bar_h_fig.update_yaxes(showticklabels = True, secondary_y = True)

    bar_h_fig.update_layout(
            title = title,
            barmode = 'group',
            legend = dict(font_size = 12,
                        tracegroupgap = 0, y = -.2,
                        yanchor = 'bottom', xanchor = 'left',
                        orientation = 'h'),
            barcornerradius = barcornerradius,
            template = f'{theme}+{pio.templates.default}',
            width = w, height = h)

    return bar_h_fig

In [34]:
cust_bar_hline(df_data = pipe_data[pipe_data['medal']!='not played'],
x_data = 'medal',
y_data = 'acc_w_score',
facet_data_col = 'team',
selector = 'event_game',
y_title = 'Scores by Accumulative Method',
hline_values = 'acc_w_score_total',
hline_annot_iter = df_data[facet_data_col].unique(),
hline_annot = '',
category_order = ['not played', 'bronze', 'silver', 'gold'],
barcornerradius = "0%",
title = None,
theme = 'plotly_white',
w = 900,
h = 400,
customdata_cols = ['medal_rel_frequence','medal','team_participation_ratio', 'acc_w_score'],
hovertemplate = '''<br><i>Proportion %{customdata[0]} medals</i>: %{customdata[1]}
                <br>Team event participation %{customdata[2]}%
                <br><i>Medal score</i>: %{customdata[3]} points<extra></extra>''')

In [35]:
print([[{"secondary_y" : True} for t in range(len(subplot_cols))]])

NameError: name 'subplot_cols' is not defined

## Clustering model

In [16]:
df_teams_disagg.head()

Unnamed: 0,player_id,event_date,event_game,score,medal,team
0,96798,2024-12-15,A,2,silver,summer
1,96798,2024-12-15,B,3,gold,summer
2,96798,2024-12-15,C,3,gold,summer
3,75054,2024-12-15,A,1,bronze,summer
4,75054,2024-12-15,B,1,bronze,summer


In [17]:
# https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html

# preprocessing
df_clust_data = df_teams_disagg[['event_date', 'player_id', 'team', 'score', 'medal']].copy()
df_clust_data['player_participation'] = df_clust_data['medal'].replace({'gold':1, 'silver':1, 'bronze':1, 'not played':0})

df_clust_data = df_clust_data.groupby(['event_date', 'player_id', 'team']).sum(['score', 'played']).reset_index()
df_clust_data['player_participation'] = df_clust_data['player_participation'].apply(lambda x: x/len(df_teams_disagg['event_game'].unique()))

# no se si es necesario el token
#tokens = {team:token for team,token in zip(df_teams_disagg['team'].unique(), range(1, len(df_teams_disagg['team'].unique())+1))}
#df_clust_data['team_token'] = df_clust_data['team'].replace(tokens)

# get_dummies (no suffix added, it only takes each team name as a feature to add to X)
df_clust_data = pd.concat([df_clust_data, pd.get_dummies(df_clust_data['team'])], axis=1)

X = df_clust_data[['score', 'player_participation'] + list(df_clust_data['team'].unique())].to_numpy()
#X = df_clust_data[['score', 'player_participation']].to_numpy()

df_clust_data.head()

Unnamed: 0,event_date,player_id,team,score,player_participation,autumn,summer,winter
0,2024-12-15,29477,summer,3,0.666667,0,1,0
1,2024-12-15,30408,winter,4,0.666667,0,0,1
2,2024-12-15,31022,winter,4,1.0,0,0,1
3,2024-12-15,33028,autumn,8,1.0,1,0,0
4,2024-12-15,34224,autumn,6,1.0,1,0,0


In [18]:
def kmeans_silhouette_score_eval(X : np.array) -> list:
    """
    Function
    -
    Cluster labels, ideal number of clusters, average silhouete score and label samples,
    based on sklearn metrics

    Parameters
    -
    - X: numpy array with data to cluster with KMeans
    """

    # evaluate silhouette best score and n_clusters
    clusters, avg_sil_score = [], []
    for n_clusters in range(2, len(X)):
        kmeans = KMeans(n_clusters = n_clusters)
        labels = kmeans.fit_predict(X)
        sil_score = silhouette_score(X, labels)

        clusters.append(n_clusters)
        avg_sil_score.append(sil_score)

    # sort together to get best score with clusters
    eval = pd.DataFrame({'n_clusters' : clusters,
                'silhouette_score' : avg_sil_score}
                ). sort_values('silhouette_score', ascending=False, ignore_index=True)

    clusters = eval.at[0,'n_clusters']
    avg_sil_score = eval.at[0,'silhouette_score']

    # readjust model
    kmeans = KMeans(n_clusters = clusters)
    labels = kmeans.fit_predict(X)
    sil_samples = silhouette_samples(X, labels)


    return labels, sil_samples, clusters, avg_sil_score

In [19]:
labels, samples, clusters, score = kmeans_silhouette_score_eval(X) 

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7aab87f58ea0>
Traceback (most recent call last):
  File "/home/bosse/anaconda3/lib/python3.11/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/bosse/anaconda3/lib/python3.11/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/bosse/anaconda3/lib/python3.11/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
                   ^^^^^^^^^^^^^^^^^^
  File "/home/bosse/anaconda3/lib/python3.11/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
             ^^^^^^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribut

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7aab87f58ea0>
Traceback (most recent call last):
  File "/home/bosse/anaconda3/lib/python3.11/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/bosse/anaconda3/lib/python3.11/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/bosse/anaconda3/lib/python3.11/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
                   ^^^^^^^^^^^^^^^^^^
  File "/home/bosse/anaconda3/lib/python3.11/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
             ^^^^^^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribut

In [20]:
# show
labels,len(labels), samples, len(samples), clusters, score

(array([5, 4, 4, 1, 3, 2, 2, 0, 5, 4, 0, 6, 4], dtype=int32),
 13,
 array([0.39141938, 0.43727832, 0.45228694, 0.        , 0.        ,
        0.50680304, 0.65841337, 1.        , 0.27452375, 0.54053967,
        1.        , 0.        , 0.54053967]),
 13,
 7,
 0.5568212059983091)

In [21]:
# adds samples and labels new values to the processed df, and label description
df_clust_data[['samples', 'labels']] = pd.DataFrame({'samples':samples, 'labels':labels})
df_clust_data['labels_desc'] = pd.Series([f'cluster {l}'for l in df_clust_data['labels']])

print(df_clust_data['labels_desc'].sort_values().unique())
df_clust_data.head()

['cluster 0' 'cluster 1' 'cluster 2' 'cluster 3' 'cluster 4' 'cluster 5'
 'cluster 6']


Unnamed: 0,event_date,player_id,team,score,player_participation,autumn,summer,winter,samples,labels,labels_desc
0,2024-12-15,29477,summer,3,0.666667,0,1,0,0.391419,5,cluster 5
1,2024-12-15,30408,winter,4,0.666667,0,0,1,0.437278,4,cluster 4
2,2024-12-15,31022,winter,4,1.0,0,0,1,0.452287,4,cluster 4
3,2024-12-15,33028,autumn,8,1.0,1,0,0,0.0,1,cluster 1
4,2024-12-15,34224,autumn,6,1.0,1,0,0,0.0,3,cluster 3


In [22]:
def silhouette_figure(data : pd.DataFrame, t : int = 50,b : int = 30,l : int = 0,r : int = 0, theme = 'plotly_white'):
    sil_fig = go.Figure()

    y_lower = 0

    for i in range(len(data['labels'].unique())):
        cluster_size_i = data\
            [data['labels']==data['labels'].unique()[i]]\
            ['samples'].to_numpy().shape[0]
        
        y_upper = y_lower + cluster_size_i

        sil_fig.add_trace(go.Scatter(
            x = data[data['labels']==data['labels'].unique()[i]]\
                .sort_values('samples')['samples'],
            y = np.arange(y_lower, y_upper),
            mode = 'lines',
            line_width=.5,
            fill = 'tozerox',
            showlegend=False
        ))

        y_lower = y_upper

    sil_fig.add_vline(x= score,
                    annotation_text = f"Avg. silhouette score: {score:.4f}",
                    annotation_position = 'top left')

    sil_fig.update_xaxes(showticklabels = True, showgrid=False)
    sil_fig.update_yaxes(showticklabels=False, showgrid=False, showspikes=False)
    sil_fig.update_layout(hovermode = 'x',
                        template = f'{theme}+{pio.templates.default}',
                        margin = dict(t=t,b=b,l=l,r=r),
                        title= f"Silhouette analysis for KMeans clustering with n_clusters = {clusters}")

    return sil_fig

In [23]:
def contour_data(X, depth : list, v_margin : float|int = .35, h_margin : float|int = .35, mesh_size: float= .1) -> list[list]:
    """
    Function
    -
    Configures range values to build plotly contour traces from KMeans input X for x and y
    axes and scores or resulting cluster labels for z (depth).

    Parameters
    -
    - X: training features applied in KMeans. Is recommended to have as first and second features
        the considered most important to plot
    - depth: KMeans output, generaly will be the labels output
    - margin: (default .35) reescale the plot for use as add_traces
    - mesh_size: (default .1) detail in contour gradient defined from xrange. yrange and zrange are
        defined from xrange shape.
    
    Output
    -
    Three arrays containing x, y and z values for plotly contour trace
    """

    # score fesature as x, participation feature as y 
    x_min, x_max = X[:,0].min() - h_margin, X[:,0].max() + h_margin
    y_min, y_max = X[:,1].min() - v_margin, X[:,1].max() + v_margin
    z_min, z_max = depth.min(), depth.max()

    xrange = np.arange(x_min, x_max, mesh_size)

    y_mesh_size = (y_max-y_min)/xrange.shape[0]
    yrange = np.arange(y_min, y_max, y_mesh_size)

    z_mesh_size = ((z_max-z_min)/xrange.shape[0])
    zrange = np.arange(z_min, z_max, z_mesh_size)

    return xrange, yrange, zrange


In [24]:
def score_contour_trace(name: str, opacity: float, xrange: list[float], yrange: list[float],
                        zrange: list[float]) -> go.Contour:
    """
    Function
    -
    Generates a plotly contour trace to add with add_traces or bluid subplots. Ideal for
    visualizing KMeans clustering results.

    Parameters
    -
    - name: trace name
    - opacity: trace opacity, between 0 and 1
    - x: array or list of float values with the first (or main first) feature
    - y: array or list of float values with the second (or main second) feature, must have
        the same length as the first feature
    - z: array or list of float values with KMeans output, wich can be scores or labels
        range, and must have the same length as the first and second features
    """
    colorscale = pio.templates[pio.templates.default]['layout']['colorscale']['sequential']

    contour_trace = go.Contour(
            x = xrange, y = yrange, z = zrange,
            showscale=False, opacity=opacity, hoverinfo='skip',
            name=name, colorscale = colorscale)

    return contour_trace

#*ranges, = contour_data(X, depth = labels)
#go.Figure(score_contour_trace('clust_scores', .5, xrange=ranges[0], yrange=ranges[1], zrange=ranges[2]))

In [32]:
# params
data = df_clust_data
#category = data['labels_desc'].sort_values().unique()
#sub_category = df_teams_disagg['team'].unique()

def kmean_scatter(data : pd.DataFrame, category : list, sub_category : list, x : str, y : str, legendgroup : str, size : str, sizescale : float,
                  customdata : str, legend_title :str):
    """
    Function
    -
    Creates a scatter figure, iterating through kmeans results and base data

    Parameters
    -
    - data: main data, must have a labels_desc column related with category entry values
    - category: list main categorical group unique values, must be values contained in 'labels_desc' column
    - sub_category: list with unique values, must be related with legendgroup entry
    - x: column for x axis values
    - y: column for y axis values
    - legendgroup: column used to group iterarions and to filter sub category data
    - size: column with float values to give size to markers
    - sizescale: marker size scalar
    - customdata: column used to display in hoverdata
    - legend_title: descriptive name of overal legend
    """
    color = pio.templates[pio.templates.default]['layout']['colorway']
    symbols = tuple(SymbolValidator().values[2::12])

    scatter_fig = go.Figure()

    for sc_i in range(len(sub_category)):
        for c_i in range(len(category)):
            data_filtered = data[data['labels_desc']== category[c_i]][data[legendgroup]== sub_category[sc_i]]
            scatter_fig.add_trace(go.Scatter(
                x = data_filtered[x],
                y = data_filtered[y],
                name = f"{category[c_i]} - {sub_category[sc_i]}",
                legendgroup = f"{sc_i}{legendgroup}",
                mode = 'markers',
                marker_color = color[sc_i],
                marker_opacity = 1,
                marker_line_width = 1.5,
                marker_line_color = color[c_i],
                marker_size = data_filtered[size]*sizescale,
                marker_sizemode = 'diameter',
                marker_symbol = symbols[sc_i],
                customdata = data_filtered[customdata],
                hovertemplate = f"<i>{customdata}: </i>" + "%{customdata}<br>" +
                                f"<b>{x}</b> "+"%{x} points<br>" +
                                f"<b>{y}</b> "+"%{y:2.2%}" +
                                "<extra></extra>"
            ))

    scatter_fig.update_layout(legend_title= legend_title)

    return scatter_fig

In [None]:

len(tuple(SymbolValidator().values[2::12]))

41

In [44]:
symbols = list(SymbolValidator().values[2::12])
category, sub_category = 50,41

print(len(symbols))
while category > len(symbols) or sub_category > len(symbols):
    symbols.extend(symbols)
    print(len(symbols))

print(len(symbols))

41
82
82


In [33]:
# values for contour
*ranges, = contour_data(X, depth = labels, mesh_size=.1)

clust_ej = kmean_scatter(data = df_clust_data,
                         category = data['labels_desc'].sort_values().unique(),
                         sub_category = df_teams_disagg['team'].unique(),
                         x = 'score',
                         y = 'player_participation',
                         legendgroup = 'team',
                         size = 'player_participation',
                         sizescale = 25,
                         customdata = 'player_id',
                         legend_title = "Clusters, teams")

clust_ej.add_trace(score_contour_trace('clust_scores', .25,
                                       xrange=ranges[0],
                                       yrange=ranges[1],
                                       zrange=ranges[2]))

clust_ej.update_xaxes(showgrid=False, showticklabels=False)
clust_ej.update_yaxes(showgrid=False, showticklabels=False)
clust_ej.update_layout(
    showlegend = True,
    legend_orientation = 'h')

clust_ej