In [17]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

pd.set_option('plotting.backend', 'plotly')
from datetime import date, datetime as dt
import os, re
import plotly.colors
import plotly.graph_objects as go, plotly.express as px, plotly.figure_factory as ff
from plotly.offline import init_notebook_mode
from plotly.subplots import make_subplots

from statsmodels.stats.proportion import proportion_confint

from tennis_utils.player import TennisDataLoader, TennisPlayerDataLoader, TennisPlayer
from tennis_utils.scrapers import SackmanScraper


def get_player_name(name):

    return '. '.join(['.'.join([e[0] for e in name.split(' ')[:-1]]), name.split(' ')[-1]])


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:

player_name = 'Roger Federer'
data_path = os.getcwd()+'/data'

tdl = TennisDataLoader(data_path+'/matches.parquet', data_path+'/players.parquet')
matches_df, players_df = tdl.matches, tdl.players

tpdl = TennisPlayerDataLoader(player_name, matches_df, players_df)

# Subset selected player matches data
player_matches = tpdl.player_matches #matches_df[matches_df['player_name'] == player_name]
player_details = tpdl.player_details #players_df[players_df['player_name']==player_name]
player_rank = tpdl.player_rank

tp = TennisPlayer(player_name, player_matches, player_rank, player_details)#, rounds=['F'])
m = tp.selected_matches     


In [26]:
tp.h2h.columns

col_mapping = {
    'opponent_name': 'Opponent', 
    'matches_played': 'Matches Played', 
    'matches_won': 'Matches Won', 
    'win_rate': 'Winrate',
    'perc_ace': '% Aces',
    'perc_df': '% Double Faults',
    'perc_firstIn': '% First In',
    'perc_firstWon': '% First Won', 
    'perc_secondWon': '% Second Won', 
    'perc_returnWon': '% Return Won', 
    'perc_bpConverted': '% BP Converted',
    'perc_bpSaved': '% BP Saved', 
    'perc_tbWon': '% TB Won',
    'perc_decidingSetWon': '% Deciding Sets Won'
}

col_mapping.keys()

AttributeError: 'dict_keys' object has no attribute 'values'

In [3]:
"""
https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval
https://www.statsmodels.org/dev/generated/statsmodels.stats.proportion.proportion_confint.html
https://www.ucl.ac.uk/english-usage/staff/sean/resources/binomialpoisson.pdf
"""

'\nhttps://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval\nhttps://www.statsmodels.org/dev/generated/statsmodels.stats.proportion.proportion_confint.html\nhttps://www.ucl.ac.uk/english-usage/staff/sean/resources/binomialpoisson.pdf\n'

In [6]:
stats_by_year = (m.groupby('year')
            .agg(
                matches_played=('winner', np.size),
                matches_won=('winner', np.sum),
                ace = ('ace', np.sum),
                df = ('df', np.sum),
                svpt = ('svpt', np.sum),
                firstIn = ('firstIn', np.sum),
                firstWon = ('firstWon', np.sum),
                secondIn = ('secondIn', np.sum),
                secondWon = ('secondWon', np.sum),
                returnWon = ('returnWon', np.sum),    
                returnPlayed = ('returnPlayed', np.sum), 
                bpConverted = ('bpConverted', np.sum),
                bpTotal = ('bpTotal', np.sum),
                bpSaved = ('bpSaved', np.sum),
                bpFaced = ('bpFaced', np.sum),
                tbPlayed = ('tbPlayed', np.sum),
                tbWon = ('tbWon', np.sum),
                decidingSetPlayed = ('decidingSetPlayed', np.sum),
                decidingSetWon = ('decidingSetWon', np.sum))
            .assign(matches_lost = lambda x: x['matches_played'] - x['matches_won'])
            .assign(win_rate = lambda x: x['matches_won']/x['matches_played'])
            .assign(perc_ace = lambda x: x['ace']/x['svpt'])
            .assign(perc_df = lambda x: x['df']/x['svpt'])
            .assign(perc_firstIn = lambda x: x['firstIn']/x['svpt'])
            .assign(perc_firstWon = lambda x: x['firstWon']/x['firstIn'])
            .assign(perc_secondWon = lambda x: x['secondWon']/x['secondIn'])
            .assign(perc_returnWon = lambda x: x['returnWon']/x['returnPlayed'])
            .assign(perc_bpConverted = lambda x: x['bpConverted']/x['bpTotal'])
            .assign(perc_bpSaved = lambda x: x['bpSaved']/x['bpFaced'])
            .assign(tbLost = lambda x: x['tbPlayed'] - x['tbWon'])
            .assign(perc_tbWon = lambda x: x['tbWon']/x['tbPlayed'])
            .assign(decidingSetLost = lambda x: x['decidingSetPlayed'] - x['decidingSetWon'])
            .assign(perc_decidingSetWon = lambda x: x['decidingSetWon']/x['decidingSetPlayed'])
            .reset_index()
        )

KeyError: "Column 'matches_played' does not exist!"

In [15]:
success_cols = ['ace', 'df', 'firstIn', 'firstWon', 'secondWon', 'returnWon', 'bpConverted', 'bpSaved', 'tbWon', 'decidingSetWon']
total_cols = ['svpt', 'svpt', 'svpt', 'firstIn', 'secondIn', 'returnPlayed', 'bpTotal', 'bpFaced', 'tbPlayed', 'decidingSetPlayed']

lower_df, upper_df = proportion_confint(
    stats_by_year[success_cols], 
    stats_by_year[total_cols],
    method='wilson'
    )

lower_df.columns = [f'lower_{c}' for c in success_cols]
upper_df.columns = [f'upper_{c}' for c in success_cols]

In [16]:
success_overall = pd.Series(m[success_cols].fillna(0).to_numpy().sum(axis=0), index=[f'success_{c}' for c in success_cols]) 
total_overall = pd.Series(m[total_cols].to_numpy().sum(axis=0), index=[f'total_{c}' for c in success_cols])

In [17]:
success_overall, total_overall

(success_ace               11162.0
 success_df                 2622.0
 success_firstIn           68475.0
 success_firstWon          53054.0
 success_secondWon         23738.0
 success_returnWon         47099.0
 success_bpConverted        4788.0
 success_bpSaved            4150.0
 success_tbWon               427.0
 success_decidingSetWon      231.0
 dtype: float64,
 total_ace               110107
 total_df                110107
 total_firstIn           110107
 total_firstWon           68475
 total_secondWon          39010
 total_returnWon         118206
 total_bpConverted        12043
 total_bpSaved             6144
 total_tbWon                652
 total_decidingSetWon       347
 dtype: int64)

In [52]:
def plot_ts_and_pie(m,
                    success_overall, 
                    total_overall,
                    stats_by_year, 
                    upper_df, 
                    lower_df, 
                    columns, 
                    colors = ['rgb(33,113,181)', 'rgb(217,71,1)', 'rgb(81, 178, 124)', 'rgb(235, 127, 134)'],
                    ):

    n_cols, n_rows = 2, len(columns)

    specs = [[{}, {'type':'pie'}]] * n_rows

    subplot_titles=[[f'Percentage {c} and 95% CI by year', f'Percentage {c} overall'] for c in columns]


    fig = make_subplots(
            cols=n_cols,
            rows=n_rows,
            specs=specs,
            shared_xaxes=True,
            vertical_spacing=0.05,
            horizontal_spacing=0.05,
            subplot_titles=sum(subplot_titles, []),
            column_widths=[0.75, 0.25]
    )


    x = stats_by_year['year']
    
    for i, col in enumerate(columns):

        fig.add_trace(
            go.Scatter(
                x=x, y=upper_df[f'upper_{col}'],
                name=f'{col} upper bound',
                fill=None, mode='lines', 
                line=dict(color='darksalmon', width=1)
            ),
            row=i+1, col=1,
            secondary_y=False
        )

        fig.add_trace(
            go.Scatter(
                x=x, y=lower_df[f'lower_{col}'],
                name=f'{col} lower bound',
                fill='tonexty', mode='lines',
                line=dict(color='darksalmon', width=1)
            ),
            row=i+1, col=1,
            secondary_y=False
        )

        fig.add_trace(
            go.Scatter(
                x=x, y=stats_by_year[f'perc_{col}'],
                textposition='top center',
                name=col,
                mode='lines+markers',
                connectgaps=True,
                marker={'color': colors[i]},
            ),
            row=i+1, col=1,
            secondary_y=False
        )

        fig.add_trace(
            go.Pie(
                values=[success_overall[f'success_{col}'], total_overall[f'total_{col}'] - success_overall[f'success_{col}']],
                marker={'colors': ['seagreen', 'indianred'],
                            'line': {'color':'white', 'width':1}}
            ),
            row=i+1, col=2
        )

        
    # Layout
    fig.update_layout({
        **{'showlegend': False,
           'height': 350*n_rows, 
           'width': 1000, 
           'xaxis': {'title': 'Year'},
           'yaxis': {'title': 'Percentage'},
           }, 
        **{f'xaxis{i}': {'title': 'Year'} for i in range(1, n_rows+1)},
        **{f'yaxis{i}': {'title': 'Percentage'} for i in range(1, n_rows+1)},
    })

    return fig






plot_ts_and_pie(m,
                success_overall, 
                total_overall, 
                stats_by_year, 
                upper_df, 
                lower_df, 
                columns=['firstIn', 'firstWon'],# 'secondWon', 'returnWon', 'ace', 'df'], 
                colors=['rgb(33,113,181)', 'rgb(217,71,1)', 'rgb(81, 178, 124)', 'rgb(235, 127, 134)', 'rgb(33,113,181)', 'rgb(217,71,1)',],
                )

In [47]:
def plot_distribution(m,
                    success_overall, 
                    total_overall,
                    stats_by_year, 
                    upper_df, 
                    lower_df, 
                    columns, 
                    colors = ['rgb(33,113,181)', 'rgb(217,71,1)', 'rgb(81, 178, 124)', 'rgb(235, 127, 134)'],
                    ):

    n_cols, n_rows = 2, len(columns)

    specs = [[{}, {}]] * n_rows

    subplot_titles=[[f'{c} Boxplot', f'{c} Distplot'] for c in columns]
    
    fig = make_subplots(
            cols=n_cols,
            rows=n_rows,
            specs=specs,
            shared_xaxes=True,
            shared_yaxes=True,
            vertical_spacing=0.0,
            horizontal_spacing=0.05,
            subplot_titles=sum(subplot_titles, []),
            column_widths=[0.6, 0.4]
    )

    

    for i, col in enumerate(columns):
        

        fig.add_trace(
            go.Box(
                x=m['year'],
                y=m[f'perc_{col}'],
                marker_color=colors[i]
            ),
            row=i+1, col=1  
        )


        hist, kde = ff.create_distplot([m[f'perc_{col}'].to_numpy()], bin_size=0.015,
                        group_labels=[col], show_rug=False, colors=[colors[i]],
                        histnorm='probability',
                    )['data']
        
        hist_ = go.Histogram(
            y=hist['x'],
            histnorm='probability',
            ybins = {'start':hist['xbins']['start'], 'end':hist['xbins']['end'], 'size':hist['xbins']['size']},
            opacity=0.7, marker_color=colors[i]
        )

        kde_ = go.Scatter(
            x=kde['y'],
            y=kde['x'],
            marker_color=colors[i]
        )

        fig.add_trace(hist_,
            row=i+1, col=2
        )

        fig.add_trace(
            kde_,
            row=i+1, col=2
        )



    # Layout
    fig.update_layout({
        'showlegend': False,
        'height': 350*n_rows, 
        'width': 1200,
#        'yaxis': {'title': 'Percentage'},
        **{f'yaxis{2*r-1}': {'title': 'Percentage'} for r in range(1, n_rows+1)},
        f'xaxis{n_rows*n_cols-1}': {'title': 'Year'},
        f'xaxis{n_rows*n_cols}': {'title': 'Frequency'},

    })

    return fig


plot_distribution(m,
                success_overall, 
                total_overall, 
                stats_by_year, 
                upper_df, 
                lower_df, 
                columns=['firstIn', 'firstWon'],#, 'secondWon', 'returnWon'],#, 'ace', 'df'], 
                colors=['rgb(33,113,181)', 'rgb(217,71,1)', 'rgb(81, 178, 124)', 'rgb(235, 127, 134)', 'rgb(33,113,181)', 'rgb(217,71,1)',],
                )

In [27]:

    def plot_cols_overtime(self):

        cols = ['firstIn', 'firstWon', 'secondWon', 'returnWon']


        m1, m2 = self.selected_matches, self.stats_by_year

        x1 = m1['tourney_name'] + '(' + m1['year'].astype(str) + '), ' + m1['round']
        txt_suffix = ', ' + m1['opponent_name'].apply(get_player_name) + ': ' + m1['result']
        symbol = m1['winner']

        x2 = m2['year']
        


        colors = [
            'rgb(33,113,181)',
            'rgb(217,71,1)',
            'rgb(81, 178, 124)',
            'rgb(235, 127, 134)'
        ]



        fig = make_subplots(
            cols=2, rows=len(cols),
            specs=[[{}, {}]]*len(cols),
            shared_xaxes=True,
            row_heights=[350]*len(cols),
            subplot_titles=[
                'Percentage 1st In - Match by Match',  'Percentage 1st In - Yearly mean and 95% CI',
                'Percentage 1st Won - Match by Match',  'Percentage 1st Won - Yearly mean and 95% CI',
                'Percentage 2nd Won - Match by Match',  'Percentage 2nd Won - Yearly mean and 95% CI',
                'Percentage Return Won - Match by Match',  'Percentage Return Won - Yearly mean and 95% CI'
                ],
            column_widths=[0.65, 0.35],
            vertical_spacing=0.05,
            horizontal_spacing=0.05
        )
        

        for i, col in enumerate(cols):

            y1 = m1[f'perc_{col}']
            txt = y1.astype(float).round(2).astype(str) + txt_suffix
    
            fig.add_trace(
                go.Scatter(
                    x=x1, y=y1,
                    name=col,
                    textposition='top center',
                    hovertemplate=txt,
                    texttemplate=txt,
                    mode='lines+markers',
                    connectgaps=True,
                    marker={'color': colors[i], 'symbol': symbol},
                ),
                row=i+1, col=1
            )

            
            fig.add_trace(
                go.Scatter(
                    x=x2, y=self.upper_df[f'upper_{col}'],
                    name='Upper Band',
                    fill=None,
                    mode='lines',
                    line=dict(color='darksalmon', width=1)
                ),
                row=i+1, col=2
            )
    
            fig.add_trace(
                go.Scatter(
                    x=x2, y=self.lower_df[f'lower_{col}'],
                    name='Lower Band',
                    fill='tonexty', # fill area between trace0 and trace1
                    mode='lines',
                    line=dict(color='darksalmon', width=1)
                ),
                row=i+1, col=2
            )

            fig.add_trace(
                go.Scatter(
                    x=x2, y=m2[f'perc_{col}'],
                    name=f'Mean {col}',
                    mode='lines+markers',
                    marker={'color': colors[i]}
                ),
                row=i+1, col=2
            )


        # Layout
        fig.update_layout(
            xaxis7={'title': 'Tournament (Year), Round', 'tickangle': 45},
            xaxis8={'title': 'Year', 'tickangle': 45},
            yaxis={'title': 'Percentage'}, yaxis2={'title': 'Percentage', 'side':'right'},
            yaxis3={'title': 'Percentage'}, yaxis4={'title': 'Percentage', 'side':'right'},
            yaxis5={'title': 'Percentage'}, yaxis6={'title': 'Percentage', 'side':'right'},
            yaxis7={'title': 'Percentage'}, yaxis8={'title': 'Percentage', 'side':'right'},
            showlegend=False
        )
    
        return fig

1331 121


In [None]:




    def plot_surface_boxplot(self, col: str, surface_colors: Dict = surface_colors):
        
        fig = px.box(self.selected_matches, x=col, color='surface', 
                     color_discrete_map=surface_colors,
                     notched=True
                    )
    
        fig.update_layout(
                    title={'text': f'Summary Statistics of {col} by Surface', 
                           'y':1, 'x':0.5, 'xanchor': 'center', 'yanchor': 'top'},
                    xaxis={'title': f'{col} Percentage'},
                    yaxis={'title': 'Surface'},
        )
        return fig
    


    def plot_col_distplot(self, col, colors):
        
        fig = px.histogram(self.selected_matches, y=col, color_discrete_sequence=colors, nbins=50, 
                          marginal='box', histnorm='probability', opacity=0.8)

        fig.update_layout(
                title={'text': f'{col} Distribution', 'y':1, 'x':0.5,
                       'xanchor': 'center', 'yanchor': 'top'},
                xaxis={'title': f'{col}'},
                yaxis={'title': 'Frequency (%)'},
                showlegend=False
        )
        
        return fig
