In [1]:
import os
import numpy as np
import pandas as pd
import sys

from pathlib import Path
from sklearn.metrics import mean_squared_error

# Custom imports
sys.path.append(os.path.join(
    os.path.abspath('.'), 'notebooks'))

from commons import *
from per import *

In [2]:
TYPE_game = ['PO', 'RS']

FILE_ids2players = 'Ids_to_Players.csv'
FILE_ids2teams = 'Ids_to_Teams.csv'

In [3]:
def kl_div(A, B):
    """
    Kullback–Leibler divergence.
    """
    A = np.asarray(A, dtype=float)
    B = np.asarray(B, dtype=float)
    
    bitm_A = np.isnan(A)
    bitm_B = np.isnan(B)

    bitm_AB = bitm_A | bitm_B
    
    # Discard NaN values
    A = A[~bitm_AB] + 1e-6
    B = B[~bitm_AB] + 1e-6
    
    return np.sum(A*np.log(A / B))

In [4]:
def rmse(A, B):
    """
    Root-mean-square error.
    """
    A = np.asarray(A, dtype=float)
    B = np.asarray(B, dtype=float)
    
    bitm_A = np.isnan(A)
    bitm_B = np.isnan(B)

    bitm_AB = bitm_A | bitm_B
    
    # Discard NaN values
    A = A[~bitm_AB]
    B = B[~bitm_AB]
    
    return mean_squared_error(A, B, squared=False)

In [5]:
def linmap(X, o_min, o_max, n_min, n_max):
    """
    Linear map a vector X from range (o_min, o_max) to range (n_min, n_max).
    """
    o_range = o_max - o_min
    n_range = n_max - n_min
    
    return ((X - o_min)*n_range) / o_range + n_min

In [6]:
def agg_per(row_PER: pd.DataFrame,
            w_rs: float = 0.1,
            w_po: float = 0.9):
    """
    Aggregate PER with both RS and PO contributions.
    
    Parameters
    ----------
    row_PER : pd.DataFrame
        Row from the PER DF
        
    w_rs : float
        Weight on PER from RS play
        
    w_po : float
        Weight on PER from PO play
    """    
    PER_avg_po = np.where(
        np.isnan(row_PER['PER_avg_po']),
        0., row_PER['PER_avg_po'])
    
    PER_avg_rs = np.where(
        np.isnan(row_PER['PER_avg_rs']),
        0., row_PER['PER_avg_rs'])
    
    return w_rs*PER_avg_rs + w_po*PER_avg_po

## Compute PER for all NBA players

### Regular seasons (RS) and playoffs (PO)

In [8]:
average_PERs = {t: None for t in TYPE_game}

for t in TYPE_game:
    if t == 'PO':
        src_dir_league  = DIR_pro_po_league
        src_dir_teams   = DIR_pro_po_teams
        src_dir_players = DIR_pro_po_players
    
        dst_dir_PER = DIR_pro_po / 'PER'
    elif t == 'RS':
        src_dir_league  = DIR_pro_rs_league
        src_dir_teams   = DIR_pro_rs_teams
        src_dir_players = DIR_pro_rs_players
        
        dst_dir_PER = DIR_pro_rs / 'PER'
    else:
        raise ValueError('Only NBA regular seasons or playoffs can be processed.')

    PER_list = []

    df_ids2players = pd.read_csv(DIR_pro / FILE_ids2players)
    df_ids2teams   = pd.read_csv(DIR_pro / FILE_ids2teams)

    for y in range(YEAR_start, YEAR_end + 1):
        filename = year2filename(y)

        # Load all data for year y
        df_players = pd.read_csv(src_dir_players / filename)
        df_teams   = pd.read_csv(src_dir_teams / filename)
        df_league  = pd.read_csv(src_dir_league / filename)

        # Merge players DF with Ids info
        df_players['Player'] = df_players['Player'] \
            .apply(lambda s: unicd2ascii(s))

        df_players = pd.merge(df_players, df_ids2teams,
                              left_on='Team', right_on='Id') \
                       .rename(columns={'Name': 'Team_full'})                          \
                       .drop('Id', 1) 

        df_players = pd.merge(df_players, df_ids2players,
                              left_on='Player', right_on='Name') \
                       .drop(['Age', 'Name'], 1)

        # Join players and teams DFs
        df_PT = pd.merge(df_players, df_teams,
                         left_on='Team_full', right_on='Team',
                         suffixes=('_p', '_t'))
        
        df_PT = df_PT.drop('Team_t', 1) \
                     .rename(columns={'Team_p': 'Team',
                                      'Pace': 'Pace_t'})

        df_nL = pd.concat([df_league]*len(df_PT), ignore_index=True)

        # Join PT and league DFs
        df_PTL = pd.concat([df_PT, df_nL], axis=1) \
                   .set_index('Id')

        # Compute uPER and aPER
        df_PTL['uPER'] = df_PTL.apply(uper, axis=1)
        df_PTL['aPER'] = df_PTL.apply(aper, axis=1)

        aPER_league = df_PTL['aPER'].mean()

        # Compute PER adjusted with league's aPER
        df_PTL['PER'] = df_PTL.apply(lambda r:
                                     per(r, aPER_league), axis=1)

        df_PTL.sort_values(by='PER', ascending=False,
                           inplace=True)

        sub_df_PTL = df_PTL[['Player', 'DOB', 'GP_p', 'PER']] \
                     .rename(columns={'GP_p': 'GP'})

        sub_df_PTL['Season'] = year2rangestr(y)

        # Collect all PER-rich sub DFs
        PER_list.append(sub_df_PTL)
        
    df_PER = pd.concat(PER_list).sort_index()

    # Group by NBA player Id
    groupby_PER = df_PER.groupby(df_PER.index)

    df_PER['PER_avg'] = groupby_PER['PER'].mean()
    df_PER['GP_avg']  = groupby_PER['GP'].mean()
    
    # Compute active NBA seasons per player
    df_seasons = groupby_PER.size()           \
                 .reset_index(name='Seasons') \
                 .set_index('Id')
    
    # Compute average PER across NBA seasons
    df_avg_PER = df_PER[['Player', 'DOB', 'PER_avg', 'GP_avg']].drop_duplicates()
    df_avg_PER = df_avg_PER.sort_values(by=['PER_avg'], ascending=False)
    df_avg_PER = df_avg_PER.join(df_seasons)
    
    df_avg_PER.to_csv(dst_dir_PER / 
                      year2filename(YEAR_start, 
                                    YEAR_end))
    
    average_PERs[t] = df_avg_PER

### Merge RS and PO distributions

In [9]:
df_composite_PERs = average_PERs['RS'].join(average_PERs['PO'],
                                            on=average_PERs['RS'].index,
                                            how='outer',
                                            lsuffix='_rs',
                                            rsuffix='_po')

df_composite_PERs = df_composite_PERs.reset_index().drop('index', 1)  \
                                     .rename(columns={'key_0': 'Id'}) \
                                     .set_index('Id')

# Merge RS and PO data
df_composite_PERs['Player'] = np.where(
    df_composite_PERs['Player_po'].isnull(),
    df_composite_PERs['Player_rs'],
    df_composite_PERs['Player_po'])

df_composite_PERs['DOB'] = np.where(
    df_composite_PERs['DOB_po'].isnull(),
    df_composite_PERs['DOB_rs'],
    df_composite_PERs['DOB_po'])

df_composite_PERs['Seasons'] = np.where(
    df_composite_PERs['Seasons_rs'].isnull(),
    df_composite_PERs['Seasons_po'],
    df_composite_PERs['Seasons_rs'])

df_composite_PERs['GP_avg_rs'] = np.where(
    df_composite_PERs['GP_avg_rs'].isnull(), 0.0,
    df_composite_PERs['GP_avg_rs'])

df_composite_PERs['GP_avg_po'] = np.where(
    df_composite_PERs['GP_avg_po'].isnull(), 0.0,
    df_composite_PERs['GP_avg_po'])

# Normalize PER values between 0. and 1.
sr_PER_avg_rs = df_composite_PERs['PER_avg_rs']
sr_PER_avg_po = df_composite_PERs['PER_avg_po']

df_composite_PERs['PER_pct_rs'] = df_composite_PERs.apply(lambda r: linmap(r['PER_avg_rs'],
                                                                           sr_PER_avg_rs.min(),
                                                                           sr_PER_avg_rs.max(),
                                                                           0., 1.),
                                                          axis=1)

df_composite_PERs['PER_pct_rs'] = df_composite_PERs['PER_pct_rs'] \
                                / df_composite_PERs['PER_pct_rs'].sum()

df_composite_PERs['PER_pct_po'] = df_composite_PERs.apply(lambda r: linmap(r['PER_avg_po'],
                                                                           sr_PER_avg_po.min(),
                                                                           sr_PER_avg_po.max(),
                                                                           0., 1.),
                                                          axis=1)

df_composite_PERs['PER_pct_po'] = df_composite_PERs['PER_pct_po'] \
                                / df_composite_PERs['PER_pct_po'].sum()

# Compute aggregate PER
df_composite_PERs['PER_avg_agg'] = df_composite_PERs.apply(agg_per, axis=1)

# Drop redundant DF columns
df_composite_PERs.drop(columns=['Player_po', 'Player_rs',
                                'DOB_po', 'DOB_rs',
                                'Seasons_po', 'Seasons_rs'],
                       inplace=True)

# Re-order DF columns
df_composite_PERs = df_composite_PERs[['Player', 'DOB', 'Seasons', 'PER_avg_agg',
                                       'GP_avg_rs', 'PER_avg_rs', 'PER_pct_rs',
                                       'GP_avg_po', 'PER_avg_po', 'PER_pct_po']]


df_composite_PERs.sort_values(by='PER_avg_agg',
                              ascending=False,
                              inplace=True)

df_composite_PERs.to_csv(DIR_pro / 'PER-{}'
                         .format(year2filename(
                             YEAR_start, YEAR_end)))

df_composite_PERs.head(10)

Unnamed: 0_level_0,Player,DOB,Seasons,PER_avg_agg,GP_avg_rs,PER_avg_rs,PER_pct_rs,GP_avg_po,PER_avg_po,PER_pct_po
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Meeksjo01,Jodie Meeks,8/21/1987,7.0,213.962815,14.0,7.609444,0.000709,3.0,236.890967,0.006811
Brookaa01,Aaron Brooks,1/14/1985,7.0,157.637556,16.571429,23.143422,0.000782,3.0,172.581349,0.005435
Mitchdo01,Donovan Mitchell,9/7/1996,4.0,107.994666,20.5,27.818081,0.000804,3.5,116.903176,0.004243
Rideris01,Isaiah Rider,3/12/1971,4.0,81.426536,16.5,8.123236,0.000712,4.0,89.571347,0.003658
Youngtr01,Trae Young,9/19/1998,3.0,75.331702,21.0,13.580662,0.000737,7.0,82.192929,0.0035
Obertfa01,Fabricio Oberto,3/21/1975,3.0,72.288942,7.333333,-8.710452,0.000633,3.0,81.288875,0.003481
Jordami01,Michael Jordan,2/17/1963,3.0,66.046382,27.0,40.84272,0.000865,11.5,68.846789,0.003214
Gordobe01,Ben Gordon,4/4/1983,11.0,60.830773,21.636364,22.429358,0.000779,5.0,65.097597,0.003134
Bogdabo01,Bogdan Bogdanovic,8/18/1992,4.0,57.713429,21.25,8.582731,0.000714,3.0,63.172396,0.003093
Rosste01,Terrence Ross,2/5/1991,9.0,55.028027,16.444444,12.757246,0.000734,3.0,59.72478,0.003019


### Compare RS and PO distributions

**Kullback-Leibler divergence**

In [10]:
kl_div(df_composite_PERs['PER_pct_po'],
       df_composite_PERs['PER_pct_rs'])

1.0183375071139675

**Root-mean-square error**

In [11]:
rmse(df_composite_PERs['PER_avg_po'],
     df_composite_PERs['PER_avg_rs'])

23.881628904744193

### 75th percentile (Q3) NBA players

In [12]:
df_agg_PERs_75 = df_composite_PERs.loc[
      (df_composite_PERs['GP_avg_rs'] 
           >= df_composite_PERs['GP_avg_rs'].quantile(0.75))
    & (df_composite_PERs['GP_avg_po'] 
           >= df_composite_PERs['GP_avg_po'].quantile(0.75))]

df_agg_PERs_75.to_csv(DIR_pro / 'PER-{}-p75.csv'
              .format(year2rangestr(
                  YEAR_start, YEAR_end)))

df_agg_PERs_75.head(10)

Unnamed: 0_level_0,Player,DOB,Seasons,PER_avg_agg,GP_avg_rs,PER_avg_rs,PER_pct_rs,GP_avg_po,PER_avg_po,PER_pct_po
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Youngtr01,Trae Young,9/19/1998,3.0,75.331702,21.0,13.580662,0.000737,7.0,82.192929,0.0035
Jordami01,Michael Jordan,2/17/1963,3.0,66.046382,27.0,40.84272,0.000865,11.5,68.846789,0.003214
Gordobe01,Ben Gordon,4/4/1983,11.0,60.830773,21.636364,22.429358,0.000779,5.0,65.097597,0.003134
Turneev01,Evan Turner,10/27/1988,9.0,52.885628,20.666667,13.864977,0.000739,4.5,57.221256,0.002966
Murraja01,Jamal Murray,2/23/1997,5.0,52.678292,21.8,30.654517,0.000817,6.5,55.125378,0.002921
Holidjr01,Jrue Holiday,6/12/1990,12.0,49.6428,22.916667,13.904332,0.000739,4.75,53.613741,0.002888
Ellismo01,Monta Ellis,10/26/1985,12.0,49.24238,26.416667,25.394914,0.000793,4.0,51.892099,0.002851
Arenagi01,Gilbert Arenas,1/6/1982,9.0,49.1202,20.888889,19.360268,0.000764,4.0,52.426859,0.002863
Johnsla01,Larry Johnson,11/28/1954,4.0,48.380025,22.75,18.701444,0.000761,7.0,51.677645,0.002847
Johnsla02,Larry Johnson,3/14/1969,4.0,48.380025,22.75,18.701444,0.000761,7.0,51.677645,0.002847
