In [177]:
import os
import numpy as np
import pandas as pd
import sys

from pathlib import Path
from sklearn.metrics import mean_squared_error

# Custom imports
sys.path.append(os.path.join(
    os.path.abspath('.'), 'notebooks'))

from commons import *

In [2]:
TYPE_game = ['PO', 'RS']

FILE_ids2players = 'Ids_to_Players.csv'
FILE_ids2teams = 'Ids_to_Teams.csv'

In [3]:
def factor(row_ptl: pd.DataFrame) -> float:
    return 2/3 - (0.5*row_ptl['Ast'] / row_ptl['FGM']) \
               / (2.0*row_ptl['FGM'] / row_ptl['FTM'])

In [4]:
def vop(row_ptl: pd.DataFrame) -> float:
    """
    Value of possession.
    """
    den = row_ptl['FGA'] - row_ptl['OReb'] \
        + row_ptl['TO'] + 0.44*row_ptl['FTA']
    
    return row_ptl['Pts'] / den

In [5]:
def drbp(row_ptl: pd.DataFrame) -> float:
    """
    Defensive rebounds percentage.
    """
    return row_ptl['DReb'] / row_ptl['Reb']

In [6]:
def uper(row_ptl: pd.DataFrame) -> float:
    """
    Unadjusted PER.
    """
    a = row_ptl['3PM_p'] \
      - (row_ptl['PF_p']*row_ptl['FTM']) / row_ptl['PF']
    
    b = row_ptl['FTM_p']/2
    b = b*(2 - row_ptl['Ast_t'] / (3*(row_ptl['FGM_t'] + 1e-1)))
    
    c = row_ptl['FGM_p'] * (2 - factor(row_ptl)*row_ptl['Ast_t'] \
                                               / (row_ptl['FGM_t'] + 1e-1))
    
    d = 2*row_ptl['Ast_p'] / 3
    
    e = drbp(row_ptl)*(2*row_ptl['OReb_p']
                           + row_ptl['Blk_p'] 
                           - 0.2464*(row_ptl['FTA_p'] - row_ptl['FTM_p'])
                           - (row_ptl['FGA_p'] - row_ptl['FGM_p'])
                           - row_ptl['Reb_p'])                \
      + 0.44*row_ptl['FTA']*(row_ptl['PF_p'] / row_ptl['PF']) \
      - (row_ptl['TO_p'] + row_ptl['OReb_p'])                 \
      + row_ptl['Stl_p'] + row_ptl['Reb_p']                   \
      - 0.1936*(row_ptl['FTA_p'] - row_ptl['FTM_p'])
    
    f = vop(row_ptl)*e

    return (1/(row_ptl['Min_p'] + 1e-1))*(a + b + c + d + f)

In [7]:
def aper(row_ptl: pd.DataFrame) -> float:
    """
    Adjusted PER.
    """
    return row_ptl['uPER'] * row_ptl['Pace'] \
                           / row_ptl['Pace_t']

In [8]:
def per(row_ptl: pd.DataFrame,
        aPER_league: float) -> float:
    """
    Player efficiency rating (PER) by J. Hollinger.  
    Ref. https://en.wikipedia.org/wiki/Player_efficiency_rating
    """     
    return row_ptl['aPER']*15/aPER_league

In [175]:
def kl_div(A, B):
    """
    Kullback–Leibler divergence.
    """
    A = np.asarray(A, dtype=float)
    B = np.asarray(B, dtype=float)
    
    bitm_A = np.isnan(A)
    bitm_B = np.isnan(B)

    bitm_AB = bitm_A | bitm_B
    
    # Discard NaN values
    A = A[~bitm_AB] + 1e-6
    B = B[~bitm_AB] + 1e-6
    
    return np.sum(A*np.log(A / B))

In [179]:
def rmse(A, B):
    """
    Root-mean-square error.
    """
    A = np.asarray(A, dtype=float)
    B = np.asarray(B, dtype=float)
    
    bitm_A = np.isnan(A)
    bitm_B = np.isnan(B)

    bitm_AB = bitm_A | bitm_B
    
    # Discard NaN values
    A = A[~bitm_AB]
    B = B[~bitm_AB]
    
    return mean_squared_error(A, B, squared=False)

In [101]:
def linmap(X, o_min, o_max, n_min, n_max):
    """
    Linear map a vector X from range (o_min, o_max) to range (n_min, n_max).
    """
    o_range = o_max - o_min
    n_range = n_max - n_min
    
    return ((X - o_min)*n_range) / o_range + n_min

In [212]:
def agg_per(row_PER: pd.DataFrame):
    """
    Aggregate PER with both RS and PO contributions.
    
    Parameters
    ----------
    row_PER : pd.DataFrame
        Row from the PER DataFrame
    """
    tot_GP_avg = row_PER['GP_avg_rs'] + row_PER['GP_avg_po']
    
    # c: Numeric stability coefficient
    if row_PER['GP_avg_rs'] == 0.0:
        c = 0.1
    elif row_PER['GP_avg_po'] == 0.0:
        c = -0.1
    else:
        c = 0.
    
    # Emphasize PO play rather than RS play
    a = 1. - (row_PER['GP_avg_rs'] + c)/tot_GP_avg
    b = 1. - (row_PER['GP_avg_po'] - c)/tot_GP_avg
    
    PER_avg_po = np.where(np.isnan(row_PER['PER_avg_po']),
                          0., row_PER['PER_avg_po'])
    
    PER_avg_rs = np.where(np.isnan(row_PER['PER_avg_rs']),
                          0., row_PER['PER_avg_rs'])
    
    return a*PER_avg_rs + b*PER_avg_po

## Compute PER for all NBA players

### Regular seasons (RS) and playoffs (PO)

In [10]:
average_PERs = {t: None for t in TYPE_game}

for t in TYPE_game:
    if t == 'PO':
        src_dir_league  = DIR_pro_po_league
        src_dir_teams   = DIR_pro_po_teams
        src_dir_players = DIR_pro_po_players
    
        dst_dir_PER = DIR_pro_po / 'PER'
    elif t == 'RS':
        src_dir_league  = DIR_pro_rs_league
        src_dir_teams   = DIR_pro_rs_teams
        src_dir_players = DIR_pro_rs_players
        
        dst_dir_PER = DIR_pro_rs / 'PER'
    else:
        raise ValueError('Only NBA regular seasons or playoffs can be processed.')

    PER_list = []

    df_ids2players = pd.read_csv(DIR_pro / FILE_ids2players)
    df_ids2teams   = pd.read_csv(DIR_pro / FILE_ids2teams)

    for y in range(YEAR_start, YEAR_end + 1):
        filename = year2filename(y)

        # Load all data for year y
        df_players = pd.read_csv(src_dir_players / filename)
        df_teams   = pd.read_csv(src_dir_teams / filename)
        df_league  = pd.read_csv(src_dir_league / filename)

        # Merge players DF with Ids info
        df_players['Player'] = df_players['Player'] \
            .apply(lambda s: unicd2ascii(s))

        df_players = pd.merge(df_players, df_ids2teams,
                              left_on='Team', right_on='Id') \
                       .rename(columns={'Name': 'Team_full'})                          \
                       .drop('Id', 1) 

        df_players = pd.merge(df_players, df_ids2players,
                              left_on='Player', right_on='Name') \
                       .drop(['Age', 'Name'], 1)

        # Join players and teams DFs
        df_PT = pd.merge(df_players, df_teams,
                         left_on='Team_full', right_on='Team',
                         suffixes=('_p', '_t'))
        
        df_PT = df_PT.drop('Team_t', 1) \
                     .rename(columns={'Team_p': 'Team',
                                      'Pace': 'Pace_t'})

        df_nL = pd.concat([df_league]*len(df_PT), ignore_index=True)

        # Join pt and league DFs
        df_PTL = pd.concat([df_PT, df_nL], axis=1) \
                   .set_index('Id')

        # Compute the unadjusted PER
        df_PTL['uPER'] = df_PTL.apply(uper, axis=1)
        
        # Compute the adjusted PER
        df_PTL['aPER'] = df_PTL.apply(aper, axis=1)

        aPER_league = df_PTL['aPER'].mean()

        # Compute the PER adjusted with league's aPER
        df_PTL['PER'] = df_PTL.apply(lambda r:
                                     per(r, aPER_league), axis=1)

        df_PTL.sort_values(by='PER', ascending=False,
                           inplace=True)

        sub_df_PTL = df_PTL[['Player', 'DOB', 'GP_p', 'PER']] \
                     .rename(columns={'GP_p': 'GP'})

        sub_df_PTL['Season'] = year2rangestr(y)

        # Collect all PER-rich sub DFs
        PER_list.append(sub_df_PTL)
        
    df_PER = pd.concat(PER_list).sort_index()

    # Group by NBA player Id
    groupby_PER = df_PER.groupby(df_PER.index)

    df_PER['PER_avg'] = groupby_PER['PER'].mean()
    df_PER['GP_avg']  = groupby_PER['GP'].mean()
    
    # Compute active NBA seasons per player
    df_seasons = groupby_PER.size()           \
                 .reset_index(name='Seasons') \
                 .set_index('Id')
    
    # Compute average PER across NBA seasons
    df_avg_PER = df_PER[['Player', 'DOB', 'PER_avg', 'GP_avg']].drop_duplicates()
    df_avg_PER = df_avg_PER.sort_values(by=['PER_avg'], ascending=False)
    df_avg_PER = df_avg_PER.join(df_seasons)
    
    df_avg_PER.to_csv(dst_dir_PER / 
                      total_year2filename(YEAR_start,
                                          YEAR_end))
    
    average_PERs[t] = df_avg_PER

### Merge RS and PO distributions

In [214]:
df_composite_PERs = average_PERs['RS'].join(average_PERs['PO'],
                                            on=average_PERs['RS'].index,
                                            how='outer',
                                            lsuffix='_rs',
                                            rsuffix='_po')

df_composite_PERs = df_composite_PERs.reset_index().drop('index', 1)  \
                                     .rename(columns={'key_0': 'Id'}) \
                                     .set_index('Id')

# Merge RS and PO data
df_composite_PERs['Player'] = np.where(
    df_composite_PERs['Player_po'].isnull(),
    df_composite_PERs['Player_rs'],
    df_composite_PERs['Player_po'])

df_composite_PERs['DOB'] = np.where(
    df_composite_PERs['DOB_po'].isnull(),
    df_composite_PERs['DOB_rs'],
    df_composite_PERs['DOB_po'])

df_composite_PERs['Seasons'] = np.where(
    df_composite_PERs['Seasons_rs'].isnull(),
    df_composite_PERs['Seasons_po'],
    df_composite_PERs['Seasons_rs'])

df_composite_PERs['GP_avg_rs'] = np.where(
    df_composite_PERs['GP_avg_rs'].isnull(),
    0.0,
    df_composite_PERs['GP_avg_rs'])

df_composite_PERs['GP_avg_po'] = np.where(
    df_composite_PERs['GP_avg_po'].isnull(),
    0.0,
    df_composite_PERs['GP_avg_po'])

# Normalize PER values between 0. and 1.
sr_PER_avg_rs = df_composite_PERs['PER_avg_rs']
sr_PER_avg_po = df_composite_PERs['PER_avg_po']

df_composite_PERs['PER_pct_rs'] = df_composite_PERs.apply(lambda r: linmap(r['PER_avg_rs'],
                                                                           sr_PER_avg_rs.min(),
                                                                           sr_PER_avg_rs.max(),
                                                                           0., 1.),
                                                          axis=1)

df_composite_PERs['PER_pct_rs'] = df_composite_PERs['PER_pct_rs'] \
                                / df_composite_PERs['PER_pct_rs'].sum()

df_composite_PERs['PER_pct_po'] = df_composite_PERs.apply(lambda r: linmap(r['PER_avg_po'],
                                                                           sr_PER_avg_po.min(),
                                                                           sr_PER_avg_po.max(),
                                                                           0., 1.),
                                                          axis=1)

df_composite_PERs['PER_pct_po'] = df_composite_PERs['PER_pct_po'] \
                                / df_composite_PERs['PER_pct_po'].sum()

# Compute aggregate PER
df_composite_PERs['PER_avg_agg'] = df_composite_PERs.apply(agg_per, axis=1)

# Drop redundant DF columns
df_composite_PERs.drop(columns=['Player_po', 'Player_rs',
                                'DOB_po', 'DOB_rs',
                                'Seasons_po', 'Seasons_rs'],
                       inplace=True)

# Re-order DF columns
df_composite_PERs = df_composite_PERs[['Player', 'DOB', 'Seasons', 'PER_avg_agg',
                                       'GP_avg_rs', 'PER_avg_rs', 'PER_pct_rs',
                                       'GP_avg_po', 'PER_avg_po', 'PER_pct_po']]


df_composite_PERs.sort_values(by='PER_avg_agg',
                              ascending=False,
                              inplace=True)

df_composite_PERs.to_csv(DIR_pro / 'PER-{}'
                         .format(total_year2filename(
                             YEAR_start, YEAR_end)))

df_composite_PERs.head(10)

Unnamed: 0_level_0,Player,DOB,Seasons,PER_avg_agg,GP_avg_rs,PER_avg_rs,PER_pct_rs,GP_avg_po,PER_avg_po,PER_pct_po
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Meeksjo01,Jodie Meeks,8/21/1987,7.0,200.127771,14.0,7.330876,0.000711,3.0,241.441391,0.006585
Brookaa01,Aaron Brooks,1/14/1985,7.0,153.347083,16.571429,23.12272,0.000777,3.0,176.922183,0.005296
Mitchdo01,Donovan Mitchell,9/7/1996,4.0,105.417531,20.5,28.246144,0.000798,3.5,118.593134,0.004131
Rideris01,Isaiah Rider,3/12/1971,4.0,74.614375,16.5,7.742245,0.000713,4.0,90.825801,0.003576
Youngtr01,Trae Young,9/19/1998,3.0,65.724099,21.0,14.178737,0.00074,7.0,82.905886,0.003418
Jordami01,Michael Jordan,2/17/1963,3.0,62.044937,27.0,41.427262,0.000853,11.5,70.826539,0.003176
Gordobe01,Ben Gordon,4/4/1983,11.0,57.553169,21.636364,22.855441,0.000776,5.0,65.571551,0.003071
Bogdabo01,Bogdan Bogdanovic,8/18/1992,4.0,57.041445,21.25,8.844365,0.000717,3.0,63.845739,0.003037
Obertfa01,Fabricio Oberto,3/21/1975,3.0,55.637088,7.333333,-9.213217,0.000642,3.0,82.166759,0.003403
Rosste01,Terrence Ross,2/5/1991,9.0,53.446151,16.444444,12.658198,0.000733,3.0,60.887197,0.002978


### Compare RS and PO distributions

**Kullback-Leibler divergence**

In [197]:
kl_div(df_composite_PERs['PER_pct_po'],
       df_composite_PERs['PER_pct_rs'])

1.0183064527433943

**Root-mean-square error**

In [198]:
rmse(df_composite_PERs['PER_avg_po'],
     df_composite_PERs['PER_avg_rs'])

24.51533804259828

### 75th percentile (Q3) NBA players

In [217]:
df_agg_PERs_75 = df_composite_PERs.loc[
      (df_composite_PERs['GP_avg_rs'] 
           >= df_composite_PERs['GP_avg_rs'].quantile(0.75))
    & (df_composite_PERs['GP_avg_po'] 
           >= df_composite_PERs['GP_avg_po'].quantile(0.75))]

df_agg_PERs_75.to_csv(DIR_pro / 'PER-{}-p75.csv'
              .format(total_year2rangestr(
                  YEAR_start, YEAR_end)))

df_agg_PERs_75.head(10)

Unnamed: 0_level_0,Player,DOB,Seasons,PER_avg_agg,GP_avg_rs,PER_avg_rs,PER_pct_rs,GP_avg_po,PER_avg_po,PER_pct_po
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Youngtr01,Trae Young,9/19/1998,3.0,65.724099,21.0,14.178737,0.00074,7.0,82.905886,0.003418
Jordami01,Michael Jordan,2/17/1963,3.0,62.044937,27.0,41.427262,0.000853,11.5,70.826539,0.003176
Gordobe01,Ben Gordon,4/4/1983,11.0,57.553169,21.636364,22.855441,0.000776,5.0,65.571551,0.003071
Turneev01,Evan Turner,10/27/1988,9.0,50.491999,20.666667,14.02588,0.000739,4.5,58.432202,0.002929
Murraja01,Jamal Murray,2/23/1997,5.0,50.313051,21.8,30.892011,0.000809,6.5,56.103728,0.002882
Ellismo01,Monta Ellis,10/26/1985,12.0,49.222455,26.416667,25.89873,0.000788,4.0,52.754123,0.002815
Arenagi01,Gilbert Arenas,1/6/1982,9.0,47.58864,20.888889,19.301789,0.000761,4.0,53.005271,0.00282
Holidjr01,Jrue Holiday,6/12/1990,12.0,47.444525,22.916667,13.701412,0.000738,4.75,54.438552,0.002849
Masonde01,Desmond Mason,10/11/1977,9.0,46.05017,25.333333,15.911576,0.000747,4.0,50.808896,0.002776
Jamesle01,LeBron James,12/30/1984,18.0,45.961236,26.388889,46.061535,0.000872,7.0,45.93463,0.002679
