# Calculating Regularized Adjusted Plus-minus (RAPM)

This notebook is combing the work of https://github.com/EvanZ/nba-rapm and https://github.com/ethanluoyc/statsnba-playbyplay 

Since the original play by play repository has some bugs I am using  my fork https://github.com/903124/statsnba-playbyplay 

The wrapper code can only be used in Python 2

A few games are not counted due to bugs in nba.com / wrapper

In [None]:
%load_ext autoreload
%autoreload 2

from statsnba import Game, Api
import requests_cache
import pandas as pd
pd.options.display.max_columns = 999

# requests_cache.install_cache('test_cache')

In [3]:
#load the statsnba api

api = Api()
game_ids = api.GetSeasonGameIDs('2017-18', 'Regular Season')


In [None]:
#Start dowloading play by play data from nba.com
#Depends on network speed, computing power and unforseeable bug it would take a couple of hours or more

import time
start_time = time.time()
def matchup_to_df(matchups):
    lst = []
    for matchup in sample_game.Matchups:
        matchup_dict = {}
        _home_players = sorted([p.PlayerName for p in matchup.HomePlayers])
        _away_players = sorted([p.PlayerName for p in matchup.AwayPlayers])
        home_players = dict(zip(['h{}'.format(i) for i in range(5)], _home_players))
        away_players = dict(zip(['a{}'.format(i) for i in range(5)], _away_players))
        
        matchup_dict.update(home_players)
        matchup_dict.update(away_players)        
        home_boxscore = matchup.Boxscore.HomeTeamStats
        away_boxscore = matchup.Boxscore.AwayTeamStats
        
        matchup_dict.update({'home_{}'.format(k):v for k,v in home_boxscore.items()})
        matchup_dict.update({'away_{}'.format(k):v for k,v in away_boxscore.items()})
        lst.append(matchup_dict)
    return lst
df = pd.DataFrame()
for i,game_id in enumerate(game_ids):
    if(game_id == game_ids[198] or game_id == game_ids[667]): #Very few of matches has  problem and the program will crash
        continue
    sample_game_id = game_id
    sample_game = Game(sample_game_id)
    try:
        df = df.append(pd.DataFrame(matchup_to_df(sample_game.Matchups)))
    except KeyError: #Bypass minor bugs on a few games
        pass
    except AttributeError:
        pass
    except TypeError:
        pass
    print(i)
    print(time.time()-start_time)

In [40]:
#dumping data
df.to_csv('1718lineup.csv')
import csv
with open("1718gameid.csv", "wb") as f:
    writer = csv.writer(f)
    writer.writerows(game_ids)

In [5]:
#reading data
import pandas as pd
df= pd.read_csv('1718lineup.csv')
data = df.to_dict('record')

In [6]:
units = []
points = []
weights = []

for d in data:

    home_poss = int(d['home_Possessions'])
    away_poss = int(d['away_Possessions'])
    
    home_name = [d['h0'],d['h1'],d['h2'],d['h3'],d['h4']]
    away_name = [d['a0'],d['a1'],d['a2'],d['a3'],d['a4']]
    home_offense_unit = {"{},offense".format(name): 1 for name in home_name}
    home_defense_unit = {"{},defense".format(name): 1 for name in home_name}
    away_offense_unit = {"{},offense".format(name): 1 for name in away_name}
    away_defense_unit = {"{},defense".format(name): 1 for name in away_name}

    home_stint = home_offense_unit.copy()
    home_stint.update(away_defense_unit)
    home_stint.update({'HCA': 1}) #home count advantage
    away_stint = away_offense_unit.copy()
    away_stint.update(home_defense_unit)
    away_stint.update({'HCA': -1}) 

    if home_poss >= 1:  # to avoid some ill-conditioning we only use stints that have possessions >= 1
        home_ortg = 100 * int(d['home_PTS']) / home_poss
        units.append(home_stint)
        points.append(home_ortg)
        weights.append(home_poss)

    if away_poss >= 1:
        away_ortg = 100 * int(d['away_PTS']) / away_poss
        units.append(away_stint)
        points.append(away_ortg)
        weights.append(away_poss)
        
print(len(units), len(points), len(weights))        

(53465, 53465, 53465)


In [7]:
# Now we employ DictVectorizer to do its magic
from sklearn.feature_extraction import DictVectorizer
u = DictVectorizer(sparse=False)
u_mat = u.fit_transform(units)
print(u_mat)  # a giant list of lists where each array contains five +1's, five -1's, and a whole mess of 0's
print(points[:25])  # just showing the first 25 stints
print(weights[:100])  # just showing the first 100 stints

[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]
[133, 100, 200, 100, 300, 100, 0, 50, 200, 100, 0, 350, 100, 133, 33, 100, 175, 75, 50, 33, 100, 100, 0, 100, 100]
[9, 9, 2, 2, 1, 2, 1, 2, 1, 1, 2, 2, 3, 3, 6, 5, 4, 4, 4, 3, 2, 3, 2, 2, 3, 3, 3, 4, 1, 2, 4, 4, 1, 9, 9, 4, 4, 1, 1, 2, 5, 6, 2, 2, 2, 1, 1, 4, 4, 1, 3, 1, 4, 4, 3, 3, 9, 7, 1, 1, 1, 1, 1, 1, 1, 2, 1, 9, 10, 6, 5, 1, 2, 3, 2, 4, 5, 3, 5, 1, 2, 3, 3, 1, 1, 3, 3, 2, 1, 3, 2, 3, 3, 3, 1, 3, 4, 5, 5, 4]


In [8]:
# The first 25 players alphabetically in the data set
from pprint import pprint
players = u.get_feature_names()
pprint(u.get_feature_names()[:25])

['Aaron Brooks,defense',
 'Aaron Brooks,offense',
 'Aaron Gordon,defense',
 'Aaron Gordon,offense',
 'Abdel Nader,defense',
 'Abdel Nader,offense',
 'Adreian Payne,defense',
 'Adreian Payne,offense',
 'Al Horford,defense',
 'Al Horford,offense',
 'Al Jefferson,defense',
 'Al Jefferson,offense',
 'Al-Farouq Aminu,defense',
 'Al-Farouq Aminu,offense',
 'Alec Burks,defense',
 'Alec Burks,offense',
 'Alec Peters,defense',
 'Alec Peters,offense',
 'Alex Abrines,defense',
 'Alex Abrines,offense',
 'Alex Caruso,defense',
 'Alex Caruso,offense',
 'Alex Len,defense',
 'Alex Len,offense',
 'Alex Poythress,defense']


In [9]:
# perform the inverse transform on one stint just to double check it makes sense

pprint(u.inverse_transform(u_mat)[:1])

[{'Allen Crabbe,offense': 1.0,
  'Danny Green,defense': 1.0,
  'DeMarre Carroll,offense': 1.0,
  'HCA': 1.0,
  'Kyle Anderson,defense': 1.0,
  'LaMarcus Aldridge,defense': 1.0,
  'Pau Gasol,defense': 1.0,
  'Rondae Hollis-Jefferson,offense': 1.0,
  'Spencer Dinwiddie,offense': 1.0,
  'Tony Parker,defense': 1.0,
  'Tyler Zeller,offense': 1.0}]


In [10]:
import numpy as np
from sklearn import linear_model

clf = linear_model.RidgeCV(alphas=(np.array([0.01, 0.1, 1.0, 10, 100, 500, 750, 1000, 1500, 2000, 5000])), cv=5)
clf.fit(u_mat, points, sample_weight=weights)

RidgeCV(alphas=array([  1.00000e-02,   1.00000e-01,   1.00000e+00,   1.00000e+01,
         1.00000e+02,   5.00000e+02,   7.50000e+02,   1.00000e+03,
         1.50000e+03,   2.00000e+03,   5.00000e+03]),
    cv=5, fit_intercept=True, gcv_mode=None, normalize=False, scoring=None,
    store_cv_values=False)

In [11]:
# Here is the value of alpha that RidgeCV selected. We could probably add a few more values to test above too...
# but this is just a tutorial :)

print(clf.alpha_)

2000.0


In [12]:
ratings = []
for player in players:
    ratings.append((player, clf.coef_[players.index(player)]))
ratings.sort(key=lambda tup: tup[1], reverse=True)  # sort by rating in descending order

In [13]:
for idx, rating in enumerate(ratings): #better offense = more positive, better defense = more negative
    print("{},{},{}".format(idx + 1, rating[0], rating[1]))

1,Stephen Curry,offense,7.01329052248
2,Chris Paul,offense,4.05354594049
3,LeBron James,offense,3.924903113
4,Jamal Crawford,defense,3.66812010708
5,Karl-Anthony Towns,offense,3.65350678396
6,James Harden,offense,3.63451659199
7,Bobby Portis,offense,3.43300957336
8,Emmanuel Mudiay,defense,3.14011775179
9,Kevin Durant,offense,3.12330024784
10,Jamal Murray,offense,3.09661396307
11,Shabazz Muhammad,defense,3.07457445583
12,Marco Belinelli,defense,3.07259940996
13,Lou Williams,offense,3.06327251896
14,Wesley Matthews,defense,3.02644832897
15,Kemba Walker,offense,2.9221463593
16,Kelly Olynyk,offense,2.88589529561
17,Lance Stephenson,defense,2.87822719954
18,Eric Gordon,offense,2.86447265509
19,Rajon Rondo,defense,2.78484298474
20,Isaiah Thomas,defense,2.76137681229
21,Enes Kanter,defense,2.70748189497
22,Elfrid Payton,defense,2.66158142694
23,Lou Williams,defense,2.62210431369
24,Otto Porter Jr.,offense,2.62081315086
25,Cristiano Felicio,defense,2.61601172587
26,DeMar DeRozan,offense,2.5944

In [15]:
ratings = []
for player in players:
    ratings.append((player, clf.coef_[players.index(player)]))
    
player_name_list = []
ORAPM_list = []
DRAPM_list = []
for rating in ratings:
    if(rating[0] != 'HCA'):
        string = rating[0].split(',')
        if(string[0] not in player_name_list):
            player_name_list.append(string[0])    
        if(string[1] == 'offense'):
            ORAPM_list.append(rating[1])
        else:
            DRAPM_list.append(-rating[1])    
RAPM_list = []
for i in range(len(ORAPM_list)):
    RAPM_list.append(ORAPM_list[i]+DRAPM_list[i]) 
RAPM_dict = {'Player': player_name_list, 'ORAPM': ORAPM_list,'DRAPM': DRAPM_list,'RAPM': RAPM_list }
RAPM_df = pd.DataFrame(data=RAPM_dict)
RAPM_df.sort_values(by=['RAPM'],ascending=False)

Unnamed: 0,DRAPM,ORAPM,Player,RAPM
441,-0.724390,7.013291,Stephen Curry,6.288900
66,1.265079,4.053546,Chris Paul,5.318625
262,2.769433,2.502859,Jrue Holiday,5.272292
236,3.072941,2.001242,Joel Embiid,5.074183
160,2.955861,1.812602,Giannis Antetokounmpo,4.768462
508,4.141912,0.493761,Yogi Ferrell,4.635673
491,2.792494,1.835247,Victor Oladipo,4.627741
150,2.858199,1.719299,Fred VanVleet,4.577497
385,1.706910,2.620813,Otto Porter Jr.,4.327723
18,5.068869,-0.793235,Andre Roberson,4.275634
