In [19]:
import numpy as np
import pandas as pd
import nba_on_court as noc
import seaborn as sns
from sklearn.linear_model import RidgeCV

This code fits a ridge model from a 3rd party source (see https://github.com/SCOREnetworkorg/sports-data-repository/blob/main/basketball/nba-rapm-data.qmd).

I'm primarily using this to compare my results with another source's results on the same data.

# Fit the Model

In [20]:
data    = pd.read_csv('../../design_matrices/nba_2223_season_rapm_data.csv.gz')
players = pd.read_csv('../../design_matrices/odrapm_design.csv.gz', index_col=0).columns[1::2].astype(int).to_list() # every other item starting at position 1


design  = data.drop(['game_id', 'stint_id', 'n_pos', 'home_points', 'away_points', 'minutes', 'margin'], axis=1)
players = np.intersect1d(design.columns.astype(int).to_list(), players).astype(str)
design  = design[players]
target  = data['margin']

design

Unnamed: 0,2544,2617,101108,200768,200782,201142,201143,201144,201145,201565,...,1631099,1631107,1631108,1631111,1631116,1631128,1631169,1631212,1631223,1631254
0,-1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,-1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,-1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32379,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32380,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32381,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32382,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
hypers = [1000, 1500, 2000, 2500, 3000]

model = RidgeCV(
    alphas=hypers,
    fit_intercept=False, # my model uses a signed intercept
).fit(design, target, sample_weight=data['minutes'])

0.0038414468490202136

In [None]:
results = pd.read_csv('../../data/nba_2223_player_table.csv').sort_values(by='player_id')
my_results = pd.read_csv('../../results/signed_indicator_results.csv', index_col=0)
results = results[results['player_name'].isin(players)]
my_results = my_results[my_results['Player'].isin(players)]

results['RAPM'] = model.coef_

results.to_csv('../../results/ryurko_results.csv')
sns.scatterplot(x=100 * my_results['Average PM'], y=30 * results['RAPM'])

In [30]:
np.corrcoef(100 * my_results['Average PM'], 30 * results['RAPM'])

array([[1.        , 0.14396836],
       [0.14396836, 1.        ]])