In [6]:
import pandas as pd
import numpy as np
from typing import Tuple
import nba_on_court as noc
from sklearn.linear_model import RidgeCV, Ridge

Load the data

In [9]:
data = pd.read_csv('../../data/processed_data_2022.csv', index_col=0)

Creating design matrix with offensive and defensive coefficients for each player

In [10]:
_AWAY_LIST = ['AWAY_PLAYER1', 'AWAY_PLAYER2', 'AWAY_PLAYER3', 'AWAY_PLAYER4', 'AWAY_PLAYER5']
_HOME_LIST = ['HOME_PLAYER1', 'HOME_PLAYER2', 'HOME_PLAYER3', 'HOME_PLAYER4', 'HOME_PLAYER5']

def _determine_coefficient_odrapm(row: pd.Series, player_id: int, offensive: bool) -> int:
    '''
    Determines whether a player's coefficient is -1, 0, 1 in a given stint for Offensive/Defensive RAPM model.

    This is meant to be used in the `get_design_matrix` function.
    '''
    

    away = row[_AWAY_LIST].values
    home = row[_HOME_LIST].values
    away_is_offense = row['BALL_ID'] in away # True if Away is offensive team
    away_is_offense = away_is_offense if offensive else not away_is_offense # Basically, change 'defense' case to 'offense' case

    if player_id in away and away_is_offense:
        return -1
    elif player_id in home and not away_is_offense:
        return 1
    else:
        return 0

In [11]:
def get_design_matrix_odrapm(game_data: pd.DataFrame, *, return_players: bool = False) -> np.ndarray | Tuple[np.ndarray, np.ndarray]:
    '''
    Given a preprocessed game/season of data, return a design matrix for a regression model.
    See `preprocess_game`

    Each row of the matrix will correspond to a stint.
    Even columns (0, 2, 4, ...) correspond to offensive indicators (i.e. filled iff player is on offense).\n
    Odd columns (1, 3, 5, ...) correspond to defensive indicators (i.e. filled iff player is on defense).\n
    Player `k` corresponds to columns `2k` (offense) and `2k+1` (defense).\n
    
    The matrix will be filled as follows:
      `-1`: Present on the Away team.
      `0`: Not present during the stint.
      `1`: Present on the Home team. 

    If `return_players` is True, then this will also return
    a list of all player IDs encountered in processing.
    The order of this list matches the order of columns in the design matrix.
    '''
    all_players = np.unique(game_data.filter(like='PLAYER').to_numpy())
    design_matrix = np.empty((len(game_data), 2 * len(all_players)))        # stints x 2 * players. to be filled

    # fill the design matrix with coefficients
    # for a future date: call 'apply' once, have _det_coeff return a 2-sized vector, and fill both columns simultaneously
    for i in range(len(all_players)):
        player = all_players[i]
        design_matrix[:, 2*i] = game_data.apply(_determine_coefficient_odrapm, axis=1, player_id=player, offensive=True).to_numpy()
        design_matrix[:, 2*i+1] = game_data.apply(_determine_coefficient_odrapm, axis=1, player_id=player, offensive=False).to_numpy()

    if return_players:
        return design_matrix, all_players
    else:
        return design_matrix

In [13]:
matrix, players = get_design_matrix_odrapm(data, return_players=True)

design_matrix = pd.DataFrame(data=matrix, columns=np.repeat(players, 2))
design_matrix.to_csv('design_matrices/odrapm_design.csv')


# design_matrix = pd.read_csv('design_matrices/odrapm_design.csv', index_col=0)

KeyboardInterrupt: 

Fit the model and return results

In [23]:
model = RidgeCV(alphas=[500, 1000, 1500, 2000, 2500, 3000]).fit(X=design_matrix, y=data['PM'])
#model = Ridge(alpha=2000).fit(X=design_matrix, y=data['PM'])

players = noc.players_name(design_matrix.columns[::2].astype(int))

results = pd.DataFrame({
    'Player': players,
    'ORAPM': model.coef_[::2],
    'DRAPM': model.coef_[1::2],
    'RAPM': model.coef_[::2] - model.coef_[1::2]
})


# fill in PM for each player. here's the logic
# 1. since each PM corresponds to a point change, we can multiply it by the indicator of the player to get how much that contributes for the player's individual PM
# 2. adding offensive and defensive together gives us all the indicators for every stints
# 3. summing all the stint PMs multiplied by the indicators gives the
pm = np.empty(len(players))
num = np.empty(len(players))
for i in range(len(players)): 
    indicators = design_matrix.iloc[:, 2*i] + design_matrix.iloc[:, 2*i+1]
    num_of_stints = np.count_nonzero(indicators)
    pm[i] = np.sum(indicators * data['PM']) / num_of_stints
    num[i] = num_of_stints

results['Average PM'] = pm
results['Total Stints'] = num

results.head()

Unnamed: 0,Player,ORAPM,DRAPM,Absolute RAPM,Average PM,Total Stints
0,LeBron James,0.167144,-0.183162,0.350306,0.009829,1933.0
1,Udonis Haslem,0.000762,-0.011907,0.012668,-0.857143,7.0
2,Chris Paul,0.131517,-0.128121,0.259638,0.030675,815.0
3,Kyle Lowry,0.147418,-0.150013,0.297432,0.001596,1880.0
4,P.J. Tucker,0.127974,-0.109306,0.23728,-0.024038,832.0


In [24]:
print(model.alpha_)


500
