The purpose of this script is to train a propensity score model for each player.

More specifically, every player will have an associated model. This model will predict the probability of the player's presence in a stint, given covariates.
This is the "Propensity Score" of the player, and is the foundation of the causal inference portion of this project.

For each player, train a logistic regression model that uses the following features
 - Score margin at the beginning of the stint
 - Minutes remaining at the beginning of the stint
 - Defensive team strength (defined as cumulative sum of a box score statistics)

The target of logistic regression will be an indicator: 1 if player is present in the stint, 0 otherwise.

In [144]:
import numpy as np
import pandas as pd
import seaborn as sns
import nba_on_court as noc
from sklearn.linear_model import LogisticRegression

pd.set_option('display.max_columns', 500)

# Preparing the Data

We need to infer each of the above metrics before model training can begin.

In [145]:
data: pd.DataFrame             = pd.read_csv('../../data/nba_2223_season_stints.csv', dtype={'game_id': str})
plr_id_to_name: dict[int, str] = pd.read_csv('../../data/nba_2223_player_table.csv', index_col=0)['player_name'].to_dict()
plr_name_to_id: dict[str, int] = dict((v,k) for k,v in plr_id_to_name.items())
box_scores: pd.DataFrame       = pd.read_csv('../../data/nba_2223_box_stats.csv')
design: pd.DataFrame           = pd.read_csv('../../design_matrices/nba_2223_season_rapm_data.csv.gz') \
                                   .drop(['game_id', 'stint_id', 'n_pos', 'home_points', 'away_points', 'minutes', 'margin'], axis=1)

In [146]:
# remove accents from player names,
# and add player ID to the box score data
import unicodedata

def strip_accents(s):
   '''
   Credit to https://stackoverflow.com/a/518232
   '''
   return ''.join(c for c in unicodedata.normalize('NFD', s) 
                  if unicodedata.category(c) != 'Mn')

box_scores['Player']   = box_scores['Player'].apply(strip_accents)
box_scores['PlayerID'] = box_scores['Player'].apply(lambda name: str(plr_name_to_id[name])) # casting to str for lexicographic sorting

box_scores.to_csv('../../data/nba_2223_box_stats.csv', index=False)

In [147]:
# players that swapped teams have multiple entries
# total stats is always first
box_scores.drop_duplicates('Player', keep='first', inplace=True) 

box_scores

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Player-additional,PlayerID
0,1,Precious Achiuwa,C,23,TOR,55,1140,15.2,0.554,0.267,0.307,9.3,24.4,16.3,6.3,1.3,2.6,11.4,19.4,0.8,1.4,2.2,0.093,-1.4,-0.8,-2.3,-0.1,achiupr01,1630173
1,2,Steven Adams,C,29,MEM,42,1133,17.5,0.564,0.004,0.490,20.1,25.3,22.7,11.2,1.5,3.7,19.8,14.6,1.3,2.1,3.4,0.144,-0.3,0.9,0.6,0.7,adamsst01,203500
2,3,Bam Adebayo,C,25,MIA,75,2598,20.1,0.592,0.011,0.361,8.0,23.6,15.5,15.9,1.7,2.4,12.7,25.2,3.6,3.8,7.4,0.137,0.8,0.8,1.5,2.3,adebaba01,1628389
3,4,Ochai Agbaji,SG,22,UTA,59,1209,9.5,0.561,0.591,0.179,3.9,6.9,5.4,7.5,0.6,1.0,9.0,15.8,0.9,0.4,1.3,0.053,-1.7,-1.4,-3.0,-0.3,agbajoc01,1630534
4,5,Santi Aldama,PF,22,MEM,77,1682,13.9,0.591,0.507,0.274,5.4,18.0,11.7,7.6,1.3,2.6,9.3,16.0,2.1,2.4,4.6,0.130,-0.3,0.8,0.5,1.1,aldamsa01,1630583
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
674,535,Thaddeus Young,PF,34,TOR,54,795,14.1,0.573,0.172,0.131,9.4,14.6,11.8,12.9,3.4,0.6,16.7,13.5,0.7,1.1,1.8,0.109,-1.8,1.9,0.1,0.4,youngth01,201152
675,536,Trae Young,PG,24,ATL,73,2541,22.0,0.573,0.331,0.460,2.4,7.0,4.7,42.5,1.5,0.3,15.2,32.6,5.3,1.4,6.7,0.126,5.3,-2.0,3.3,3.4,youngtr01,1629027
676,537,Omer Yurtseven,C,24,MIA,9,83,16.7,0.675,0.259,0.222,10.9,21.9,16.2,3.9,1.2,2.5,11.9,18.0,0.2,0.1,0.3,0.159,-2.5,-1.5,-3.9,0.0,yurtsom01,1630209
677,538,Cody Zeller,C,30,MIA,15,217,16.4,0.659,0.034,0.593,13.0,21.8,17.3,7.2,0.7,1.9,15.8,18.1,0.4,0.3,0.7,0.147,-2.0,-0.7,-2.8,0.0,zelleco01,203469


In [148]:
# infer home strength and away strength
# careful with the axes. the player IDs are sorted lexicographically, not numerically
home_indicators = design.replace(-1, 0).sort_index(axis=1)
away_indicators = design.replace(1, 0).mul(-1).sort_index(axis=1)
bpm             = box_scores.sort_values('PlayerID')['BPM'].astype(float).to_numpy()

data['home_strength'] = home_indicators.mul(bpm, axis=1).sum(axis=1)
data['away_strength'] = away_indicators.mul(bpm, axis=1).sum(axis=1)

In [149]:
data

Unnamed: 0,game_id,stint_id,home_lineup,away_lineup,n_pos,home_points,away_points,minutes,margin,home_strength,away_strength
0,0022200002,1,201939_202691_203110_203952_1626172,2544_201566_201976_203076_1629022,14,5,2,2.70,21.428571,9.6,10.0
1,0022200002,2,201939_202691_203110_203952_1626172,2544_201566_203076_1629022_1630559,9,6,2,1.67,44.444444,9.6,11.9
2,0022200002,3,201939_203110_203952_1626172_1629673,2544_201566_203076_1629022_1630559,5,0,3,0.48,-60.000000,8.0,11.9
3,0022200002,4,201939_203110_203952_1626172_1629673,2544_203076_1629022_1629134_1630559,5,5,1,0.78,80.000000,8.0,8.2
4,0022200002,5,201939_203210_203952_1629673_1630164,2544_203076_1629022_1629134_1630559,9,3,6,1.52,-33.333333,1.1,8.2
...,...,...,...,...,...,...,...,...,...,...,...
32379,0022201229,18,203082_1627814_1629647_1630240_1630688,201566_203486_1626181_1629611_1630538,16,8,13,3.44,-31.250000,-7.8,1.7
32380,0022201229,19,203082_1627814_1629006_1629647_1630240,201566_203486_1626181_1629611_1630538,11,7,10,2.45,-27.272727,-6.8,1.7
32381,0022201229,20,203082_1627814_1629006_1629647_1630240,201566_202695_203486_1626181_1629611,21,13,17,4.30,-19.047619,-6.8,8.6
32382,0022201229,21,203082_1627814_1629006_1629647_1630240,201566_201587_202695_203486_1626181,12,7,8,3.53,-8.333333,-6.8,9.3


In [150]:
# infer start score margin and start time left
games: pd.Series         = pd.unique(data['game_id'])
data['end_score_margin'] = data['home_points'] - data['away_points']


list_of_data = []
for game_id in games:
    game_data: pd.DataFrame = data[data['game_id'] == game_id]

    game_data['start_score_margin'] = game_data['end_score_margin'].shift(fill_value=0)

    time_elapsed = game_data['minutes'].cumsum()
    game_data['start_time_left'] = 48 - time_elapsed.shift(fill_value=0)
    game_data['end_time_left']   = 48 - time_elapsed

    list_of_data.append(game_data)

data = pd.concat(list_of_data)

data
    

Unnamed: 0,game_id,stint_id,home_lineup,away_lineup,n_pos,home_points,away_points,minutes,margin,home_strength,away_strength,end_score_margin,start_score_margin,start_time_left,end_time_left
0,0022200002,1,201939_202691_203110_203952_1626172,2544_201566_201976_203076_1629022,14,5,2,2.70,21.428571,9.6,10.0,3,0,48.00,45.30
1,0022200002,2,201939_202691_203110_203952_1626172,2544_201566_203076_1629022_1630559,9,6,2,1.67,44.444444,9.6,11.9,4,3,45.30,43.63
2,0022200002,3,201939_203110_203952_1626172_1629673,2544_201566_203076_1629022_1630559,5,0,3,0.48,-60.000000,8.0,11.9,-3,4,43.63,43.15
3,0022200002,4,201939_203110_203952_1626172_1629673,2544_203076_1629022_1629134_1630559,5,5,1,0.78,80.000000,8.0,8.2,4,-3,43.15,42.37
4,0022200002,5,201939_203210_203952_1629673_1630164,2544_203076_1629022_1629134_1630559,9,3,6,1.52,-33.333333,1.1,8.2,-3,4,42.37,40.85
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32379,0022201229,18,203082_1627814_1629647_1630240_1630688,201566_203486_1626181_1629611_1630538,16,8,13,3.44,-31.250000,-7.8,1.7,-5,1,15.09,11.65
32380,0022201229,19,203082_1627814_1629006_1629647_1630240,201566_203486_1626181_1629611_1630538,11,7,10,2.45,-27.272727,-6.8,1.7,-3,-5,11.65,9.20
32381,0022201229,20,203082_1627814_1629006_1629647_1630240,201566_202695_203486_1626181_1629611,21,13,17,4.30,-19.047619,-6.8,8.6,-4,-3,9.20,4.90
32382,0022201229,21,203082_1627814_1629006_1629647_1630240,201566_201587_202695_203486_1626181,12,7,8,3.53,-8.333333,-6.8,9.3,-1,-4,4.90,1.37
