In [2]:
!pip install nba-on-court



In [3]:
from nba_api.stats.endpoints import playbyplayv2
import nba_on_court.nba_on_court as noc
import pandas as pd
import numpy as np

In [40]:
nba_data = pd.read_csv('data/combined_data_2022.csv') 
preproc_data = pd.read_csv('data/processed_data_2022.csv') 
design = pd.read_csv('design.csv') 

In [5]:
nba_data.columns 

Index(['Unnamed: 0', 'GAME_ID', 'EVENTNUM', 'EVENTMSGTYPE',
       'EVENTMSGACTIONTYPE', 'PERIOD', 'WCTIMESTRING', 'PCTIMESTRING', 'SCORE',
       'SCOREMARGIN', 'PERSON1TYPE', 'PLAYER1_ID', 'PLAYER1_NAME',
       'PLAYER1_TEAM_ID', 'PLAYER1_TEAM_CITY', 'PLAYER1_TEAM_NICKNAME',
       'PLAYER1_TEAM_ABBREVIATION', 'PERSON2TYPE', 'PLAYER2_ID',
       'PLAYER2_NAME', 'PLAYER2_TEAM_ID', 'PLAYER2_TEAM_CITY',
       'PLAYER2_TEAM_NICKNAME', 'PLAYER2_TEAM_ABBREVIATION', 'PERSON3TYPE',
       'PLAYER3_ID', 'PLAYER3_NAME', 'PLAYER3_TEAM_ID', 'PLAYER3_TEAM_CITY',
       'PLAYER3_TEAM_NICKNAME', 'PLAYER3_TEAM_ABBREVIATION',
       'VIDEO_AVAILABLE_FLAG', 'DESCRIPTION_STATS', 'ENDTIME', 'EVENTS',
       'FG2A', 'FG2M', 'FG3A', 'FG3M', 'GAMEDATE', 'GAMEID',
       'NONSHOOTINGFOULSTHATRESULTEDINFTS', 'OFFENSIVEREBOUNDS', 'OPPONENT',
       'SHOOTINGFOULSDRAWN', 'STARTSCOREDIFFERENTIAL', 'STARTTIME',
       'STARTTYPE', 'TURNOVERS', 'DESCRIPTION_PBP', 'URL'],
      dtype='object')

Filtering data where stints and turnovers both have values for 'player_1_team_abbreviation'

In [31]:
def preprocess_game(game_data: pd.DataFrame) -> pd.DataFrame:
    '''
    Given the full data of a game (including players on court), 
    preprocess the game data and return the result.

    This will return a DataFrame with the following features:
      `SCOREMARGIN`: The score margin of the stint. Note that positive means in favor for home, negative in favor for away.
      `TURNOVERS`: Indicator of whether the stint resulted in a turnover.
      `PM`: The Plus-Minus of the stint. See SCOREMARGIN for meaning of the sign.
      `[AWAY|HOME]_PLAYER[NUM]`: Player ID of an Away/Home player.
    '''
    # extract features we care about
    subset = game_data[
        [
            'SCOREMARGIN',
            'TURNOVERS', 
            'PLAYER1_ID', 
            'AWAY_PLAYER1', 'AWAY_PLAYER2', 'AWAY_PLAYER3', 'AWAY_PLAYER4', 'AWAY_PLAYER5', 
            'HOME_PLAYER1', 'HOME_PLAYER2', 'HOME_PLAYER3', 'HOME_PLAYER4', 'HOME_PLAYER5'
        ]
    ]

    # have to preprocess stints that end in scores differently from stints that end in turnovers
    # because otherwise the PM can't be calculated
    scores = subset[~subset['SCOREMARGIN'].isna()].replace('TIE', 0).reset_index(drop=True)
    scores['PM'] = scores['SCOREMARGIN'].astype(np.int64).diff().replace(np.nan, 0)
    scores.at[0, 'PM'] = np.int64(scores.at[0, 'SCOREMARGIN']) # manually add in the first PM
    score_stints = scores[(scores['TURNOVERS'] == 0) & ((scores['PLAYER1_ID'].isna() == False) & (scores['PLAYER1_ID'] != 0))]

    turnovers = subset[subset['TURNOVERS'] == 1].reset_index(drop=True)
    turnovers['SCOREMARGIN'].fillna(0, inplace=True)
    turnovers['PM'] = np.zeros(len(turnovers), dtype=np.int64)
    turnover_stints = turnovers[((turnovers['PLAYER1_ID'].isna() == False) & (turnovers['PLAYER1_ID'] != 0))]

    return pd.concat([score_stints, turnover_stints], ignore_index=True)

In [32]:
games = pd.unique(nba_data['GAME_ID'])
list_of_game_data = []

for game in games:
    current = nba_data[nba_data['GAME_ID'] == game].reset_index(drop=True)
    current = noc.players_on_court(current)
    list_of_game_data.append(preprocess_game(current))

data = pd.concat(list_of_game_data, ignore_index=True)

In [33]:
data 

Unnamed: 0,SCOREMARGIN,TURNOVERS,PLAYER1_ID,AWAY_PLAYER1,AWAY_PLAYER2,AWAY_PLAYER3,AWAY_PLAYER4,AWAY_PLAYER5,HOME_PLAYER1,HOME_PLAYER2,HOME_PLAYER3,HOME_PLAYER4,HOME_PLAYER5,PM
0,2,0.0,203507,1629622,1629639,1629216,202710,1628389,201572,203114,201950,1628960,203507,2.0
1,0,0.0,1629639,1629622,1629639,1629216,202710,1628389,201572,203114,201950,1628960,203507,-2.0
2,2,0.0,1628960,1629622,1629639,1629216,202710,1628389,201572,203114,201950,1628960,203507,2.0
3,-1,0.0,1629639,1629622,1629639,1629216,202710,1628389,201572,203114,201950,1628960,203507,-3.0
4,-3,0.0,202710,1629622,1629639,1629216,202710,1628389,201572,203114,201950,1628960,203507,-2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12493,0,1.0,200768,1629130,1628389,202710,1628997,200768,203999,203932,1627750,1628971,1631128,0.0
12494,0,1.0,1627750,1629130,1628389,202710,1628997,200768,203999,203932,1627750,1628971,1631128,0.0
12495,0,1.0,203999,1629622,1628389,202710,1628997,1629216,203999,203932,1627750,203484,1629008,0.0
12496,0,1.0,203484,1629622,1628389,202710,1628997,1629216,203999,203932,1627750,203484,1629008,0.0


In [45]:
data.iloc[1,:][_AWAY_LIST].values

array([1629622, 1629639, 1629216, 202710, 1628389], dtype=object)

Creating design matrix with offensive and defensive coefficients for each player

In [34]:
_AWAY_LIST = ['AWAY_PLAYER1', 'AWAY_PLAYER2', 'AWAY_PLAYER3', 'AWAY_PLAYER4', 'AWAY_PLAYER5']
_HOME_LIST = ['HOME_PLAYER1', 'HOME_PLAYER2', 'HOME_PLAYER3', 'HOME_PLAYER4', 'HOME_PLAYER5']

def _determine_coefficient(row: pd.Series, player_id: int) -> int:
    '''
    Determines whether a player's coefficient is -1, 0, 1 in a given stint.

    This is meant to be used in the `get_design_matrix` function.
    '''
    

    away = row[_AWAY_LIST].values
    home = row[_HOME_LIST].values
    if player_id in away:
        return -1
    elif player_id in home:
        return 1
    else:
        return 0

In [None]:
def get_design_matrix(game_data: pd.DataFrame, *, return_players: bool = False) -> np.ndarray | Tuple[np.ndarray, np.ndarray]:
    '''
    Given a preprocessed game/season of data, return a design matrix for a regression model.
    See `preprocess_game`

    Each row of the matrix will correspond to a stint, and each column corresponds to a specific player.
    The matrix will be filled as follows:
      `-1`: Present on the Away team.
      `0`: Not present during the stint.
      `1`: Present on the Home team. 

    If `return_players` is True, then this will also return
    a list of all player IDs encountered in processing.
    The order of this list matches the order of columns in the design matrix.
    '''
    all_players = np.unique(game_data.filter(like='PLAYER').to_numpy())
    design_matrix = np.empty((len(game_data), len(all_players)))        # stints x players. to be filled

    # fill the design matrix with coefficients
    for i in range(len(all_players)):
        player = all_players[i]
        design_matrix[:, i] = game_data.apply(_determine_coefficient, axis=1, player_id=player).to_numpy()

    if return_players:
        return design_matrix, all_players
    else:
        return design_matrix

12498

In [50]:
all_players = np.unique(data.filter(like='PLAYER').to_numpy())

list(all_players).index(2617)

design_od = np.empty(len(nba_data),2*len(all_players))

# fill in design_od
for i in range(len(data)):
    

2