In [1]:
import numpy as np
import pandas as pd
import nba_on_court as noc
from typing import Tuple

# Preprocessing Raw Data

Code for preprocessing the raw combined datasets. See `nba_on_court.left_join_nbastats` for combining NBA (nbastats) and Play-by-Play (pbpstats) datasets

Free throws need to be combined into a single stint, as it should only affect the players causing the foul. Here is the logic for dealing with them.

- Before free throws can be dealt with, preprocess the raw data by calculating PM and related statistics.
- A stint being a free throw is equivalent to having its PM be +-1.
- Goal: Combine consecutive free throws into a single stint, where the players present in the stint were the ones present in the foul.
    - Unfortunately, some free throws don't have a turnover immediately preceeding them. These stints used the players present in the first free throw.

In [2]:
# constants used for readable indexing

AWAY_LIST = ['AWAY_PLAYER1', 'AWAY_PLAYER2', 'AWAY_PLAYER3', 'AWAY_PLAYER4', 'AWAY_PLAYER5']
HOME_LIST = ['HOME_PLAYER1', 'HOME_PLAYER2', 'HOME_PLAYER3', 'HOME_PLAYER4', 'HOME_PLAYER5']
PLAYERS = AWAY_LIST + HOME_LIST

BASE_FEATURES  = [
    'GAME_ID',
    'SCOREMARGIN',
    'TURNOVERS', 
    'PLAYER1_ID'
] + PLAYERS

In [4]:
def preprocess_season(season: pd.DataFrame) -> pd.DataFrame:
    '''
    Given a full season of raw data, preprocess the data
    and return the resulting DataFrame. Note that this
    is essentially `preprocess_game` but for an entire season.

    This will return a DataFrame with the following features:
      `GAME_ID`: The game that the stint belongs to.
      `SCOREMARGIN`: The score margin of the stint. Note that positive means in favor for home, negative in favor for away.
      `TURNOVERS`: Indicator of whether the stint resulted in a turnover.
      `BALL_ID`: (Presumed to be) the player who possessed the ball during the stint.
      `PM`: The Plus-Minus of the stint. See SCOREMARGIN for meaning of the sign.
      `[AWAY|HOME]_PLAYER[NUM]`: Player ID of an Away/Home player.
    '''
    games = pd.unique(season['GAME_ID'])
    list_of_game_data = []

    for game in games:
        current = season[season['GAME_ID'] == game].reset_index(drop=True)
        current = noc.players_on_court(current)
        list_of_game_data.append(preprocess_game(current))

    return pd.concat(list_of_game_data, ignore_index=True)

def preprocess_game(game_data: pd.DataFrame) -> pd.DataFrame:
    '''
    Given the full data of a game (including players on court), 
    preprocess the game data and return the result.

    This will return a DataFrame with the following features:
      `GAME_ID`: The game that the stint belongs to.
      `SCOREMARGIN`: The score margin of the stint. Note that positive means in favor for home, negative in favor for away.
      `TURNOVERS`: Indicator of whether the stint resulted in a turnover.
      `BALL_ID`: (Presumed to be) the player who possessed the ball during the stint.
      `PM`: The Plus-Minus of the stint. See SCOREMARGIN for meaning of the sign.
      `[AWAY|HOME]_PLAYER[NUM]`: Player ID of an Away/Home player.
    '''
    # extract features we care about
    subset = game_data[BASE_FEATURES]
    subset.rename(columns={'PLAYER1_ID': 'BALL_ID'}, inplace=True)

    # have to preprocess stints that end in scores differently from stints that end in turnovers
    # because otherwise the PM can't be calculated
    scores = subset[~subset['SCOREMARGIN'].isna()].replace('TIE', 0)
    scores['PM'] = scores['SCOREMARGIN'].astype(np.int64).diff().replace(np.nan, 0)
    scores.at[scores.index[0], 'PM'] = np.int64(scores.at[scores.index[0], 'SCOREMARGIN']) # manually add in the first PM
    score_stints = scores[(scores['TURNOVERS'] == 0) & ((scores['BALL_ID'].isna() == False) & (scores['BALL_ID'] != 0))]

    turnovers = subset[subset['TURNOVERS'] == 1]
    turnovers['SCOREMARGIN'].fillna(0, inplace=True)
    turnovers['PM'] = np.zeros(len(turnovers), dtype=np.int64)
    turnover_stints = turnovers[((turnovers['BALL_ID'].isna() == False) & (turnovers['BALL_ID'] != 0))]

    stints = pd.concat([score_stints, turnover_stints]).sort_index()

    # deal with free throws
    # unfortunately i can't think of a vectorized solution, so we have to iterate over the rows
    freethrows = stints[(stints['PM'].abs() == 1) | (stints['TURNOVERS'] == 1)] # extract stints relating to freethrows
    rows_to_skip = []

    for index, row in freethrows.iterrows():
        if row.get('TURNOVERS') == 1 or index in rows_to_skip:
            continue
        i = freethrows.index.get_loc(index)

        foul_row = freethrows.iloc[i - 1]
        if foul_row.get('TURNOVERS') == 0: # if the previous stint isn't a turnover, treat the current stint as the foul
            foul_row = row
        stints.loc[index, PLAYERS] = foul_row[PLAYERS]
        
        # check the next two rows if they're part of the same freethrow
        for j in (1, 2):
            if i+j >= len(freethrows.index): # out of bounds
                break
            
            next_row = freethrows.iloc[i+j]
            if next_row.get('PM') != row.get('PM'): # not consecutive freethrows
                break
            
            stints.at[index, 'PM'] += next_row['PM']
            rows_to_skip.append(freethrows.index[i+j])

    # finally, filter out the rows flagged for skipping
    stints.drop(rows_to_skip, inplace=True)
    return stints

In [56]:
raw_data = pd.read_csv('../../data/combined_data_2022.csv', index_col=0)
data = preprocess_season(raw_data)

data.head(50)

Unnamed: 0,GAME_ID,SCOREMARGIN,TURNOVERS,BALL_ID,AWAY_PLAYER1,AWAY_PLAYER2,AWAY_PLAYER3,AWAY_PLAYER4,AWAY_PLAYER5,HOME_PLAYER1,HOME_PLAYER2,HOME_PLAYER3,HOME_PLAYER4,HOME_PLAYER5,PM
0,42200101,2,0.0,203507,1629622,1629639,1629216,202710,1628389,201572,203114,201950,1628960,203507,2.0
1,42200101,0,0.0,1629639,1629622,1629639,1629216,202710,1628389,201572,203114,201950,1628960,203507,-2.0
2,42200101,2,0.0,1628960,1629622,1629639,1629216,202710,1628389,201572,203114,201950,1628960,203507,2.0
3,42200101,-1,0.0,1629639,1629622,1629639,1629216,202710,1628389,201572,203114,201950,1628960,203507,-3.0
4,42200101,0,1.0,1628960,1629622,1629639,1629216,202710,1628389,201572,203114,201950,1628960,203507,0.0
5,42200101,0,1.0,1629216,1629622,1629639,1629216,202710,1628389,201572,203114,201950,1628960,203507,0.0
6,42200101,-3,0.0,202710,1629622,1629639,1629216,202710,1628389,201572,203114,201950,1628960,203507,-2.0
7,42200101,-2,0.0,203507,1629622,1629639,1629216,202710,1628389,201572,203114,201950,1628960,203507,1.0
8,42200101,0,1.0,201572,1629622,1629639,1629216,202710,1628389,201572,203114,201950,1628960,203507,0.0
9,42200101,-4,0.0,1629216,1629622,1629639,1629216,202710,1628389,201572,203114,201950,1628960,203507,-2.0


Remove stints that make no sense ('BALL_ID' isn't in the game).

In [58]:
def check_valid(row: pd.Series):
    return row['BALL_ID'] in row[PLAYERS].values

data = data[data.apply(check_valid, axis=1)].reset_index(drop=True)

In [59]:
data.to_csv('../../data/processed_data_2022.csv')

Combine free throws into a single stint. While it makes most sense to include this in 'preprocess game',

# Design Matrix

Using the processed data, create a design matrix out of it.

In [3]:
data = pd.read_csv('../../data/processed_data_2022.csv', index_col=0)

In [4]:
def _determine_coefficient_odrapm(row: pd.Series, player_id: int, offensive: bool) -> int:
    '''
    Determines whether a player's coefficient is -1, 0, 1 in a given stint for Offensive/Defensive RAPM model.

    This is meant to be used in the `get_design_matrix` function.
    '''
    away = row[AWAY_LIST].values
    home = row[HOME_LIST].values
    away_is_offense = row['BALL_ID'] in away # True if Away is offensive team
    away_is_offense = away_is_offense if offensive else not away_is_offense # Basically, change 'defense' case to 'offense' case

    if player_id in away and away_is_offense:
        return -1
    elif player_id in home and not away_is_offense:
        return 1
    else:
        return 0
    
def get_design_matrix_odrapm(game_data: pd.DataFrame, *, as_frame: bool = True) -> pd.DataFrame | Tuple[np.ndarray, np.ndarray]:
    '''
    Given a preprocessed game/season of data, return a design matrix for a regression model.
    This will not add the signed intercept, that needs to be added separately.

    Each row of the matrix will correspond to a stint.
    Even columns (0, 2, 4, ...) correspond to offensive indicators (i.e. filled iff player is on offense).\n
    Odd columns (1, 3, 5, ...) correspond to defensive indicators (i.e. filled iff player is on defense).\n
    Player `k` corresponds to columns `2k` (offense) and `2k+1` (defense).\n
    
    The matrix will be filled as follows:
      `-1`: Present on the Away team.
      `0`: Not present during the stint.
      `1`: Present on the Home team. 

    By default, the result is returned as a DataFrame. 
    Note that defensive indicators may have a `.1` at the end of the column feature. 
    
    If `as_frame` is False, then this will return a tuple of numpy arrays. 
    The first array is the design matrix itself.
    The second array is the list of players, where position `i` corresponds to columns `2i` and `2i + 1`.
    '''
    all_players = np.unique(game_data.filter(like='PLAYER').to_numpy())
    design_matrix = np.empty((len(game_data), 2 * len(all_players)))        # stints x 2 * players. to be filled

    # fill the design matrix with coefficients
    # for a future date: call 'apply' once, have _det_coeff return a 2-sized vector, and fill both columns simultaneously
    for i in range(len(all_players)):
        player = all_players[i]
        design_matrix[:, 2*i] = game_data.apply(_determine_coefficient_odrapm, axis=1, player_id=player, offensive=True).to_numpy()
        design_matrix[:, 2*i+1] = game_data.apply(_determine_coefficient_odrapm, axis=1, player_id=player, offensive=False).to_numpy()

    if as_frame:
        return pd.DataFrame(data=design_matrix, columns=np.repeat(all_players, 2))
    else:
        return design_matrix, all_players

In [5]:
matrix = get_design_matrix_odrapm(data)

And now add the signed intercept column

In [7]:
def _determine_coefficient(row: pd.Series) -> int:
    '''
    Determines the coefficient for the signed constant term in the ridge regression design matrix.

    1 if Home is offense, -1 otherwise.
    '''
    if row['BALL_ID'] in row[HOME_LIST].values: # if home is offense
        return 1
    else:
        return -1

In [8]:
coefs = data.apply(_determine_coefficient, axis=1)
matrix.insert(loc=0, column='0', value=coefs)
matrix.to_csv('../../design_matrices/odrapm_design.csv')

In [9]:
matrix.shape

(11013, 433)