In [23]:
# Import required packages and load season data

from pbpstats.client import Client
import numpy as np
import pandas as pd
from nba_api.stats.endpoints import leaguelineupviz, commonteamroster, boxscoretraditionalv2
from datetime import datetime as dt
import time

settings = {
    "dir": "NBA/response_data",
    "Games": {"source": "file", "data_provider": "data_nba"},
    "Possessions": {"source": "file", "data_provider": "data_nba"},
    "EnhancedPbp": {"source": "file", "data_provider": "data_nba"},
    "Boxscore": {"source": "file", "data_provider": "stats_nba"}
}

client = Client(settings)

s = client.Season("nba", "2021-22", "Regular Season")

# Read dictionary keyed by team names and valued by team IDs
with open('team_names.txt','r') as f:
    team_names_dict = eval(f.read())

team_ids_dict = {v:k for k,v in team_names_dict.items()}



In [45]:
# Define utility functions


def timeInSeconds(colon_time):
    """ Converts a time in minutes and seconds to time in seconds

    Args:
        colon_time (string): Time formatted as MM:SS

    Returns:
        float: Time in seconds
    """
    time = list(map(float,colon_time.split(':')))
    return (time[0]*60 + time[1])

def getFoulTimes(row):
    """ Calculates the amount of time since a player last committed a foul

    Args:
        row (pd.Series): Play-by-play data row for a foul event

    Returns:
        float: Time in seconds since player's last foul
    """
    global df
    time = list(map(float,row['clock'].split(':')))
    if row.name == 0:
        return (row['period'] - 1)*12*60 + (12 - time[0])*60 + (0 - time[1])
    else:
        prev_row = df.loc[row.name - 1]
    
    prev_time = list(map(float,prev_row['clock'].split(':')))

    if (row['game_id'] != prev_row['game_id']) or \
        (row['player1_id'] != prev_row['player1_id']):
        return (row['period'] - 1)*12*60 + (12 - time[0])*60 + (0 - time[1])
    else:
        return (row['period'] - prev_row['period'])*12*60 + (prev_time[0] - time[0])*60 + (prev_time[1] - time[1])


def getCensoredFoulTime(row):
    """ Calculate the amount of time since a player's last foul for a censored foul

    Args:
        row (pd.Series): Boxscore data row for a player

    Returns:
        float: Time in seconds from a player's last foul to the end of the game
    """
    if row.pf not in [0,6]:
        return 48*60 - df[(df['game_id']==row.game_id) & (df['player1_id']==row.player_id)].foul_time.sum()
    elif row.pf == 0:
        return row['min'] # Change to 48 if you want to treat the observation time as the entire game instead of 
                       # just playing time

In [None]:
# Create a dataframe of all the fouls over the course of a season;
# Calculate the time between players' fouls in each game

fouls = []

for season_game in s.games.items:
    game = client.Game(season_game.game_id)
    for event in game.enhanced_pbp.items:
        if event.data['event_type'] == 6:
            fouls.append(event.data)

df = pd.DataFrame(fouls)
df = df.sort_values(['game_id','player1_id','period','clock'],ascending=[True,True,True,False],ignore_index=True)
df['foul_time'] = df.apply(getFoulTimes,axis=1)
df['player_foul_num'] = df.apply(lambda row: row.player_game_fouls[row['player1_id']], axis=1)

# Drop rows with lane violation 'fouls' and technical fouls
df = df[(df['event_action_type'] != 17) & (df['event_action_type'] != 11)] 

# Drop unwanted columns
df = df.drop(['event_num', 'locX', 'locY', 'opt1', 'opt2', 'event_type', 'player3_id',
       'home_score', 'away_score', 'offense_team_id', 'order', 'possession_changing_override',
       'non_possession_changing_override', 'score', 'previous_event',
       'next_event'], axis=1)

# Save foul data (without censored data)
df.to_csv('raw_foul_data21-22.csv')

In [14]:
# Load data of all foul occurances from the regular season

df = pd.read_csv('raw_foul_data21-22.csv', converters={'game_id': str},index_col=0)

In [None]:
# Add censored data

censored_list = []

# For each game
for season_game in s.games.items:
    
    game = client.Game(season_game.game_id) # Load the game data
    box = pd.DataFrame(game.boxscore.data['player']) # Create a dataframe for player boxscore data
    box = box[['game_id','team_id','player_id','min','pf']] # Select relevant columns
    box = box.dropna(subset='pf') # Drop data for players who did not enter the game
    box['min'] = box.loc[:,'min'].apply(timeInSeconds) # Format playing time as time in seconds
    box['censored_time'] = box.apply(getCensoredFoulTime,axis=1) # Get the foul time for censored data

    # For each player in the game
    for row in box.iterrows():
        row = row[1]

        # Create row for censored data point
        censored_row = pd.Series({'game_id':row.game_id, 'period':None, 'clock':None, 'description':None, 
        'event_action_type':None, 'team_id':row.team_id, 'player1_id':row.player_id, 'player_game_fouls':None, 
        'fouls_to_give':None, 'foul_time':row.censored_time, 'player_foul_num':(row.pf+1), 'non_censored':False})
        
        censored_list.append(censored_row)

df_cens = pd.concat(censored_list,ignore_index=True,axis=1) # Combine censored data into a dataframe

# Combine censored and noncensored data
foul_df = pd.concat([df,df_cens.T],ignore_index=True)
foul_df['non_censored'] = foul_df['non_censored'].fillna(True)
foul_df = foul_df.sort_values(['game_id','player1_id','period','clock'], ascending=[True,True,True,False], \
    ignore_index=True, na_position='last')

# Save complete foul data
foul_df.to_csv('foul_data.csv')