For later: this is no account for trades made between seasons or during seasons

In [1]:
import datetime
import string
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
from basketball_reference_scraper.pbp import get_pbp
from basketball_reference_scraper.constants import TEAM_TO_TEAM_ABBR
from basketball_reference_scraper.seasons import get_schedule

In [2]:
#  Indexing pbp dataframe
HOME = 'PHO'
AWAY = 'BRK'
QTR_IDX = 0
TIME_IDX = 1
HOME_IDX = 2
AWAY_IDX = 3

#  Indexing numpy array "plays"
TEAM_IDX = 0
TIME_IDX = 1
SCOREPLAY_IDX = 2
BENEFIT_IDX = 3
DETRIMENT_IDX = 4
MOMENTUM_IDX = 5

In [18]:
def name_to_abbr(name):
    return TEAM_TO_TEAM_ABBR[name.upper()]

def calculate_elapsed_time(remaining, quarter):
    qtr = '12:00.0' # 12-minute regulation quarters
    ot = '5:00.0'  # 5-minute overtime periods
    format = '%M:%S.%f'

    if 'OT' not in str(quarter): #  play occurs during regulation
        period = qtr
        quarter_progression = datetime.timedelta(minutes=(quarter-1)*12) 
    else: #  play occurs during overtime
        period = ot
        ot_period = int(quarter.strip(string.ascii_letters))
        quarter_progression = datetime.timedelta(minutes=(ot_period-1)*5)  
        
    return (quarter_progression + 
            datetime.datetime.strptime(period, format) - 
            datetime.datetime.strptime(remaining, format))

# def penalty_play(play):
#     results = ['foul']

def scoring_play(play):
    results = ['makes 2-pt', 'makes 3-pt']
    if any(word in play for word in results):
        return 1
    return 0
    
def benifit_play(play):
    results = ['makes 2-pt', 'makes 3-pt', 'rebound', 'foul']
    if any(word in play for word in results):
        return 1
    return 0

def detriment_play(play):
    results = ['turnover', 'miss', 'timeout']
    if any(word in play for word in results):
        return 1
    return 0
    
def process_play(time, play):
    return np.array([time, scoring_play(play), benifit_play(play), detriment_play(play)])

def calculate_momentum(play_arr):
    return play_arr[SCOREPLAY_IDX] + play_arr[BENEFIT_IDX] - 2*play_arr[DETRIMENT_IDX]
    
def momentum_checks(plays):
    playcount = plays.shape[0]
    
    for play in range(playcount):
        idx = play  # start from the current play
        while idx >= 0:
            idx -= 1
            if plays[play][TIME_IDX] - plays[idx][TIME_IDX] > 60:
                break
            plays[play][MOMENTUM_IDX] += calculate_momentum(plays[idx])

    return plays

In [4]:
year = 2021

#  Get the nba schedule (all games)
nba = get_schedule(year, playoffs=False)

#  Remove games that have not yet been played
season = nba[~np.isnan(nba.VISITOR_PTS)]
season = nba[~np.isnan(nba.HOME_PTS)]

#  Select games played by a specific team if desired
team = 'Brooklyn Nets'
team_season = pd.concat((season[season.VISITOR == team], season[season.HOME== team]), axis=0)


In [19]:
games_pbp = []
wins = []

#  Extract parameters used by play-by-play functions 
for idx,row in team_season.iterrows():
    date = row['DATE']
    away = name_to_abbr(row['VISITOR'])
    home = name_to_abbr(row['HOME'])
    home_winner = row['HOME_PTS'] > row['VISITOR_PTS']
    winner = home if home_winner else away
    print(f'{date.date()}: {away} @ {home} | winner: {winner}')
    
    wins.append(1 if winner == name_to_abbr(team) else 0)
    
    #  Begin processing play-by-play data on a per-game basis
    pbp = get_pbp(date, home, away)
    pbp_playcount = len(pbp)

    # Stop collecting pbp data after n plays (comment these 2 lines to use the full pbp data)
    playcount_stopper = 200 
    pbp_playcount = playcount_stopper

    
    plays = np.zeros((pbp_playcount, 6))  # HOME?, time (secs), score, benefit, detriment, momentum_score    
    
    for play in range(pbp_playcount):
        elapsed_gametime = calculate_elapsed_time(pbp.iloc[play][TIME_IDX], pbp.iloc[play][QTR_IDX])
        action_home = pbp.iloc[play][HOME_IDX]
        action_away = pbp.iloc[play][AWAY_IDX]

        if action_home is not np.nan:
            plays[play] = np.hstack((1, process_play(elapsed_gametime.seconds, action_home.lower()), 0))
        else:
            plays[play] = np.hstack((0, process_play(elapsed_gametime.seconds, action_away.lower()), 0))

    plays = momentum_checks(plays)
    
    games_pbp.append(plays)

2020-12-25: BRK @ BOS | winner: BRK
2020-12-27: BRK @ CHO | winner: CHO
2021-01-08: BRK @ MEM | winner: MEM
2021-01-13: BRK @ NYK | winner: BRK
2021-01-20: BRK @ CLE | winner: CLE
2021-01-22: BRK @ CLE | winner: CLE


KeyboardInterrupt: 

In [6]:
np.array(games_pbp).shape
game_count = len(games_pbp)

X_train = np.array(games_pbp)[:round(game_count*0.75)].reshape(round(game_count*0.75),-1)
y_train = np.array(wins)[:round(game_count*0.75)].reshape(-1,)

X_test = np.array(games_pbp)[round(game_count*0.75):].reshape(game_count-round(game_count*0.75),-1)
y_test = np.array(wins)[round(game_count*0.75):].reshape(-1,)

In [16]:
clf = MLPClassifier(hidden_layer_sizes=40, random_state=0).fit(X_train, y_train)

In [17]:
clf.predict(X_test)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [9]:
s.shape[0] - s.iloc[:,2].dropna().shape[0]

NameError: name 's' is not defined