This notebook is going to explore neural network models to predict whether a team makes the playoffs given a team's sequence of box scores.

In [52]:
import numpy as np
import pandas as pd

from playoff_model_helpers import *

In [212]:
from sklearn.model_selection import train_test_split

In [223]:
df = import_data()

In [225]:
df = made_playoffs(df)

In [226]:
df.sort_values(['Team','Game_day'],inplace=True)

In [227]:
df = include_home(df)

In [228]:
df = include_conference(df)

In [229]:
df = add_new_features(df)

In [230]:
df = clean(df)

In [231]:
df.head()

Unnamed: 0,Team,Game_ID,Matchup,Game_day,Result,MINS,PTS,FGM,FGA,FGP,...,PF,Playoffs,Home,Conference,Poss,PPP,OREB%,ATR,FTR,eFG
81,ATL,28900002,ATL vs. IND,1989-11-03,L,240,103,42,94,44.7,...,29,0,1,East,113.72,0.905733,0.403846,0.888889,0.404255,45.212766
80,ATL,28900025,ATL vs. WAS,1989-11-07,L,240,114,45,92,48.9,...,24,0,1,East,109.52,1.040906,0.340426,1.230769,0.358696,48.913043
79,ATL,28900054,ATL @ BOS,1989-11-10,L,240,106,40,86,46.5,...,26,0,0,East,101.72,1.042076,0.195652,2.0,0.44186,46.511628
78,ATL,28900064,ATL vs. ORL,1989-11-11,W,240,148,58,95,61.1,...,28,0,1,East,109.04,1.3573,0.324324,2.5,0.431579,62.105263
77,ATL,28900070,ATL @ ORL,1989-11-13,W,240,112,42,92,45.7,...,32,0,0,East,109.84,1.019665,0.38,1.235294,0.391304,45.652174


In [232]:
df['Season_ID'] = df['Game_ID'].str.slice(0,5)

In [233]:
df['Team'].unique()

array(['ATL', 'BKN', 'BOS', 'CHA', 'CHI', 'CLE', 'DAL', 'DEN', 'DET',
       'GSW', 'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN',
       'NOP', 'NYK', 'OKC', 'ORL', 'PHI', 'PHX', 'POR', 'SAC', 'SAS',
       'TOR', 'UTA', 'WAS'], dtype=object)

In [235]:
df.drop(inplace=True,columns=['Game_ID','MINS','Matchup'])

In [None]:
class DataIterator(torch.utils.data.IterableDataset):
    def __init__(self,start,end,df,idxs):
        self.start = start
        self.end = end
        self.df = df
        self.idxs = idxs
        
    def __iter__(self):
        workers = torch.utils.data.get_worker_info()

        if workers is None:
            start = self.start
            end = self.end
        else:
            each_worker = int(ceil((sef.end - self.start)/float(workers.num_workers)))
            pid = workers.id

            start = self.start + pid*each_worker
            end = min(start + each_worker,self.end)

        return team_and_season_generator(self.df,idxs[start:end])

    @staticmethod
    def team_and_season_generator(df,idxs):
        for t,s in idxs:
            yield df[(df['Team']==t) & (df['Season_ID']==s)] 

In [236]:
def create_indices(df):
    teams = df['Team'].unique()
    seasons = df['Season_ID'].unique()
    combs = []
    for t in teams:
        for s in seasons:
            combs.append((t,s))
            
    return combs

In [169]:
def team_and_season_generator(df,idxs):
    for t,s in idxs:
        yield df[(df['Team']==t) & (df['Season_ID']==s)]
#     for t in teams:
#         for s in seasons:
#             yield df[(df['Team']==t) & (df['Season_ID']==s)]

In [171]:
def window_data(df,window = 5):
    a = df.iloc[0:window].values
    a = a.reshape((a.shape[0],a.shape[1],1))
    
    for i in range(1,len(df)-window):
        b = df.iloc[i:i+window].values
        b = np.expand_dims(b,2)
        a = np.append(a,b,axis=2)
        
    return a

In [180]:
def create_sequence(df, idxs):
    for d in team_and_season_generator(df,idxs):
        yield d

In [210]:
idxs = create_indices(df)

In [213]:
train, test = split_train_test(idxs)

In [216]:
df

Unnamed: 0,Team,Game_day,Result,PTS,FGM,FGA,FGP,PM3,PA3,P3P,...,Playoffs,Home,Conference,Poss,PPP,OREB%,ATR,FTR,eFG,Season_ID
81,ATL,1989-11-03,L,103,42,94,44.7,1,6,16.7,...,0,1,East,113.72,0.905733,0.403846,0.888889,0.404255,45.212766,00289
80,ATL,1989-11-07,L,114,45,92,48.9,0,0,0.0,...,0,1,East,109.52,1.040906,0.340426,1.230769,0.358696,48.913043,00289
79,ATL,1989-11-10,L,106,40,86,46.5,0,8,0.0,...,0,0,East,101.72,1.042076,0.195652,2.000000,0.441860,46.511628,00289
78,ATL,1989-11-11,W,148,58,95,61.1,2,4,50.0,...,0,1,East,109.04,1.357300,0.324324,2.500000,0.431579,62.105263,00289
77,ATL,1989-11-13,W,112,42,92,45.7,0,3,0.0,...,0,0,East,109.84,1.019665,0.380000,1.235294,0.391304,45.652174,00289
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74157,WAS,2021-05-08,W,133,49,110,44.5,9,32,28.1,...,1,0,East,120.20,1.106489,0.245902,1.444444,0.272727,48.636364,00220
74156,WAS,2021-05-10,L,124,51,100,51.0,13,26,50.0,...,1,0,East,103.84,1.194145,0.163265,3.111111,0.110000,57.500000,00220
74155,WAS,2021-05-12,L,116,45,101,44.6,10,29,34.5,...,1,0,East,111.36,1.041667,0.160714,4.142857,0.188119,49.504950,00220
74154,WAS,2021-05-14,W,120,41,89,46.1,7,25,28.0,...,1,1,East,106.28,1.129093,0.250000,2.090909,0.415730,50.000000,00220


In [184]:
d = df[(df['Team']=='DET')]

In [202]:
start = time.ctime()
for g in create_sequence(df):
    gen = g
end = time.ctime()

In [203]:
start

'Thu Apr 14 21:24:37 2022'

In [204]:
end

'Thu Apr 14 21:26:22 2022'

In [207]:
gen[:,:]

array([['WAS', 'WAS', 'WAS', ..., 'WAS', 'WAS', 'WAS'],
       [Timestamp('2020-12-31 00:00:00'),
        Timestamp('2021-01-01 00:00:00'),
        Timestamp('2021-01-03 00:00:00'), ...,
        Timestamp('2021-05-10 00:00:00'),
        Timestamp('2021-05-12 00:00:00'),
        Timestamp('2021-05-14 00:00:00')],
       ['L', 'W', 'W', ..., 'L', 'L', 'W'],
       ...,
       [0.4880952380952381, 0.23595505617977527, 0.3269230769230769, ...,
        0.11, 0.18811881188118812, 0.4157303370786517],
       [58.333333333333336, 64.04494382022472, 47.11538461538461, ...,
        57.5, 49.504950495049506, 50.0],
       ['00220', '00220', '00220', ..., '00220', '00220', '00220']],
      dtype=object)

In [199]:
import time

In [None]:
window = 5

a = det_90.iloc[0:window].values
a = a.reshape((a.shape[0],a.shape[1],1))
for i in range(1,len(det_90)-window):
    b = det_90.iloc[i:i+window].values
    b = np.expand_dims(b,2)
    a = np.append(a,b,axis=2)

In [None]:
a[:,:,1]

In [None]:
##### a[:,:,1]

In [147]:
idxs = np.arange(a.shape[2])

In [198]:
%timeit
np.random.shuffle(idxs)

In [157]:
trains

62

In [165]:
test = idxs[int(len(idxs)*.2)]
trains = idxs[0:round(len(idxs)*.8)]
test = idxs[len(trains):len(idxs)]

In [166]:
train = a[:,:,trains]

In [167]:
test = a[:,:,test]

In [168]:
test.shape

(5, 31, 15)

In [136]:
train, test = train_test_split(a)

In [137]:
train.shape

(3, 31, 77)

In [33]:
eg = df.head()
# .values.reshape((-1,24,5))

In [34]:
eg = eg.values.reshape((5,24,1))

In [42]:
eg = np.append(eg,df.tail().values.reshape((5,24,1)),2)

In [190]:
a

array([[['DET', 'DET', 'DET', ..., 'DET', 'DET', 'DET'],
        [Timestamp('1990-11-02 00:00:00'),
         Timestamp('1990-11-03 00:00:00'),
         Timestamp('1990-11-06 00:00:00'), ...,
         Timestamp('1991-04-06 00:00:00'),
         Timestamp('1991-04-09 00:00:00'),
         Timestamp('1991-04-10 00:00:00')],
        ['W', 'W', 'L', ..., 'L', 'L', 'L'],
        ...,
        [0.4819277108433735, 0.19318181818181818, 0.4246575342465753,
         ..., 0.17391304347826086, 0.27848101265822783,
         0.30864197530864196],
        [50.0, 52.27272727272727, 44.52054794520548, ...,
         41.30434782608695, 50.0, 43.20987654320987],
        ['00290', '00290', '00290', ..., '00290', '00290', '00290']],

       [['DET', 'DET', 'DET', ..., 'DET', 'DET', 'DET'],
        [Timestamp('1990-11-03 00:00:00'),
         Timestamp('1990-11-06 00:00:00'),
         Timestamp('1990-11-07 00:00:00'), ...,
         Timestamp('1991-04-09 00:00:00'),
         Timestamp('1991-04-10 00:00:00'),
    