In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib.text import TextPath
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import joblib
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.compose import make_column_transformer
# from sklearn.pipeline import make_pipeline
# import seaborn as sns
# from tqdm import tqdm

In [16]:
cols = {'ANA':'#b85e0b','ARI':'#7d1db3','BOS':'#ffec00','BUF':'#b653fb','CAR':'#963e3e','CBJ':'#475483','CGY':'#b45300','CHI':'#8d6b0a','COL':'#6b051f','DAL':'#007f16','DET':'#ff0000','EDM':'#352247','FLA':'#77d200','LAK':'#380078','MIN':'#003a07','MTL':'#ec0365','NJD':'#ab0027','NSH':'#f3bf00','NYI':'#0078ff','NYR':'#07b182','OTT':'#805700','PHI':'#ff7c00','PIT':'#19bcd1','SEA':'#00c9b5','SJS':'#016072','STL':'#000df0','TBL':'#150078','TOR':'#363caf','VAN':'#5c6c98','VGK':'#bca900','WPG':'#140e6b','WSH':'#990276'}

In [2]:
# toi20 = pd.read_csv('nhl_shifts_20202021.csv')
pbp20 = pd.read_csv('nhl_pbp_20202021.csv')
# toi19 = pd.read_csv('nhl_shifts_20192020.csv')
pbp19 = pd.read_csv('nhl_pbp_20192020.csv')
# toi18 = pd.read_csv('nhl_shifts_20182019.csv')
pbp18 = pd.read_csv('nhl_pbp_20182019.csv')
# toi17 = pd.read_csv('nhl_shifts_20172018.csv')
pbp17 = pd.read_csv('nhl_pbp_20172018.csv')

In [3]:
# toi_table = toi20.append(toi19).append(toi18).append(toi17)
pbp_table = pbp20.append(pbp19).append(pbp18).append(pbp17)

In [4]:
events = [e for e in pbp_table['Event'] if e == 'GOAL' or e == 'SHOT' or e == 'MISS']
descriptions = [d for e, d in zip(pbp_table['Event'], pbp_table['Description']) if e == 'GOAL' or e == 'SHOT' or e == 'MISS']
strength = [s for e, s in zip(pbp_table['Event'], pbp_table['Strength']) if e == 'GOAL' or e == 'SHOT' or e == 'MISS']
xcoord = [x for e, x in zip(pbp_table['Event'], pbp_table['xC']) if e == 'GOAL' or e == 'SHOT' or e == 'MISS']
ycoord = [y for e, y in zip(pbp_table['Event'], pbp_table['yC']) if e == 'GOAL' or e == 'SHOT' or e == 'MISS']
per_time = [t for e, t in zip(pbp_table['Event'], pbp_table['Time_Elapsed']) if e == 'GOAL' or e == 'SHOT' or e == 'MISS']
per = [p for e, p in zip(pbp_table['Event'], pbp_table['Period']) if e == 'GOAL' or e == 'SHOT' or e == 'MISS']
secs = [s for e, s in zip(pbp_table['Event'], pbp_table['Seconds_Elapsed']) if e == 'GOAL' or e == 'SHOT' or e == 'MISS']
# since_last = [s for e, s in zip(pbp_table['Event'], pbp_table['Seconds_Since']) if e == 'GOAL' or e == 'SHOT' or e == 'MISS']
zone = [z for e, z in zip(pbp_table['Event'], pbp_table['Ev_Zone']) if e == 'GOAL' or e == 'SHOT' or e == 'MISS']
shot_type = [t for e, t in zip(pbp_table['Event'], pbp_table['Type']) if e == 'GOAL' or e == 'SHOT' or e == 'MISS']
indices = range(len(pbp_table['Event']))
prev_event = [pbp_table['Event'].iloc[i-1] for e, i in zip(pbp_table['Event'], indices) if e == 'GOAL' or e == 'SHOT' or e == 'MISS']
prev_x = [pbp_table['xC'].iloc[i-1] for e, i in zip(pbp_table['Event'], indices) if e == 'GOAL' or e == 'SHOT' or e == 'MISS']
prev_y = [pbp_table['yC'].iloc[i-1] for e, i in zip(pbp_table['Event'], indices) if e == 'GOAL' or e == 'SHOT' or e == 'MISS']

In [5]:
since_last = [s-secs[l-1] if s>=secs[l-1] else -1 for s,l in zip(secs,range(len(secs)))]

In [6]:
# setting up categorical-->numerical variables
strength_unique = list(set(strength))
strength_dict = dict(zip(strength_unique,range(len(strength_unique))))
strength_nums = [strength_dict[x] for x in strength]

zone_unique = list(set(zone))
zone_dict = dict(zip(zone_unique,range(len(zone_unique))))
zone_nums = [zone_dict[x] for x in zone]

shot_type_unique = list(set(shot_type))
shot_type_dict = dict(zip(shot_type_unique,range(len(shot_type_unique))))
shot_type_nums = [shot_type_dict[x] for x in shot_type]

prev_event_unique = list(set(prev_event))
prev_event_dict = dict(zip(prev_event_unique,range(len(prev_event_unique))))
prev_event_nums = [prev_event_dict[x] for x in prev_event]

In [7]:
goal_state = [1 if e == 'GOAL' else 0 for e in events]

In [9]:
forest = RandomForestRegressor(random_state=0)

In [10]:
features = np.array((strength_nums, xcoord, ycoord, per, secs, since_last, 
                     zone_nums, shot_type_nums, prev_event_nums, prev_x, prev_y))
feature_names = ['Strength', 'xC', 'yC', 'Period', 'Seconds into period', 'Seconds since last event', 'Zone',
                 'Shot type', 'Previous event', 'Previous xC', 'Previous yC']

In [11]:
features

array([[ 28.,  28.,  28., ...,  28.,  28.,  28.],
       [-74.,  49., -52., ..., -70.,  74.,  79.],
       [ 29., -25., -31., ...,  -2.,   1., -14.],
       ...,
       [  7.,  12.,   3., ...,  11.,  12.,  11.],
       [  0., -58., -87., ..., -56.,  62.,  74.],
       [  0.,  -7., -33., ...,  -1.,  17.,   1.]])

In [12]:
features[np.isnan(features)] = 0

In [13]:
forest.fit(features.T, goal_state)

RandomForestRegressor(random_state=0)

In [14]:
importances = forest.feature_importances_

In [15]:
for i, val in enumerate(feature_names):
    print(f'{val}: {importances[i]}')

Strength: 0.024580299521348905
xC: 0.12233196563259816
yC: 0.11463746604059097
Period: 0.03979429035300732
Seconds into period: 0.19943240660656097
Seconds since last event: 0.14767967980230312
Zone: 0.005917870386409345
Shot type: 0.040679804617141095
Previous event: 0.040710975849903616
Previous xC: 0.13749078026416242
Previous yC: 0.1267444609259742


In [17]:
def xG(s, xC, yC, p, sinp, ssle, z, st, pe, pxC, pyC):
    return importances[0]*s + importances[1]*xC + importances[2]*yC + importances[3]*p + importances[4]*sinp + importances[5]*ssle + importances[6]*z + importances[7]*st + importances[8]*pe + importances[9]*pxC + importances[10]*pyC

In [18]:
joblib.dump(forest, 'fearfactorxg.joblib')

['fearfactorxg.joblib']

# ok let's test it

In [20]:
p_pbp = pd.read_csv('./gamestats/csvs/nhl_pbp_1651341935.csv')

In [21]:
events = [e for e in p_pbp['Event'] if e == 'GOAL' or e == 'SHOT' or e == 'MISS']
event_team = [t for e, t in zip(p_pbp['Event'], p_pbp['Ev_Team']) if e == 'GOAL' or e == 'SHOT' or e == 'MISS']
descriptions = [d for e, d in zip(p_pbp['Event'], p_pbp['Description']) if e == 'GOAL' or e == 'SHOT' or e == 'MISS']
strength = [s for e, s in zip(p_pbp['Event'], p_pbp['Strength']) if e == 'GOAL' or e == 'SHOT' or e == 'MISS']
xcoord = np.asarray([x for e, x in zip(p_pbp['Event'], p_pbp['xC']) if e == 'GOAL' or e == 'SHOT' or e == 'MISS'])
ycoord = np.asarray([y for e, y in zip(p_pbp['Event'], p_pbp['yC']) if e == 'GOAL' or e == 'SHOT' or e == 'MISS'])
per_time = [t for e, t in zip(p_pbp['Event'], p_pbp['Time_Elapsed']) if e == 'GOAL' or e == 'SHOT' or e == 'MISS']
per = [p for e, p in zip(p_pbp['Event'], p_pbp['Period']) if e == 'GOAL' or e == 'SHOT' or e == 'MISS']
secs = [s for e, s in zip(p_pbp['Event'], p_pbp['Seconds_Elapsed']) if e == 'GOAL' or e == 'SHOT' or e == 'MISS']
zone = [z for e, z in zip(p_pbp['Event'], p_pbp['Ev_Zone']) if e == 'GOAL' or e == 'SHOT' or e == 'MISS']
shot_type = [t for e, t in zip(p_pbp['Event'], p_pbp['Type']) if e == 'GOAL' or e == 'SHOT' or e == 'MISS']
indices = range(len(p_pbp['Event']))
prev_event = [p_pbp['Event'].iloc[i-1] for e, i in zip(p_pbp['Event'], indices) if e == 'GOAL' or e == 'SHOT' or e == 'MISS']
prev_x = np.asarray([p_pbp['xC'].iloc[i-1] for e, i in zip(p_pbp['Event'], indices) if e == 'GOAL' or e == 'SHOT' or e == 'MISS'])
prev_y = np.asarray([p_pbp['yC'].iloc[i-1] for e, i in zip(p_pbp['Event'], indices) if e == 'GOAL' or e == 'SHOT' or e == 'MISS'])

In [22]:
xcoord[np.isnan(xcoord)] = 0
ycoord[np.isnan(ycoord)] = 0
prev_x[np.isnan(prev_x)] = 0
prev_y[np.isnan(prev_y)] = 0

In [23]:
since_last = [s-secs[l-1] if s>=secs[l-1] else -1 for s,l in zip(secs,range(len(secs)))]

In [24]:
# setting up categorical-->numerical variables
strength_unique = list(set(strength))
# strength_dict = dict(zip(strength_unique,range(len(strength_unique))))
strength_nums = [strength_dict[x] for x in strength]

zone_unique = list(set(zone))
# zone_dict = dict(zip(zone_unique,range(len(zone_unique))))
zone_nums = [zone_dict[x] for x in zone]

shot_type_unique = list(set(shot_type))
# shot_type_dict = dict(zip(shot_type_unique,range(len(shot_type_unique))))
shot_type_nums = [shot_type_dict[x] for x in shot_type]

prev_event_unique = list(set(prev_event))
# prev_event_dict = dict(zip(prev_event_unique,range(len(prev_event_unique))))
prev_event_nums = [prev_event_dict[x] for x in prev_event]

In [25]:
all_xg = forest.predict(np.array((strength_nums, xcoord, ycoord, per, secs, since_last, zone_nums,
              shot_type_nums, prev_event_nums, prev_x, prev_y)).T)

In [26]:
all_xg

array([0.  , 0.25, 0.14, 0.26, 0.09, 0.21, 0.14, 0.03, 0.02, 0.02, 0.02,
       0.01, 0.02, 0.08, 0.25, 0.01, 0.17, 0.04, 0.01, 0.06, 0.14, 0.1 ,
       0.  , 0.19, 0.07, 0.  , 0.16, 0.16, 0.  , 0.16, 0.06, 0.01, 0.06,
       0.05, 0.01, 0.08, 0.03, 0.04, 0.18, 0.33, 0.05, 0.51, 0.02, 0.08,
       0.03, 0.  , 0.  , 0.09, 0.  , 0.  , 0.  , 0.01, 0.09, 0.03, 0.04,
       0.26, 0.05, 0.11, 0.03, 0.11, 0.02, 0.06, 0.  , 0.14, 0.01, 0.05,
       0.29, 0.04, 0.13, 0.06, 0.03, 0.3 , 0.02, 0.1 , 0.05, 0.12, 0.03,
       0.09, 0.35, 0.26, 0.02, 0.09, 0.05, 0.1 , 0.16, 0.17, 0.29, 0.09,
       0.59, 0.42, 0.54, 0.21, 0.27, 0.08])

In [27]:
# setting up data for plotting
per = np.asarray(per)

xc_plot = [x if p%2==0 else -x for x, p in zip(xcoord,per)]
xc_plot = np.asarray(xc_plot)

yc_plot = [y if p%2==0 else -y for y, p in zip(ycoord,per)]
yc_plot = np.asarray(yc_plot)

event_team = np.asarray(event_team)
events = np.asarray(events)
all_xg = np.asarray(all_xg)
descriptions = np.asarray(descriptions)
strength = np.asarray(strength)

In [28]:
angles = np.degrees(np.arctan((xc_plot)/yc_plot))
angles[np.isnan(angles)] = 0

  angles = np.degrees(np.arctan((xc_plot)/yc_plot))


In [29]:
# marks = [(3,0,a) if t=='VGK' else (2,0,a) for a, t in zip(angles, event_team)]
marks = [(3,0,a+90) if y>=0 and t=='VGK' else (3,0,a-90) if  y>=0 else (3,0,a) for a, t, y in zip(angles, event_team, yc_plot)]
marks = np.asarray(marks)

In [30]:
%matplotlib notebook
fig, ax = plt.subplots(1, figsize=(8,4), constrained_layout=True)

for i in range(len(xc_plot[events=='SHOT'])):
    ax.scatter(xc_plot[events=='SHOT'][i],
               yc_plot[events=='SHOT'][i], 
               s=(all_xg[events=='SHOT'][i]+0.01)*3000,
               c=cols[event_team[events=='SHOT'][i]], 
               alpha=0.5, marker=marks[events=='SHOT'][i])

for i in range(len(xc_plot[events=='MISS'])):
    ax.scatter(xc_plot[events=='MISS'][i],
               yc_plot[events=='MISS'][i], 
               s=(all_xg[events=='MISS'][i]+0.01)*3000,
               c='#525252', 
               alpha=0.4, marker=marks[events=='MISS'][i])

for g in range(len(xc_plot[(events=='GOAL')])):
    
    so_increment = 1
    
    scorer = descriptions[events=='GOAL'][g].split(' ')[2]
    if '(' in scorer:
        scorer = scorer.split('(')[0].capitalize()
    else:
        scorer = scorer.split(',')[0].capitalize()
    
    if per[events=='GOAL'][g] == 4:
        mark = 'OT'
        msize = 6000
    elif per[events=='GOAL'][g] == 5:
        mark = 'SO' + f'{so_increment}'
        so_increment += 1
        msize = 6000
    else:
        mark = g+1
        msize = 1000
        
    strstr1 = int(strength[events=='GOAL'][g].split('x')[0])
    strstr2 = int(strength[events=='GOAL'][g].split('x')[1])
    if strstr1 > strstr2:
        stren = ' PPG'
    elif strstr1 < strstr2:
        stren = ' SHG'
    else:
        stren = ''
        
    textmark = TextPath((0,0), str(mark))
        
    ax.scatter(xc_plot[events=='GOAL'][g],
           yc_plot[events=='GOAL'][g], 
#            s=(all_xg[events=='GOAL'][g]+0.01)*1750,
           c=cols[event_team[events=='GOAL'][g]],
           edgecolor='k', linewidth=0.6,
           marker=textmark, s=msize)
    
    xgh = all_xg[events=='GOAL'][g]
    print(f'{str(mark)} - {scorer}{stren} ({xgh:.2f} xG)')
    
ax.set(xlim=(-100,100), ylim=(-50,50))

<IPython.core.display.Javascript object>

1 - Howden (0.25 xG)
2 - Zibanejad (0.00 xG)
3 - Kreider PPG (0.51 xG)
4 - Coghlan (0.01 xG)
SO1 - Marchessault (0.27 xG)


[(-100.0, 100.0), (-50.0, 50.0)]