# NHL Playoff Bracket Predictions
### Uses regular season team summary statistics to predict playoff outcomes

In [50]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler  
from sklearn.neural_network import MLPRegressor
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

In [2]:
ls

 Volume in drive T is Data
 Volume Serial Number is E01F-4123

 Directory of T:\Dropbox\Projects\NHL

02/13/2018  07:50 PM    <DIR>          .
02/13/2018  07:50 PM    <DIR>          ..
02/11/2018  08:06 PM                 8 .gitignore
02/13/2018  07:50 PM    <DIR>          .ipynb_checkpoints
02/12/2018  08:00 PM            15,579 .Rhistory
02/13/2018  07:17 PM    <DIR>          __pycache__
12/24/2017  09:17 PM                27 _config.yml
02/13/2018  07:50 PM            64,193 Bracket_2018_linear.ipynb
02/13/2018  07:49 PM            64,193 Bracket_2018_LSTM.ipynb
02/13/2018  07:17 PM    <DIR>          Data
02/11/2018  08:02 PM           568,674 Join.ipynb
01/04/2018  07:59 PM             3,018 lstm.py
12/24/2017  09:17 PM            93,123 NHL Stat DB_2008_2017.csv
02/11/2018  07:22 PM             2,501 NHL_Collection.ipynb
02/12/2018  06:57 PM            67,212 NHL_KNN.ipynb
02/12/2018  07:45 PM             8,973 NHL_Predictions_2017.R
12/24/2017  09:17 PM             1,300 README.m

# Read in the raw data from file

In [3]:
# regular season
reg = pd.read_csv('Data/2000_2017_regular_season.csv')
reg.head()

Unnamed: 0,year,Rk,Team,AvAge,GP,W,L,OL,PTS,PTS%,GF,GA,SOW,SOL,SRS,SOS,TG/G,EVGF,EVGA,PP,PPO,PP%,PPA,PPOA,PK%,SH,SHA,PIM/G,oPIM/G,S,S%,SA,SV%,PDO
0,2017,1,Washington Capitals*,28.4,82,55,19,8,118,0.72,263.0,182,2,5.0,0.99,0.0,5.43,199,130,57,248.0,22.98,44,272.0,83.82,5,3.0,9.0,8.5,2495.0,10.5,2282.0,0.922,102.5
1,2017,2,Pittsburgh Penguins*,28.7,82,50,21,11,111,0.677,282.0,234,4,5.0,0.59,0.01,6.29,213,170,60,260.0,23.08,52,257.0,79.77,5,7.0,8.4,8.7,2745.0,10.1,2671.0,0.914,101.0
2,2017,3,Chicago Blackhawks*,29.4,82,50,23,9,109,0.665,244.0,213,4,1.0,0.32,-0.06,5.57,197,158,42,233.0,18.03,47,211.0,77.73,1,7.0,6.9,7.8,2508.0,9.6,2574.0,0.918,101.3
3,2017,4,Columbus Blue Jackets*,26.2,82,50,24,8,108,0.659,249.0,195,2,2.0,0.68,0.02,5.41,195,152,42,211.0,19.91,39,223.0,82.51,10,2.0,8.6,8.5,2540.0,9.7,2489.0,0.922,101.2
4,2017,5,Minnesota Wild*,28.3,82,49,25,8,106,0.646,266.0,208,3,2.0,0.63,-0.08,5.78,211,166,47,224.0,20.98,37,217.0,82.95,5,3.0,8.0,8.3,2527.0,10.4,2465.0,0.916,101.5


In [4]:
# playoff stats
playoff = pd.read_csv('Data/2000_2017_playoffs.csv')
playoff.head()

Unnamed: 0,year,Rk,Team,GP,W,L,T,OW,OL,W-L%,G,GA,DIFF
0,2017,1,Pittsburgh Penguins,25,16,9,0,2,2,0.64,77,57,20
1,2017,2,Nashville Predators,22,14,8,0,2,1,0.636,60,48,12
2,2017,3,Ottawa Senators,19,11,8,0,6,2,0.579,47,50,-3
3,2017,4,Anaheim Ducks,17,10,7,0,4,1,0.588,50,52,-2
4,2017,5,Edmonton Oilers,13,7,6,0,1,3,0.538,36,35,1


# Cleanup
* playoff result is the outcome variable, so append playoff result to regular season dataframe
* will need a foreign key relationship of team/year between the two tables

In [5]:
def drop_pun(str_var):
    '''drop the list of chars from string'''
    ignore='!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    str_cln=str_var.translate({ord(x): '' for x in ignore})
    return str_cln

In [6]:
reg.Team = reg.Team.apply(drop_pun)

In [7]:
# rename rank columns to more distinct
playoff.rename(columns={"Rk": "playoff_rank"}, inplace=True)
reg.rename(columns={"Rk": "reg_rank"}, inplace=True)

In [8]:
allData = pd.merge(reg, playoff[['year','Team','playoff_rank']], how = 'left', on = ['year', 'Team'])
allData.head()

Unnamed: 0,year,reg_rank,Team,AvAge,GP,W,L,OL,PTS,PTS%,GF,GA,SOW,SOL,SRS,SOS,TG/G,EVGF,EVGA,PP,PPO,PP%,PPA,PPOA,PK%,SH,SHA,PIM/G,oPIM/G,S,S%,SA,SV%,PDO,playoff_rank
0,2017,1,Washington Capitals,28.4,82,55,19,8,118,0.72,263.0,182,2,5.0,0.99,0.0,5.43,199,130,57,248.0,22.98,44,272.0,83.82,5,3.0,9.0,8.5,2495.0,10.5,2282.0,0.922,102.5,6.0
1,2017,2,Pittsburgh Penguins,28.7,82,50,21,11,111,0.677,282.0,234,4,5.0,0.59,0.01,6.29,213,170,60,260.0,23.08,52,257.0,79.77,5,7.0,8.4,8.7,2745.0,10.1,2671.0,0.914,101.0,1.0
2,2017,3,Chicago Blackhawks,29.4,82,50,23,9,109,0.665,244.0,213,4,1.0,0.32,-0.06,5.57,197,158,42,233.0,18.03,47,211.0,77.73,1,7.0,6.9,7.8,2508.0,9.6,2574.0,0.918,101.3,15.0
3,2017,4,Columbus Blue Jackets,26.2,82,50,24,8,108,0.659,249.0,195,2,2.0,0.68,0.02,5.41,195,152,42,211.0,19.91,39,223.0,82.51,10,2.0,8.6,8.5,2540.0,9.7,2489.0,0.922,101.2,13.0
4,2017,5,Minnesota Wild,28.3,82,49,25,8,106,0.646,266.0,208,3,2.0,0.63,-0.08,5.78,211,166,47,224.0,20.98,37,217.0,82.95,5,3.0,8.0,8.3,2527.0,10.4,2465.0,0.916,101.5,14.0


In [9]:
# drop teams that never made playoffs (playoff_rank will be NaN)
allData.playoff_rank.unique()

array([  6.,   1.,  15.,  13.,  14.,   4.,   9.,   5.,   8.,  nan,  11.,
         3.,  10.,  12.,   2.,  16.,   7.])

In [10]:
# drop rows that have nan values
allData.dropna(axis=0, inplace=True)

In [11]:
allData.playoff_rank.unique()

array([  6.,   1.,  15.,  13.,  14.,   4.,   9.,   5.,   8.,  11.,   3.,
        10.,  12.,   2.,  16.,   7.])

#### Scale data by year


In [36]:
scaled = allData.drop(['Team', 'playoff_rank','GP'],axis=1).groupby('year').apply(lambda x: (x - np.mean(x)) / np.std(x)).drop(['year'],axis=1)
scaled.head()

Unnamed: 0,reg_rank,AvAge,W,L,OL,PTS,PTS%,GF,GA,SOW,SOL,SRS,SOS,TG/G,EVGF,EVGA,PP,PPO,PP%,PPA,PPOA,PK%,SH,SHA,PIM/G,oPIM/G,S,S%,SA,SV%,PDO
0,-1.482175,0.262896,2.22136,-1.964709,-0.392232,2.255336,2.257468,1.087139,-2.016137,-0.825723,0.879883,2.401232,0.605482,-0.358137,0.894188,-2.378947,1.290005,0.397546,1.268785,-0.106183,1.230323,1.083203,-0.677905,-1.340533,-0.04028,-0.576639,-0.307469,1.590443,-1.239816,1.587617,2.042848
1,-1.288848,0.558655,0.866872,-1.413856,0.710921,1.225726,1.224064,2.102989,1.494102,0.825723,0.879883,0.883066,0.918662,2.35949,1.703582,0.925146,1.786161,1.280983,1.315731,1.71409,0.585049,-0.912689,-0.677905,1.0968,-0.493429,-0.384426,2.210015,0.958479,1.750811,-0.098348,0.383495
2,-1.095521,1.248757,0.866872,-0.863003,-0.024515,0.931552,0.935672,0.071288,0.076505,0.825723,-1.319824,-0.141695,-1.2736,0.084268,0.77856,-0.066082,-1.190774,-0.706749,-1.055026,0.57642,-1.393792,-1.918028,-2.130557,1.0968,-1.626301,-1.249384,-0.17656,0.168524,1.005076,0.744635,0.715366
3,-0.902194,-1.905998,0.866872,-0.587576,-0.392232,0.784465,0.791476,0.338617,-1.138577,-0.825723,-0.769897,1.224653,1.231843,-0.421338,0.662932,-0.561696,-1.190774,-2.326382,-0.172447,-1.243853,-0.877573,0.437618,1.137911,-1.949867,-0.342379,-0.576639,0.145678,0.326515,0.351597,1.587617,0.604742
4,-0.708866,0.16431,0.595975,-0.31215,-0.392232,0.49029,0.479051,1.247536,-0.261018,0.0,-0.769897,1.034883,-1.899961,0.747874,1.587955,0.594737,-0.363848,-1.369326,0.329872,-1.698921,-1.135683,0.654455,-0.677905,-1.340533,-0.795528,-0.768852,0.014769,1.432452,0.167086,0.323143,0.936613


In [39]:
# stitch scaled data back onto original dataframe
allData=pd.concat([allData[['Team','year','playoff_rank']],scaled],axis=1)

# Train/Test split
* train on < 2016 seasons
* test on 2017 season

In [40]:
x_train = allData[allData.year<2017].drop('playoff_rank', axis=1)
y_train = allData[allData.year<2017].playoff_rank

x_test = allData[allData.year==2017].drop('playoff_rank', axis=1)
y_test = allData[allData.year==2017].playoff_rank

In [41]:
x_train.shape

(138, 33)

In [42]:
x_train.head()

Unnamed: 0,Team,year,reg_rank,AvAge,W,L,OL,PTS,PTS%,GF,GA,SOW,SOL,SRS,SOS,TG/G,EVGF,EVGA,PP,PPO,PP%,PPA,PPOA,PK%,SH,SHA,PIM/G,oPIM/G,S,S%,SA,SV%,PDO
30,Washington Capitals,2016,-1.690309,-0.814649,2.442906,-2.615911,-0.523424,2.658474,2.655248,1.330821,-1.44944,0.476731,-0.823387,2.093533,-0.407846,0.216425,1.416665,-0.686305,0.742781,-0.360504,1.012996,-1.250114,0.166197,1.259465,-1.501074,-0.199557,-0.106531,-0.675245,0.213569,1.227402,-0.514581,0.948122,0.927127
31,Dallas Stars,2016,-1.47902,0.732153,1.015234,-1.151781,-0.130856,1.13539,1.139902,2.349306,1.944818,-0.953463,-0.823387,0.857195,0.062746,2.907499,1.870401,1.542419,1.265049,0.286052,1.121566,-0.236508,-0.210098,0.13425,1.404231,3.126393,-0.43944,-0.675245,1.300485,1.588403,-0.120046,-2.115042,-0.70161
33,Pittsburgh Penguins,2016,-1.056443,0.886834,0.539343,-0.273304,-0.523424,0.443079,0.438772,0.855528,-0.532073,0.476731,0.411693,1.104462,0.062746,0.344571,0.736061,-0.049527,-0.475844,0.227275,-0.648582,-0.912245,0.213234,0.980101,1.767394,-0.199557,-0.506022,-0.578781,2.234851,-0.577601,0.537511,0.948122,0.175402
34,Anaheim Ducks,2016,-0.845154,-1.588051,0.063452,-0.56613,0.65428,0.304617,0.303069,-0.977746,-1.541176,-0.238366,0.411693,-0.032969,-1.819622,-1.62034,-1.532619,-0.686305,0.916871,-0.713172,1.461434,-1.419048,1.765454,2.066516,0.314741,-0.864747,2.090671,1.446953,-0.043858,-0.938602,-1.263185,0.729325,-1.453334
35,Florida Panthers,2016,-0.633866,1.196194,0.301398,-0.273304,-0.130856,0.304617,0.303069,0.448134,-0.532073,2.622022,-0.205847,0.708834,-0.407846,0.045563,0.584815,-1.535343,-0.649934,1.226499,-1.3472,1.62177,0.730641,-0.944405,-0.411585,-0.199557,0.62587,0.578781,-1.235652,1.046902,0.365535,0.729325,1.678852


In [43]:
x_train.shape, x_test.shape

((138, 33), (15, 33))

# Fit Linear Model

In [44]:
reg = linear_model.LinearRegression()

In [45]:
reg.fit(X=x_train.drop(['year','Team'], axis=1), y=y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

# Fit Neural Network

In [58]:
mlp = MLPRegressor(hidden_layer_sizes=(100,100), max_iter=2000)

In [59]:
mlp.fit(X=x_train.drop(['year','Team'], axis=1), y=y_train)

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100), learning_rate='constant',
       learning_rate_init=0.001, max_iter=2000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [73]:
predictions_2017 = pd.DataFrame({'Team': x_test.Team,
                                 'Actual': y_test,
                                 'LM' : reg.predict(X=x_test.drop(['year','Team'], axis=1)),
                                 'NN' : mlp.predict(X=x_test.drop(['year','Team'], axis=1))})
predictions_2017

Unnamed: 0,Actual,LM,NN,Team
0,6.0,3.307495,0.515743,Washington Capitals
1,1.0,9.356636,8.448861,Pittsburgh Penguins
2,15.0,9.271696,12.725988,Chicago Blackhawks
3,13.0,4.831136,-0.096329,Columbus Blue Jackets
4,14.0,6.633041,11.704651,Minnesota Wild
5,4.0,7.938779,7.689149,Anaheim Ducks
6,9.0,7.938753,9.015216,Montreal Canadiens
7,5.0,8.136691,6.611793,Edmonton Oilers
8,8.0,8.537563,9.522855,New York Rangers
10,11.0,6.493228,12.314592,San Jose Sharks


# Convert Regression to ordered class

In [86]:
# convert regression to ordered classification
def reg_to_ord(array):
    temp = array.argsort()
    ranks = np.empty_like(temp)
    ranks[temp] = np.arange(len(array))
    return ranks+1 # plus 1 so index starts at 1

In [87]:
reg_to_ord(predictions_2017.NN)

array([ 2,  6, 13,  1, 11,  5,  7,  4,  8, 12, 15, 14,  3, 10,  9], dtype=int64)

# 2017 Predictions
* LM, NN are the predictions from linear model, and neural network, respectively.

In [88]:
predictions_2017[['LM','NN']]=predictions_2017[['LM','NN']].apply(reg_to_ord)
predictions_2017.sort_values('Actual')

Unnamed: 0,Actual,LM,NN,Team
1,1.0,12,6,Pittsburgh Penguins
16,2.0,13,10,Nashville Predators
11,3.0,15,15,Ottawa Senators
5,4.0,7,5,Anaheim Ducks
7,5.0,8,4,Edmonton Oilers
0,6.0,1,2,Washington Capitals
8,8.0,9,8,New York Rangers
6,9.0,6,7,Montreal Canadiens
12,10.0,10,14,Toronto Maple Leafs
10,11.0,4,12,San Jose Sharks


In [89]:
# note, where is 7th place team from 2017?