In [43]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
import numpy as np
import pandas as pd

In [44]:
nhlRaw = pd.read_csv('NHL Stat DB_2008_2017.csv', encoding="ISO-8859-1")

In [45]:
# read in data from file
nhlRaw.columns = [c.replace(' ', '_') for c in nhlRaw.columns]

In [46]:
# filter to teams that made playoffs
# TODO: remove tampa bay
nhlRaw = nhlRaw[nhlRaw.CY_Season_Rank < 18]

In [47]:
# split the labels from the 'data'
labels = nhlRaw.iloc[:,0:7]
nhlData = nhlRaw.iloc[:,np.r_[0,7:len(nhlRaw.columns)]]

In [48]:
# convert values to scaled, group by year
# TODO: consider using z-score...divide by Std.Dev.
nhlScale = nhlData.groupby('Year')
nhlScaled = nhlScale.apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)))

In [49]:
# now drop year from the dataframe
nhlScaled = nhlScaled.drop('Year', 1)

In [50]:
# merge the datasets back together
nhlScaled = pd.concat([labels, nhlScaled], axis = 1)

In [51]:
# fill NaN with zeros
# TODO: find a better imputation method rather than subbing 
#       a zero.
nhlScaled = nhlScaled.fillna(0)
# current scaling method is causing some +/- inf values, sub a zero in there too
nhlScaled = nhlScaled.replace([np.inf, -np.inf], 0)

In [52]:
# split to training (history) and test (current year)
# traditional methods would split the data 70/30 rather
# than prior year and current year

# 'Test' set
# select current year
nhlTest = nhlScaled[nhlScaled.Year == 2017]
# extract the ranks (targets)
nhlTestclass = nhlTest.CY_Playoff_Rank
# select team teams
nhlTestTeams = nhlTest.Team
# drop admin data from the test set
nhlTest = nhlTest.iloc[:,7:len(nhlRaw.columns)]

# 'Training' set
# select all prior years
nhlTrain = nhlScaled[nhlScaled.Year < 2017]
nhlTrainclass = nhlTrain.CY_Playoff_Rank
nhlTrainTeams = nhlTrain.Team
# drop admin data from training set
nhlTrain = nhlTrain.iloc[:,7:len(nhlRaw.columns)]

In [53]:
# initialize knn obj.
# TODO: optimize num neighbors
knn = KNeighborsClassifier(n_neighbors = 6, algorithm = 'auto')

In [54]:
# fit model
# syntax knn.fit(training data, target data)
knn.fit(nhlTrain, nhlTrainclass)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=6, p=2,
           weights='uniform')

In [55]:
# call the model w/ 'test' data
predicts = knn.predict(nhlTest)
# convert to dataframe
predicts = pd.DataFrame(predicts)

In [56]:
# merge the actual and prediction data frames
# reset row indexing to align predictions with actuals
nhlTestTeams.reset_index(drop=True, inplace=True)
nhlTestclass.reset_index(drop=True, inplace=True)
predicts.reset_index(drop=True, inplace=True)

In [57]:
# merge the data for presentation
results = pd.concat([nhlTestTeams, nhlTestclass, predicts], axis=1)
# give name to columns
results.columns = ['Team', 'Actual', 'Predicted']

In [58]:
# view results
# knn not good in its current form :(
results

Unnamed: 0,Team,Actual,Predicted
0,Anaheim Ducks,4,8
1,Boston Bruins,10,15
2,Calgary Flames,16,14
3,Chicago Blackhawks,15,1
4,Columbus Blue Jackets,14,3
5,Edmonton Oilers,6,16
6,Minnesota Wild,13,7
7,Montréal Canadiens,12,4
8,Nashville Predators,2,9
9,New York Rangers,8,10
