In [182]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [183]:
nhlRaw = pd.read_csv('NHL Stat DB_2008_2017.csv', encoding="ISO-8859-1")

In [184]:
# read in data from file
nhlRaw.columns = [c.replace(' ', '_') for c in nhlRaw.columns]

In [185]:
# filter to teams that made playoffs
# TODO: remove tampa bay
nhlRaw = nhlRaw[nhlRaw.CY_Season_Rank < 18]

In [186]:
# split the labels from the stat data
labels = nhlRaw.iloc[:,0:7]
nhlData = nhlRaw.iloc[:,np.r_[0,7:len(nhlRaw.columns)]]

In [187]:
# convert values to scaled, group by year
nhlScale = nhlData.groupby('Year')
# nhlScaled = nhlScale.apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)))
# not sure if z-score is working properly yet
nhlScaled = nhlScale.apply(lambda x: (x - np.mean(x)) / np.std(x))

In [193]:
# merge the datasets back together if skipping PCA
# labels.reset_index(drop=True, inplace=True)
# nhlScaled.reset_index(drop=True, inplace=True)
# nhlScaled = pd.concat([labels, nhlScaled], axis = 1)

In [194]:
#nhlScaled

Unnamed: 0,Year,Team,YrTm,Season,CY_Season_Rank,CY_Playoff_Rank,GP,W,L,OT,...,NZFO%,GF-GA,GF/W,GF/L,GA/W,GA/L,PY_Playoff_RNK,2PY_Playoff_RNK,3PY_Playoff_RNK,4PYPlayoff_RNK
0,2017,Anaheim Ducks,2017Anaheim Ducks,Regular,6,4,82,-0.128999,-0.985064,1.531283,...,-2.244852,-0.273502,-0.878724,0.172077,-0.467434,0.514530,-0.525918,-1.060270,-0.995883,-0.477690
1,2017,Boston Bruins,2017Boston Bruins,Regular,13,10,82,-0.677243,1.247747,-0.765641,...,-0.185289,-0.273502,0.279414,-0.919851,0.326237,-1.239955,0.305766,0.599890,-0.870475,-1.313648
2,2017,Calgary Flames,2017Calgary Flames,Regular,15,16,82,-0.403121,1.805950,-1.914104,...,-0.427591,-1.203408,-0.522566,-1.316781,0.524347,-1.341544,1.241410,-0.585939,1.763082,1.433071
3,2017,Chicago Blackhawks,2017Chicago Blackhawks,Regular,3,15,82,0.967490,-0.985064,0.000000,...,0.905067,-0.041025,-0.837629,0.628273,-0.539787,1.142096,-0.421957,-1.297436,-1.246698,-1.433071
4,2017,Columbus Blue Jackets,2017Columbus Blue Jackets,Regular,4,14,82,0.967490,-0.705962,-0.382821,...,-1.760249,1.167853,-0.506813,0.553190,-1.185060,0.010734,1.345371,1.192804,-0.118031,0.477690
5,2017,Edmonton Oilers,2017Edmonton Oilers,Regular,8,6,82,0.145124,-0.147760,0.000000,...,0.178163,0.330937,0.037175,0.057150,-0.260867,-0.066371,1.553292,1.904301,1.888489,1.313648
6,2017,Minnesota Wild,2017Minnesota Wild,Regular,5,13,82,0.693368,-0.426861,-0.382821,...,1.026218,1.307339,0.502998,0.672980,-0.600780,0.201584,-0.214036,-0.467356,-0.619660,0.119423
7,2017,Montréal Canadiens,2017Montréal Canadiens,Regular,7,12,82,0.145124,-0.147760,0.000000,...,0.057012,-0.180511,-0.968347,-0.346408,-0.586033,-0.399464,0.721608,-0.704522,-1.121290,0.238845
8,2017,Nashville Predators,2017Nashville Predators,Regular,17,2,82,-1.499610,0.689545,1.148462,...,-0.548742,-0.505978,1.536873,-0.540533,1.372010,-0.427540,-0.837799,-0.111607,0.759822,1.671916
9,2017,New York Rangers,2017New York Rangers,Regular,9,8,82,0.419246,0.410443,-1.148462,...,0.662766,0.377433,0.274939,-0.105714,-0.098285,-0.304294,0.201806,-0.941687,-1.372105,-0.716535


In [189]:
# now drop year from the dataframe
nhlScaled = nhlScaled.drop('Year', 1)
nhlScaled = nhlScaled.fillna(0)


In [9]:
nhlCluster = nhlScaled.transpose()

In [10]:
# cluster stat attriburtes
kmeans = KMeans(n_clusters=6, random_state=0).fit(nhlCluster)

In [11]:
# associate the clusters with each stat
pcaGroups = pd.DataFrame({'Group' : kmeans.labels_,
                          'Stat' : nhlScaled.columns})
# assign group names to the dataframe instead of stat names
nhlScaled.columns = pcaGroups['Group']

In [190]:
# fill NaN with zeros
# TODO: find a better imputation method rather than subbing 
#       a zero.
nhlScaled = nhlScaled.fillna(0)
# current scaling method is causing some +/- inf values, sub a zero in there too
nhlScaled = nhlScaled.replace([np.inf, -np.inf], 0)

In [13]:
pca = PCA(n_components=1)
pcaDF = pd.DataFrame()

for x in np.unique(pcaGroups['Group']):
    # extract a group
    pcN = nhlScaled[x]
    # PCA on that group
    pcN = pca.fit_transform(pcN)
    # convert to data frame
    pcN = pd.DataFrame(pcN)
    # merge that group into dataframe out
    pcaDF = pd.concat([pcaDF, pcN], axis = 1)
pcaDF.columns = np.unique(pcaGroups['Group'])

In [14]:
# merge the datasets back together
labels.reset_index(drop=True, inplace=True)
pcaDF.reset_index(drop=True, inplace=True)
nhlScaled = pd.concat([labels, pcaDF], axis = 1)

In [15]:
# fill NaN with zeros
# TODO: find a better imputation method rather than subbing 
#       a zero.
nhlScaled = nhlScaled.fillna(0)
# current scaling method is causing some +/- inf values, sub a zero in there too
nhlScaled = nhlScaled.replace([np.inf, -np.inf], 0)

In [195]:
# split to training (history) and test (current year)
# traditional methods would split the data 70/30 rather
# than prior year and current year

# 'Test' set
# select current year
X_test = nhlScaled[nhlScaled.Year == 2017]
# extract the ranks (targets)
y_test = X_test.CY_Playoff_Rank
# drop ranks from X_
X_test = X_test.drop('CY_Playoff_Rank', axis = 1)

In [196]:
# save team names, drop the rest of the data
X_test_names = X_test.Team
X_test = X_test.drop(X_test.columns[[list(range(0,6,1))]], axis = 1)

In [197]:
# select all prior years for training and validation
nhl_train = nhlScaled[nhlScaled.Year < 2017]
nhl_train_y = nhl_train.CY_Playoff_Rank

nhl_train = nhl_train.drop(nhl_train.columns[[list(range(0,7,1))]], axis = 1)

X_train, X_validate, y_train, y_validate = train_test_split(nhl_train,
                                                            nhl_train_y,
                                                            test_size=0.33, 
                                                            random_state=42)




In [198]:
# initialize knn obj.
# TODO: optimize num neighbors
knn = KNeighborsClassifier(n_neighbors = 5, algorithm = 'auto')

In [199]:
# fit model
# syntax knn.fit(training data, target data)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [200]:
# call the model w/ 'validation' data
predicts = knn.predict(X_validate)
# convert to dataframe
predicts = pd.DataFrame(predicts)

In [201]:
# merge the actual and prediction data frames
# reset row indexing to align predictions with actuals
y_validate.reset_index(drop=True, inplace=True)
predicts.reset_index(drop=True, inplace=True)

In [202]:
# merge the data for presentation
results = pd.concat([y_validate, predicts], axis=1)
# give name to columns
results.columns = ['Actual', 'Predicted']

In [203]:
# view results
# knn not good in its current form :(
#results

In [204]:
mean_err = np.mean(np.abs(results.Actual - results.Predicted))
sd_err = np.std(np.abs(results.Actual - results.Predicted))

In [205]:
print('Mean' + str(mean_err) + '\n SD' + str(sd_err))

Mean5.549019607843137
 SD3.8872023585203435
