In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
# import xgboost as xgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
nhlRaw = pd.read_csv('NHL Stat DB_2008_2017.csv', encoding="ISO-8859-1")

In [3]:
# read in data from file
nhlRaw.columns = [c.replace(' ', '_') for c in nhlRaw.columns]

In [4]:
# filter to teams that made playoffs
# TODO: remove tampa bay
nhlRaw = nhlRaw[nhlRaw.CY_Season_Rank < 18]

In [5]:
# split the labels from the stat data
labels = nhlRaw.iloc[:,0:7]
nhlData = nhlRaw.iloc[:,np.r_[0,7:len(nhlRaw.columns)]]

In [6]:
# convert values to scaled, group by year
nhlScale = nhlData.groupby('Year')
# nhlScaled = nhlScale.apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)))
# not sure if z-score is working properly yet
nhlScaled = nhlScale.apply(lambda x: (x - np.mean(x)) / np.std(x))

In [9]:
# merge the datasets back together if skipping PCA
# labels.reset_index(drop=True, inplace=True)
# nhlScaled.reset_index(drop=True, inplace=True)
# nhlScaled = pd.concat([labels, nhlScaled], axis = 1)

In [10]:
#nhlScaled

In [7]:
# now drop year from the dataframe
nhlScaled = nhlScaled.drop('Year', 1)
nhlScaled = nhlScaled.fillna(0)


In [8]:
nhlCluster = nhlScaled.transpose()

In [9]:
# cluster stat attriburtes
kmeans = KMeans(n_clusters=6, random_state=0).fit(nhlCluster)

In [10]:
# associate the clusters with each stat
pcaGroups = pd.DataFrame({'Group' : kmeans.labels_,
                          'Stat' : nhlScaled.columns})
# assign group names to the dataframe instead of stat names
nhlScaled.columns = pcaGroups['Group']

In [11]:
# fill NaN with zeros
# TODO: find a better imputation method rather than subbing 
#       a zero.
nhlScaled = nhlScaled.fillna(0)
# current scaling method is causing some +/- inf values, sub a zero in there too
nhlScaled = nhlScaled.replace([np.inf, -np.inf], 0)

In [12]:
pca = PCA(n_components=1)
pcaDF = pd.DataFrame()

for x in np.unique(pcaGroups['Group']):
    # extract a group
    pcN = nhlScaled[x]
    # PCA on that group
    pcN = pca.fit_transform(pcN)
    # convert to data frame
    pcN = pd.DataFrame(pcN)
    # merge that group into dataframe out
    pcaDF = pd.concat([pcaDF, pcN], axis = 1)
pcaDF.columns = np.unique(pcaGroups['Group'])

In [13]:
# merge the datasets back together
labels.reset_index(drop=True, inplace=True)
pcaDF.reset_index(drop=True, inplace=True)
nhlScaled = pd.concat([labels, pcaDF], axis = 1)

In [14]:
# fill NaN with zeros
# TODO: find a better imputation method rather than subbing 
#       a zero.
nhlScaled = nhlScaled.fillna(0)
# current scaling method is causing some +/- inf values, sub a zero in there too
nhlScaled = nhlScaled.replace([np.inf, -np.inf], 0)

In [25]:
nhlScaled

Unnamed: 0,Year,Team,YrTm,Season,CY_Season_Rank,CY_Playoff_Rank,GP,0,1,2,3,4,5
0,2017,Anaheim Ducks,2017Anaheim Ducks,Regular,6,4,82,-1.866229,-1.480540,0.272732,2.082136,-2.553175,-1.387045
1,2017,Boston Bruins,2017Boston Bruins,Regular,13,10,82,8.254934,5.136323,-2.472288,-0.564807,-1.071066,-0.515102
2,2017,Calgary Flames,2017Calgary Flames,Regular,15,16,82,-3.000205,1.423986,-3.105769,-1.422211,-1.574673,1.409101
3,2017,Chicago Blackhawks,2017Chicago Blackhawks,Regular,3,15,82,-1.104653,-1.260905,1.941329,0.606175,-0.044339,-1.127692
4,2017,Columbus Blue Jackets,2017Columbus Blue Jackets,Regular,4,14,82,-0.861551,-1.223675,2.649074,2.538892,0.312004,0.980960
5,2017,Edmonton Oilers,2017Edmonton Oilers,Regular,8,6,82,0.317547,0.507953,0.292642,0.402152,0.611724,2.961267
6,2017,Minnesota Wild,2017Minnesota Wild,Regular,5,13,82,-0.345373,-0.672053,2.347463,0.534718,2.830032,-0.492060
7,2017,Montréal Canadiens,2017Montréal Canadiens,Regular,7,12,82,-0.349190,0.716633,-0.118342,1.841594,-1.675008,0.169122
8,2017,Nashville Predators,2017Nashville Predators,Regular,17,2,82,1.612642,-0.554200,-2.580721,-2.365545,-0.538123,1.131288
9,2017,New York Rangers,2017New York Rangers,Regular,9,8,82,-2.271166,-0.570376,0.572063,-0.806124,1.795360,-1.594297


In [15]:
# split to training (history) and test (current year)
# traditional methods would split the data 70/30 rather
# than prior year and current year

# 'Test' set
# select current year
X_test = nhlScaled[nhlScaled.Year == 2017]
# extract the ranks (targets)
y_test = X_test.CY_Playoff_Rank
# drop ranks from X_
X_test = X_test.drop('CY_Playoff_Rank', axis = 1)

In [16]:
# save team names, drop the rest of the data
X_test_names = X_test.Team
X_test = X_test.drop(X_test.columns[[list(range(0,6,1))]], axis = 1)

In [17]:
# select all prior years for training and validation
nhl_train = nhlScaled[nhlScaled.Year < 2017]
nhl_train_y = nhl_train.CY_Playoff_Rank

nhl_train = nhl_train.drop(nhl_train.columns[[list(range(0,7,1))]], axis = 1)

X_train, X_validate, y_train, y_validate = train_test_split(nhl_train,
                                                            nhl_train_y,
                                                            test_size=0.33, 
                                                            random_state=42)

In [18]:
# initialize knn obj.
# TODO: optimize num neighbors
knn = KNeighborsClassifier(n_neighbors = 5, algorithm = 'auto')

In [19]:
# fit model
# syntax knn.fit(training data, target data)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [20]:
# call the model w/ 'validation' data
predicts = knn.predict(X_validate)
# convert to dataframe
predicts = pd.DataFrame(predicts)

In [21]:
# merge the actual and prediction data frames
# reset row indexing to align predictions with actuals
y_validate.reset_index(drop=True, inplace=True)
predicts.reset_index(drop=True, inplace=True)

In [22]:
# merge the data for presentation
results = pd.concat([y_validate, predicts], axis=1)
# give name to columns
results.columns = ['Actual', 'Predicted']

In [23]:
# view results
results

Unnamed: 0,Actual,Predicted
0,13,4
1,14,3
2,10,14
3,4,7
4,8,4
5,17,6
6,2,7
7,4,4
8,14,3
9,7,4


In [28]:
mean_err = np.mean(np.abs(results.Actual - results.Predicted))
sd_err = np.std(np.abs(results.Actual - results.Predicted))

In [29]:
print('Mean' + str(mean_err) + '\n SD' + str(sd_err))

Mean4.882352941176471
 SD3.8073564439278345
