In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import xgboost as xgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline



In [5]:
nhlRaw = pd.read_csv('NHL Stat DB_2008_2017.csv', encoding="ISO-8859-1")

In [6]:
# read in data from file
nhlRaw.columns = [c.replace(' ', '_') for c in nhlRaw.columns]

In [7]:
# filter to teams that made playoffs
# TODO: remove tampa bay
nhlRaw = nhlRaw[nhlRaw.CY_Season_Rank < 18]

In [8]:
# split the labels from the stat data
labels = nhlRaw.iloc[:,0:7]
nhlData = nhlRaw.iloc[:,np.r_[0,7:len(nhlRaw.columns)]]

In [9]:
# convert values to scaled, group by year
nhlScale = nhlData.groupby('Year')
# nhlScaled = nhlScale.apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)))
# not sure if z-score is working properly yet
nhlScaled = nhlScale.apply(lambda x: (x - np.mean(x)) / np.std(x))

In [10]:
# merge the datasets back together if skipping PCA
# labels.reset_index(drop=True, inplace=True)
# nhlScaled.reset_index(drop=True, inplace=True)
# nhlScaled = pd.concat([labels, nhlScaled], axis = 1)

In [11]:
#nhlScaled

In [12]:
# now drop year from the dataframe
nhlScaled = nhlScaled.drop('Year', 1)
nhlScaled = nhlScaled.fillna(0)


In [9]:
nhlCluster = nhlScaled.transpose()

In [10]:
# cluster stat attriburtes
kmeans = KMeans(n_clusters=6, random_state=0).fit(nhlCluster)

In [11]:
# associate the clusters with each stat
pcaGroups = pd.DataFrame({'Group' : kmeans.labels_,
                          'Stat' : nhlScaled.columns})
# assign group names to the dataframe instead of stat names
nhlScaled.columns = pcaGroups['Group']

In [190]:
# fill NaN with zeros
# TODO: find a better imputation method rather than subbing 
#       a zero.
nhlScaled = nhlScaled.fillna(0)
# current scaling method is causing some +/- inf values, sub a zero in there too
nhlScaled = nhlScaled.replace([np.inf, -np.inf], 0)

In [13]:
pca = PCA(n_components=1)
pcaDF = pd.DataFrame()

for x in np.unique(pcaGroups['Group']):
    # extract a group
    pcN = nhlScaled[x]
    # PCA on that group
    pcN = pca.fit_transform(pcN)
    # convert to data frame
    pcN = pd.DataFrame(pcN)
    # merge that group into dataframe out
    pcaDF = pd.concat([pcaDF, pcN], axis = 1)
pcaDF.columns = np.unique(pcaGroups['Group'])

In [14]:
# merge the datasets back together
labels.reset_index(drop=True, inplace=True)
pcaDF.reset_index(drop=True, inplace=True)
nhlScaled = pd.concat([labels, pcaDF], axis = 1)

In [15]:
# fill NaN with zeros
# TODO: find a better imputation method rather than subbing 
#       a zero.
nhlScaled = nhlScaled.fillna(0)
# current scaling method is causing some +/- inf values, sub a zero in there too
nhlScaled = nhlScaled.replace([np.inf, -np.inf], 0)

In [195]:
# split to training (history) and test (current year)
# traditional methods would split the data 70/30 rather
# than prior year and current year

# 'Test' set
# select current year
X_test = nhlScaled[nhlScaled.Year == 2017]
# extract the ranks (targets)
y_test = X_test.CY_Playoff_Rank
# drop ranks from X_
X_test = X_test.drop('CY_Playoff_Rank', axis = 1)

In [196]:
# save team names, drop the rest of the data
X_test_names = X_test.Team
X_test = X_test.drop(X_test.columns[[list(range(0,6,1))]], axis = 1)

In [197]:
# select all prior years for training and validation
nhl_train = nhlScaled[nhlScaled.Year < 2017]
nhl_train_y = nhl_train.CY_Playoff_Rank

nhl_train = nhl_train.drop(nhl_train.columns[[list(range(0,7,1))]], axis = 1)

X_train, X_validate, y_train, y_validate = train_test_split(nhl_train,
                                                            nhl_train_y,
                                                            test_size=0.33, 
                                                            random_state=42)




In [198]:
# initialize knn obj.
# TODO: optimize num neighbors
knn = KNeighborsClassifier(n_neighbors = 5, algorithm = 'auto')

In [199]:
# fit model
# syntax knn.fit(training data, target data)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [200]:
# call the model w/ 'validation' data
predicts = knn.predict(X_validate)
# convert to dataframe
predicts = pd.DataFrame(predicts)

In [201]:
# merge the actual and prediction data frames
# reset row indexing to align predictions with actuals
y_validate.reset_index(drop=True, inplace=True)
predicts.reset_index(drop=True, inplace=True)

In [202]:
# merge the data for presentation
results = pd.concat([y_validate, predicts], axis=1)
# give name to columns
results.columns = ['Actual', 'Predicted']

In [203]:
# view results
# knn not good in its current form :(
#results

In [204]:
mean_err = np.mean(np.abs(results.Actual - results.Predicted))
sd_err = np.std(np.abs(results.Actual - results.Predicted))

In [205]:
print('Mean' + str(mean_err) + '\n SD' + str(sd_err))

Mean5.549019607843137
 SD3.8872023585203435
