In [2]:
# Load data

import numpy as np
import pandas as pd
import altair as alt

df_skaters = pd.read_csv('C:/Users/Admin/Desktop/Career/Projects/NHL Project/Project/data/game_skater_stats.csv')
df_skaters.head()

Unnamed: 0,game_id,player_id,team_id,timeOnIce,assists,goals,shots,hits,powerPlayGoals,powerPlayAssists,...,faceoffTaken,takeaways,giveaways,shortHandedGoals,shortHandedAssists,blocked,plusMinus,evenTimeOnIce,shortHandedTimeOnIce,powerPlayTimeOnIce
0,2016020045,8468513,4,955,1,0,0,2.0,0,0,...,0,1.0,1.0,0,0,1.0,1,858,97,0
1,2016020045,8476906,4,1396,1,0,4,2.0,0,0,...,0,1.0,2.0,0,0,2.0,0,1177,0,219
2,2016020045,8474668,4,915,0,0,1,1.0,0,0,...,0,2.0,0.0,0,0,0.0,-1,805,0,110
3,2016020045,8473512,4,1367,3,0,0,0.0,0,2,...,27,0.0,0.0,0,0,0.0,-1,1083,19,265
4,2016020045,8471762,4,676,0,0,3,2.0,0,0,...,0,0.0,1.0,0,0,0.0,-1,613,63,0


In [3]:
# Drop duplicate rows

df_skaters = df_skaters.drop_duplicates()

In [4]:
# Replace NAs with 0s

df_skaters = df_skaters.fillna(0)

In [5]:
# Create data frame for question

df_skaters_toi = df_skaters[['player_id', 'timeOnIce', 'assists', 'goals']].groupby('player_id').sum()
df_skaters_toi['points'] = df_skaters_toi['assists'] + df_skaters_toi['goals']

In [6]:
###

# Merge with player data to add name and position

df_position = pd.read_csv('C:/Users/Admin/Desktop/Career/Projects/NHL Project/Project/data/player_info.csv')
df_skaters_toi = df_skaters_toi.merge(df_position[['player_id', 'firstName', 'lastName', 'primaryPosition']], left_on='player_id', right_on='player_id', how='inner')
df_skaters_toi.head()

Unnamed: 0,player_id,timeOnIce,assists,goals,points,firstName,lastName,primaryPosition
0,8444894,51806,12,11,23,Greg,Adams,LW
1,8444919,232930,37,4,41,Tommy,Albelin,D
2,8445000,322029,74,88,162,Dave,Andreychuk,LW
3,8445176,214915,82,59,141,Donald,Audette,RW
4,8445266,329428,23,7,30,Murray,Baron,D


In [7]:
# Drop goalies

df_skaters_toi = df_skaters_toi[df_skaters_toi['primaryPosition'] != 'G']

In [8]:
df_skaters_toi.head()

Unnamed: 0,player_id,timeOnIce,assists,goals,points,firstName,lastName,primaryPosition
0,8444894,51806,12,11,23,Greg,Adams,LW
1,8444919,232930,37,4,41,Tommy,Albelin,D
2,8445000,322029,74,88,162,Dave,Andreychuk,LW
3,8445176,214915,82,59,141,Donald,Audette,RW
4,8445266,329428,23,7,30,Murray,Baron,D


In [9]:
# Convert to binary outcome

def map_position(primary_position):
    if primary_position in ['LW', 'RW', 'C']:
        return 0
    elif primary_position == 'D':
        return 1
    else:
        return 'unknown'

df_skaters_toi['position'] = df_skaters_toi['primaryPosition'].apply(map_position)

In [10]:
# Train-test split

from sklearn.model_selection import train_test_split

X = df_skaters_toi[['points', 'timeOnIce']]
y = df_skaters_toi['position']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [11]:
# Visualise data to help determine kernel

df_skaters_toi['position'] = df_skaters_toi['position'].astype('category')

vis = alt.Chart(df_skaters_toi).mark_point().encode(
    x='timeOnIce',
    y='points',
    shape='position'
).properties(
    title='Points vs Time on Ice by Position'
)

vis

In [15]:
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold
from sklearn.svm import SVC

# The model takes too long to run, so let's take a subsample of it
subsample_fraction = 0.5

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Subsample the training data
subsample_size = int(len(X_train) * subsample_fraction)
subsample_indices = np.random.choice(len(X_train), subsample_size, replace=False)

# Use the subsample_indices to select the rows
X_train_subsample = X_train.iloc[subsample_indices]
y_train_subsample = y_train.iloc[subsample_indices]

# Some minor hyperparameter tuning
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear']
}

# Use StratifiedKFold with fewer folds for cross-validation
cv = StratifiedKFold(n_splits=3)

# Using grid search for hyperparameter tuning
grid_search = GridSearchCV(SVC(probability=False), param_grid, cv=cv, scoring='accuracy', n_jobs=-1)

# Fit the model on the subsampled data
grid_search.fit(X_train_subsample, y_train_subsample)

# Get the best parameters
best_param = grid_search.best_params_
best_model = grid_search.best_estimator_

# Print the best parameters
print("Best Parameters:", best_param)

# Make predictions with the best model
y_pred = best_model.predict(X_test)

Best Parameters: {'C': 10, 'kernel': 'linear'}


In [16]:
# Print some classification metrics

from sklearn.metrics import classification_report, confusion_matrix

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[316 131]
 [ 67 157]]

Classification Report
              precision    recall  f1-score   support

           0       0.83      0.71      0.76       447
           1       0.55      0.70      0.61       224

    accuracy                           0.70       671
   macro avg       0.69      0.70      0.69       671
weighted avg       0.73      0.70      0.71       671

