In [8]:
import os
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
from PIL import Image
import matplotlib.pyplot as plt
import math

from sklearn.tree import DecisionTreeClassifier


# 0. Read dataset

In [9]:
# path_csv = r"Dataset/tidyData_fe2.csv"
path_csv = r"Dataset/tidyData_fe3.csv"


In [10]:
df = pd.read_csv(path_csv)
print(f"Shape of df: {df.shape}")
df.sample(5)

Shape of df: (385076, 31)


Unnamed: 0.1,Unnamed: 0,gamePk,homeTeam,period,periodType,periodTime,periodTimeRemaining,dateTime,teamId,teamName,attackingSide,teamTriCode,eventType,x-coordinate,y-coordinate,goalieName,shooterName,shotType,isEmptyNet,strength,shot_distance,angle,isgoal,last_event_type,coor_x_last_event,coor_y_last_event,time_last_event,distance_last_event,is_rebound,Change in shot angle,Speed
352618,352618,2016021154,Calgary Flames,2,REGULAR,04:07,15:53,2017-04-01T02:08:41Z,28,San Jose Sharks,left,SJS,Shot,-87.0,-14.0,Brian Elliott,Joe Pavelski,Backhand,0,False,14.142136,-1.428899,0,Faceoff,0.0,0.0,10.0,88.119237,False,0.0,8.811924
103280,103280,2019020053,Winnipeg Jets,1,REGULAR,16:13,03:47,2019-10-11T00:45:22Z,52,Winnipeg Jets,left,WPG,Shot,-38.0,33.0,Devan Dubnyk,Mason Appleton,Slap Shot,0,False,60.74537,0.574305,0,Faceoff,-20.0,-22.0,19.0,57.870545,True,1.18263,3.045818
367443,367443,2016020236,St. Louis Blues,1,REGULAR,04:19,15:41,2016-11-16T01:14:13Z,19,St. Louis Blues,right,STL,Shot,72.0,-24.0,Robin Lehner,Colton Parayko,Wrist Shot,0,False,29.410882,-0.954499,0,Faceoff,69.0,-22.0,15.0,3.605551,True,-0.76752,0.24037
88472,88472,2019020052,Nashville Predators,3,REGULAR,00:33,19:27,2019-10-11T02:02:06Z,18,Nashville Predators,right,NSH,Shot,59.0,-26.0,Braden Holtby,Mikael Granlund,Wrist Shot,0,False,39.698866,-0.714091,0,Faceoff,69.0,22.0,13.0,49.030603,False,0.0,3.771585
106822,106822,2019020998,Anaheim Ducks,1,REGULAR,11:17,08:43,2020-02-29T03:26:38Z,5,Pittsburgh Penguins,left,PIT,Shot,-42.0,24.0,John Gibson,Teddy Blueger,Backhand,0,False,52.773099,0.472122,0,Faceoff,20.0,22.0,15.0,62.03225,True,1.034044,4.135483


# 1. Add more feature

- In this section, we will add the new feature called `attacking_zone_shot`. This feature indicates whether the shot occurred in the attacking zone.
- We choose to analyze this feature based on the observation that the shot in the attacking zone has high chance to become a goal.
- **After analyzing**, we realize that 94.5% shot happend inside attacking zone, and 5% shot happened outside attacking zone.

In [11]:
def Is_Shot_In_Attacking_Zone(x_coordinate_shot, attacking_side):

    if str(attacking_side).lower() == "left":
        x_coordinate_net = -89
        attacking_zone_range = -25
        return x_coordinate_net <= x_coordinate_shot <= attacking_zone_range
    elif str(attacking_side).lower() == "right":
        x_coordinate_net = 89
        attacking_zone_range = 25
        return  attacking_zone_range <= x_coordinate_shot <= x_coordinate_net
    else:
        return None

In [12]:
df['attacking_zone_shot'] = df.apply(lambda x: Is_Shot_In_Attacking_Zone(x['x-coordinate'], x['attackingSide']), axis=1)

In [13]:
print(f"Shape of df: {df.shape}")
df.sample(5)

Shape of df: (385076, 32)


Unnamed: 0.1,Unnamed: 0,gamePk,homeTeam,period,periodType,periodTime,periodTimeRemaining,dateTime,teamId,teamName,attackingSide,teamTriCode,eventType,x-coordinate,y-coordinate,goalieName,shooterName,shotType,isEmptyNet,strength,shot_distance,angle,isgoal,last_event_type,coor_x_last_event,coor_y_last_event,time_last_event,distance_last_event,is_rebound,Change in shot angle,Speed,attacking_zone_shot
384525,384525,2016030132,Washington Capitals,4,OVERTIME,12:01,07:59,2017-04-16T02:42:28Z,10,Toronto Maple Leafs,left,TOR,Shot,-74.0,1.0,Braden Holtby,Zach Hyman,Wrist Shot,0,False,15.033296,0.066568,0,Blocked Shot,68.0,3.0,6.0,142.014084,True,-0.469867,23.669014,True
23431,23431,2017020299,Tampa Bay Lightning,2,REGULAR,12:23,07:37,2017-11-19T01:21:44Z,2,New York Islanders,right,NYI,Shot,64.0,-12.0,Andrei Vasilevskiy,Jason Chimera,Wrist Shot,0,False,27.730849,-0.44752,0,Shot,-35.0,7.0,8.0,100.806746,True,-0.576431,12.600843,True
73716,73716,2017021120,Arizona Coyotes,2,REGULAR,02:48,17:12,2018-03-20T03:04:59Z,53,Arizona Coyotes,left,ARI,Shot,-69.0,21.0,Mike Smith,Brad Richardson,Wrist Shot,0,False,29.0,0.809784,0,Blocked Shot,-54.0,14.0,2.0,16.552945,True,0.273972,8.276473,True
383904,383904,2016030312,Pittsburgh Penguins,3,REGULAR,19:00,01:00,2017-05-16T02:37:11Z,9,Ottawa Senators,left,OTT,Shot,-48.0,-14.0,Marc-Andre Fleury,Kyle Turris,Slap Shot,0,False,43.324358,-0.32905,0,Faceoff,-69.0,22.0,16.0,41.677332,True,-0.607349,2.604833,True
322511,322511,2016020192,Colorado Avalanche,2,REGULAR,00:50,19:10,2016-11-09T03:07:36Z,53,Arizona Coyotes,left,ARI,Shot,-52.0,-28.0,Calvin Pickard,Tony DeAngelo,Wrist Shot,0,False,46.400431,-0.647811,0,Faceoff,0.0,0.0,12.0,59.059292,False,0.0,4.921608,True


In [14]:
df['attacking_zone_shot'].value_counts(normalize=True)

attacking_zone_shot
True     0.937678
False    0.062322
Name: proportion, dtype: float64

# 2. Feature selection

- In this section, I will use **correlation** and **mutual information** to perform the feature selection.

### a. Correlation

In [16]:
list_features = ['period', 'x-coordinate', 'y-coordinate', 'shot_distance', 'angle', 'last_event_type', 'coor_x_last_event',\
                 'coor_y_last_event', 'time_last_event', 'distance_last_event', 'is_rebound', 'Change in shot angle', 'Speed']

labels = 'isgoal'

In [37]:
X = df[list_features]
y = df[labels]

print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")

Shape of X: (385076, 13)
Shape of y: (385076,)


In [38]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 385076 entries, 0 to 385075
Data columns (total 13 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   period                385076 non-null  int64  
 1   x-coordinate          385060 non-null  float64
 2   y-coordinate          385061 non-null  float64
 3   shot_distance         371046 non-null  float64
 4   angle                 371046 non-null  float64
 5   last_event_type       385076 non-null  object 
 6   coor_x_last_event     385070 non-null  float64
 7   coor_y_last_event     385070 non-null  float64
 8   time_last_event       385076 non-null  float64
 9   distance_last_event   385059 non-null  float64
 10  is_rebound            385076 non-null  bool   
 11  Change in shot angle  373781 non-null  float64
 12  Speed                 385066 non-null  float64
dtypes: bool(1), float64(10), int64(1), object(1)
memory usage: 35.6+ MB


In [39]:
def Remove_High_Corr_Features(X, threshold=0.95):
    correlation_matrix = X.corr().abs()

    print(f"Correlation matrix: {correlation_matrix}")

    upper_triangle = np.triu(np.ones(correlation_matrix.shape), k=1)

    correlated_pairs = np.where(correlation_matrix > threshold * upper_triangle)

    features_to_remove = set()
    for feature1, feature2 in zip(*correlated_pairs):
        features_to_remove.add(X.columns[feature1] if correlation_matrix.iloc[feature1, feature2] > correlation_matrix.iloc[feature2, feature1] else X.columns[feature2])

    X_filtered = X.drop(columns=features_to_remove, axis=1)

    return X_filtered

In [40]:
# Remove high correlation features
categorical_features = ['last_event_type', 'is_rebound']
numerical_features = [i for i in list(X.columns) if i not in categorical_features]

X_categorical = X[categorical_features]
X_numerical = X[numerical_features]

threshold = 0.95

print(f"Shape of X BEFORE: {X_numerical.shape}")
X_numerical = Remove_High_Corr_Features(X_numerical)
print(f"Shape of X AFTER: {X_numerical.shape}")

X = pd.concat([X_numerical, X_categorical])
print("-"*100)
print(f"Shape of X: {X.shape}")
X.sample(5)

Shape of X BEFORE: (385076, 11)
Correlation matrix:                         period  x-coordinate  y-coordinate  shot_distance  \
period                1.000000      0.000255      0.002402       0.001682   
x-coordinate          0.000255      1.000000      0.005629       0.003771   
y-coordinate          0.002402      0.005629      1.000000       0.008266   
shot_distance         0.001682      0.003771      0.008266       1.000000   
angle                 0.000164      0.030715      0.838728       0.012414   
coor_x_last_event     0.002749      0.403532      0.001728       0.003092   
coor_y_last_event     0.002683      0.004698      0.062554       0.001075   
time_last_event       0.024591      0.002110      0.009291       0.034327   
distance_last_event   0.018375      0.000979      0.004375       0.004898   
Change in shot angle  0.000251      0.014302      0.538909       0.007610   
Speed                 0.050699      0.002966      0.008595       0.028970   

                       

Unnamed: 0,last_event_type,is_rebound
238440,,
143693,,
177572,,
248453,,
119766,,


### b. Mutual information