In [6]:
import os
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
from PIL import Image
import matplotlib.pyplot as plt
import math

from sklearn.tree import DecisionTreeClassifier


# 0. Read dataset

In [7]:
path_csv = r"Dataset/tidyData_fe3.csv"

In [8]:
df = pd.read_csv(path_csv)
print(f"Shape of df: {df.shape}")
df.sample(5)

Shape of df: (292537, 31)


Unnamed: 0.1,Unnamed: 0,gamePk,homeTeam,period,periodType,periodTime,periodTimeRemaining,dateTime,teamId,teamName,attackingSide,teamTriCode,eventType,x-coordinate,y-coordinate,goalieName,shooterName,shotType,isEmptyNet,strength,shot_distance,angle,isgoal,last_event_type,coor_x_last_event,coor_y_last_event,time_last_event,distance_last_event,is_rebound,Change in shot angle,Speed
20987,20987,2017020562,Pittsburgh Penguins,1,REGULAR,03:37,16:23,2017-12-28 00:13:51+00:00,29,Columbus Blue Jackets,left,CBJ,Shot,-49.0,-14.0,Matt Murray,Seth Jones,Slap Shot,0,False,42.37924,-0.336675,0,Takeaway,-96.0,6.0,5.0,51.078371,True,-0.87163,10.215674
36217,36217,2017020236,Philadelphia Flyers,1,REGULAR,04:42,15:18,2017-11-10 00:18:24+00:00,16,Chicago Blackhawks,right,CHI,Shot,45.0,-17.0,Brian Elliott,Tommy Wingels,Snap Shot,0,False,47.169906,-0.368696,0,Takeaway,2.0,-2.0,4.0,45.54119,True,0.01818,11.385298
114768,114768,2019020221,New York Rangers,2,REGULAR,13:57,06:03,2019-11-05 01:36:55+00:00,3,New York Rangers,right,NYR,Shot,43.0,-6.0,Anders Nilsson,Adam Fox,Wrist Shot,0,False,46.389654,-0.129703,0,Missed Shot,61.0,28.0,5.0,38.470768,True,-0.231391,7.694154
241892,241892,2016020053,Pittsburgh Penguins,2,REGULAR,00:12,19:48,2016-10-21 00:03:06+00:00,28,San Jose Sharks,right,SJS,Shot,69.0,32.0,Marc-Andre Fleury,Joe Thornton,Wrist Shot,0,False,37.735925,1.012197,0,Faceoff,0.0,0.0,12.0,76.059187,False,0.0,6.338266
124890,124890,2019020587,Winnipeg Jets,2,REGULAR,03:01,16:59,2019-12-28 02:05:21+00:00,19,St. Louis Blues,left,STL,Shot,-73.0,0.0,Connor Hellebuyck,Alexander Steen,Wrist Shot,0,False,16.0,0.0,0,Missed Shot,-37.0,-21.0,4.0,41.677332,True,0.13944,10.419333


# 1. Add more feature

- In this section, we will add the new feature called `attacking_zone_shot`. This feature indicates whether the shot occurred in the attacking zone.
- We choose to analyze this feature based on the observation that the shot in the attacking zone has high chance to become a goal.
- **After analyzing**, we realize that 94.5% shot happend inside attacking zone, and 5% shot happened outside attacking zone.

In [9]:
def Is_Shot_In_Attacking_Zone(x_coordinate_shot, attacking_side):

    if str(attacking_side).lower() == "left":
        x_coordinate_net = -89
        attacking_zone_range = -25
        return x_coordinate_net <= x_coordinate_shot <= attacking_zone_range
    elif str(attacking_side).lower() == "right":
        x_coordinate_net = 89
        attacking_zone_range = 25
        return  attacking_zone_range <= x_coordinate_shot <= x_coordinate_net
    else:
        return None

In [10]:
df['attacking_zone_shot'] = df.apply(lambda x: Is_Shot_In_Attacking_Zone(x['x-coordinate'], x['attackingSide']), axis=1)

In [11]:
print(f"Shape of df: {df.shape}")
df.sample(5)

Shape of df: (292537, 32)


Unnamed: 0.1,Unnamed: 0,gamePk,homeTeam,period,periodType,periodTime,periodTimeRemaining,dateTime,teamId,teamName,attackingSide,teamTriCode,eventType,x-coordinate,y-coordinate,goalieName,shooterName,shotType,isEmptyNet,strength,shot_distance,angle,isgoal,last_event_type,coor_x_last_event,coor_y_last_event,time_last_event,distance_last_event,is_rebound,Change in shot angle,Speed,attacking_zone_shot
87565,87565,2019020044,Buffalo Sabres,3,REGULAR,11:04,08:56,2019-10-10 01:20:05+00:00,7,Buffalo Sabres,right,BUF,Shot,50.0,39.0,Keith Kinkaid,Marco Scandella,Wrist Shot,0,False,55.154329,0.785398,0,Missed Shot,-49.0,0.0,22.0,106.404887,True,1.203622,4.836586,True
1155,1155,2017020211,Los Angeles Kings,3,REGULAR,16:56,03:04,2017-11-05 05:15:26+00:00,18,Nashville Predators,right,NSH,Shot,85.0,-10.0,Darcy Kuemper,Viktor Arvidsson,Backhand,0,False,10.77033,-1.19029,0,Faceoff,69.0,22.0,6.0,35.777088,True,-1.642104,5.962848,True
260937,260937,2016020991,Arizona Coyotes,2,REGULAR,01:58,18:02,2017-03-10 03:08:26+00:00,9,Ottawa Senators,right,OTT,Shot,79.0,-21.0,Mike Smith,Mark Stone,Wrist Shot,0,False,23.259407,-1.126377,0,Shot,-56.0,-29.0,32.0,135.236829,True,-0.405406,4.226151,True
134187,134187,2018020489,Pittsburgh Penguins,3,REGULAR,05:34,14:26,2018-12-15 02:07:34+00:00,5,Pittsburgh Penguins,right,PIT,Shot,48.0,-13.0,Jaroslav Halak,Derick Brassard,Snap Shot,0,False,43.011626,-0.307046,0,Hit,61.0,39.0,54.0,53.600373,True,-1.146935,0.9926,True
252399,252399,2016020532,Buffalo Sabres,1,REGULAR,05:16,14:44,2016-12-30 00:18:07+00:00,6,Boston Bruins,left,BOS,Shot,-69.0,-1.0,Robin Lehner,Dominic Moore,Tip-In,0,False,20.024984,-0.049958,0,Blocked Shot,-59.0,-5.0,12.0,10.77033,True,0.155437,0.897527,True


In [13]:
df['attacking_zone_shot'].value_counts(normalize=True)

attacking_zone_shot
True     0.944946
False    0.055054
Name: proportion, dtype: float64

# 2. Add more feature

- This dataset is essentially a time series, implying that events from past matches can influence those in the present. For instance, if a team has scored abundantly in the preceding match, it suggests a strong attacking strategy. Consequently, there's a high likelihood that the team will replicate this success and score significantly in the current match.
- In this section, we will add the feature called `recent_goal_scoring`, which is the number of goal of this team from the beginning of this year to now. 

In [17]:
df['eventType'].value_counts()

eventType
Shot    264895
Goal     27642
Name: count, dtype: int64

In [14]:
def Is_In_Season(gamepk, current_season):
    if str(gamepk)[0:4] == str(current_season):
        return True
    return False

In [24]:
df.head()

Unnamed: 0.1,Unnamed: 0,gamePk,homeTeam,period,periodType,periodTime,periodTimeRemaining,dateTime,teamId,teamName,attackingSide,teamTriCode,eventType,x-coordinate,y-coordinate,goalieName,shooterName,shotType,isEmptyNet,strength,shot_distance,angle,isgoal,last_event_type,coor_x_last_event,coor_y_last_event,time_last_event,distance_last_event,is_rebound,Change in shot angle,Speed,attacking_zone_shot
0,0,2017020495,Vancouver Canucks,1,REGULAR,00:13,19:47,2017-12-16 03:08:32+00:00,23,Vancouver Canucks,left,VAN,Shot,-78.0,4.0,Martin Jones,Sam Gagner,Wrist Shot,0,False,11.7047,0.348771,0,Faceoff,0.0,0.0,13.0,78.102497,False,0.0,0.0,True
1,1,2017020495,Vancouver Canucks,1,REGULAR,00:14,19:46,2017-12-16 03:08:41+00:00,23,Vancouver Canucks,left,VAN,Shot,-81.0,0.0,Martin Jones,Sam Gagner,Wrist Shot,0,False,8.0,0.0,0,Shot,-78.0,4.0,1.0,5.0,True,-0.348771,5.0,True
2,2,2017020495,Vancouver Canucks,1,REGULAR,00:44,19:16,2017-12-16 03:09:40+00:00,23,Vancouver Canucks,left,VAN,Goal,-79.0,-1.0,Martin Jones,Markus Granlund,Deflected,0,Power Play,10.049876,-0.099669,1,Faceoff,-69.0,22.0,29.0,25.079872,True,-0.099669,0.864823,True
3,3,2017020495,Vancouver Canucks,1,REGULAR,01:20,18:40,2017-12-16 03:10:50+00:00,23,Vancouver Canucks,left,VAN,Shot,-67.0,20.0,Martin Jones,Brock Boeser,Wrist Shot,0,False,29.732137,0.737815,0,Faceoff,0.0,0.0,36.0,69.921384,False,0.0,1.942261,True
4,4,2017020495,Vancouver Canucks,1,REGULAR,02:08,17:52,2017-12-16 03:12:03+00:00,23,Vancouver Canucks,left,VAN,Shot,-38.0,-1.0,Martin Jones,Alex Biega,Slap Shot,0,False,51.009803,-0.019605,0,Giveaway,-64.0,36.0,3.0,45.221676,True,-0.75742,15.073892,True


In [None]:
list_previous_season_goal = []

cache_team_name = None
cache_previous_season = None
cache_num_goal_previous_season = None

for idx_sample, shot_sample in df.iterrows():

    if idx_sample % 10 == 0:
        print(f"[INFO] Idx = {idx_sample}")

    team_name = shot_sample['teamName']
    gamepk = str(shot_sample['gamePk'])
    previous_season = int(gamepk[0:4]) - 1

    if (cache_team_name == team_name) and (cache_previous_season == previous_season):
        num_goal_previous_season = cache_num_goal_previous_season
    else:
        this_team_df = df[df['teamName'] == team_name]
        previous_season_df = this_team_df[this_team_df.apply(lambda x: Is_In_Season(x['gamePk'], previous_season), axis=1)]
        num_goal_previous_season = previous_season_df[previous_season_df['eventType'] == "Goal"].shape[0]

    list_previous_season_goal.append(num_goal_previous_season)
    cache_team_name = team_name
    cache_previous_season = previous_season
    cache_num_goal_previous_season = num_goal_previous_season


assert len(list_previous_season_goal) == df.shape[0]
df['previous_season_goal'] = list_previous_season_goal