In [6]:
import os
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
from PIL import Image
import matplotlib.pyplot as plt
import math

from sklearn.tree import DecisionTreeClassifier


# 0. Read dataset

In [18]:
path_csv = r"Dataset/tidyData_fe2.csv"

In [19]:
df = pd.read_csv(path_csv)
print(f"Shape of df: {df.shape}")
df.sample(5)

Shape of df: (385076, 28)


Unnamed: 0.1,Unnamed: 0,gamePk,homeTeam,period,periodType,periodTime,periodTimeRemaining,dateTime,teamId,teamName,attackingSide,teamTriCode,eventType,x-coordinate,y-coordinate,goalieName,shooterName,shotType,isEmptyNet,strength,shot_distance,angle,isgoal,last_event_type,coor_x_last_event,coor_y_last_event,time_last_event,distance_last_event
87058,87058,2017030414,Washington Capitals,2,REGULAR,15:23,04:37,2018-06-05T01:43:47Z,15,Washington Capitals,right,WSH,Goal,67.0,23.0,Marc-Andre Fleury,John Carlson,Slap Shot,0,Power Play,31.827661,0.807617,1,Hit,71.0,-7.0,2.0,30.265492
74584,74584,2017021032,Boston Bruins,1,REGULAR,19:34,00:26,2018-03-09T00:47:30Z,6,Boston Bruins,right,BOS,Goal,76.0,6.0,Alex Lyon,Brian Gionta,Backhand,0,Even,14.317821,0.432408,1,Hit,-96.0,13.0,12.0,172.142383
337829,337829,2016020593,Los Angeles Kings,2,REGULAR,18:34,01:26,2017-01-07T22:43:57Z,26,Los Angeles Kings,right,LAK,Shot,76.0,-28.0,Darcy Kuemper,Drew Doughty,Snap Shot,0,False,30.870698,-1.136126,0,Missed Shot,45.0,23.0,17.0,59.682493
297644,297644,2018020202,Nashville Predators,1,REGULAR,07:34,12:26,2018-11-04T00:23:48Z,6,Boston Bruins,left,BOS,Shot,-86.0,-29.0,Pekka Rinne,Brad Marchand,Wrist Shot,0,False,29.154759,-1.467715,0,Faceoff,-69.0,22.0,25.0,53.75872
367679,367679,2016021098,Chicago Blackhawks,3,REGULAR,13:24,06:36,2017-03-24T02:55:17Z,25,Dallas Stars,right,DAL,Shot,81.0,7.0,Corey Crawford,Tyler Seguin,Backhand,0,False,10.630146,0.71883,0,Faceoff,-69.0,-22.0,43.0,152.777616


# 1. Add more feature

- In this section, we will add the new feature called `attacking_zone_shot`. This feature indicates whether the shot occurred in the attacking zone.
- We choose to analyze this feature based on the observation that the shot in the attacking zone has high chance to become a goal.
- **After analyzing**, we realize that 94.5% shot happend inside attacking zone, and 5% shot happened outside attacking zone.

In [20]:
def Is_Shot_In_Attacking_Zone(x_coordinate_shot, attacking_side):

    if str(attacking_side).lower() == "left":
        x_coordinate_net = -89
        attacking_zone_range = -25
        return x_coordinate_net <= x_coordinate_shot <= attacking_zone_range
    elif str(attacking_side).lower() == "right":
        x_coordinate_net = 89
        attacking_zone_range = 25
        return  attacking_zone_range <= x_coordinate_shot <= x_coordinate_net
    else:
        return None

In [21]:
df['attacking_zone_shot'] = df.apply(lambda x: Is_Shot_In_Attacking_Zone(x['x-coordinate'], x['attackingSide']), axis=1)

In [22]:
print(f"Shape of df: {df.shape}")
df.sample(5)

Shape of df: (385076, 29)


Unnamed: 0.1,Unnamed: 0,gamePk,homeTeam,period,periodType,periodTime,periodTimeRemaining,dateTime,teamId,teamName,attackingSide,teamTriCode,eventType,x-coordinate,y-coordinate,goalieName,shooterName,shotType,isEmptyNet,strength,shot_distance,angle,isgoal,last_event_type,coor_x_last_event,coor_y_last_event,time_last_event,distance_last_event,attacking_zone_shot
277188,277188,2018020822,New York Rangers,1,REGULAR,06:26,13:34,2019-02-07T01:22:35Z,3,New York Rangers,left,NYR,Shot,-35.0,30.0,Jaroslav Halak,Tony DeAngelo,Wrist Shot,0,False,61.773781,0.507099,0,Hit,-93.0,21.0,4.0,58.694122,True
246004,246004,2018020483,Nashville Predators,4,OVERTIME,03:05,01:55,2018-12-14T03:49:15Z,18,Nashville Predators,left,NSH,Goal,-76.0,0.0,Anders Nilsson,Colton Sissons,Wrist Shot,0,Even,13.0,0.0,1,Faceoff,69.0,-22.0,19.0,146.65947,True
324683,324683,2016020926,Montréal Canadiens,4,OVERTIME,01:10,03:50,2017-03-01T03:15:30Z,8,Montréal Canadiens,right,MTL,Shot,83.0,-3.0,Sergei Bobrovsky,Max Pacioretty,Wrist Shot,0,False,6.708204,-0.463648,0,Shot,71.0,0.0,4.0,12.369317,True
76469,76469,2017020245,Columbus Blue Jackets,3,REGULAR,02:21,17:39,2017-11-11T01:55:48Z,12,Carolina Hurricanes,left,CAR,Shot,-75.0,30.0,Sergei Bobrovsky,Justin Faulk,Snap Shot,0,False,33.105891,1.134169,0,Shot,72.0,-35.0,10.0,160.729587,True
50336,50336,2017020307,New York Rangers,1,REGULAR,08:06,11:54,2017-11-20T00:22:30Z,9,Ottawa Senators,right,OTT,Shot,64.0,-28.0,Henrik Lundqvist,Mike Hoffman,Wrist Shot,0,False,37.536649,-0.841942,0,Missed Shot,52.0,25.0,28.0,54.341513,True


In [23]:
df['attacking_zone_shot'].value_counts(normalize=True)

attacking_zone_shot
True     0.937678
False    0.062322
Name: proportion, dtype: float64

# 2. Add more feature

- This dataset is essentially a time series, implying that events from past matches can influence those in the present. For instance, if a team has scored abundantly in the preceding match, it suggests a strong attacking strategy. Consequently, there's a high likelihood that the team will replicate this success and score significantly in the current match.
- In this section, we will add the feature called `recent_goal_scoring`, which is the number of goal of this team from the beginning of this year to now. 

In [9]:
df['eventType'].value_counts()

eventType
Shot    348504
Goal     36572
Name: count, dtype: int64

In [None]:
def Is_In_Season(gamepk, current_season):
    if str(gamepk)[0:4] == str(current_season):
        return True
    return False

In [None]:
df.head()

In [None]:
list_previous_season_goal = []

cache_team_name = None
cache_previous_season = None
cache_num_goal_previous_season = None

for idx_sample, shot_sample in df.iterrows():

    if idx_sample % 10 == 0:
        print(f"[INFO] Idx = {idx_sample}")

    team_name = shot_sample['teamName']
    gamepk = str(shot_sample['gamePk'])
    previous_season = int(gamepk[0:4]) - 1

    if (cache_team_name == team_name) and (cache_previous_season == previous_season):
        num_goal_previous_season = cache_num_goal_previous_season
    else:
        this_team_df = df[df['teamName'] == team_name]
        previous_season_df = this_team_df[this_team_df.apply(lambda x: Is_In_Season(x['gamePk'], previous_season), axis=1)]
        num_goal_previous_season = previous_season_df[previous_season_df['eventType'] == "Goal"].shape[0]

    list_previous_season_goal.append(num_goal_previous_season)
    cache_team_name = team_name
    cache_previous_season = previous_season
    cache_num_goal_previous_season = num_goal_previous_season


assert len(list_previous_season_goal) == df.shape[0]
df['previous_season_goal'] = list_previous_season_goal