In [1]:
import os
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
import math
import json
from datetime import datetime, timedelta


# 0. Read dataset

In this section, I will read from the raw data (`json` file).

In [2]:
PATH_TIDY_DATA_CSV = r"Dataset/tidyData_original.csv"

In [3]:
df = pd.read_csv(PATH_TIDY_DATA_CSV)
df = df.reset_index()
print(f"Shape of df: {df.shape}")
df.head(10)

Shape of df: (385076, 20)


Unnamed: 0,index,gamePk,homeTeam,period,periodType,periodTime,periodTimeRemaining,dateTime,teamId,teamName,attackingSide,teamTriCode,eventType,x-coordinate,y-coordinate,goalieName,shooterName,shotType,isEmptyNet,strength
0,0,2018021200,Colorado Avalanche,1,REGULAR,00:35,19:25,2019-03-30T01:09:28Z,21,Colorado Avalanche,left,COL,Shot,-75.0,-5.0,Darcy Kuemper,Gabriel Landeskog,Backhand,,False
1,1,2018021200,Colorado Avalanche,1,REGULAR,01:44,18:16,2019-03-30T01:11:28Z,21,Colorado Avalanche,left,COL,Shot,-37.0,33.0,Darcy Kuemper,Tyson Barrie,Wrist Shot,,False
2,2,2018021200,Colorado Avalanche,1,REGULAR,02:42,17:18,2019-03-30T01:13:23Z,21,Colorado Avalanche,left,COL,Shot,-80.0,-6.0,Darcy Kuemper,Colin Wilson,Wrist Shot,,False
3,3,2018021200,Colorado Avalanche,1,REGULAR,05:19,14:41,2019-03-30T01:16:58Z,53,Arizona Coyotes,right,ARI,Shot,62.0,-23.0,Philipp Grubauer,Richard Panik,Wrist Shot,,False
4,4,2018021200,Colorado Avalanche,1,REGULAR,05:53,14:07,2019-03-30T01:19:11Z,53,Arizona Coyotes,right,ARI,Shot,-7.0,36.0,Philipp Grubauer,Lawson Crouse,Wrist Shot,,False
5,5,2018021200,Colorado Avalanche,1,REGULAR,07:02,12:58,2019-03-30T01:20:48Z,21,Colorado Avalanche,left,COL,Shot,-70.0,-21.0,Darcy Kuemper,Sven Andrighetto,Wrist Shot,,False
6,6,2018021200,Colorado Avalanche,1,REGULAR,07:19,12:41,2019-03-30T01:21:05Z,21,Colorado Avalanche,left,COL,Shot,-67.0,-38.0,Darcy Kuemper,Derick Brassard,Wrist Shot,,False
7,7,2018021200,Colorado Avalanche,1,REGULAR,07:48,12:12,2019-03-30T01:21:34Z,21,Colorado Avalanche,left,COL,Shot,-79.0,-20.0,Darcy Kuemper,Tyson Jost,Wrist Shot,,False
8,8,2018021200,Colorado Avalanche,1,REGULAR,08:11,11:49,2019-03-30T01:21:56Z,53,Arizona Coyotes,right,ARI,Shot,51.0,23.0,Philipp Grubauer,Jordan Oesterle,Wrist Shot,,False
9,9,2018021200,Colorado Avalanche,1,REGULAR,08:23,11:37,2019-03-30T01:24:22Z,53,Arizona Coyotes,right,ARI,Shot,37.0,1.0,Philipp Grubauer,Nick Cousins,Wrist Shot,,False


# 1. Include simple features

In [4]:
def time_string_to_seconds(time_str):
    """
    This function convert from mm:ss to the total number of seconds. For example: "03:45" -> 225
    """
    try:
        minutes, seconds = map(int, time_str.split(':'))
    except:
        print(f"Error in time string. Current time string: {time_str}")
    total_seconds = minutes * 60 + seconds
    return total_seconds

In [5]:
# def Determine_Shooting_Net(home_team, shoter_tem, period):
    
#     is_home_shot = True 
#     if home_team.lower() != shoter_tem.lower():
#         is_home_shot = False    # Away shot

#     net_coordinates = None
#     if is_home_shot:
#         if period % 2 == 1:
#             net_coordinates = (89, 0)
#         else:
#             net_coordinates = (-89, 0)
#     else:    # away
#         if period % 2 == 1:
#             net_coordinates = (-89, 0)
#         else:
#             net_coordinates = (89, 0)
    
#     return net_coordinates

In [6]:
def Determine_Shooting_Net(shot_coor_x, shot_coor_y):
    net_coordinates = None
    if shot_coor_x > 0:
        net_coordinates = (89, 0)
    else:
        net_coordinates = (-89, 0)
    return net_coordinates

In [7]:
def Calculate_Distance(point1, point2):
    x1, y1 = point1
    x2, y2 = point2
    distance = ((x2 - x1) ** 2 + (y2 - y1) ** 2) ** 0.5
    return distance

In [8]:
def Calculate_Shot_Angle(shot_location, net_location):
    shot_x, shot_y = shot_location
    net_x, net_y = net_location

    dx = net_x - shot_x
    dy = net_y - shot_y
    angle = math.atan2(dy, dx)
    return math.degrees(angle)

In [9]:
list_game_seconds = []
list_game_period = []
list_coordinates_x = []
list_coordinates_y = []
list_shot_distance = []
list_shot_angle = []
list_shot_type = []

for idx_event, shot_event in df.iterrows():
    game_seconds = time_string_to_seconds(shot_event['periodTime']) + (int(shot_event['period']) - 1) * 20 * 60 
    game_period = shot_event['period']
    coor_x = shot_event['x-coordinate']
    coor_y = shot_event['y-coordinate']
    
    # Extract the location of net
    # home_team = shot_event['homeTeam']
    # shoter_tem = shot_event['teamName']
    net_coordinates = Determine_Shooting_Net(coor_x, coor_y)
    shot_distance = Calculate_Distance((coor_x, coor_y), net_coordinates)
    shot_angle = Calculate_Shot_Angle((coor_x, coor_y), net_coordinates)
    shot_type = shot_event['shotType']
    
    list_game_seconds.append(game_seconds)
    list_game_period.append(game_period)
    list_coordinates_x.append(coor_x)
    list_coordinates_y.append(coor_y)
    list_shot_distance.append(shot_distance)
    list_shot_angle.append(shot_angle)
    list_shot_type.append(shot_type)

# 2. Add feature of previous event

- We will find event based on `dateTime`. For example, "2019-03-30T01:09:28Z"

In [10]:
path_json_file = r"Dataset/2018/regular_season/2018021200.json"

with open(path_json_file, 'r') as json_file:
    data = json.load(json_file)

list_event = data['liveData']['plays']['allPlays']
print(f"Number of event: {len(list_event)}")

Number of event: 365


In [11]:
df.head()

Unnamed: 0,index,gamePk,homeTeam,period,periodType,periodTime,periodTimeRemaining,dateTime,teamId,teamName,attackingSide,teamTriCode,eventType,x-coordinate,y-coordinate,goalieName,shooterName,shotType,isEmptyNet,strength
0,0,2018021200,Colorado Avalanche,1,REGULAR,00:35,19:25,2019-03-30T01:09:28Z,21,Colorado Avalanche,left,COL,Shot,-75.0,-5.0,Darcy Kuemper,Gabriel Landeskog,Backhand,,False
1,1,2018021200,Colorado Avalanche,1,REGULAR,01:44,18:16,2019-03-30T01:11:28Z,21,Colorado Avalanche,left,COL,Shot,-37.0,33.0,Darcy Kuemper,Tyson Barrie,Wrist Shot,,False
2,2,2018021200,Colorado Avalanche,1,REGULAR,02:42,17:18,2019-03-30T01:13:23Z,21,Colorado Avalanche,left,COL,Shot,-80.0,-6.0,Darcy Kuemper,Colin Wilson,Wrist Shot,,False
3,3,2018021200,Colorado Avalanche,1,REGULAR,05:19,14:41,2019-03-30T01:16:58Z,53,Arizona Coyotes,right,ARI,Shot,62.0,-23.0,Philipp Grubauer,Richard Panik,Wrist Shot,,False
4,4,2018021200,Colorado Avalanche,1,REGULAR,05:53,14:07,2019-03-30T01:19:11Z,53,Arizona Coyotes,right,ARI,Shot,-7.0,36.0,Philipp Grubauer,Lawson Crouse,Wrist Shot,,False


In [56]:
print(df.shape)

print(len(df['dateTime'].unique()))

(385076, 20)
377446


In [12]:
def Retrieve_File_Name_From_gamePk(gamePk, parent_directory="Dataset"):
    season = str(gamePk[0:4])
    type_game = str(gamePk[4:6])
    if type_game == "02":
        type_game = "regular_season"
    elif type_game == "03":
        type_game = "playoffs"
    path_json_file = os.path.join(parent_directory, season, type_game, f"{gamePk}.json")
    return path_json_file

In [13]:
def Get_List_Event_From_Json(path_json_file):
    with open(path_json_file, 'r') as json_file:
        data = json.load(json_file)
    try:
        list_event = data['liveData']['plays']['allPlays']
    except:
        list_event = None
    
    return list_event

In [14]:
def Extract_Previous_Event(list_event, current_sample_dateTime):
    previous_event = None
    for (idx_event, event) in enumerate(list_event):
        event_dateTime= str(event['about']['dateTime'])
        if event_dateTime.lower() == current_sample_dateTime:
            previous_event = list_event[idx_event - 1]
        else:
            continue
    return previous_event

In [36]:
def Get_Info_Previous_Event(previous_event):
    event_type = previous_event['result']['event']
    try:
        coor_x = previous_event['coordinates']['x']
        coor_y = previous_event['coordinates']['y']
    except:
        coor_x = coor_y = 0
    event_period = previous_event['about']['period']
    event_periodTime = previous_event['about']['periodTime']
    
    return (event_type, coor_x, coor_y, event_period, event_periodTime)

In [53]:
idx_shot = 379
shot_sample = df.iloc[idx_shot, :]
shot_sample_dateTime = str(shot_sample['dateTime']).lower()
shot_sample_gamePk = str(shot_sample['gamePk'])
current_coor_x = shot_sample['x-coordinate']
current_coor_y = shot_sample['y-coordinate']
current_period = shot_sample['period']
current_periodTime = shot_sample['periodTime']

path_json_file = Retrieve_File_Name_From_gamePk(shot_sample_gamePk)
list_event = Get_List_Event_From_Json(path_json_file)

previous_event = Extract_Previous_Event(list_event, shot_sample_dateTime)
(previous_event_type, previous_coor_x, previous_coor_y, previous_event_period, previous_event_periodTime) = Get_Info_Previous_Event(previous_event)

print(f"Event type: {previous_event_type}")
print(f"Event coor_x: {previous_coor_x}")
print(f"Event coor_y: {previous_coor_y}")
print(f"Event period: {previous_event_period}")
print(f"Event period time: {previous_event_periodTime}")

Event type: Blocked Shot
Event coor_x: -74.0
Event coor_y: -3.0
Event period: 3
Event period time: 07:11


In [52]:
print(int(current_period))
print(current_periodTime)
print(int(previous_event_period))
print(previous_event_periodTime)

2
01:50
3
07:11


In [49]:
if int(current_period) == int(previous_event_period):
    time_distance = Calculate_Time_Distance(current_periodTime, previous_event_periodTime)
elif int(current_period) > int(previous_event_period):
    time_distance = Calculate_Time_Distance(previous_event_periodTime, "20:00") + Calculate_Time_Distance("00:00", current_periodTime)
else:
    print(f"Error")
    print(f"Current time: {current_periodTime}, Previous time: {previous_event_periodTime}")

Error
Current time: 01:50, Previous time: 07:11


In [39]:
def Calculate_Time_Distance(time_str1, time_str2):
    time_format = "%M:%S"
    time1 = datetime.strptime(time_str1, time_format)
    time2 = datetime.strptime(time_str2, time_format)
        
    time_difference_seconds = abs((time1 - time2).total_seconds())
    return time_difference_seconds

In [41]:
list_last_event_type = []
list_coor_x_last_event = []
list_coor_y_last_event = []
list_time_last_event = []
list_distance_last_event = []


for idx_sample, shot_sample in df.iterrows():
    if idx_sample % 10_000 == 0:
        print(f"[INFO] Idx = {idx_sample}")

    try:
        shot_sample_dateTime = str(shot_sample['dateTime']).lower()
        shot_sample_gamePk = str(shot_sample['gamePk'])
        current_coor_x = shot_sample['x-coordinate']
        current_coor_y = shot_sample['y-coordinate']
        current_period = shot_sample['period']
        current_periodTime = shot_sample['periodTime']

        path_json_file = Retrieve_File_Name_From_gamePk(shot_sample_gamePk)
        list_event = Get_List_Event_From_Json(path_json_file)

        previous_event = Extract_Previous_Event(list_event, shot_sample_dateTime)
        (previous_event_type, previous_coor_x, previous_coor_y, previous_event_period, previous_event_periodTime) = Get_Info_Previous_Event(previous_event)

        list_last_event_type.append(previous_event_type)
        list_coor_x_last_event.append(previous_coor_x)
        list_coor_y_last_event.append(previous_coor_y)
        
        if int(current_period) == int(previous_event_period):
            time_distance = Calculate_Time_Distance(current_periodTime, previous_event_periodTime)
            list_time_last_event.append(time_distance)
        elif int(current_period) > int(previous_event_period):
            time_distance = Calculate_Time_Distance(previous_event_periodTime, "20:00") + Calculate_Time_Distance("00:00", current_periodTime)
            list_time_last_event.append(time_distance)
        else:
            print(f"Error at index {idx_sample}")
            print(f"Current time: {current_periodTime}, Previous time: {previous_event_periodTime}")
        

        distance = Calculate_Distance((current_coor_x, current_coor_y), (previous_coor_x, previous_coor_y)) # Distance from the last event to current event
        list_distance_last_event.append(distance)

    except Exception as error:
        print(f"Error at index {idx_sample}")
        print(f"Error: {error}")
        break

[INFO] Idx = 0
Error at index 379
Current time: 01:50, Previous time: 07:11
Error at index 1472
Current time: 08:45, Previous time: 12:16
[INFO] Idx = 10000
[INFO] Idx = 20000


KeyboardInterrupt: 

In [22]:
df.loc[23, :]

index                                    23
gamePk                           2018021200
homeTeam                 Colorado Avalanche
period                                    2
periodType                          REGULAR
periodTime                            04:19
periodTimeRemaining                   15:41
dateTime               2019-03-30T02:12:08Z
teamId                                   21
teamName                 Colorado Avalanche
attackingSide                         right
teamTriCode                             COL
eventType                              Shot
x-coordinate                           60.0
y-coordinate                           16.0
goalieName                    Darcy Kuemper
shooterName               Alexander Kerfoot
shotType                         Wrist Shot
isEmptyNet                              NaN
strength                              False
Name: 23, dtype: object