In [9]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json

In [20]:
PATH_FOLDER_DATA = Path(r"/home/thaiv7/Desktop/IFT6758_project/dataset_new")
PATH_OUTPUT_FILE = Path(r"/home/thaiv7/Desktop/IFT6758_project/dataset_new/processed_data.csv")

LIST_SEASON = [2016, 2017, 2018, 2019, 2020]
LIST_GAME_TYPE = ['playoffs', 'regular_season']
LIST_CHOSEN_EVENT = ['shot-on-goal', 'goal']

# 1. Read dataset

In [11]:
def define_net_coordinate(df):
    """
    df: dataframe of current game
    """

    dict_net_coordinate = {}

    list_period = df['period'].unique()
    for period in list_period:
        dict_net_coordinate[period] = {}

        for team in ['home', 'away']:
            current_df = df[(df['period'] == period) & (df['zone code'] == "O") & (df['team'] == team)].sample(1)
            current_x_coor = current_df['x coor'].iloc[0]    # extract x coordinate
            if int(current_x_coor) <= 0:
                net_coordinate_x = -89
                net_coordinate_y = 0
            elif int(current_x_coor) >= 0:
                net_coordinate_x = 89
                net_coordinate_y = 0
            net_coordinate = (net_coordinate_x, net_coordinate_y)

            dict_net_coordinate[period][team] = net_coordinate

    return dict_net_coordinate

In [12]:
path_folder_data = PATH_FOLDER_DATA
list_season = LIST_SEASON
list_game_type = LIST_GAME_TYPE
list_chosen_event = LIST_CHOSEN_EVENT

# -------------------------
df = pd.DataFrame()

season = 2016
path_season_folder = os.path.join(path_folder_data, str(season))
for game_type in list_game_type:

    # Get list game (list of json file)
    path_season_game_folder = os.path.join(path_season_folder, game_type)
    list_game_name = sorted(os.listdir(path_season_game_folder))

    for game_name in list_game_name:
        path_game_file = os.path.join(path_season_game_folder, game_name)
        with open(path_game_file) as json_file:
            game_data = json.load(json_file)

        list_event_type = []
        list_x_coor = []
        list_y_coor = []
        list_event_owner_team_id = []
        list_zone_code = []
        list_period = []
        list_team = []
        list_situation_code = []

        # Extract all event of game
        home_team_id = game_data['homeTeam']['id']
        away_team_id = game_data['awayTeam']['id']
        list_event = game_data['plays'] 

        for event in list_event:
            event_type = event['typeDescKey']
            if event_type in list_chosen_event: # Filter just 
                x_coor = event['details']['xCoord']
                y_coor = event['details']['yCoord']

                list_event_type.append(event_type)
                list_x_coor.append(x_coor)
                list_y_coor.append(y_coor)

                event_owner_team_id = event['details']['eventOwnerTeamId']
                list_event_owner_team_id.append(event_owner_team_id)
                if str(event_owner_team_id) == str(home_team_id):
                    list_team.append("home")
                elif str(event_owner_team_id) == str(away_team_id):
                    list_team.append("away")
                else:
                    print(event_owner_team_id)

                zone_code = event['details']['zoneCode']
                list_zone_code.append(zone_code)

                period = event['period']
                list_period.append(period)

                situation_code = event['situationCode']
                list_situation_code.append(situation_code)


        game_df = {"event type": list_event_type,\
                    "period": list_period,\
                    "x coor": list_x_coor,\
                    "y coor": list_y_coor,\
                    "owner team id": list_event_owner_team_id,\
                    "team": list_team,\
                    "zone code": list_zone_code,\
                    "situation code": list_situation_code}
        game_df = pd.DataFrame(game_df)

        # Calculate coordinates (x,y) of net in each shot        
        dict_net_coor = define_net_coordinate(game_df)

        list_net_x_coor = []
        list_net_y_coor = []
        for index, row in game_df.iterrows():
            period = row['period']
            team = row['team']
            net_coor_x, net_coor_y = dict_net_coor[period][team]
            list_net_x_coor.append(net_coor_x)
            list_net_y_coor.append(net_coor_y)
        game_df['net x coor'] = list_net_x_coor
        game_df['net y coor'] = list_net_y_coor
        
        # Concat dateframe of current game (game_df) into final dataframe (df)
        df = pd.concat([df, game_df], ignore_index=True)

        break
    break

# 2. Extract feature

In [13]:
def compute_shot_distance(row):
    x = float(row['x coor'])
    y = float(row['y coor'])
    net_x = float(row['net x coor'])
    net_y = float(row['net y coor'])

    distance = np.nan
    try:
        distance = np.sqrt((x - net_x) ** 2 + (y - net_y) ** 2)
    except:
        distance = np.nan

    return distance

In [14]:
df['shot distance'] = df.apply(compute_shot_distance, axis=1)
df['shot distance'] = df['shot distance'].astype(float)

print(f'Shape of df: {df.shape}')
df.head()

Shape of df: (62, 11)


Unnamed: 0,event type,period,x coor,y coor,owner team id,team,zone code,situation code,net x coor,net y coor,shot distance
0,shot-on-goal,1,-8,-36,8,home,N,1551,-89,0,88.63972
1,shot-on-goal,1,85,-6,3,away,O,1551,89,0,7.211103
2,shot-on-goal,1,-69,-35,8,home,O,1551,-89,0,40.311289
3,shot-on-goal,1,-55,-17,8,home,O,1551,-89,0,38.013156
4,shot-on-goal,1,-58,-28,8,home,O,1541,-89,0,41.773197


In [15]:
def compute_shot_angle(row):
    shot_angle_rad = np.arcsin(row['y coor'] / row['shot distance'])
    return shot_angle_rad

In [16]:
df['shot angle'] = df.apply(compute_shot_angle, axis=1)

print(f'Shape of df: {df.shape}')
df.head()

Shape of df: (62, 12)


Unnamed: 0,event type,period,x coor,y coor,owner team id,team,zone code,situation code,net x coor,net y coor,shot distance,shot angle
0,shot-on-goal,1,-8,-36,8,home,N,1551,-89,0,88.63972,-0.418224
1,shot-on-goal,1,85,-6,3,away,O,1551,89,0,7.211103,-0.982794
2,shot-on-goal,1,-69,-35,8,home,O,1551,-89,0,40.311289,-1.05165
3,shot-on-goal,1,-55,-17,8,home,O,1551,-89,0,38.013156,-0.463648
4,shot-on-goal,1,-58,-28,8,home,O,1541,-89,0,41.773197,-0.734594


In [17]:
df['isgoal'] = df['event type'].apply(lambda x: 1 if x == 'goal' else 0)

print(f'Shape of df: {df.shape}')
df.head()

Shape of df: (62, 13)


Unnamed: 0,event type,period,x coor,y coor,owner team id,team,zone code,situation code,net x coor,net y coor,shot distance,shot angle,isgoal
0,shot-on-goal,1,-8,-36,8,home,N,1551,-89,0,88.63972,-0.418224,0
1,shot-on-goal,1,85,-6,3,away,O,1551,89,0,7.211103,-0.982794,0
2,shot-on-goal,1,-69,-35,8,home,O,1551,-89,0,40.311289,-1.05165,0
3,shot-on-goal,1,-55,-17,8,home,O,1551,-89,0,38.013156,-0.463648,0
4,shot-on-goal,1,-58,-28,8,home,O,1541,-89,0,41.773197,-0.734594,0


In [18]:
def check_empty_net(row):
    is_empty = None

    situation_code = str(row['situation code'])
    team = str(row['team'])
    if team == "home":
        away_goalie = str(situation_code[0])  # If home shot, we need to check goalie of away
        if away_goalie == '1':    
            is_empty = 0
        elif away_goalie == '0':
            is_empty = 1
        else:
            is_empty = np.nan
    elif team == "away":
        home_goalie = str(situation_code[3])  # if away shot, we need to check goalie of home
        if home_goalie == '1':  
            is_empty = 0
        elif home_goalie == '0':
            is_empty = 1
    else:
        is_empty = np.nan
    
    return is_empty

In [19]:
df['is empty net'] = df.apply(check_empty_net, axis=1)

print(f'Shape of df: {df.shape}')
df.head()

Shape of df: (62, 14)


Unnamed: 0,event type,period,x coor,y coor,owner team id,team,zone code,situation code,net x coor,net y coor,shot distance,shot angle,isgoal,is empty net
0,shot-on-goal,1,-8,-36,8,home,N,1551,-89,0,88.63972,-0.418224,0,0
1,shot-on-goal,1,85,-6,3,away,O,1551,89,0,7.211103,-0.982794,0,0
2,shot-on-goal,1,-69,-35,8,home,O,1551,-89,0,40.311289,-1.05165,0,0
3,shot-on-goal,1,-55,-17,8,home,O,1551,-89,0,38.013156,-0.463648,0,0
4,shot-on-goal,1,-58,-28,8,home,O,1541,-89,0,41.773197,-0.734594,0,0


In [21]:

df.to_csv(PATH_OUTPUT_FILE)