In [1]:
import sys
from pathlib import Path

# Determine the parent directory of the current notebook
notebook_path = Path().resolve()
parent_dir = notebook_path.parent

# Add the parent directory to sys.path to access utils module
sys.path.insert(0, str(parent_dir))

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from utils import unify_coordinates_referential, init_logger, verify_dotenv_file, GOAL_POSITION
from feature_engineering import *
from matplotlib.colors import ListedColormap

In [2]:
df = pd.read_csv(Path(os.environ["DATA_FOLDER"], "2016_final.csv"))

In [3]:
data = NHLFeatureEngineering(
    df = df,
    distanceToGoal=True,
    angleToGoal=True,
    isGoal=True,
    emptyNet=True,
    verbose=True,
    inputRinkSide=True,
    periodTimeSeconds=True,
    lastEvent=True,
    lastCoordinates=True,
    timeElapsed=True,
    distanceFromLastEvent=True,
    rebound=True,
    changeAngle=True,
    speed=True,
    computePowerPlayFeatures=True,
)

all_data = data.df
unify_data = data.dfUnify

[32m2023-11-10 17:06:05.385[0m | [1mINFO    [0m | [36mfeature_engineering[0m:[36m__init__[0m:[36m37[0m - [1mCalculations of distance/angle done w.r.t GOAL_POSITION = [89, 0][0m
[32m2023-11-10 17:06:05.649[0m | [1mINFO    [0m | [36mfeature_engineering[0m:[36m_printNaStatsBeforeUnifying[0m:[36m139[0m - [1mFound 340014 shots with coordinates and rinkSide specified.[0m
[32m2023-11-10 17:06:05.662[0m | [1mINFO    [0m | [36mfeature_engineering[0m:[36m_printNaStatsBeforeUnifying[0m:[36m144[0m - [1m
                Coordinates NA stats:
                    83659 shots without coordinates.
                    83658 shots without both X and Y coordinates.
                    1 shots without X coordinates.
                    0 shots without Y coordinates.
                
                RinkSide NA stats:
                    0 shots without rinkSide specified. Use inputRinkSide == True to handle missing 
                    values based on mean X coordinates i

# Vérification des features

## distanceToGoal (Goal is at [89, 0])

In [4]:
test = unify_data[["coordinateX", "coordinateY", "distanceToGoal"]].iloc[50:60, :]
test["test"] = np.linalg.norm(np.array(test[["coordinateX", "coordinateY"]]) - np.array([89, 0]), axis=1)
test

Unnamed: 0,coordinateX,coordinateY,distanceToGoal,test
50,,,,
51,-20.0,22.0,111.198022,111.198022
52,74.0,-39.0,41.785165,41.785165
53,-51.0,-23.0,141.876707,141.876707
54,,,,
55,-69.0,-22.0,159.524293,159.524293
56,-61.0,3.0,150.029997,150.029997
57,,,,
58,69.0,22.0,29.732137,29.732137
59,34.0,20.0,58.5235,58.5235


## isGoal

In [5]:
print("Number of goals in original df:", unify_data.query("eventType == 'GOAL'").shape[0], "\n")
print("Number of goals in new df:", unify_data["isGoal"].value_counts())

Number of goals in original df: 7377 

Number of goals in new df: isGoal
0    416296
1      7377
Name: count, dtype: int64


## periodTimeSeconds (eyetest)

In [6]:
test = unify_data[["periodTime", "periodTimeSeconds"]].iloc[50:60, :]
test

Unnamed: 0,periodTime,periodTimeSeconds
50,09:25,565
51,09:25,565
52,09:45,585
53,09:52,592
54,09:53,593
55,09:53,593
56,10:05,605
57,10:11,611
58,10:11,611
59,10:16,616


## lastEventType

In [7]:
test = unify_data[["gameId", "period", "periodTime", "eventType", "lastEventType"]].iloc[50:60, :]
test

Unnamed: 0,gameId,period,periodTime,eventType,lastEventType
50,2016020001,1,09:25,STOP,HIT
51,2016020001,1,09:25,FACEOFF,STOP
52,2016020001,1,09:45,MISSED_SHOT,FACEOFF
53,2016020001,1,09:52,BLOCKED_SHOT,MISSED_SHOT
54,2016020001,1,09:53,STOP,BLOCKED_SHOT
55,2016020001,1,09:53,FACEOFF,STOP
56,2016020001,1,10:05,BLOCKED_SHOT,FACEOFF
57,2016020001,1,10:11,STOP,BLOCKED_SHOT
58,2016020001,1,10:11,FACEOFF,STOP
59,2016020001,1,10:16,SHOT,FACEOFF


## lastCoordinateX AND lastCoordinateY

In [8]:
test = unify_data[["gameId", "period", "coordinateX", "coordinateY", "lastCoordinateX", "lastCoordinateY"]].iloc[50:60, :]
test

Unnamed: 0,gameId,period,coordinateX,coordinateY,lastCoordinateX,lastCoordinateY
50,2016020001,1,,,21.0,40.0
51,2016020001,1,-20.0,22.0,,
52,2016020001,1,74.0,-39.0,-20.0,22.0
53,2016020001,1,-51.0,-23.0,74.0,-39.0
54,2016020001,1,,,-51.0,-23.0
55,2016020001,1,-69.0,-22.0,,
56,2016020001,1,-61.0,3.0,-69.0,-22.0
57,2016020001,1,,,-61.0,3.0
58,2016020001,1,69.0,22.0,,
59,2016020001,1,34.0,20.0,69.0,22.0


## timeElapsed

In [9]:
test = unify_data[["gameId", "eventType", "period", "periodTime", "periodTimeSeconds", "timeElapsed"]].iloc[50:60, :]
test

Unnamed: 0,gameId,eventType,period,periodTime,periodTimeSeconds,timeElapsed
50,2016020001,STOP,1,09:25,565,20.0
51,2016020001,FACEOFF,1,09:25,565,0.0
52,2016020001,MISSED_SHOT,1,09:45,585,20.0
53,2016020001,BLOCKED_SHOT,1,09:52,592,7.0
54,2016020001,STOP,1,09:53,593,1.0
55,2016020001,FACEOFF,1,09:53,593,0.0
56,2016020001,BLOCKED_SHOT,1,10:05,605,12.0
57,2016020001,STOP,1,10:11,611,6.0
58,2016020001,FACEOFF,1,10:11,611,0.0
59,2016020001,SHOT,1,10:16,616,5.0


### Analyses des edges cases

In [16]:
test2 = unify_data.query("timeElapsed == 0")
one = set(test2.index - 1)
two = set(test2.index)
three = sorted(list(one.union(two)))
# print max rows
pd.set_option("display.max_rows", None)
oui = unify_data.iloc[three][["gameId", "period", "eventType", "periodTimeSeconds", "timeElapsed"]]

df_one = unify_data.iloc[list(one)]["eventType"].reset_index(drop=True)
df_two = unify_data.iloc[list(two)]["eventType"].reset_index(drop=True)
temp = pd.DataFrame({"t-1 (one)": df_one, "t (two)": df_two})
temp.groupby(["t-1 (one)", "t (two)"]).value_counts()

t-1 (one)          t (two)          
BLOCKED_SHOT       BLOCKED_SHOT           101
                   CHALLENGE                4
                   FACEOFF               1068
                   GAME_END                 4
                   GAME_SCHEDULED           1
                   GIVEAWAY                26
                   GOAL                    15
                   HIT                     36
                   MISSED_SHOT              8
                   PENALTY                 49
                   PERIOD_END              10
                   PERIOD_OFFICIAL         12
                   PERIOD_READY             2
                   PERIOD_START            38
                   SHOT                    46
                   STOP                   629
                   TAKEAWAY                 6
CHALLENGE          BLOCKED_SHOT             1
                   CHALLENGE               38
                   FACEOFF                 70
                   GAME_END                

### distanceFromLastEvent

In [None]:
test = unify_data[["coordinateX", "coordinateY", "lastCoordinateX", "lastCoordinateY", "distanceFromLastEvent"]].iloc[50:60, :]
test["test"] = np.linalg.norm(np.array(test[["coordinateX", "coordinateY"]]) - np.array(test[["lastCoordinateX", "lastCoordinateY"]]), axis=1)
test

Unnamed: 0,coordinateX,coordinateY,lastCoordinateX,lastCoordinateY,distanceFromLastEvent,test
50,,,21.0,40.0,,
51,-20.0,22.0,,,,
52,74.0,-39.0,-20.0,22.0,112.058021,112.058021
53,-51.0,-23.0,74.0,-39.0,126.01984,126.01984
54,,,-51.0,-23.0,,
55,-69.0,-22.0,,,,
56,-61.0,3.0,-69.0,-22.0,26.248809,26.248809
57,,,-61.0,3.0,,
58,69.0,22.0,,,,
59,34.0,20.0,69.0,22.0,35.057096,35.057096


### rebound

In [None]:
test = unify_data[["gameId", "period", "eventType", "lastEventType", "rebound"]].iloc[35:45, :]
test

Unnamed: 0,gameId,period,eventType,lastEventType,rebound
35,2016020001,1,STOP,FACEOFF,0
36,2016020001,1,FACEOFF,STOP,0
37,2016020001,1,HIT,FACEOFF,0
38,2016020001,1,SHOT,HIT,0
39,2016020001,1,GIVEAWAY,SHOT,1
40,2016020001,1,SHOT,GIVEAWAY,0
41,2016020001,1,HIT,SHOT,1
42,2016020001,1,MISSED_SHOT,HIT,0
43,2016020001,1,GOAL,MISSED_SHOT,0
44,2016020001,1,FACEOFF,GOAL,0


### changeAngle

In [None]:
test = unify_data[["gameId", "period", "rebound", "angleToGoal", "changeAngle"]].iloc[35:45, :]
test

Unnamed: 0,gameId,period,rebound,angleToGoal,changeAngle
35,2016020001,1,0,,
36,2016020001,1,0,7.926927,
37,2016020001,1,0,14.036243,0.0
38,2016020001,1,0,26.980231,0.0
39,2016020001,1,1,29.859016,2.878785
40,2016020001,1,0,-16.886791,0.0
41,2016020001,1,1,28.663957,45.550748
42,2016020001,1,0,-63.434949,0.0
43,2016020001,1,0,3.012788,0.0
44,2016020001,1,0,0.0,0.0


### speed

In [None]:
test = unify_data[["gameId", "period", "periodTime", "distanceFromLastEvent", "timeElapsed", "speed"]].iloc[35:45, :]
test["test"] = test["distanceFromLastEvent"] / test["timeElapsed"]
test

Unnamed: 0,gameId,period,periodTime,distanceFromLastEvent,timeElapsed,speed,test
35,2016020001,1,06:04,,17.0,,
36,2016020001,1,06:04,,0.0,,
37,2016020001,1,06:30,25.059928,26.0,0.064256,0.963843
38,2016020001,1,06:46,81.221918,16.0,0.200054,5.07637
39,2016020001,1,07:07,3.162278,21.0,0.007406,0.150585
40,2016020001,1,07:30,48.041649,23.0,0.106759,2.088767
41,2016020001,1,07:58,61.032778,28.0,0.127684,2.179742
42,2016020001,1,08:18,86.267027,20.0,0.173227,4.313351
43,2016020001,1,08:21,17.492856,3.0,0.034916,5.830952
44,2016020001,1,08:21,70.007142,0.0,0.139735,inf


### Power play features (analyse d'une partie)

*Note : Power play est considéré comme une différence d'effectifs entre les deux équipes, et pas comme seulement la présence d'une pénalité*

In [None]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
test = unify_data.query("gameId == 2016020001")[["gameId", "period", "periodTime", "eventType", "penaltySeverity", "penaltyMinutes", "penalizedTeam", "elapsedPowerPlay", "homeSkaters", "awaySkaters"]]
test

Unnamed: 0,gameId,period,periodTime,eventType,penaltySeverity,penaltyMinutes,penalizedTeam,elapsedPowerPlay,homeSkaters,awaySkaters
0,2016020001,1,00:00,GAME_SCHEDULED,,,,0,5.0,5.0
1,2016020001,1,00:00,PERIOD_READY,,,,0,5.0,5.0
2,2016020001,1,00:00,PERIOD_START,,,,0,5.0,5.0
3,2016020001,1,00:00,FACEOFF,,,,0,5.0,5.0
4,2016020001,1,00:05,STOP,,,,0,5.0,5.0
5,2016020001,1,00:05,FACEOFF,,,,0,5.0,5.0
6,2016020001,1,00:43,TAKEAWAY,,,,0,5.0,5.0
7,2016020001,1,01:03,BLOCKED_SHOT,,,,0,5.0,5.0
8,2016020001,1,01:10,BLOCKED_SHOT,,,,0,5.0,5.0
9,2016020001,1,01:11,SHOT,,,,0,5.0,5.0
