In [46]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

from tqdm.notebook import tqdm

In [47]:
data = pd.read_csv("clean_data.csv")
print(data.columns,data.shape,data.dtypes,sep="\n\n")
data.head()

Index(['gameId', 'season', 'gameType', 'gameDate', 'homeTeam', 'awayTeam',
       'rinkSide', 'period', 'periodTime', 'byTeam', 'eventType', 'shotType',
       'coordinateX', 'coordinateY', 'shooterName', 'goalieName', 'strength',
       'emptyNet'],
      dtype='object')

(387829, 18)

gameId           int64
season           int64
gameType        object
gameDate        object
homeTeam        object
awayTeam        object
rinkSide        object
period           int64
periodTime      object
byTeam          object
eventType       object
shotType        object
coordinateX    float64
coordinateY    float64
shooterName     object
goalieName      object
strength          bool
emptyNet          bool
dtype: object


Unnamed: 0,gameId,season,gameType,gameDate,homeTeam,awayTeam,rinkSide,period,periodTime,byTeam,eventType,shotType,coordinateX,coordinateY,shooterName,goalieName,strength,emptyNet
0,2016020001,2016,R,2016-10-12,OTT,TOR,right,1,01:11,TOR,SHOT,Wrist Shot,-77.0,5.0,Mitchell Marner,Craig Anderson,False,False
1,2016020001,2016,R,2016-10-12,OTT,TOR,left,1,02:53,OTT,SHOT,Wrist Shot,86.0,13.0,Chris Kelly,Frederik Andersen,False,False
2,2016020001,2016,R,2016-10-12,OTT,TOR,left,1,04:01,OTT,SHOT,Wrist Shot,23.0,-38.0,Cody Ceci,Frederik Andersen,False,False
3,2016020001,2016,R,2016-10-12,OTT,TOR,left,1,04:46,OTT,SHOT,Slap Shot,33.0,-15.0,Erik Karlsson,Frederik Andersen,False,False
4,2016020001,2016,R,2016-10-12,OTT,TOR,right,1,06:46,TOR,SHOT,Wrist Shot,-34.0,28.0,Martin Marincin,Craig Anderson,False,False


In [48]:
print('shotType : ', data['shotType'].unique(), 'eventType : ', data['eventType'].unique(), sep='\n')

shotType : 
['Wrist Shot' 'Slap Shot' 'Backhand' 'Snap Shot' 'Tip-In' 'Deflected'
 'Wrap-around' nan]
eventType : 
['SHOT' 'GOAL']


In [49]:
seed = 42
NHL_TEAMS = data['homeTeam'].unique()
SEASONS= data['season'].unique()

In [50]:
df = data[['gameId','byTeam','homeTeam','awayTeam','period','coordinateX','coordinateY','season']]
df = df[df['byTeam'] == 'ANA']
df['is_homeTeam'] = df['homeTeam']=='ANA'
df

Unnamed: 0,gameId,byTeam,homeTeam,awayTeam,period,coordinateX,coordinateY,season,is_homeTeam
742,2016020013,ANA,DAL,ANA,1,76.0,-16.0,2016,False
743,2016020013,ANA,DAL,ANA,1,2.0,-31.0,2016,False
745,2016020013,ANA,DAL,ANA,1,67.0,28.0,2016,False
746,2016020013,ANA,DAL,ANA,1,57.0,20.0,2016,False
747,2016020013,ANA,DAL,ANA,1,39.0,24.0,2016,False
...,...,...,...,...,...,...,...,...,...
382318,2020020865,ANA,MIN,ANA,3,46.0,24.0,2020,False
382319,2020020865,ANA,MIN,ANA,3,83.0,-3.0,2020,False
382320,2020020865,ANA,MIN,ANA,3,65.0,20.0,2020,False
382322,2020020865,ANA,MIN,ANA,4,-39.0,7.0,2020,False


In [52]:
from utils import unify_coordinates_referential

data_coords_unified = unify_coordinates_referential(data)

# Preparation des coordonnees pour une saison et un equipe (dans le meme referentiel)

In [51]:
def get_plays_of_team_of_season(NHL_TEAM,SEASON,verbose=True):
    df_plays_ofa_team = data[(data['homeTeam']==NHL_TEAM )| (data['awayTeam']==NHL_TEAM)] # On récupère les plays ou l'équipe NHL_TEAM est impliquée

    df_plays_ofa_team_ofa_season = df_plays_ofa_team[df_plays_ofa_team['season']==SEASON] # On récupère les plays de la saison SEASON

    df_plays_ofa_team_ofa_season = df_plays_ofa_team_ofa_season[['period','shotType', 'eventType', 'coordinateX', 'coordinateY' ]]
    
    if verbose:
        print(f"Nombre de plays ou l'equipe {NHL_TEAM} est impliquee : {df_plays_ofa_team.shape}")
        print(f"Nombre de plays ou l'equipe {NHL_TEAM} est impliquee pour la saison {SEASON} : {df_plays_ofa_team_ofa_season.shape}")
        print(f'''Nombre de plays sans coordonnees : 
            {df_plays_ofa_team_ofa_season[df_plays_ofa_team_ofa_season["coordinateX"].isna()].shape} 
            {df_plays_ofa_team_ofa_season[df_plays_ofa_team_ofa_season["coordinateX"].isna()]}
        ''')

    return df_plays_ofa_team_ofa_season

interact(
    get_plays_of_team_of_season, 
    NHL_TEAM=widgets.Dropdown(
        options=NHL_TEAMS,
        value=NHL_TEAMS[0],
        description='NHL_TEAM:',
        disabled=False,
    ),
    SEASON=widgets.RadioButtons(
        options=SEASONS,
        value=SEASONS[0],
        description='SEASON:',
        disabled=False
    ),
    verbose=fixed(True)
)

interactive(children=(Dropdown(description='NHL_TEAM:', options=('OTT', 'CHI', 'EDM', 'SJS', 'BUF', 'NYR', 'PI…

<function __main__.get_plays_of_team_of_season(NHL_TEAM, SEASON, verbose=True)>

In [22]:
def count_na_coordinates_per_team_per_season():    
    d = {}
    for team in tqdm(NHL_TEAMS, desc='Counting NA coordinates for each play per team per season'):
        nb = []
        for season in SEASONS:
            dd = get_plays_of_team_of_season(team,season,False)
            nb.append(dd[dd["coordinateX"].isna()].shape[0])

        d.update(
            {team : pd.Series(nb, index=SEASONS)}
        )

    return pd.DataFrame(d,index=SEASONS)

In [23]:
import plotly.graph_objs as go

def plot_na_ccordinates_per_team_per_season(
		count_na_coordinates_per_team_per_season: pd.DataFrame
   ) -> None:
   """
   Using plotly library, plot in a matrix with heat map color
   the number of plays without coordinates per season (row) and per team (columns) 
   """
   fig = go.Figure(data=go.Heatmap(
				   z=count_na_coordinates_per_team_per_season.values,
				   x=count_na_coordinates_per_team_per_season.columns,
				   y=count_na_coordinates_per_team_per_season.index,
				   ))
   fig.update_layout(
	   title='Number of plays without coordinates per season and per team',
	   xaxis_nticks=36)
   fig.show()

plot_na_ccordinates_per_team_per_season(count_na_coordinates_per_team_per_season())

Counting NA coordinates for each play per team per season:   0%|          | 0/31 [00:00<?, ?it/s]

In [8]:
df = get_plays_of_team_of_season('OTT',2017,verbose=True)

df.shape
df.dropna(inplace=True)
df.shape

Nombre de plays ou l'equipe OTT est impliquee : (24898, 17)
Nombre de plays ou l'equipe OTT est impliquee pour la saison 2017 : (5286, 5)
Nombre de plays sans coordonnees : 
            (5, 5) 
                    period    shotType eventType  coordinateX  coordinateY
207001       5  Wrist Shot      GOAL          NaN          NaN
207002       5  Wrist Shot      SHOT          NaN          NaN
207003       5    Backhand      SHOT          NaN          NaN
207004       5  Wrist Shot      SHOT          NaN          NaN
207005       5  Wrist Shot      GOAL          NaN          NaN
        


(5281, 5)

#  taux de tir moyen par heure de la ligue par emplacement

In [24]:
# Vous pouvez ignorer le fait d'essayer de déterminer si un événement s'est produit
# lors d'un avantage numérique ou d'un désavantage numérique.

# Vous pouvez supposer que chaque match dure 60 minutes.

data['coordinates'] = list(zip(data['coordinateX'],data['coordinateY']))

hourly_shot_rate_per_position = data.groupby(['coordinates']).count()['gameId']


In [28]:
data.groupby(['coordinates','gameId']).count()['season'].unstack(level=1).sum(axis=1).sort_values(ascending=False)

coordinates
(79.0, 1.0)      405.0
(-79.0, -1.0)    386.0
(-79.0, 1.0)     384.0
(78.0, 1.0)      381.0
(-79.0, 0.0)     381.0
                 ...  
(6.0, 39.0)        1.0
(6.0, 21.0)        1.0
(6.0, 20.0)        1.0
(6.0, 15.0)        1.0
(99.0, 3.0)        1.0
Length: 15101, dtype: float64

# Regroupez les tirs par équipe et utilisez les moyennes de ligue du taux de tir moyen par
heure calculées ci-dessus pour calculer le la différence du taux de tir par heure pour
chaque équipe avec la moyenne. Vous pouvez choisir de représenter cela soit comme
une différence brute de buts entre les équipes, soit comme un pourcentage.

In [16]:
data.groupby(['awayTeam']).count()['season']

awayTeam
ANA    11924
ARI    12008
BOS    13713
BUF    11869
CAR    12486
CBJ    12672
CGY    12366
CHI    12408
COL    12878
DAL    12652
DET    11595
EDM    12143
FLA    12250
LAK    11735
MIN    12263
MTL    12921
NJD    12214
NSH    13496
NYI    13059
NYR    12380
OTT    12284
PHI    11944
PIT    13082
SJS    12609
STL    12850
TBL    13711
TOR    13252
VAN    12204
VGK    11012
WPG    12700
WSH    13149
Name: season, dtype: int64