In [1]:
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
cleaned_data_dir = "./data/cleaned/"

In [3]:
cd ..

/home/jake/Projects/mila/6758/hockey


In [4]:
team_files = os.listdir(cleaned_data_dir)

plays = pd.concat([pd.read_csv(os.path.join(cleaned_data_dir, team_file)) for team_file in team_files])
plays = plays.drop("Unnamed: 0", axis=1)
plays.shape

(385076, 29)

In [5]:
# plays_full = plays.copy()
# plays = plays.sample(2000)  # TODO delete

In [6]:
plays['season'] = plays['game_id'].apply(lambda gid: int(str(gid)[:4]))
plays['subseason'] = plays['game_id'].apply(lambda gid: int(str(gid)[4:6])).replace({2: 'regular', 3: 'playoffs'})

In [7]:
plays = plays.rename({'shooter_team_name': 'team'}, axis=1)

In [8]:
def game_length(events):
    """ Calculate game length from events """
    last_event = events.sort_values('event_index', ascending=False).iloc[0]
    time = last_event['time']
    last_period = last_event['period']

    time_in_minutes = int(time[:2]) + int(time[3:])/60
    game_length = 60 if last_period <= 3 else (last_period-1)*20 + time_in_minutes
    return game_length

In [9]:
team_game_lengths = plays.groupby(['team','season','subseason','game_id']).apply(game_length)

In [10]:
team_game_lengths.describe(percentiles=[.9,.95,.99])

count    12300.000000
mean        61.842359
std          5.553643
min         60.000000
50%         60.000000
90%         63.668333
95%         80.000000
99%         80.000000
max        150.450000
dtype: float64

In [11]:
team_minutes = team_game_lengths.groupby(['team','season','subseason']).sum()
team_minutes.name = 'Minutes'

In [12]:
team_shots = plays.groupby(['team','season','subseason','coordinate_x','coordinate_y']).size()
team_shots.name = 'Shots'

In [13]:
teams = plays[['team','season']].drop_duplicates().sort_values(['team','season'])
subseason = pd.DataFrame({'subseason': ['regular', 'playoffs']})
coordinates_x = pd.DataFrame({'coordinate_x': list(range(25,100+1))})  # only consider offensive zone 
coordinates_y = pd.DataFrame({'coordinate_y': list(range(-42,42+1))})

team_shots_spine = teams.merge(subseason, how='cross')\
                        .merge(coordinates_x, how='cross')\
                        .merge(coordinates_y, how='cross')

assert team_shots_spine.shape[0] == len(teams) * len(subseason) * len(coordinates_x) * len(coordinates_y)

In [14]:
team_shots_spine = team_shots_spine.set_index(['team','season','subseason','coordinate_x','coordinate_y'])

In [15]:
team_shots = team_shots_spine.join(team_shots, how='left').fillna(0)

In [16]:
team_shots_per_hour = team_shots.join(team_minutes)
team_shots_per_hour['shots_per_hour'] = team_shots_per_hour['Shots'] / \
                                        (team_shots_per_hour['Minutes'] / 60)
team_shots_per_hour.shape

(1989680, 3)

In [17]:
league_shots_per_hour = team_shots_per_hour.groupby(team_shots_per_hour.index.names[1:])[['Shots','Minutes']].sum()
league_shots_per_hour['league_shots_per_hour'] = league_shots_per_hour['Shots'] / \
                                                 (league_shots_per_hour['Minutes'] / 60)
league_shots_per_hour = league_shots_per_hour[['league_shots_per_hour']]  # drop other columns

In [18]:
team_shots_per_hour = team_shots_per_hour.join(league_shots_per_hour, on=league_shots_per_hour.index.names)  # preserves order

In [19]:
# team_shots_per_hour['x_bin'] = team_shots_per_hour.index
def round_to_nearest_x(arr,x,offset):
     res = x * ((arr-offset) // x) + offset
     res = res + (x-1)/2 # if x%2 == 1 else res + x/2  # make coordinate in middle
     return res

team_shots_per_hour['x_bin'] = round_to_nearest_x(team_shots_per_hour.index.get_level_values('coordinate_x'), 3, 1)
team_shots_per_hour['x_bin'].replace({101: 100}, inplace = True)  # smooth edge to original value
# team_shots_per_hour.loc[team_shots_per_hour['x_bin'] == -43, 'x_bin'] = -42  
team_shots_per_hour['y_bin'] = round_to_nearest_x(team_shots_per_hour.index.get_level_values('coordinate_y'), 3, 1)
team_shots_per_hour['y_bin'].replace({-43: -42}, inplace = True)  # smooth edge to original value

In [20]:
team_shots_per_hour_binned = \
    team_shots_per_hour.reset_index().groupby(['team', 'season', 'subseason', 'x_bin','y_bin'])\
        [['shots_per_hour', 'league_shots_per_hour']].sum()

In [21]:
team_shots_per_hour_binned[team_shots_per_hour_binned.shots_per_hour!=0]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,shots_per_hour,league_shots_per_hour
team,season,subseason,x_bin,y_bin,Unnamed: 5_level_1,Unnamed: 6_level_1
Anaheim Ducks,2016,playoffs,26.0,-19.0,0.056364,0.010920
Anaheim Ducks,2016,playoffs,29.0,-28.0,0.056364,0.010920
Anaheim Ducks,2016,playoffs,29.0,-10.0,0.056364,0.005460
Anaheim Ducks,2016,playoffs,29.0,11.0,0.056364,0.010920
Anaheim Ducks,2016,playoffs,29.0,14.0,0.056364,0.016381
...,...,...,...,...,...,...
Winnipeg Jets,2020,regular,92.0,23.0,0.017591,0.003358
Winnipeg Jets,2020,regular,92.0,29.0,0.017591,0.002798
Winnipeg Jets,2020,regular,92.0,32.0,0.017591,0.002238
Winnipeg Jets,2020,regular,95.0,8.0,0.017591,0.002798


In [3]:
team_shots_per_hour_binned['relative_shots_per_hour'] = \
    team_shots_per_hour_binned['shots_per_hour'] - team_shots_per_hour['league_shots_per_hour']

NameError: name 'team_shots_per_hour_binned' is not defined

In [2]:
team_shots_per_hour_binned.to_csv("./data/advanced_viz/shots_per_hour.csv")

NameError: name 'team_shots_per_hour_binned' is not defined