# Milestone 2 Visualizations

## Dependencies

In [None]:
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns


## Setup

In [None]:
cd ..

In [None]:
plt.style.use("ggplot")


In [None]:
shots = pd.read_csv("./data/processed/train_processed.csv")

# round for plotting purposes
shots["distance_from_net"] = shots["distance_from_net"].round()
shots["angle"] = shots["angle"].round()

shots_normal = shots[
    shots["coordinate_x"] >= 25
]  # only consider shots within offensive zone (25 = blue line coordinate)
shots_normal = shots_normal[~shots_normal["is_empty_net"].fillna(False)]  # remove empty net situations
shots_normal = shots_normal[shots_normal["period_type"] != "SHOOTOUT"]  # remove shootout situations
shots_normal.shape


## Task 2: Feature Engineering I

### Q1: Shot Histograms

#### By Distance

In [None]:
goal_rate_by_distance = shots.groupby(["distance_from_net", "is_goal"]).size().reset_index()
goal_rate_by_distance.columns = ["distance_from_net", "is_goal", "Shots"]
goal_rate_by_distance["is_goal"].replace({False: "No-goal", True: "Goal"}, inplace=True)


In [None]:
fig, ax = plt.subplots(figsize=(20, 6))
ax = sns.barplot(
    x="distance_from_net", y="Shots", hue="is_goal", data=goal_rate_by_distance, palette="pastel", edgecolor=".6", ax=ax
)

plt.title("Goal Distribution by Distance to Net (Training Data)")
for ind, label in enumerate(ax.get_xticklabels()):
    if ind % 10 == 0:  # every 10th label is kept
        label.set_visible(True)
    else:
        label.set_visible(False)


#### By Angle

In [None]:
goal_rate_by_angle = shots.groupby(["angle", "is_goal"]).size().reset_index()
goal_rate_by_angle.columns = ["angle", "is_goal", "Shots"]
goal_rate_by_angle["is_goal"].replace({False: "No-goal", True: "Goal"}, inplace=True)


In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
ax = sns.barplot(x="angle", y="Shots", hue="is_goal", data=goal_rate_by_angle,
                 palette="pastel", edgecolor=".6", ax=ax)

plt.title("Goal Distribution by Angle to Net (Training Data)")
for ind, label in enumerate(ax.get_xticklabels()):
    if ind % 10 == 0:  # every 10th label is kept
        label.set_visible(True)
    else:
        label.set_visible(False)


#### Joint-Plot

In [None]:
shots_by_angle_and_distance = shots.groupby(["distance_from_net", "angle"]).size().reset_index()
shots_by_angle_and_distance.columns = ["distance_from_net", "angle", "Shots"]


In [None]:
# fig, ax = plt.subplots(figsize=(10,10));

with sns.axes_style('white'):
    ax = sns.jointplot(
        x="distance_from_net", y="angle", data=shots_by_angle_and_distance, kind="hex", palette="pastel", edgecolor=".6"
    )


# for ind, label in enumerate(ax.get_xticklabels()):
# if ind % 10 == 0:  # every 10th label is kept
# label.set_visible(True)
# else:
# label.set_visible(False)


### Q2: Goal Rates

In [None]:
goal_rate_by_distance = shots.groupby(['distance_from_net','is_goal']).size().unstack('is_goal')
goal_rate_by_distance['total_shots'] = goal_rate_by_distance.sum(axis=1)
goal_rate_by_distance['goal_rate'] = goal_rate_by_distance[True] / goal_rate_by_distance['total_shots']

goal_rate_by_distance.rename(columns={False: "No-goals", True: "Goals"}, inplace=True)
goal_rate_by_distance.reset_index(inplace=True)

goal_rate_by_distance.columns.name = None


In [None]:
fig, ax = plt.subplots(figsize=(20, 6))
ax = sns.barplot(
    x="distance_from_net", y="goal_rate", data=goal_rate_by_distance,
    color="b", alpha=.33, edgecolor=".6", ax=ax
)

plt.title("Goal Rate by Distance to Net (Training Data only)")
for ind, label in enumerate(ax.get_xticklabels()):
    if ind % 10 == 0:  # every 10th label is kept
        label.set_visible(True)
    else:
        label.set_visible(False)


In [None]:
goal_rate_by_angle = shots.groupby(['angle','is_goal']).size().unstack('is_goal')
goal_rate_by_angle['total_shots'] = goal_rate_by_angle.sum(axis=1)
goal_rate_by_angle['goal_rate'] = goal_rate_by_angle[True] / goal_rate_by_angle['total_shots']

goal_rate_by_angle.rename(columns={False: "No-goals", True: "Goals"}, inplace=True)
goal_rate_by_angle.reset_index(inplace=True)

goal_rate_by_angle.columns.name = None

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
ax = sns.barplot(
    x="angle", y="goal_rate", data=goal_rate_by_angle,
    color="b", alpha=.33, edgecolor=".6", ax=ax
)

plt.title("Goal Rate by Angle to Net (Training Data only)")
for ind, label in enumerate(ax.get_xticklabels()):
    if ind % 10 == 0:  # every 10th label is kept
        label.set_visible(True)
    else:
        label.set_visible(False)


### Q3: Goal Histograms

In [None]:
goals_by_distance = shots[shots['is_goal']].groupby(['distance_from_net','is_empty_net']).size().reset_index()
goals_by_distance.columns = ['distance_from_net','is_empty_net', 'Goals']

In [None]:
fig, ax = plt.subplots(figsize=(20, 6))
ax = sns.barplot(
    x="distance_from_net", y="Goals", hue="is_empty_net", data=goals_by_distance,
    palette="pastel", edgecolor=".6", ax=ax
)

plt.title("Goal Distribution by Distance (Training Data only)")
for ind, label in enumerate(ax.get_xticklabels()):
    if ind % 10 == 0:  # every 10th label is kept
        label.set_visible(True)
    else:
        label.set_visible(False)

In [None]:
fig, ax = plt.subplots(figsize=(20, 6))
ax = sns.barplot(
    x="distance_from_net", y="Goals", hue="is_empty_net",
    data=goals_by_distance[goals_by_distance['distance_from_net'] > 80],
    palette="pastel", edgecolor=".6", ax=ax
)

plt.title("Goal Rate by Distribution by Distance to Net (>= 80ft) (Training Data only)")
for ind, label in enumerate(ax.get_xticklabels()):
    if ind % 10 == 0:  # every 10th label is kept
        label.set_visible(True)
    else:
        label.set_visible(False)

## Q4: Mislabeled Events

The majority of the goals returned by the following query are mislabeled.

```python
shots[(shots.distance_from_net >= 150) & (shots.is_goal) & (~shots.is_empty_net)]
```

One such example is Adam Cracknell's first period goal on February 21, 2016, against the Colorado Avalanche.

It is labeled with x_coordinate , but really occurred at the opposite end, within a few feet of the goal [[nhl.com](https://www.nhl.com/video/cracknell-opens-the-scoring/t-278025682/c-41679503)]

In [23]:
shots.loc[52321]

game_id                                                   2015020888.0
event_index                                                       47.0
secondary_type                                                Backhand
description          Adam Cracknell (5) Backhand, assists: Emerson ...
period                                                             1.0
period_type                                                    REGULAR
time                                                             08:31
time_remaining                                                   11:29
date                                              2016-02-22T03:21:59Z
goals_home                                                         1.0
goals_away                                                         0.0
shooter_team_code                                                  VAN
shooter_id                                                   8471490.0
goalie_name                                            Semyon Varlamov
goalie