Data sourced from [moneypuck.com](https://moneypuck.com/data.htm)

# Packages

In [11]:
import glob
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

# Read Data

In [12]:
df: pd.DataFrame = pd.concat(
    [pd.read_csv(csv_file) for csv_file in glob.glob('./Data/*.csv')],
    ignore_index=True
)

df.set_index('playerId', inplace=True)

In [13]:
df.sample(n=5, random_state=20)

Unnamed: 0_level_0,season,name,team,position,situation,games_played,icetime,xGoals,goals,unblocked_shot_attempts,xRebounds,rebounds,xFreeze,freeze,xOnGoal,ongoal,xPlayStopped,playStopped,xPlayContinuedInZone,playContinuedInZone,xPlayContinuedOutsideZone,playContinuedOutsideZone,flurryAdjustedxGoals,lowDangerShots,mediumDangerShots,highDangerShots,lowDangerxGoals,mediumDangerxGoals,highDangerxGoals,lowDangerGoals,mediumDangerGoals,highDangerGoals,blocked_shot_attempts,penalityMinutes,penalties
playerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
8478872,2022,Karel Vejmelka,ARI,G,4on5,50,18257.0,48.25,48.0,585.0,29.78,37.0,69.06,59.0,334.57,312.0,0.92,0.0,197.74,177.0,103.92,132.0,43.06,302.0,100.0,58.0,8.81,12.11,27.33,8.0,15.0,25.0,125.0,0.0,0.0
8462044,2012,Jean-Sebastien Giguere,COL,G,5on5,18,45309.0,27.1,30.0,683.0,24.09,24.0,81.98,83.0,355.85,366.0,10.99,8.0,199.0,157.0,152.61,193.0,26.31,385.0,96.0,15.0,11.12,11.65,4.34,12.0,15.0,3.0,187.0,4.0,2.0
8480112,2020,Hunter Miska,COL,G,5on5,5,12722.0,6.32,12.0,158.0,5.79,9.0,21.2,21.0,83.07,86.0,3.72,8.0,46.85,29.0,32.94,44.0,6.15,89.0,25.0,2.0,2.49,3.19,0.64,6.0,6.0,0.0,42.0,0.0,0.0
8473541,2009,Jonathan Bernier,L.A,G,5on4,3,892.0,0.07,0.0,5.0,0.1,0.0,0.43,0.0,3.23,3.0,0.52,0.0,0.93,0.0,3.38,5.0,0.07,5.0,0.0,0.0,0.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8475660,2015,Cam Talbot,EDM,G,all,56,190595.0,142.79,137.0,3128.0,111.61,108.0,375.38,403.0,1626.97,1649.0,47.28,40.0,918.88,768.0,669.89,795.0,137.75,1729.0,417.0,125.0,50.53,49.59,42.68,51.0,43.0,43.0,857.0,0.0,0.0


# Feature Selection

- I could explore feature selection models.
- For now I've chosen features I think are important based off my own knowledge of the NHL and hockey.

In [14]:
# i could potentially explore feature selection models
df = df[[
    'season',              # the year of the season played
    'name',                # the goalie's name
    'situation',           # type of play (5on5, 4on5, 4on4, etc.)
    'games_played',        # total number of games played for a season
    'xGoals',              # total number of goals expected to be allowed
    'goals',               # total number of goals actually allowed
    'xRebounds',           # total number of rebounds expected to allow
    'rebounds',            # total number of rebounds given from shots on goal
    'ongoal',              # total number of shots on goal faced
    'lowDangerxGoals',     # goals expected to allow from low danger shots
    'mediumDangerxGoals',  # goals expected to allow from medium danger shots
    'highDangerxGoals',    # goals expected to allow from high danger shots
    'lowDangerGoals',      # goals actually allowed from low danger shots
    'mediumDangerGoals',   # goals actually allowed from medium danger shots
    'highDangerGoals'      # goals actually allowed from high danger shots
]]

# Feature Engineering

In [15]:
# include aggregate stats of all situations (5on5, 5on4, 4on5, etc.)
df = df[df['situation'] == 'all']