Data sourced from [moneypuck.com](https://moneypuck.com/data.htm)

# Packages

In [61]:
import glob
import polars as pl
import matplotlib.pyplot as plt

# Read Data

In [62]:
df: pl.DataFrame = pl.concat(
    [pl.read_csv(csv_file) for csv_file in glob.glob('./Data/*.csv')],
    how='vertical'
)

df.sample(n=5, seed=20)

playerId,season,name,team,position,situation,games_played,icetime,xGoals,goals,unblocked_shot_attempts,xRebounds,rebounds,xFreeze,freeze,xOnGoal,ongoal,xPlayStopped,playStopped,xPlayContinuedInZone,playContinuedInZone,xPlayContinuedOutsideZone,playContinuedOutsideZone,flurryAdjustedxGoals,lowDangerShots,mediumDangerShots,highDangerShots,lowDangerxGoals,mediumDangerxGoals,highDangerxGoals,lowDangerGoals,mediumDangerGoals,highDangerGoals,blocked_shot_attempts,penalityMinutes,penalties
i64,i64,str,str,str,str,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
8480373,2020,"""Josef Korenar""","""S.J""","""G""","""all""",10,29510.0,21.33,26.0,476.0,18.21,16.0,58.91,73.0,255.31,258.0,7.46,6.0,142.81,101.0,103.8,131.0,20.61,271.0,66.0,16.0,7.71,8.16,5.45,10.0,9.0,7.0,123.0,0.0,0.0
8467913,2009,"""Alex Auld""","""NYR""","""G""","""other""",24,2764.0,3.71,4.0,66.0,2.4,2.0,7.91,8.0,37.82,36.0,1.02,1.0,20.16,17.0,16.6,18.0,3.52,40.0,8.0,4.0,1.4,0.94,1.37,1.0,2.0,1.0,14.0,0.0,0.0
8481668,2024,"""Arturs Silovs""","""VAN""","""G""","""4on5""",10,2713.0,3.94,4.0,63.0,2.67,3.0,6.35,5.0,30.32,30.0,0.23,0.0,19.0,15.0,9.98,16.0,3.87,31.0,7.0,5.0,1.02,0.79,2.13,0.0,1.0,3.0,20.0,0.0,0.0
8475660,2019,"""Cam Talbot""","""CGY""","""G""","""other""",26,1792.0,2.24,3.0,47.0,1.82,1.0,5.32,7.0,27.11,32.0,0.6,0.0,17.03,17.0,10.8,11.0,2.17,33.0,3.0,3.0,1.07,0.33,0.85,2.0,0.0,1.0,8.0,0.0,0.0
8471239,2014,"""Cory Schneider""","""N.J""","""G""","""all""",69,235436.0,166.23,148.0,3436.0,128.56,111.0,433.73,487.0,1914.2,1982.0,51.2,51.0,1055.58,787.0,788.75,1038.0,160.49,1978.0,516.0,140.0,56.67,62.75,46.8,48.0,57.0,43.0,802.0,0.0,0.0


# Feature Selection

- I could explore feature selection models.
- For now I've chosen features I think are important based off my own knowledge of the NHL and hockey.

In [63]:
# include aggregate stats of all situations (5on5, 5on4, 4on5, etc.)
# df = df[df['situation'] == 'all']
df = df.filter(pl.col('situation') == 'all')

# # i could potentially explore feature selection models
df = df.select([
    'season',              # the year of the season played
    'name',                # the goalie's name
    'games_played',        # total number of games played for a season
    'xGoals',              # total number of goals expected to be allowed
    'goals',               # total number of goals actually allowed
    'xRebounds',           # total number of rebounds expected to allow
    'rebounds',            # total number of rebounds given from shots on goal
    'ongoal',              # total number of shots on goal faced
    'lowDangerxGoals',     # goals expected to allow from low danger shots
    'mediumDangerxGoals',  # goals expected to allow from medium danger shots
    'highDangerxGoals',    # goals expected to allow from high danger shots
    'lowDangerGoals',      # goals actually allowed from low danger shots
    'mediumDangerGoals',   # goals actually allowed from medium danger shots
    'highDangerGoals'      # goals actually allowed from high danger shots
])

df.sample(n=5, seed=20)

season,name,games_played,xGoals,goals,xRebounds,rebounds,ongoal,lowDangerxGoals,mediumDangerxGoals,highDangerxGoals,lowDangerGoals,mediumDangerGoals,highDangerGoals
i64,str,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2020,"""Sergei Bobrovsky""",31,79.88,88.0,60.09,85.0,933.0,25.58,28.56,25.73,25.0,33.0,30.0
2009,"""Steve Mason""",57,127.82,163.0,104.78,88.0,1614.0,46.36,42.75,38.7,59.0,59.0,45.0
2024,"""Connor Hellebuyck""",60,155.87,120.0,119.89,195.0,1576.0,51.63,51.25,52.99,50.0,33.0,37.0
2019,"""Elvis Merzlikins""",32,68.84,71.0,58.71,57.0,919.0,29.16,25.36,14.33,32.0,27.0,12.0
2014,"""Cory Schneider""",69,166.23,148.0,128.56,111.0,1982.0,56.67,62.75,46.8,48.0,57.0,43.0


# Feature Engineering

In [64]:
df.describe()

statistic,season,name,games_played,xGoals,goals,xRebounds,rebounds,ongoal,lowDangerxGoals,mediumDangerxGoals,highDangerxGoals,lowDangerGoals,mediumDangerGoals,highDangerGoals
str,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",1598.0,"""1598""",1598.0,1598.0,1598.0,1598.0,1598.0,1598.0,1598.0,1598.0,1598.0,1598.0,1598.0,1598.0
"""null_count""",0.0,"""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",2016.295369,,27.143304,68.6452,68.976846,52.602522,58.82791,765.296621,23.096902,24.473191,21.074975,23.997497,24.660826,20.318523
"""std""",4.894797,,20.523826,54.021054,52.297402,41.420043,48.698128,604.477073,18.16468,19.10398,17.369629,18.498625,18.88664,16.596219
"""min""",2008.0,"""Aaron Dell""",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""25%""",2012.0,,7.0,17.51,20.0,13.04,14.0,191.0,5.75,6.36,5.1,7.0,7.0,5.0
"""50%""",2016.0,,25.0,59.4,62.0,46.35,50.0,669.0,20.52,21.83,17.85,22.0,22.0,17.0
"""75%""",2021.0,,43.0,108.82,110.0,83.48,92.0,1207.0,36.45,39.3,33.84,38.0,39.0,33.0
"""max""",2024.0,"""Zane McIntyre""",77.0,217.67,209.0,168.82,222.0,2375.0,75.81,83.57,78.93,76.0,91.0,75.0
