# Exploratory Data Analysis

In [67]:
import pandas as pd

from typing import List, Set
from pathlib import Path

DATA_PATH = Path("../data/")

In [55]:
df = pd.read_csv(DATA_PATH.joinpath("raw/skaters.csv"))

In [69]:
MTL_PLAYERS = ('Alex Belzile',
 'Alexander Romanov',
 'Artturi Lehkonen',
 'Ben Chiarot',
 'Brendan Gallagher',
 'Brett Kulak',
 'Cedric Paquette',
 'Chris Wideman',
 'Christian Dvorak',
 'Cole Caufield',
 'David Savard',
 'Jake Evans',
 'Jeff Petry',
 'Jesperi Kotkaniemi',
 'Jesse Ylonen',
 'Joel Armia',
 'Joel Edmundson',
 'Jonathan Drouin',
 'Josh Anderson',
 'Mathieu Perreault',
 'Mike Hoffman',
 'Nick Suzuki',
 'Otto Leskinen',
 'Paul Byron',
 'Tyler Toffoli',
 'Xavier Ouellet')

def make_dataset() -> pd.DataFrame:
    """
    Make the dataset for the last 5 years.
    """
    years = range(2016, 2021)
    dataset = pd.DataFrame()
    for year in years:
        print(f"Data year: {year}")
        df = pd.read_csv(DATA_PATH.joinpath(f"raw/skaters_{year}.csv"))
        dataset = dataset.append(get_new_team(df, players=MTL_PLAYERS))
    return dataset


In [70]:
df_merge = make_dataset()

Data year: 2016
Data year: 2017
Data year: 2018
Data year: 2019
Data year: 2020


In [72]:
df_merge['name'].unique()

array(['Chris Wideman', 'Brendan Gallagher', 'Brett Kulak',
       'Tyler Toffoli', 'Ben Chiarot', 'Josh Anderson', 'Cedric Paquette',
       'Joel Edmundson', 'Jeff Petry', 'Christian Dvorak',
       'Artturi Lehkonen', 'Mike Hoffman', 'David Savard', 'Joel Armia',
       'Mathieu Perreault', 'Jonathan Drouin', 'Xavier Ouellet',
       'Paul Byron', 'Jesperi Kotkaniemi', 'Otto Leskinen', 'Jake Evans',
       'Nick Suzuki', 'Alexander Romanov', 'Cole Caufield',
       'Alex Belzile', 'Jesse Ylonen'], dtype=object)

In [74]:
df_merge.season.unique()

array([2016, 2017, 2018, 2019, 2020])

In [68]:
def get_new_team(skaters: pd.DataFrame, players: Set[str]) -> pd.DataFrame:
    """
    Get the team for 2021-22.
    """
    new_team = skaters['name'].apply(lambda x: x in players)
    mtl = skaters.loc[new_team, :]
    return mtl

In [52]:
mtl2122 = get_new_team(skaters=df, substract=substract, add=add)

In [64]:
mtl2122.name.unique()

array(['Alexander Romanov', 'Joel Armia', 'Jesperi Kotkaniemi',
       'Nick Suzuki', 'Cole Caufield', 'Xavier Ouellet', 'Alex Belzile',
       'Josh Anderson', 'Jake Evans', 'Jonathan Drouin', 'Paul Byron',
       'Brendan Gallagher', 'Otto Leskinen', 'Brett Kulak',
       'Jesse Ylonen', 'Tyler Toffoli', 'Artturi Lehkonen', 'Jeff Petry',
       'Ben Chiarot', 'Joel Edmundson', 'Mike Hoffman',
       'Mathieu Perreault', 'David Savard', 'Cedric Paquette'],
      dtype=object)

In [47]:
new = df['name'].apply(lambda x: x in add).to_list()

In [49]:
pd.concat([mtl, df.loc[new, :]])

Unnamed: 0,playerId,season,name,team,position,situation,games_played,icetime,shifts,gameScore,...,OffIce_F_xGoals,OffIce_A_xGoals,OffIce_F_shotAttempts,OffIce_A_shotAttempts,xGoalsForAfterShifts,xGoalsAgainstAfterShifts,corsiForAfterShifts,corsiAgainstAfterShifts,fenwickForAfterShifts,fenwickAgainstAfterShifts
230,8476479,2020,Phillip Danault,MTL,C,other,53,2318.0,48.0,21.58,...,6.40,9.15,75.0,67.0,0.04,0.40,2.0,1.0,1.0,1.0
231,8476479,2020,Phillip Danault,MTL,C,all,53,53682.0,1261.0,36.26,...,107.45,107.54,2232.0,1914.0,0.00,0.00,0.0,0.0,0.0,0.0
232,8476479,2020,Phillip Danault,MTL,C,5on5,53,43597.0,1004.0,36.26,...,77.02,72.24,1772.0,1573.0,1.52,1.58,43.0,30.0,30.0,26.0
233,8476479,2020,Phillip Danault,MTL,C,4on5,53,6705.0,166.0,36.12,...,3.37,16.83,52.0,202.0,0.18,0.38,1.0,4.0,1.0,4.0
234,8476479,2020,Phillip Danault,MTL,C,5on4,53,1062.0,43.0,19.24,...,9.39,1.82,192.0,31.0,0.00,0.00,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3775,8476975,2020,Cedric Paquette,CAR,C,other,47,591.0,12.0,5.71,...,11.33,4.99,45.0,56.0,0.00,0.00,0.0,0.0,0.0,0.0
3776,8476975,2020,Cedric Paquette,CAR,C,all,47,26357.0,651.0,7.38,...,130.82,105.10,2364.0,2057.0,0.00,0.00,0.0,0.0,0.0,0.0
3777,8476975,2020,Cedric Paquette,CAR,C,5on5,47,23943.0,583.0,7.38,...,81.90,72.49,1783.0,1603.0,0.93,0.39,21.0,16.0,17.0,14.0
3778,8476975,2020,Cedric Paquette,CAR,C,4on5,47,1787.0,53.0,-0.66,...,3.36,10.74,40.0,184.0,0.00,0.00,0.0,0.0,0.0,0.0
