# Data Extraction; Montreal Canadiens Games

This notebook extracts the montreal canadiens home games from the NHL API from 2000 to 2020

## Table of Contents

1. Load data
2. Prepare data
3. Quick Analysis
4. Export data & Analysis

In [1]:
import pandas as pd
import numpy as np
import requests
from pandas.io.json import json_normalize
import datetime

pd.set_option('display.max_columns', None)

## 1 Load data 

In [2]:
def extract_gamestats(gameID):
    
    '''
    This function takes the gameID as an input and returns a dataframe that contains
    the teamSkaterStats info for the home/away teams for that specific game.
    '''
    
    r = requests.get(url='https://statsapi.web.nhl.com/api/v1/game/{}/boxscore'.format(gameID))
    data = r.json()
    df = pd.json_normalize(data = data['teams'])
    df = df.filter(regex='teamSkaterStats')
    df['gameID'] = gameID
    
    return df

In [3]:
start_date = datetime.date(2005, 1, 1)
end_date = datetime.date(2019, 12, 31)
delta = datetime.timedelta(days = 1)

data_list = []
# Example of a montreal canadiens game
# https://statsapi.web.nhl.com/api/v1/schedule?date=2001-01-12&teamId=8

while start_date <= end_date:
    r = requests.get(url='https://statsapi.web.nhl.com/api/v1/schedule?date=' + start_date.strftime("%Y-%m-%d") + '&teamId=8')
    data = r.json()
    
    df = pd.json_normalize(data = data['dates'], record_path='games', meta=['date'])
    
    # we want to skip days with no games
    if df.empty==False :
        pd_list = []
        for games in df['gamePk']:
            tmp = extract_gamestats(gameID=games)
            pd_list.append(tmp)
    
        pd_tmp = pd.concat(pd_list, sort='False', ignore_index=True)
        df = df.merge(pd_tmp, left_on='gamePk', right_on='gameID', how='left')
    
        data_list.append(df)
        
        print(start_date)
        
        start_date += delta
    
    else:
        start_date += delta

df = pd.concat(data_list, sort='False', ignore_index=True)

2005-09-18
2005-09-20
2005-09-22
2005-09-24
2005-09-27
2005-09-28
2005-10-01
2005-10-02
2005-10-05
2005-10-06
2005-10-08
2005-10-11
2005-10-12
2005-10-15
2005-10-18
2005-10-22
2005-10-25
2005-10-27
2005-10-29
2005-10-31
2005-11-01
2005-11-04
2005-11-05
2005-11-08
2005-11-10
2005-11-12
2005-11-15
2005-11-18
2005-11-19
2005-11-22
2005-11-25
2005-11-26
2005-11-29
2005-12-01
2005-12-03
2005-12-10
2005-12-13
2005-12-15
2005-12-17
2005-12-20
2005-12-23
2005-12-26
2005-12-28
2005-12-30
2005-12-31
2006-01-03
2006-01-05
2006-01-07
2006-01-11
2006-01-14
2006-01-16
2006-01-19
2006-01-21
2006-01-23
2006-01-25
2006-01-26
2006-01-28
2006-01-31
2006-02-02
2006-02-04
2006-02-05
2006-02-07
2006-02-09
2006-02-11
2006-02-28
2006-03-02
2006-03-04
2006-03-06
2006-03-07
2006-03-09
2006-03-11
2006-03-13
2006-03-16
2006-03-18
2006-03-20
2006-03-21
2006-03-23
2006-03-25
2006-03-26
2006-03-28
2006-03-30
2006-04-01
2006-04-04
2006-04-06
2006-04-08
2006-04-10
2006-04-12
2006-04-13
2006-04-15
2006-04-18
2006-04-22

2013-10-22
2013-10-24
2013-10-26
2013-10-28
2013-10-29
2013-11-01
2013-11-02
2013-11-05
2013-11-07
2013-11-10
2013-11-12
2013-11-15
2013-11-16
2013-11-19
2013-11-22
2013-11-23
2013-11-27
2013-11-29
2013-11-30
2013-12-02
2013-12-04
2013-12-05
2013-12-07
2013-12-10
2013-12-12
2013-12-14
2013-12-15
2013-12-17
2013-12-19
2013-12-21
2013-12-28
2013-12-29
2013-12-31
2014-01-02
2014-01-04
2014-01-06
2014-01-08
2014-01-11
2014-01-14
2014-01-16
2014-01-18
2014-01-22
2014-01-24
2014-01-25
2014-01-28
2014-01-30
2014-02-01
2014-02-02
2014-02-04
2014-02-06
2014-02-08
2014-02-26
2014-02-27
2014-03-01
2014-03-03
2014-03-05
2014-03-06
2014-03-08
2014-03-12
2014-03-15
2014-03-16
2014-03-18
2014-03-20
2014-03-22
2014-03-24
2014-03-25
2014-03-27
2014-03-29
2014-04-01
2014-04-04
2014-04-05
2014-04-09
2014-04-10
2014-04-12
2014-04-16
2014-04-18
2014-04-20
2014-04-22
2014-05-01
2014-05-03
2014-05-06
2014-05-08
2014-05-10
2014-05-12
2014-05-14
2014-05-17
2014-05-19
2014-05-22
2014-05-25
2014-05-27
2014-05-29

## 2 Prep data

In [4]:
#only select columns that we want
df = df.loc[df['teams.home.team.id'] == 8]
df_games = df[['date', 'gameDate', 'gameID', 'gameType', 'teams.home.team.id', 'teams.away.team.id', 'teams.home.score','teams.away.score']]


df_games['Mtl_win'] = (df_games['teams.home.score'] - df_games['teams.away.score'] > 0).astype('int')

df_games= df_games[['date', 'gameDate', 'gameType', 'Mtl_win']]

df_games.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,date,gameDate,gameType,Mtl_win
0,2005-09-18,2005-09-18T23:00:00Z,PR,1
1,2005-09-20,2005-09-20T23:30:00Z,PR,1
3,2005-09-24,2005-09-24T23:00:00Z,PR,1
4,2005-09-27,2005-09-27T23:30:00Z,PR,1
5,2005-09-28,2005-09-28T22:00:00Z,PR,0


In [5]:
df_games.describe(include = 'all')

Unnamed: 0,date,gameDate,gameType,Mtl_win
count,691,691,691,691.0
unique,691,691,3,
top,2014-01-28,2009-01-31T19:00:00Z,R,
freq,1,1,576,
mean,,,,0.539797
std,,,,0.498775
min,,,,0.0
25%,,,,0.0
50%,,,,1.0
75%,,,,1.0


## 4 Export Results

In [6]:
df_games.to_csv('ch_games.csv')