### Extract Games

#### *Use nhl API https://github.com/dword4/nhlapi to extract games & results*
#### *Save results in Google Big Query*

In [43]:
import getpass
import pandas as pd
import requests  
from pandas.io.json import json_normalize
import datetime
import pandas_gbq
from google.oauth2 import service_account

pd.set_option('display.max_columns', None)

In [44]:
def get_keys():
    '''
    This function will return the path to Google Keys for each user.
    '''
    if getpass.getuser()=='antoinetl':
        credential_keys = '/Users/antoinetl/Documents/code/Google Keys/My First Project-4938b2ab0dc6.json'
    elif getpass.getuser()=='philippejacques':
        credential_keys = '/Users/philippejacques/Desktop/Projet/HockeyPrediction/Google Keys/rational-world-288611-dbe647d5aaf9.json'
    return credential_keys

In [45]:
# Connexion GBQ
credential_keys = get_keys()

# Connexion GBQ
credentials = service_account.Credentials.from_service_account_file(
    credential_keys,
    )
pandas_gbq.context.credentials = credentials
# https://pandas-gbq.readthedocs.io/en/latest/intro.html

In [46]:
def extract_gamestats(gameID):
    
    '''
    Takes the gameID as an input and returns a dataframe that contains
    the teamSkaterStats info for the home/away teams for that specific game.
    '''
    
    r = requests.get(url='https://statsapi.web.nhl.com/api/v1/game/{}/boxscore'.format(gameID))
    data = r.json()
    df = pd.json_normalize(data = data['teams'])
    df = df.filter(regex='teamSkaterStats')
    df['gameID'] = gameID
    
    return df

In [47]:
# select date for extraction (july will be a complete season)
start_date = datetime.date(2008, 6, 1)
end_date = datetime.date(2008, 7, 1)

delta = datetime.timedelta(days=1)

data_list = []

while start_date <= end_date:
    
    r = requests.get(url='https://statsapi.web.nhl.com/api/v1/schedule?date=' + start_date.strftime("%Y-%m-%d"))
    data = r.json()
    
    df = pd.json_normalize(data = data['dates'], record_path='games', meta=['date'])
    
    # On ajoute des statistiques au niveau des matchs
    # Le if sert a skipper les journees sans matchs
    if df.empty==False :
        pd_list = []
        for games in df['gamePk']:
            tmp = extract_gamestats(gameID=games)
            pd_list.append(tmp)

        pd_tmp = pd.concat(pd_list, sort='False', ignore_index=True)
        df = df.merge(pd_tmp, left_on='gamePk', right_on='gameID', how='left')

        data_list.append(df)
        
        start_date += delta
    else:
        start_date += delta
    
    if (start_date.day == 1):
        print(start_date)

2008-07-01


In [48]:
# Pour stack la version
df = pd.concat(data_list, sort='False', ignore_index=True)

In [49]:
# replace columns in dataframe because GCP does not support '.' in column indices
df.columns = df.columns.str.replace(r".", "_")

In [50]:
df.head()

Unnamed: 0,away_teamStats_teamSkaterStats_blocked,away_teamStats_teamSkaterStats_giveaways,away_teamStats_teamSkaterStats_goals,away_teamStats_teamSkaterStats_hits,away_teamStats_teamSkaterStats_pim,away_teamStats_teamSkaterStats_powerPlayGoals,away_teamStats_teamSkaterStats_powerPlayOpportunities,away_teamStats_teamSkaterStats_powerPlayPercentage,away_teamStats_teamSkaterStats_shots,away_teamStats_teamSkaterStats_takeaways,content_link,date,gameDate,gameID,gamePk,gameType,home_teamStats_teamSkaterStats_blocked,home_teamStats_teamSkaterStats_giveaways,home_teamStats_teamSkaterStats_goals,home_teamStats_teamSkaterStats_hits,home_teamStats_teamSkaterStats_pim,home_teamStats_teamSkaterStats_powerPlayGoals,home_teamStats_teamSkaterStats_powerPlayOpportunities,home_teamStats_teamSkaterStats_powerPlayPercentage,home_teamStats_teamSkaterStats_shots,home_teamStats_teamSkaterStats_takeaways,link,season,status_abstractGameState,status_codedGameState,status_detailedState,status_startTimeTBD,status_statusCode,teams_away_leagueRecord_losses,teams_away_leagueRecord_type,teams_away_leagueRecord_wins,teams_away_score,teams_away_team_id,teams_away_team_link,teams_away_team_name,teams_home_leagueRecord_losses,teams_home_leagueRecord_type,teams_home_leagueRecord_wins,teams_home_score,teams_home_team_id,teams_home_team_link,teams_home_team_name,venue_id,venue_link,venue_name
0,31,23,4,25,12,1.0,5.0,20.0,32,12,/api/v1/game/2007030415/content,2008-06-02,2008-06-03T00:00:00Z,2007030415,2007030415,P,12,25,3,44,14,1.0,5.0,20.0,58,15,/api/v1/game/2007030415/feed/live,20072008,Final,7,Final,False,7,5,league,14,4,5,/api/v1/teams/5,Pittsburgh Penguins,6,league,15,3,17,/api/v1/teams/17,Detroit Red Wings,5047.0,/api/v1/venues/5047,Joe Louis Arena
1,7,2,3,28,12,1.0,3.0,33.3,30,1,/api/v1/game/2007030416/content,2008-06-04,2008-06-05T00:00:00Z,2007030416,2007030416,P,14,14,2,37,8,2.0,5.0,40.0,22,3,/api/v1/game/2007030416/feed/live,20072008,Final,7,Final,False,7,6,league,16,3,17,/api/v1/teams/17,Detroit Red Wings,6,league,14,2,5,/api/v1/teams/5,Pittsburgh Penguins,,/api/v1/venues/null,Mellon Arena


In [11]:
# TODO: Set project_id to your Google Cloud Platform project ID.
project_id = "rational-world-288611"

# TODO: Set table_id to the full destination table ID (including the dataset ID).
table_id = 'My_dataset.schedule_and_results'

pandas_gbq.to_gbq(df, table_id, project_id=project_id, if_exists='replace')

1it [00:07,  7.92s/it]
