In [47]:
import requests
import pandas as pd
import json
import numpy
from pandas.io.json import json_normalize

In [48]:
#Get single game data point
def get_boxscore(game_id):
    endpoint = 'https://statsapi.web.nhl.com/api/v1/game/'+str(game_id)+'/boxscore'
    #create json object from GET call
    response = requests.get(endpoint).json()
    #extract away & home team game stats to df
    #create additional columns in df's for home/away, game id and win/loss
    away_stats = pd.DataFrame(response['teams']['away']['teamStats']['teamSkaterStats'],index=[0])
    away_stats['Team ID'] = response['teams']['away']['team']['id']
    away_stats['Home'] = 0
    away_stats['Game ID'] = game_id
    home_stats = pd.DataFrame(response['teams']['home']['teamStats']['teamSkaterStats'],index=[0])
    home_stats['Team ID'] = response['teams']['home']['team']['id']
    home_stats['Home'] = 1
    home_stats['Game ID'] = game_id
    if home_stats['goals'][0] > away_stats['goals'][0]:
        home_stats['Win'] = 1
        away_stats['Win'] = 0
    else:
        home_stats['Win'] = 0
        away_stats['Win'] = 1
    #concatenate dataframes
    box_score = pd.concat([home_stats,away_stats],ignore_index=True)
    return box_score

def get_teams():
    #Returns all teams info, will be used to give names to team id's from box scores
    #further info https://gitlab.com/dword4/nhlapi/blob/master/stats-api.md#teams
    endpoint = 'https://statsapi.web.nhl.com/api/v1/teams'
    #create json object from GET call
    response = requests.get(endpoint).json()
    response = pd.DataFrame(response['teams'])
    #cast id column as string
    response = response.astype({'id':'str'})
    #other info is available we only need ID and Name fields
    return response[['id','name']]

def box_score_scrape(year):
    #creates dynamic endpoints to return data on game id's sequentially
    #create df object with relevant column names
    box_score_data = pd.DataFrame(columns=['goals', 'pim', 'shots', 'powerPlayPercentage', 'powerPlayGoals',
       'powerPlayOpportunities', 'faceOffWinPercentage', 'blocked',
       'takeaways', 'giveaways', 'hits', 'Team ID', 'Home', 'Game ID', 'Win'])
    #Game Id information available: https://gitlab.com/dword4/nhlapi/blob/master/stats-api.md#game-ids
    #Number of games in season is 370 prior to 2017
    game_num =371
    if year < 2017:
        game_num = 370
    #scraping regular season only 
    season = '02'
    for x in range(1,game_num + 1):          
        game_id = str(year)+season+str(x).zfill(4)
        new_boxscore = get_boxscore(game_id)
        box_score_data = box_score_data.append([new_boxscore],ignore_index=True)
    #cast all relevant columns as integer, float and string
    box_score_data = box_score_data.astype({'powerPlayPercentage':'float64','faceOffWinPercentage':'float64',
                                         'goals':'int64', 'pim':'int64', 'shots':'int64', 'powerPlayGoals':'int64', 
                                         'powerPlayOpportunities':'int64', 'blocked':'int64', 'takeaways':'int64', 
                                         'giveaways':'int64', 'hits':'int64', 'Team ID':'str', 'Home':'int64', 
                                         'Game ID':'str', 'Win':'int64'})
    return box_score_data

#Get game feed
def get_feed_live(game_id):
    endpoint = 'https://statsapi.web.nhl.com/api/v1/game/'+str(game_id)+'/feed/live'
    #create json object from GET call
    response = requests.get(endpoint).json()
    #flatten json response
    play_result = json_normalize(response['liveData']['plays']['allPlays'])
    if 'result.eventTypeId' not in play_result:
        print('No events in '+str(game_id))
        return
    #retain required fields
    play_result = play_result[['result.eventTypeId','team.id']]
    #filter df to retun only shot related events
    play_result = play_result[play_result['result.eventTypeId'].str.contains('SHOT')]
    #add count column
    play_result['count']=1
    #pivot play_result to events in columns and aggregation of counts
    play_result = play_result.pivot_table(values='count',index = ['team.id'], columns = 'result.eventTypeId',aggfunc='sum')
    #reset index to flatten pivot table
    play_result.reset_index(inplace=True)
    #add game id column
    play_result['Game Id'] = response['gameData']['game']['pk']
    #drop BLOCKED_SHOT and SHOT as info already contained in box_score_data
    play_result.drop(['BLOCKED_SHOT','SHOT'],axis=1,inplace=True)
    return play_result

def feed_live_scrape(year):
    #creates dynamic endpoints to return data on game id's sequentially
    #create df object with relevant column names
    feed_live_data = pd.DataFrame(columns=['team.id','MISSED_SHOT','Game Id'])
    #Game Id information available: https://gitlab.com/dword4/nhlapi/blob/master/stats-api.md#game-ids
    #Number of games in season is 370 prior to 2017
    game_num = 371
    if year < 2017:
        game_num = 370
    #scraping regular season only 
    season = '02'
    #loop thorugh game id's, call get_feed_live function, append result to feed_live_data df
    for x in range(1,game_num + 1):    
        game_id = str(year)+season+str(x).zfill(4)
        new_feed_live = get_feed_live(game_id)
        #check if get_feed_live returned anything
        if not new_feed_live is None:
            feed_live_data = feed_live_data.append([new_feed_live],ignore_index=True)
            #replace team id NaN with 0
            feed_live_data['team.id'].fillna(0, inplace=True)
            #cast team id as int to remove decimal place
            feed_live_data = feed_live_data.astype({'team.id':'int64'})
            #cast Game Id and team.id columns as string
            feed_live_data = feed_live_data.astype({'team.id':'str','Game Id':'str'})
    return feed_live_data

Run API calls to extract required data from API

In [49]:
#create box_scores df with call box_score_scrape function 
box_scores = box_score_scrape(2017)

In [50]:
#create teams df 
teams = get_teams()

In [51]:
#create missed shots df
missed_shots = feed_live_scrape(2017)

No events in 2017020018
No events in 2017020034
No events in 2017020196
No events in 2017020199
No events in 2017020209
No events in 2017020262
No events in 2017020309
No events in 2017020318


In [52]:
#merge team df and box_score df to match team names
box_scores = pd.merge(box_scores, teams, how = 'inner', left_on = 'Team ID', right_on = 'id')
#Drop superfluous 'id' column
#box_scores.drop('id',axis=1,inplace=True)

In [53]:
#Drop superfluous 'id' column
box_scores.drop('id',axis=1,inplace=True)

In [54]:
#create df of blocked shots, invert home/away identifiers and merge with box score data 
shots_blocked = box_scores[['Game ID','Home','blocked']]
shots_blocked['Home']=shots_blocked['Home'].apply(lambda x: (x*-1)+1)
#rename blocked column to shot_blocked
shots_blocked = shots_blocked.rename(columns={'blocked':'shots_blocked'})
#merge shots_blocked df with box_score
box_scores = pd.merge(box_scores, shots_blocked, how = 'inner', left_on = ['Game ID','Home'], right_on = ['Game ID','Home'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [55]:
#merge missed shots df and box_score df 
box_scores = pd.merge(box_scores, missed_shots, how = 'left', left_on = ['Game ID','Team ID'], right_on = ['Game Id','team.id'])
#Drop dupicate columns
box_scores.drop(['Game Id','team.id'],axis=1,inplace=True)

In [69]:
#replace any NAs in MISSED_SHOT with 0
box_scores['MISSED_SHOT'].fillna(0,inplace=True)
#add total shots column
box_scores['Total Shots']=box_scores['MISSED_SHOT']+box_scores['shots_blocked']+box_scores['shots']

In [71]:
#write df to CSV
box_scores.to_csv('box_scores.csv')