In [5]:
import urllib3
import json
import itertools

import pandas as pd
import numpy as np

In [6]:
def get_game_data(gameid=1):
    
    '''Retrieving data and creating dataframe of the home and 
    away teams and their score results.
    :param gameid: The game number to retrieve data from
    '''
    
    game_url=f'https://statsapi.web.nhl.com/api/v1/game/201702{str(gameid).zfill(4)}/feed/live'
    http = urllib3.PoolManager()
    r = http.request('GET', game_url)
    data=json.loads(r.data)
    
    results=data['liveData']['plays']['currentPlay']['about']['goals']
    
    teams={'home_team' : data['gameData']['teams']['home']['name'],'away_team': data['gameData']['teams']['away']['name']}
    
    time={'time':data['metaData']['timeStamp']}
    
    #dictionary unpacking
    row={**results,**teams,**time}
    
    return pd.DataFrame(row,index=[row["time"]])


In [3]:
game1 = get_game_data(1)

game1.head()

Unnamed: 0,away,home,home_team,away_team,time
20171006_173713,7,2,Winnipeg Jets,Toronto Maple Leafs,20171006_173713


In [4]:
game_results=pd.concat([get_game_data(x) for x in range(1,1271)])

game_results.to_csv("game_results.csv")

game_results.head()

MaxRetryError: HTTPSConnectionPool(host='statsapi.web.nhl.com', port=443): Max retries exceeded with url: /api/v1/game/2017020725/feed/live (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002DB753F7220>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))

In [7]:
game_results=pd.read_csv("game_results.csv")

In [8]:
game_results['outcome_categorical'] = "away"
game_results.loc[(game_results['away']<game_results['home']),"outcome_categorical"]="home"
game_results.head()

Unnamed: 0.1,Unnamed: 0,away,home,home_team,away_team,time,outcome_categorical
0,20171006_173713,7,2,Winnipeg Jets,Toronto Maple Leafs,20171006_173713,away
1,20171008_080732,5,4,Pittsburgh Penguins,St. Louis Blues,20171008_080732,away
2,20171008_080734,0,3,Edmonton Oilers,Calgary Flames,20171008_080734,home
3,20171008_160734,5,3,San Jose Sharks,Philadelphia Flyers,20171008_160734,away
4,20171009_153739,3,4,Boston Bruins,Nashville Predators,20171009_153739,home


In [9]:
salary=pd.read_html("https://www.capfriendly.com/archive/2017")[0]

salary["FINAL CAP HIT"]=salary["FINAL CAP HIT"].str.replace(',','').str.replace('$','').astype(int)

salary.to_csv("salary.csv",index=False)
salary.head()

  salary["FINAL CAP HIT"]=salary["FINAL CAP HIT"].str.replace(',','').str.replace('$','').astype(int)


Unnamed: 0,TEAM,FINAL CAP HIT,LTIR USED,FINAL CAP SPACE
0,Detroit Red Wings,78458260,"$5,458,260",$0
1,Pittsburgh Penguins,77649912,"$4,649,912",$0
2,Anaheim Ducks,76957176,"$3,957,176",$0
3,Arizona Coyotes,75286038,"$2,286,038",$0
4,Calgary Flames,73824956,"$824,956",$0


In [10]:
set(game_results["home_team"].unique()) - set(salary["TEAM"])

{'Montréal Canadiens', 'Vegas Golden Knights'}

In [11]:
salary["TEAM"] = salary["TEAM"].replace("Montreal Canadiens","Montréal Canadiens")


In [12]:
salary[salary["TEAM"] == "Vegas Golden Knights"]

Unnamed: 0,TEAM,FINAL CAP HIT,LTIR USED,FINAL CAP SPACE


In [13]:
salary=salary.set_index("TEAM")
salary=salary["FINAL CAP HIT"]

salary.loc['Vegas Golden Knights'] = np.nan
salary.head()

TEAM
Detroit Red Wings      78458260.0
Pittsburgh Penguins    77649912.0
Anaheim Ducks          76957176.0
Arizona Coyotes        75286038.0
Calgary Flames         73824956.0
Name: FINAL CAP HIT, dtype: float64

In [None]:
def team_standings(season="20162017"):
    
    game_url = f"https://statsapi.web.nhl.com/api/v1/standings?season={season}"
    http = urllib3.PoolManager()
    r = http.request('GET',game_url)
    data=json.loads(r.data)
    
    #JSON data has a record element for divisions and then lists the team
    #records inside of that, so we need to do a nested iteration
    df_standings=pd.DataFrame()
    for record in data['records']:
        for team_record in record['teamRecords']:
            
            df_standings=df_standings.append(pd.json_normalize(team_record))
    
    return df_standings

previous_season_standings=team_standings()

previous_season_standings.to_csv("previous_season_standings.csv",index=False)

previous_season_standings.head()

In [14]:
previous_season_standings=pd.read_csv('previous_season_standings.csv')

In [15]:
df_cum = pd.DataFrame()
df_cum.loc['won', list(game_results["home_team"].unique())]=0
df_cum.loc['lost', list(game_results["away_team"].unique())]=0
df_cum.head()

Unnamed: 0,Winnipeg Jets,Pittsburgh Penguins,Edmonton Oilers,San Jose Sharks,Boston Bruins,Buffalo Sabres,New York Rangers,Ottawa Senators,Detroit Red Wings,Chicago Blackhawks,...,St. Louis Blues,Arizona Coyotes,Calgary Flames,Vancouver Canucks,Montréal Canadiens,Nashville Predators,Vegas Golden Knights,Colorado Avalanche,Philadelphia Flyers,Minnesota Wild
won,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
lost,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
df_cum = df_cum.unstack()


In [17]:
print(df_cum)

Winnipeg Jets        won     0.0
                     lost    0.0
Pittsburgh Penguins  won     0.0
                     lost    0.0
Edmonton Oilers      won     0.0
                            ... 
Colorado Avalanche   lost    0.0
Philadelphia Flyers  won     0.0
                     lost    0.0
Minnesota Wild       won     0.0
                     lost    0.0
Length: 62, dtype: float64


In [18]:
df_cum=pd.DataFrame(df_cum,columns=['time']).T
df_cum.head()

Unnamed: 0_level_0,Winnipeg Jets,Winnipeg Jets,Pittsburgh Penguins,Pittsburgh Penguins,Edmonton Oilers,Edmonton Oilers,San Jose Sharks,San Jose Sharks,Boston Bruins,Boston Bruins,...,Nashville Predators,Nashville Predators,Vegas Golden Knights,Vegas Golden Knights,Colorado Avalanche,Colorado Avalanche,Philadelphia Flyers,Philadelphia Flyers,Minnesota Wild,Minnesota Wild
Unnamed: 0_level_1,won,lost,won,lost,won,lost,won,lost,won,lost,...,won,lost,won,lost,won,lost,won,lost,won,lost
time,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
for idx,row in game_results.iterrows():
    if row["away"]>row["home"]:
        winner=row["away_team"]
        loser=row["home_team"]
    elif row["away"]<row["home"]:
        winner=row["home_team"]
        loser=row["away_team"]
    
    df_cum.loc[idx, (winner,"won")]=df_cum[(winner,"won")].max()+1
    df_cum.loc[idx, (loser,"lost")]=df_cum[(loser,"lost")].max()+1

In [20]:
df_cum.head(100)

Unnamed: 0_level_0,Winnipeg Jets,Winnipeg Jets,Pittsburgh Penguins,Pittsburgh Penguins,Edmonton Oilers,Edmonton Oilers,San Jose Sharks,San Jose Sharks,Boston Bruins,Boston Bruins,...,Nashville Predators,Nashville Predators,Vegas Golden Knights,Vegas Golden Knights,Colorado Avalanche,Colorado Avalanche,Philadelphia Flyers,Philadelphia Flyers,Minnesota Wild,Minnesota Wild
Unnamed: 0_level_1,won,lost,won,lost,won,lost,won,lost,won,lost,...,won,lost,won,lost,won,lost,won,lost,won,lost
time,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,,1.0,,,,,,,,,...,,,,,,,,,,
1,,,,1.0,,,,,,,...,,,,,,,,,,
2,,,,,1.0,,,,,,...,,,,,,,,,,
3,,,,,,,,1.0,,,...,,,,,,,1.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,,,,,,,,,4.0,,...,,,,,,,,,,
95,,,,,,,,,5.0,,...,,,,,,,,,,
96,,,,,,,,,,,...,4.0,,,,,,,3.0,,
97,,,,,,,,,,,...,,,,,,,,,,


In [21]:
df_cum = df_cum.fillna(method='ffill').drop(index="time")
df_cum.head()

Unnamed: 0_level_0,Winnipeg Jets,Winnipeg Jets,Pittsburgh Penguins,Pittsburgh Penguins,Edmonton Oilers,Edmonton Oilers,San Jose Sharks,San Jose Sharks,Boston Bruins,Boston Bruins,...,Nashville Predators,Nashville Predators,Vegas Golden Knights,Vegas Golden Knights,Colorado Avalanche,Colorado Avalanche,Philadelphia Flyers,Philadelphia Flyers,Minnesota Wild,Minnesota Wild
Unnamed: 0_level_1,won,lost,won,lost,won,lost,won,lost,won,lost,...,won,lost,won,lost,won,lost,won,lost,won,lost
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [22]:
def create_features(row):
    features={}
    
    features["away_won"]=df_cum.loc[row.name,(row["away_team"],"won")]
    features["away_lost"]=df_cum.loc[row.name,(row["away_team"],"lost")]
    features["home_won"]=df_cum.loc[row.name,(row["home_team"],"won")]
    features["home_lost"]=df_cum.loc[row.name,(row["home_team"],"lost")]
    
    #subtract 1 because the match on that day has not taken place
    
    if row["outcome_categorical"]=="home":
        features["home_won"]=features["home_won"]-1
        features["away_lost"]=features["away_lost"]-1
    else:
        features["home_lost"]=features["home_lost"]-1
        features["away_won"]=features["away_lost"]-1
    
    features["away_cap"]=salary[row["away"]]
    features["home_cap"]=salary[row["home"]]
    
    home_last_season=previous_season_standings.query(f"`team.name`=='{row['home_team']}'").add_prefix("home_last_season_")
    away_last_season=previous_season_standings.query(f"`team.name`=='{row['away_team']}'").add_prefix("away_last_season_")
    
    if len(home_last_season)>0:
        home_last_season=home_last_season.iloc[0].to_dict()
    else:
        home_last_season={}
    if len(away_last_season)>0:
        away_last_season=away_last_season.iloc[0].to_dict()
    else:
        away_last_season={}
        
    return pd.Series({**features,**home_last_season, **away_last_season, **row})

observations=game_results.apply(create_features,axis='columns')
observations.head()

Unnamed: 0.1,Unnamed: 0,away,away_cap,away_last_season_clinchIndicator,away_last_season_conferenceHomeRank,away_last_season_conferenceL10Rank,away_last_season_conferenceRank,away_last_season_conferenceRoadRank,away_last_season_divisionHomeRank,away_last_season_divisionL10Rank,...,home_last_season_streak.streakType,home_last_season_team.id,home_last_season_team.link,home_last_season_team.name,home_last_season_wildCardRank,home_lost,home_team,home_won,outcome_categorical,time
0,20171006_173713,7,72989072.0,x,10.0,7.0,8.0,7.0,4.0,4.0,...,wins,52.0,/api/v1/teams/52,Winnipeg Jets,3.0,0.0,Winnipeg Jets,0.0,away,20171006_173713
1,20171008_080732,5,73546558.0,x,7.0,4.0,5.0,4.0,4.0,2.0,...,losses,5.0,/api/v1/teams/5,Pittsburgh Penguins,0.0,0.0,Pittsburgh Penguins,0.0,away,20171008_080732
2,20171008_080734,0,78458260.0,x,9.0,11.0,7.0,5.0,4.0,5.0,...,wins,22.0,/api/v1/teams/22,Edmonton Oilers,0.0,0.0,Edmonton Oilers,0.0,home,20171008_080734
3,20171008_160734,5,73546558.0,,4.0,4.0,11.0,14.0,4.0,3.0,...,wins,28.0,/api/v1/teams/28,San Jose Sharks,0.0,0.0,San Jose Sharks,0.0,away,20171008_160734
4,20171009_153739,3,75286038.0,x,5.0,6.0,8.0,10.0,3.0,4.0,...,losses,6.0,/api/v1/teams/6,Boston Bruins,0.0,0.0,Boston Bruins,0.0,home,20171009_153739


In [23]:
observations.keys()

Index(['Unnamed: 0', 'away', 'away_cap', 'away_last_season_clinchIndicator',
       'away_last_season_conferenceHomeRank',
       'away_last_season_conferenceL10Rank', 'away_last_season_conferenceRank',
       'away_last_season_conferenceRoadRank',
       'away_last_season_divisionHomeRank', 'away_last_season_divisionL10Rank',
       'away_last_season_divisionRank', 'away_last_season_divisionRoadRank',
       'away_last_season_gamesPlayed', 'away_last_season_goalsAgainst',
       'away_last_season_goalsScored', 'away_last_season_lastUpdated',
       'away_last_season_leagueHomeRank', 'away_last_season_leagueL10Rank',
       'away_last_season_leagueRank', 'away_last_season_leagueRecord.losses',
       'away_last_season_leagueRecord.ot',
       'away_last_season_leagueRecord.type',
       'away_last_season_leagueRecord.wins', 'away_last_season_leagueRoadRank',
       'away_last_season_points', 'away_last_season_pointsPercentage',
       'away_last_season_ppConferenceRank', 'away_last_sea

In [24]:
observations=observations.drop(['away_last_season_clinchIndicator','away_last_season_lastUpdated'],axis='columns')

In [25]:
observations=observations.drop(['away_last_season_leagueRecord.type','away_last_season_streak.streakCode',
                               'away_last_season_streak.streakType','away_last_season_team.link','away_last_season_team.name',
                               'home_last_season_clinchIndicator','home_last_season_lastUpdated',
                               'home_last_season_leagueRecord.type','home_last_season_streak.streakCode',
                               'home_last_season_streak.streakType','home_last_season_team.link','home_last_season_team.name'],axis='columns')

In [26]:
observations=observations.drop(['away','home','away_team','home_team','time'],axis='columns')

In [30]:
observations=observations.drop(observations.keys()[0],axis='columns')

In [31]:
observations.head()

Unnamed: 0,away_cap,away_last_season_conferenceHomeRank,away_last_season_conferenceL10Rank,away_last_season_conferenceRank,away_last_season_conferenceRoadRank,away_last_season_divisionHomeRank,away_last_season_divisionL10Rank,away_last_season_divisionRank,away_last_season_divisionRoadRank,away_last_season_gamesPlayed,...,home_last_season_ppConferenceRank,home_last_season_ppDivisionRank,home_last_season_ppLeagueRank,home_last_season_row,home_last_season_streak.streakNumber,home_last_season_team.id,home_last_season_wildCardRank,home_lost,home_won,outcome_categorical
0,72989072.0,10.0,7.0,8.0,7.0,4.0,4.0,4.0,4.0,82.0,...,9.0,5.0,20.0,37.0,7.0,52.0,3.0,0.0,0.0,away
1,73546558.0,7.0,4.0,5.0,4.0,4.0,2.0,3.0,3.0,82.0,...,2.0,2.0,2.0,46.0,2.0,5.0,0.0,0.0,0.0,away
2,78458260.0,9.0,11.0,7.0,5.0,4.0,5.0,4.0,2.0,82.0,...,4.0,2.0,7.0,43.0,3.0,22.0,0.0,0.0,0.0,home
3,73546558.0,4.0,4.0,11.0,14.0,4.0,3.0,6.0,7.0,82.0,...,5.0,3.0,10.0,44.0,1.0,28.0,0.0,0.0,0.0,away
4,75286038.0,5.0,6.0,8.0,10.0,3.0,4.0,4.0,5.0,82.0,...,7.0,3.0,13.0,42.0,1.0,6.0,0.0,0.0,0.0,home


Machine Learning Model 

In [32]:
#making sure all are numeric
for col in observations.columns:
    if col!='outcome_categorical':
        observations[col] = pd.to_numeric(observations[col])

observations.to_csv("observations.csv")

training_df = observations[0:799]
testing_df = observations[800:]

#fill null values
training_df = training_df.fillna(training_df.mean())
testing_df = testing_df.fillna(testing_df.mean())

  training_df = training_df.fillna(training_df.mean())
  testing_df = testing_df.fillna(testing_df.mean())


In [33]:
testing_df.head(100)

Unnamed: 0,away_cap,away_last_season_conferenceHomeRank,away_last_season_conferenceL10Rank,away_last_season_conferenceRank,away_last_season_conferenceRoadRank,away_last_season_divisionHomeRank,away_last_season_divisionL10Rank,away_last_season_divisionRank,away_last_season_divisionRoadRank,away_last_season_gamesPlayed,...,home_last_season_ppConferenceRank,home_last_season_ppDivisionRank,home_last_season_ppLeagueRank,home_last_season_row,home_last_season_streak.streakNumber,home_last_season_team.id,home_last_season_wildCardRank,home_lost,home_won,outcome_categorical
800,77649912.0,4.000000,5.000000,2.000000,2.000000,2.000000,3.000000,2.000000,2.000000,82.0,...,11.000000,6.000000,24.000000,33.000000,1.000000,25.000000,5.000000,22.0,27.0,home
801,75286038.0,2.000000,9.000000,1.000000,1.000000,1.000000,6.000000,1.000000,1.000000,82.0,...,8.000000,4.000000,17.000000,41.000000,1.000000,20.000000,1.000000,21.0,26.0,home
802,73824956.0,9.000000,1.000000,10.000000,9.000000,3.000000,1.000000,5.000000,5.000000,82.0,...,13.000000,7.000000,29.000000,26.000000,8.000000,23.000000,7.000000,29.0,22.0,away
803,78458260.0,13.000000,10.000000,12.000000,12.000000,7.000000,4.000000,6.000000,6.000000,82.0,...,10.000000,5.000000,22.000000,37.000000,1.000000,26.000000,4.000000,28.0,26.0,home
804,73824956.0,8.052632,8.085526,7.945175,7.890351,4.263158,4.287281,4.212719,4.182018,82.0,...,1.000000,1.000000,1.000000,53.000000,1.000000,15.000000,0.000000,21.0,32.0,away
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,76957176.0,1.000000,1.000000,3.000000,7.000000,1.000000,1.000000,1.000000,4.000000,82.0,...,2.000000,2.000000,5.000000,46.000000,4.000000,30.000000,0.000000,28.0,31.0,away
896,78458260.0,6.000000,2.000000,4.000000,3.000000,3.000000,2.000000,2.000000,1.000000,82.0,...,12.000000,6.000000,27.000000,24.000000,1.000000,53.000000,6.000000,37.0,17.0,home
897,73824956.0,16.000000,16.000000,16.000000,16.000000,8.000000,8.000000,8.000000,8.000000,82.0,...,10.000000,5.000000,18.000000,38.000000,3.000000,14.000000,4.000000,20.0,40.0,away
898,75286038.0,10.000000,7.000000,8.000000,7.000000,4.000000,4.000000,4.000000,4.000000,82.0,...,2.000000,2.000000,2.000000,46.000000,2.000000,5.000000,0.000000,25.0,32.0,home


In [34]:
from sklearn.linear_model import LogisticRegression 

In [35]:
features = training_df.drop('outcome_categorical',axis='columns')
target = training_df['outcome_categorical']

clf=LogisticRegression()
reg=clf.fit(features,target)

reg.score(features,target)

1.0

In [36]:
from sklearn.metrics import accuracy_score

labels=testing_df['outcome_categorical']
predictions=reg.predict(testing_df.drop("outcome_categorical",axis='columns'))

print(f"score {accuracy_score(labels,predictions)}")

score 1.0


In [None]:
predictions