<a href="https://colab.research.google.com/github/11uc/nhl-prediction-notebooks/blob/main/download_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Download data
Download data from the NHL api from player performance prediction. Save all the data on the hard drive in csv format.

In [None]:
!pip install requests_futures

Collecting requests_futures
  Downloading requests_futures-1.0.0-py2.py3-none-any.whl (7.4 kB)
Installing collected packages: requests-futures
Successfully installed requests-futures-1.0.0


In [None]:
import numpy as np
import pandas as pd
import requests
from requests_futures.sessions import FuturesSession
from datetime import datetime

In [None]:
apiurl = "https://statsapi.web.nhl.com/api/v1/"

## Team roster in seasons
Team roster data with player ids and names and team ids and names in each season

In [None]:
yearRange = (1917, 2021)
players = []
for year in range(*yearRange):
    season = f'{year}{year+1}'
    resp = requests.get(apiurl + "teams",
                        params = {"expand": "team.roster",
                                  "season": season})
    data = resp.json()
    for team in data["teams"]:
        if "roster" in team:
            for person in team["roster"]["roster"]:
                player = {"season": season,
                          "year": year,
                          "teamId": team["id"],
                          "teamName": team["name"],
                          "playerId": person["person"]["id"],
                          "playerName": person["person"]["fullName"]
                         }
                players.append(player)
players = pd.DataFrame(players)
players.to_csv("players.csv", index = False)

In [None]:
players.head()

Unnamed: 0,season,year,teamId,teamName,playerId,playerName
0,19171918,1917,36,Ottawa Senators (1917),8445135,George Boucher
1,19171918,1917,36,Ottawa Senators (1917),8445191,Morley Bruce
2,19171918,1917,36,Ottawa Senators (1917),8445802,Rusty Crawford
3,19171918,1917,36,Ottawa Senators (1917),8445844,Jack Darragh
4,19171918,1917,36,Ottawa Senators (1917),8445874,Cy Denneny


In [None]:
players = pd.read_csv("players.csv", dtype = {"season": str})

## Player basic info with birth date and position.
For all the players, using player ids from the team roster dataframe, download the basic information about that player.
Some format conversion is done here.

In [None]:
def heightToInches(hstr):
    if hstr is None:
        return None
    feet, inch = hstr.split(' ')
    h = int(feet[:-1]) * 12 + int(inch[:-1])
    return h

def birthYear(dob):
    if dob is None:
        return None
    else:
        return datetime.fromisoformat(dob).year

stats_cols = ["id", "birthYear", "nationality", "height", 
             "weight", "shootsCatches", "position"]

def get_player_stats(resp):
    data = resp.json()
    stats = data["people"][0]
    player_stats = [stats.get("id"),
                   birthYear(stats.get("birthDate")),
                   stats.get("nationality"),
                   heightToInches(stats.get("height")),
                   stats.get("weight"),
                   stats.get("shootsCatches"),
                   stats.get("primaryPosition").get("code")]
    return player_stats

In [None]:
playerIds = np.unique(players["playerId"])
session = FuturesSession(max_workers = 10)
futures = [session.get(url = apiurl + f'people/{pid}')
            for pid in playerIds]
player_stats = [get_player_stats(future.result()) for 
               future in futures]
player_stats = pd.DataFrame(player_stats,
                           columns = stats_cols)
player_stats.to_csv("player_stats.csv", index = False)

In [None]:
player_stats = pd.read_csv("player_stats.csv", 
                           dtype = {"season": str})

## Download team seasonal data
Download every team's seasonal performance stats

In [None]:
def get_team_stats(resp, year):
    data = resp.json()
    stats = []
    for team in data["teams"]:
        if "teamStats" in team:
            stat = team["teamStats"][0]["splits"][0]["stat"]
            stat["season"] = f"{year}{year + 1}"
            stat["teamId"] = team["id"]
            stats.append(stat)
    return stats

In [None]:
session = FuturesSession(max_workers = 10)
futures = [session.get(url = apiurl + "teams",
                       params = {"expand": "team.stats",
                                 "season": f"{year}{year + 1}"})
            for year in range(*yearRange)]
team_seasonal = [stat 
                 for future, year in zip(futures, range(*yearRange))
                 for stat in get_team_stats(future.result(), year)]
team_seasonal = pd.DataFrame(team_seasonal)
# Save to drive
team_seasonal.to_csv("team_seasonal.csv", index = False)

In [None]:
# Load from drive
team_seasonal = pd.read_csv("team_seasonal.csv",
                            dtype = {"season": str})

## Download player seasonal stats
* Combining data if a player is in more than one team in a season.
* Remove redundant data columns.

### Skaters

In [None]:
players_p = players.merge(
    player_stats[["id", "position"]].set_index("id"),
    how = "left",
    left_on = "playerId",
    right_index = True)

In [None]:
def get_player_season(future, season, pid):
    data = future.result().json()
    if not len(data["stats"][0]["splits"]):
        # print(pid, season)
        return None
    stat = data["stats"][0]["splits"][0]["stat"]
    stat["season"] = season
    stat["playerId"] = pid
    # This has extra features, remove
    stat.pop("timeOnIcePerGame", 0)
    stat.pop("evenTimeOnIcePerGame", 0)
    stat.pop("shortHandedTimeOnIcePerGame", 0)
    stat.pop("powerPlayTimeOnIcePerGame", 0)
    return stat

def get_futures(row, session):
    season = row["season"]
    pid = row["playerId"]
    future = session.get(url = apiurl + f'people/{pid}/stats',
                         params = {"stats": "statsSingleSeason",
                                   "season": season})
    return future, season, pid

In [None]:
# Skaters
# Combine future requests, season and pid in a tuple
session = FuturesSession(max_workers = 10)
uni_players = (players_p.query(("position != 'G' "
                                " and position == position"))
                .drop_duplicates(["playerId", "season"]))
futures = uni_players.apply(get_futures, axis = 1, 
                          args = (session,))
skater_uni = [get_player_season(*future) for future in futures]
skater_uni = pd.DataFrame(filter(None, skater_uni))

In [None]:
skater_uni.to_csv("skater_uni.csv", index = False)

### Goalies

In [None]:
players_p = players.merge(
    player_stats[["id", "position"]].set_index("id"),
    how = "left",
    left_on = "playerId",
    right_index = True)

In [None]:
def get_player_season(future, season, pid):
    data = future.result().json()
    if not len(data["stats"][0]["splits"]):
        # print(pid, season)
        return None
    stat = data["stats"][0]["splits"][0]["stat"]
    stat["season"] = season
    stat["playerId"] = pid
    return stat

def get_futures(row, session):
    season = row["season"]
    pid = row["playerId"]
    future = session.get(url = apiurl + f'people/{pid}/stats',
                         params = {"stats": "statsSingleSeason",
                                   "season": season})
    return future, season, pid

In [None]:
# Goaliess
# Combine future requests, season and pid in a tuple
session = FuturesSession(max_workers = 10)
uni_players = (players_p.query("position == 'G'")
                .drop_duplicates(["playerId", "season"]))
futures = uni_players.apply(get_futures, axis = 1, 
                          args = (session,))
goalie_uni = [get_player_season(*future) for future in futures]
goalie_uni = pd.DataFrame(filter(None, goalie_uni))

In [None]:
goalie_uni = pd.DataFrame(filter(None, goalie_uni))
goalie_uni.to_csv("goalie_uni.csv", index = False)