# Downloading

In [3]:
import os
import requests
import json

base_url = "https://api-web.nhle.com/v1/gamecenter/"

def download_nhl_data(season, file_path):
    # Vérifier si le fichier de données existe déjà
    if os.path.exists(file_path):
        with open(file_path, 'r') as file:
            data = json.load(file)
    else:
        game_type = "02"  # Saison régulière
        data = []  # Stocker les données de tous les matchs

        is_game_number = True
        game = 1
        # Parcourir tous les matchs de la saison demandée
        while is_game_number :
            # Créer le GAME_ID en fonction de l'année cible, du type de jeu et du numéro de jeu
            game_id = f"{season}{game_type}{str(game).zfill(4)}"
            # Composer l'URL complet
            url = f"{base_url}{game_id}/play-by-play/"

            # Effectuer une requête HTTP pour télécharger les données
            response = requests.get(url)

            if response.status_code == 200:
                game_data = response.json()
                data.append(game_data)
                game += 1
            else:
                print(f"Erreur lors du téléchargement des données pour le match {game} de la saison {season}")
                is_game_number = False

        # Sauvegarder toutes les données dans le fichier spécifié
        with open(file_path, 'w') as file:
            json.dump(data, file)

    return data


In [4]:
seasons_data = {}

seasons_data["2016"] = download_nhl_data(2016, "nhl_data_2016.json")
seasons_data["2017"] = download_nhl_data(2017, "nhl_data_2017.json")
seasons_data["2018"] = download_nhl_data(2018, "nhl_data_2018.json")
seasons_data["2019"] = download_nhl_data(2019, "nhl_data_2019.json")
seasons_data["2020"] = download_nhl_data(2020, "nhl_data_2020.json")

Erreur lors du téléchargement des données pour le match 1231 de la saison 2016
Erreur lors du téléchargement des données pour le match 1272 de la saison 2017
Erreur lors du téléchargement des données pour le match 1272 de la saison 2018
Erreur lors du téléchargement des données pour le match 1083 de la saison 2019
Erreur lors du téléchargement des données pour le match 869 de la saison 2020


# Cleaning

In [3]:
import pandas as pd
import numpy as np

def clean_milestone3(json_path):
    """
    json_path: path vers le fichier json

    clean et sauvegarde un df clean
    """
    df = pd.read_json(json_path)

    df['homeTeamId'] = pd.json_normalize(df['homeTeam']).id

    dp = df.iloc[:,[-1, -2]]
    dp = dp.explode('plays')

    d = pd.json_normalize(dp.plays).set_index(dp.index)
    d['homeTeamId'] = dp['homeTeamId']
    d = d[d['details.shotType'].notna()]
    d = d[['typeDescKey', 'details.xCoord', 'details.yCoord', 'situationCode', 'details.eventOwnerTeamId', 'homeTeamId']]

    d['details.eventOwnerTeamId']=d['details.eventOwnerTeamId'].astype(int)

    # Coordonnées des camps gauche et droit
    coord_camp_gauche = (-90, 0)
    coord_camp_droit = (90, 0)

    d['distanceToNet'] = np.sqrt(np.minimum((d['details.xCoord'] - coord_camp_gauche[0])**2 + (d['details.yCoord'] - coord_camp_gauche[1])**2, (d['details.xCoord'] - coord_camp_droit[0])**2 + (d['details.yCoord'] - coord_camp_droit[1])**2))

    # Calculer l'angle relatif du joueur par rapport au filet (filet gauche)
    d['relativeAngleToNet'] = np.degrees(np.arctan2(d['details.yCoord'], d['details.xCoord'] - coord_camp_gauche[0]))

    d['but'] = (d['typeDescKey']=='goal').astype(int)
    d['isHome'] = d['details.eventOwnerTeamId']==d['homeTeamId']
    d['filetVide'] = ((((d['situationCode'].astype(int)*d['isHome'])//1000+(d['situationCode'].astype(int)*(d['isHome']-1)*(-1))%10)-1)*(-1))
    final = d[['distanceToNet', 'relativeAngleToNet', 'but', 'filetVide']]
    final.to_csv(json_path[:-5]+"_mlstn3_clean.csv")
    return final

In [143]:
#iterate over all nhl_data files and make a clean df
p = ["nhl_data_2016.json","nhl_data_2017.json", "nhl_data_2018.json", "nhl_data_2019.json", "nhl_data_2020.json"]
for file_path in p:
    clean_milestone3(file_path)

# Merging multiple years into 1 file

In [56]:
#merge all the clean df into one
p_clean = ["nhl_data_2016_mlstn3_clean.csv","nhl_data_2017_mlstn3_clean.csv", "nhl_data_2018_mlstn3_clean.csv", "nhl_data_2019_mlstn3_clean.csv", "nhl_data_2020_mlstn3_clean.csv"]
df_list = []
count = 0
for path in p_clean:
    df = pd.read_csv(path)
    df['Unnamed: 0'] = df['Unnamed: 0']+count
    count = df['Unnamed: 0'].values[-1]
    df_list.append(df)
df = pd.concat(df_list, axis=0, ignore_index=True)
df.to_csv('nhl_all_year_m3EXTRA_clean.csv')

# Cleaning EXTRA params


In [52]:
import pandas as pd
import numpy as np

def clean_milestone3_EXTRA(json_path):
    """
    json_path: path vers le fichier json

    clean et sauvegarde un df clean
    """
    df = pd.read_json(json_path)

    df['gameId'] = df.id
    df['homeTeamId'] = pd.json_normalize(df['homeTeam']).id
    df['awayTeamId'] = pd.json_normalize(df['awayTeam']).id
    df['homeTeamName'] = pd.json_normalize(df['homeTeam']).abbrev
    df['awayTeamName'] = pd.json_normalize(df['awayTeam']).abbrev

    dp = df.iloc[:,[-1, -2, -3, -4, -5, -6]]
    dp = dp.explode('plays')

    d = pd.json_normalize(dp.plays).set_index(dp.index)
    d['homeTeamId'] = dp['homeTeamId']
    d['awayTeamId'] = dp['awayTeamId']
    d['gameId'] = dp['gameId']
    d['homeTeamName'] = dp['homeTeamName']
    d['awayTeamName'] = dp['awayTeamName']

    d = d[d['details.shotType'].notna()]
    d = d[['gameId', 'period', 'timeInPeriod', 'typeDescKey', 'details.xCoord', 'details.yCoord', 'situationCode', 'details.eventOwnerTeamId', 'homeTeamId', 'awayTeamId', 'homeTeamName', 'awayTeamName']]

    d['details.eventOwnerTeamId']=d['details.eventOwnerTeamId'].astype(int)

    # Coordonnées des camps gauche et droit
    coord_camp_gauche = (-90, 0)
    coord_camp_droit = (90, 0)

    d['distanceToNet'] = np.sqrt(np.minimum((d['details.xCoord'] - coord_camp_gauche[0])**2 + (d['details.yCoord'] - coord_camp_gauche[1])**2, (d['details.xCoord'] - coord_camp_droit[0])**2 + (d['details.yCoord'] - coord_camp_droit[1])**2))

    # Calculer l'angle relatif du joueur par rapport au filet (filet gauche)
    d['relativeAngleToNet'] = np.degrees(np.arctan2(d['details.yCoord'], d['details.xCoord'] - coord_camp_gauche[0]))

    d['isGoal'] = (d['typeDescKey']=='goal').astype(int)
    d['isHome'] = d['details.eventOwnerTeamId']==d['homeTeamId']
    d['filetVide'] = ((((d['situationCode'].astype(int)*d['isHome'])//1000+(d['situationCode'].astype(int)*(d['isHome']-1)*(-1))%10)-1)*(-1))
    final = d[['gameId', 'period', 'timeInPeriod', 'typeDescKey', 'homeTeamId', 'awayTeamId', 'details.eventOwnerTeamId', 'homeTeamName', 'awayTeamName', 'distanceToNet', 'relativeAngleToNet', 'filetVide', 'isGoal']]
    final.to_csv(json_path[:-5]+"_mlstn3_clean.csv")
    return final

In [53]:
#iterate over all nhl_data files and make a clean df
p = ["/content/drive/MyDrive/NHL3/nhl_data_2016.json","/content/drive/MyDrive/NHL3/nhl_data_2017.json", "/content/drive/MyDrive/NHL3/nhl_data_2018.json", "/content/drive/MyDrive/NHL3/nhl_data_2019.json", "/content/drive/MyDrive/NHL3/nhl_data_2020.json"]
for file_path in p:
    clean_milestone3(file_path)

In [54]:
z ='/content/nhl_data_2016_mlstn3_clean.csv'

In [55]:
    df = pd.read_csv(z)
    df

Unnamed: 0.1,Unnamed: 0,gameId,period,timeInPeriod,typeDescKey,homeTeamId,awayTeamId,details.eventOwnerTeamId,homeTeamName,awayTeamName,distanceToNet,relativeAngleToNet,filetVide,isGoal
0,0,2016020001,1,01:11,shot-on-goal,9,10,10,OTT,TOR,13.928388,21.037511,0,0
1,0,2016020001,1,02:14,missed-shot,9,10,9,OTT,TOR,39.051248,3.626572,0,0
2,0,2016020001,1,02:53,shot-on-goal,9,10,9,OTT,TOR,13.601471,4.224403,0,0
3,0,2016020001,1,03:43,missed-shot,9,10,10,OTT,TOR,18.000000,0.000000,0,0
4,0,2016020001,1,04:01,shot-on-goal,9,10,9,OTT,TOR,77.025970,-18.586927,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104049,1229,2016021230,3,15:53,shot-on-goal,22,23,23,EDM,VAN,39.115214,-32.471192,0,0
104050,1229,2016021230,3,15:59,shot-on-goal,22,23,23,EDM,VAN,38.832976,-55.491477,0,0
104051,1229,2016021230,3,18:07,shot-on-goal,22,23,23,EDM,VAN,24.413111,-55.007980,0,0
104052,1229,2016021230,3,19:21,shot-on-goal,22,23,22,EDM,VAN,34.205263,-9.637538,0,0


# Merging multiple years into 1 file

In [None]:
#merge all the clean df into one
p_clean = ["nhl_data_2016_mlstn3_clean.csv","nhl_data_2017_mlstn3_clean.csv", "nhl_data_2018_mlstn3_clean.csv", "nhl_data_2019_mlstn3_clean.csv", "nhl_data_2020_mlstn3_clean.csv"]
df_list = []
count = 0
for path in p_clean:
    df = pd.read_csv(path)
    df['Unnamed: 0'] = df['Unnamed: 0']+count
    count = df['Unnamed: 0'].values[-1]
    df_list.append(df)
df = pd.concat(df_list, axis=0, ignore_index=True)
df.to_csv('nhl_all_year_m3EXTRA_clean.csv')