# Récupération des matchs manquants

Suite au merge des datasets nous avons remarqué qu'il manquait un grand nombre de d'observations. 

Certain matchs n'étaient pas dans les datasets play by play. Il est possible de les récupérer via l'API nba_api afin de les réinjecter dans notre pre processing

In [None]:
# Matchs qui n'apparaissent que dans Shot locations
game_ids = df_merged[df_merged['GAME_ID'].isna()]['Game ID'].unique()
len(game_ids)

1714

In [None]:
from nba_api.stats.endpoints import playbyplayv2
from tqdm import tqdm

# pour chaque game id de Shot locations non présent dans les play by play
# on récupère les play by play sur le site de la NBA
liste_df = []
for id in tqdm(game_ids):
    string_id = "00" + str(int(id))
    liste_df.append(playbyplayv2.PlayByPlayV2(string_id).get_data_frames()[0])


100%|██████████| 1714/1714 [31:03<00:00,  1.09s/it]


In [None]:
# concaténation
missing_play_by_play = pd.concat(liste_df)

In [None]:
# enregistrement du dataset
missing_play_by_play.to_csv("../data/raw/missing_pbp.csv")

# Récupération des stats manquants

In [17]:
import pandas as pd
BEST_PLAYERS =['Kobe Bryant', 'LeBron James', 'Stephen Curry', 'Kevin Durant', 'Dwyane Wade', 
               'Dirk Nowitzki', 'Tim Duncan', "Shaquille O'Neal", "Steve Nash", "Kawhi Leonard", 
               "James Harden", "Jason Kidd", "Allen Iverson", "Chris Webber", "Kevin Garnett", 
               "Paul Pierce", "Giannis Antetokounmpo", "Jimmy Butler", "Russell Westbrook", "Dwight Howard"]


In [20]:
dfs = []
for i in range(2018,2021):
    df = pd.read_csv('../data/raw/season_stats_'+str(i)+'.csv')
    df_advanced = pd.read_csv('../data/raw/season_stats_advanced_'+str(i)+'.csv')
    df['Year']=i
    df = df.merge(df_advanced, on=['Player', 'Tm','Pos', 'Age', 'G', 'MP'])
    dfs.append(df)

stats = pd.concat(dfs)

In [22]:
stats = stats[stats.Player.isin(BEST_PLAYERS)]

In [23]:
stats.columns

Index(['Rk_x', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA',
       'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA',
       'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
       'Player-additional_x', 'Year', 'Rk_y', 'PER', 'TS%', '3PAr', 'FTr',
       'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%',
       'Unnamed: 19', 'OWS', 'DWS', 'WS', 'WS/48', 'Unnamed: 24', 'OBPM',
       'DBPM', 'BPM', 'VORP', 'Player-additional_y'],
      dtype='object')

In [24]:
# Change column order to fit existing data
stats = stats.reindex(['Year','Player','Pos','Age','Tm','G','GS','MP',
                       'PER','TS%','3PAr','FTr','ORB%','DRB%','TRB%','AST%','STL%',
                       'BLK%','TOV%','USG%', 'Unnamed: 19','OWS','DWS','WS','WS/48','Unnamed: 24',
                       'OBPM','DBPM','BPM','VORP','FG','FGA','FG%','3P','3PA','3P%',
                       '2P','2PA','2P%','eFG%','FT','FTA','FT%','ORB','DRB','TRB',
                       'AST','STL','BLK','TOV','PF','PTS'], axis=1)
    

In [25]:
#save
stats.to_csv('../data/raw/seasons_stats_2018-2020.csv')

## Récupérer les matchs de 2019 - 2020 dans l'ancien format

In [5]:
from nba_api.stats.endpoints import playbyplayv2
from tqdm import tqdm
import pandas as pd
pd.set_option('display.max_columns', 100)
shot_locations = pd.read_csv("../data/processed/Shot_Locations_top_20_players_2000to2020.csv")

In [10]:
game_ids = shot_locations[shot_locations.Year > 2018]["Game ID"].drop_duplicates()

In [13]:
# pour chaque game id de Shot locations non présent dans les play by play
# on récupère les play by play sur le site de la NBA
liste_df = []
for id in tqdm(game_ids):
    string_id = "00" + str(int(id))
    try :
        liste_df.append(playbyplayv2.PlayByPlayV2(string_id).get_data_frames()[0])
    except JSONDecodeError:
        print(id)

100%|██████████| 639/639 [08:02<00:00,  1.33it/s]


In [14]:
# concaténation
missing_play_by_play = pd.concat(liste_df)

# enregistrement du dataset
missing_play_by_play.to_csv("../data/raw/missing_pbp_2019-2020.csv")

## Récupérer les métriques par équipes et par saison

In [9]:
from nba_api.stats.endpoints import teamestimatedmetrics
import pandas as pd

In [19]:
seasons = ["2000-01","2001-02", "2002-03", "2003-04", "2004-05", "2005-06", "2006-07", "2007-08", "2008-09", "2009-10",
            "2010-11","2011-12","2012-13","2013-14","2014-15","2015-16","2016-17","2017-18","2018-19","2019-20", "2020-21"]

liste_df = []

for season in seasons:    
    df = teamestimatedmetrics.TeamEstimatedMetrics(season=season).get_data_frames()[0]
    df["Year"] = int(season[:4])
    liste_df.append(df)

# concaténation
team_metrics = pd.concat(liste_df)

# add abbreviation
teams = pd.read_csv("../data/raw/teams.csv")
team_metrics = team_metrics.merge(teams[['TEAM_ID', 'ABBREVIATION']], on='TEAM_ID')

In [21]:
team_metrics.head(3)

Unnamed: 0,TEAM_NAME,TEAM_ID,GP,W,L,W_PCT,MIN,E_OFF_RATING,E_DEF_RATING,E_NET_RATING,E_PACE,E_AST_RATIO,E_OREB_PCT,E_DREB_PCT,E_REB_PCT,E_TM_TOV_PCT,GP_RANK,W_RANK,L_RANK,W_PCT_RANK,MIN_RANK,E_OFF_RATING_RANK,E_DEF_RATING_RANK,E_NET_RATING_RANK,E_AST_RATIO_RANK,E_OREB_PCT_RANK,E_DREB_PCT_RANK,E_REB_PCT_RANK,E_TM_TOV_PCT_RANK,E_PACE_RANK,Year,ABBREVIATION
0,Detroit Pistons,1610612765,82,32,50,0.39,3971.0,97.3,98.9,-1.6,97.5,14.9,0.293,0.722,0.503,0.162,1,21,21,21,10,24,8,19,28,10,12,9,17,1,2000,DET
1,Sacramento Kings,1610612758,82,55,27,0.671,4016.0,102.8,97.0,5.8,97.0,16.8,0.272,0.707,0.495,0.15,1,4,4,4,1,8,7,2,15,20,22,22,6,2,2000,SAC
2,Golden State Warriors,1610612744,82,17,65,0.207,3956.0,94.5,104.9,-10.4,96.8,15.9,0.333,0.697,0.5,0.162,1,28,28,28,21,28,29,28,22,1,29,16,18,3,2000,GSW


In [22]:
# enregistrement du dataset
team_metrics.to_csv("../data/raw/team_metrics.csv")