In [1]:
import pandas as pd
import requests
import json

import time

In [2]:
loc = r'../stats/'
season = '2020-21'

# Request offensive data from NBA API

Requests specifically data from the season defined above

In [3]:
headers = {
    'Accept': 'application/json, text/plain, */*',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
    'x-nba-stats-origin': 'stats',
    'Sec-Fetch-Site': 'same-site',
    'Sec-Fetch-Mode': 'cors',
    'Referer': 'https://www.nba.com/'
}

r = requests.get('https://stats.nba.com/stats/leaguedashplayerstats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&GameSegment=&Height=&LastNGames=0&LeagueID=00&Location=&MeasureType=Base&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=PerGame&Period=0&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&Season=' + season + '&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StarterBench=&TeamID=0&TwoWay=0&VsConference=&VsDivision=&Weight=', headers=headers, timeout=10)

In [4]:
r_dict = json.loads(r.content)

data = r_dict['resultSets'][0]['rowSet']
columns = r_dict['resultSets'][0]['headers']

df = pd.DataFrame(data=data, columns=columns)
df.drop(columns=df.columns[31:], inplace=True) # Get rid of ranks
df.drop(columns=['NICKNAME', 'TEAM_ID', 'TEAM_ABBREVIATION', 'STL', 'BLK', 'PFD', 'W', 'L', 'W_PCT'], inplace=True)
df['SEASON_ID'] = '220' + season[2:4]

print(df.shape)
df.head()

(540, 23)


Unnamed: 0,PLAYER_ID,PLAYER_NAME,AGE,GP,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,...,FT_PCT,OREB,DREB,REB,AST,TOV,BLKA,PF,PTS,SEASON_ID
0,203932,Aaron Gordon,25.0,50,27.7,4.6,10.0,0.463,1.2,3.5,...,0.651,1.5,4.1,5.7,3.2,1.9,0.6,1.8,12.4,22020
1,1628988,Aaron Holiday,24.0,66,17.8,2.6,6.6,0.39,1.0,2.8,...,0.819,0.2,1.1,1.3,1.9,1.0,0.5,1.4,7.2,22020
2,1630174,Aaron Nesmith,21.0,46,14.5,1.7,3.9,0.438,0.9,2.3,...,0.786,0.6,2.2,2.8,0.5,0.5,0.3,1.9,4.7,22020
3,1627846,Abdel Nader,27.0,24,14.8,2.4,4.8,0.491,0.8,1.8,...,0.757,0.3,2.3,2.6,0.8,0.8,0.3,1.4,6.7,22020
4,1629690,Adam Mokoka,22.0,14,4.0,0.5,1.4,0.368,0.1,0.7,...,0.0,0.1,0.3,0.4,0.4,0.4,0.1,0.4,1.1,22020


# Get top percentile of players

Only uses the top x percentile to get H2H data from

In [20]:
percentile = 0.2

off_df = df.copy()
off_df = off_df[off_df['PTS'] > df.quantile(q=percentile)['PTS']]
print('Number of over ' + str(percentile * 100) + 'th percentile players: ' + str(off_df.shape[0]))
print('Minimum time to complete: ' + str(format(off_df.shape[0] * 1 / 60, '.1f')) + ' minutes')
print('Estimated time to complete: ' + str(format(off_df.shape[0] * 1.71 / 60, '.1f')) + ' minutes')

off_df.head()

Number of over 20.0th percentile players: 430
Minimum time to complete: 7.2 minutes
Estimated time to complete: 12.3 minutes


Unnamed: 0,PLAYER_ID,PLAYER_NAME,AGE,GP,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,...,FT_PCT,OREB,DREB,REB,AST,TOV,BLKA,PF,PTS,SEASON_ID
0,203932,Aaron Gordon,25.0,50,27.7,4.6,10.0,0.463,1.2,3.5,...,0.651,1.5,4.1,5.7,3.2,1.9,0.6,1.8,12.4,22020
1,1628988,Aaron Holiday,24.0,66,17.8,2.6,6.6,0.39,1.0,2.8,...,0.819,0.2,1.1,1.3,1.9,1.0,0.5,1.4,7.2,22020
2,1630174,Aaron Nesmith,21.0,46,14.5,1.7,3.9,0.438,0.9,2.3,...,0.786,0.6,2.2,2.8,0.5,0.5,0.3,1.9,4.7,22020
3,1627846,Abdel Nader,27.0,24,14.8,2.4,4.8,0.491,0.8,1.8,...,0.757,0.3,2.3,2.6,0.8,0.8,0.3,1.4,6.7,22020
5,201143,Al Horford,35.0,28,27.9,5.8,12.9,0.45,2.0,5.4,...,0.818,1.0,5.7,6.7,3.4,1.0,0.4,1.7,14.2,22020


# Get H2H data for each player

Retrieves data from NBA.com based on each player ID

Takes a long time to grab all data as there is a 1 second delay between requests to avoid being ip banned from NBA.com

In [6]:
h2h_df = None
created = False
i = 1

for ID in off_df['PLAYER_ID']:
    print(ID, end = '')
    r = requests.get('https://stats.nba.com/stats/leagueseasonmatchups?DateFrom=&DateTo=&LeagueID=00&OffPlayerID=' + str(ID) + '&Outcome=&PORound=0&PerMode=Totals&Season=' + season + '&SeasonType=Regular+Season', headers=headers, timeout=10)
    
    # Verbose
    print('...done! [' + str(i) + '/' + str(off_df.shape[0]) + ']')
    i += 1
    
    r_dict = json.loads(r.content)
    data = r_dict['resultSets'][0]['rowSet']
    columns = r_dict['resultSets'][0]['headers']
    
    if created == False:
        h2h_df = pd.DataFrame(data = data, columns = columns)
        created = True
    else:
        combine = pd.DataFrame(data=data, columns=columns)
        h2h_df = h2h_df.append(combine, ignore_index = True)
    time.sleep(1)
    
h2h_df

203932...done! [1/430]
1628988...done! [2/430]
1630174...done! [3/430]
1627846...done! [4/430]
201143...done! [5/430]
202329...done! [6/430]
202692...done! [7/430]
1630197...done! [8/430]
1627936...done! [9/430]
203458...done! [10/430]
1628993...done! [11/430]
203083...done! [12/430]
2738...done! [13/430]
203952...done! [14/430]
1629014...done! [15/430]
203076...done! [16/430]
1630162...done! [17/430]
1630237...done! [18/430]
1629717...done! [19/430]
203382...done! [20/430]
203085...done! [21/430]
202340...done! [22/430]
1628389...done! [23/430]
203463...done! [24/430]
1627732...done! [25/430]
202687...done! [26/430]
201933...done! [27/430]
1626246...done! [28/430]
1626171...done! [29/430]
203992...done! [30/430]
202711...done! [31/430]
202954...done! [32/430]
203078...done! [33/430]
1629634...done! [34/430]
1629164...done! [35/430]
1627742...done! [36/430]
1630271...done! [37/430]
201572...done! [38/430]
1628971...done! [39/430]
1627854...done! [40/430]
1627741...done! [41/430]
203468

Unnamed: 0,SEASON_ID,OFF_PLAYER_ID,OFF_PLAYER_NAME,DEF_PLAYER_ID,DEF_PLAYER_NAME,GP,MATCHUP_MIN,PARTIAL_POSS,PLAYER_PTS,TEAM_PTS,...,MATCHUP_FG3A,MATCHUP_FG3_PCT,HELP_BLK,HELP_FGM,HELP_FGA,HELP_FG_PERC,MATCHUP_FTM,MATCHUP_FTA,SFL,MATCHUP_TIME_SEC
0,22020,203932,Aaron Gordon,202710,Jimmy Butler,3,12:10,61.1,15,69,...,2,0.0,0,0,0,0,1,2,1,729.9
1,22020,203932,Aaron Gordon,1629647,Darius Bazley,2,10:08,56.1,9,58,...,5,0.2,0,0,0,0,0,0,0,608.1
2,22020,203932,Aaron Gordon,202699,Tobias Harris,2,10:45,53.8,4,58,...,2,0.0,0,0,0,0,0,0,0,645.0
3,22020,203932,Aaron Gordon,203937,Kyle Anderson,2,10:41,52.5,5,48,...,5,0.2,0,0,0,0,0,0,0,640.5
4,22020,203932,Aaron Gordon,202695,Kawhi Leonard,3,10:12,51.8,11,44,...,2,0.5,0,0,0,0,0,0,0,612.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119180,22020,1629627,Zion Williamson,1627812,Yogi Ferrell,1,0:04,0.4,0,0,...,0,0.0,0,0,0,0,0,0,0,3.8
119181,22020,1629627,Zion Williamson,1628422,Damyean Dotson,1,0:05,0.4,0,0,...,0,0.0,0,0,0,0,0,0,0,4.8
119182,22020,1629627,Zion Williamson,1629021,Moritz Wagner,1,0:02,0.2,0,0,...,0,0.0,0,0,0,0,0,0,0,2.4
119183,22020,1629627,Zion Williamson,1630222,Mason Jones,1,0:03,0.2,0,0,...,0,0.0,0,0,0,0,0,0,0,3.2


In [7]:
h2h_df.to_csv(loc + season + '_' + str(percentile) + '_h2h_stats.csv', index=False)

# Extract more offensive data

Get additional offensive statistics

In [8]:
r = requests.get('https://stats.nba.com/stats/leaguedashptstats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&Height=&LastNGames=0&LeagueID=00&Location=&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PerMode=PerGame&PlayerExperience=&PlayerOrTeam=Player&PlayerPosition=&PtMeasureType=PaintTouch&Season=' + season + '&SeasonSegment=&SeasonType=Regular+Season&StarterBench=&TeamID=0&VsConference=&VsDivision=&Weight=', headers=headers, timeout=10)

In [9]:
r_dict = json.loads(r.content)

data = r_dict['resultSets'][0]['rowSet']
columns = r_dict['resultSets'][0]['headers']

stats = pd.DataFrame(data=data, columns=columns)

# Remove unnecessary columns
stats = stats[['PLAYER_ID', 'TOUCHES', 'PAINT_TOUCHES', 'PAINT_TOUCH_FGM', 'PAINT_TOUCH_FGA', 'PAINT_TOUCH_PASSES', 'PAINT_TOUCH_TOV']]
off_df = pd.merge(off_df, stats, on=['PLAYER_ID'])
off_df.columns

Index(['PLAYER_ID', 'PLAYER_NAME', 'AGE', 'GP', 'MIN', 'FGM', 'FGA', 'FG_PCT',
       'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
       'REB', 'AST', 'TOV', 'BLKA', 'PF', 'PTS', 'SEASON_ID', 'TOUCHES',
       'PAINT_TOUCHES', 'PAINT_TOUCH_FGM', 'PAINT_TOUCH_FGA',
       'PAINT_TOUCH_PASSES', 'PAINT_TOUCH_TOV'],
      dtype='object')

In [10]:
r = requests.get('https://stats.nba.com/stats/leaguedashptstats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&Height=&LastNGames=0&LeagueID=00&Location=&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PerMode=PerGame&PlayerExperience=&PlayerOrTeam=Player&PlayerPosition=&PtMeasureType=Efficiency&Season=' + season + '&SeasonSegment=&SeasonType=Regular+Season&StarterBench=&TeamID=0&VsConference=&VsDivision=&Weight=', headers=headers, timeout=10)

In [11]:
r_dict = json.loads(r.content)

data = r_dict['resultSets'][0]['rowSet']
columns = r_dict['resultSets'][0]['headers']

stats = pd.DataFrame(data=data, columns=columns)
stats = stats[['PLAYER_ID', 'DRIVE_PTS', 'DRIVE_FG_PCT', 'CATCH_SHOOT_PTS',
                 'CATCH_SHOOT_FG_PCT', 'PULL_UP_PTS', 'PULL_UP_FG_PCT',
                 'PAINT_TOUCH_PTS', 'PAINT_TOUCH_FG_PCT', 'POST_TOUCH_PTS',
                 'POST_TOUCH_FG_PCT', 'ELBOW_TOUCH_PTS', 'ELBOW_TOUCH_FG_PCT',
                 'EFF_FG_PCT']]
off_df = pd.merge(off_df, stats, on=['PLAYER_ID'])

off_df.drop(columns=['PLAYER_NAME'], inplace=True)
off_df

Unnamed: 0,PLAYER_ID,AGE,GP,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,...,CATCH_SHOOT_FG_PCT,PULL_UP_PTS,PULL_UP_FG_PCT,PAINT_TOUCH_PTS,PAINT_TOUCH_FG_PCT,POST_TOUCH_PTS,POST_TOUCH_FG_PCT,ELBOW_TOUCH_PTS,ELBOW_TOUCH_FG_PCT,EFF_FG_PCT
0,203932,25.0,50,27.7,4.6,10.0,0.463,1.2,3.5,0.335,...,0.386,2.4,0.323,3.6,0.686,1.0,0.381,0.8,0.696,0.522
1,1628988,24.0,66,17.8,2.6,6.6,0.390,1.0,2.8,0.368,...,0.384,1.4,0.374,0.2,0.500,0.0,0.000,0.1,0.600,0.467
2,1630174,21.0,46,14.5,1.7,3.9,0.438,0.9,2.3,0.370,...,0.398,0.5,0.355,0.7,0.842,0.0,0.000,0.1,0.500,0.551
3,1627846,27.0,24,14.8,2.4,4.8,0.491,0.8,1.8,0.419,...,0.429,0.2,0.333,0.8,0.636,0.0,0.000,0.0,0.000,0.569
4,201143,35.0,28,27.9,5.8,12.9,0.450,2.0,5.4,0.368,...,0.381,1.0,0.412,2.6,0.596,2.1,0.453,1.1,0.520,0.528
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
425,1630214,22.0,59,18.4,2.8,5.1,0.559,0.4,1.1,0.338,...,0.417,0.4,0.571,3.0,0.625,0.3,0.643,1.5,0.642,0.595
426,1627812,28.0,10,13.7,2.0,5.7,0.351,0.9,2.8,0.321,...,0.500,2.1,0.290,0.0,0.000,0.0,0.000,0.2,0.500,0.430
427,1629139,26.0,50,14.5,1.6,3.6,0.439,0.7,1.8,0.400,...,0.414,0.3,0.250,0.5,0.478,0.0,0.000,0.3,0.800,0.539
428,203897,26.0,58,35.1,9.8,19.4,0.507,3.4,8.2,0.419,...,0.458,9.3,0.421,1.5,0.735,0.1,0.167,0.8,0.760,0.596


In [12]:
off_df.to_csv(loc + season + '_' + str(percentile) + '_off_stats.csv', index=False)