In [16]:
import pandas as pd
import requests
import json

import time

In [28]:
loc = r'./stats/'
season = '2019-20'

# Request offensive data from NBA API

Requests specifically data from the season defined above

In [29]:
headers = {
    'Accept': 'application/json, text/plain, */*',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
    'x-nba-stats-origin': 'stats',
    'Sec-Fetch-Site': 'same-site',
    'Sec-Fetch-Mode': 'cors',
    'Referer': 'https://www.nba.com/'
}

r = requests.get('https://stats.nba.com/stats/leaguedashplayerstats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&GameSegment=&Height=&LastNGames=0&LeagueID=00&Location=&MeasureType=Base&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=PerGame&Period=0&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&Season=' + season + '&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StarterBench=&TeamID=0&TwoWay=0&VsConference=&VsDivision=&Weight=', headers=headers, timeout=10)

In [30]:
r_dict = json.loads(r.content)

data = r_dict['resultSets'][0]['rowSet']
columns = r_dict['resultSets'][0]['headers']

df = pd.DataFrame(data=data, columns=columns)
df.drop(columns=df.columns[31:], inplace=True) # Get rid of ranks
df.drop(columns=['NICKNAME', 'TEAM_ID', 'TEAM_ABBREVIATION', 'STL', 'BLK', 'PFD', 'W', 'L', 'W_PCT'], inplace=True)
df['SEASON_ID'] = '220' + season[2:4]

print(df.shape)
df.head()

(529, 23)


Unnamed: 0,PLAYER_ID,PLAYER_NAME,AGE,GP,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,...,FT_PCT,OREB,DREB,REB,AST,TOV,BLKA,PF,PTS,SEASON_ID
0,203932,Aaron Gordon,24.0,62,32.5,5.4,12.4,0.437,1.2,3.8,...,0.674,1.7,5.9,7.7,3.7,1.6,0.7,2.0,14.4,22019
1,1628988,Aaron Holiday,23.0,66,24.5,3.5,8.5,0.414,1.3,3.3,...,0.851,0.3,2.0,2.4,3.4,1.3,0.4,1.8,9.5,22019
2,1627846,Abdel Nader,26.0,55,15.8,2.2,4.8,0.468,0.9,2.3,...,0.773,0.3,1.6,1.8,0.7,0.8,0.2,1.4,6.3,22019
3,1629690,Adam Mokoka,21.0,11,10.2,1.1,2.5,0.429,0.5,1.4,...,0.5,0.6,0.3,0.9,0.4,0.2,0.4,1.5,2.9,22019
4,1629678,Admiral Schofield,23.0,33,11.2,1.1,2.8,0.38,0.6,1.8,...,0.667,0.2,1.2,1.4,0.5,0.2,0.1,1.5,3.0,22019


# Get top percentile of players

Only uses the top x percentile to get H2H data from

In [31]:
percentile = 0.9

off_df = df.copy()
off_df = off_df[off_df['PTS'] > df.quantile(q=percentile)['PTS']]
print('Number of over ' + str(percentile * 100) + 'th percentile players: ' + str(off_df.shape[0]))
print('Minimum time to complete: ' + str(format(off_df.shape[0] * 1 / 60, '.1f')) + ' minutes')
print('Estimated time to complete: ' + str(format(off_df.shape[0] * 1.5 / 60, '.1f')) + ' minutes')

off_df.head()

Number of over 90.0th percentile players: 51
Minimum time to complete: 0.8 minutes
Estimated time to complete: 1.3 minutes


Unnamed: 0,PLAYER_ID,PLAYER_NAME,AGE,GP,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,...,FT_PCT,OREB,DREB,REB,AST,TOV,BLKA,PF,PTS,SEASON_ID
20,203952,Andrew Wiggins,25.0,54,34.4,8.1,18.1,0.447,2.1,6.2,...,0.709,1.2,3.9,5.1,3.7,2.4,1.0,2.4,21.8,22019
23,203076,Anthony Davis,27.0,62,34.4,8.9,17.7,0.503,1.2,3.5,...,0.846,2.3,7.0,9.3,3.2,2.5,0.7,2.5,26.1,22019
39,202711,Bojan Bogdanovic,31.0,63,33.1,6.6,14.8,0.447,3.0,7.3,...,0.903,0.6,3.5,4.1,2.1,2.5,1.1,1.7,20.2,22019
42,203078,Bradley Beal,27.0,57,36.0,10.4,22.9,0.455,3.0,8.4,...,0.842,0.9,3.3,4.2,6.1,3.4,0.9,2.2,30.5,22019
45,1627742,Brandon Ingram,22.0,62,33.9,8.2,17.7,0.463,2.4,6.2,...,0.851,0.8,5.3,6.1,4.2,3.0,0.8,2.9,23.8,22019


# Get H2H data for each player

Retrieves data from NBA.com based on each player ID

Takes a long time to grab all data as there is a 1 second delay between requests to avoid being ip banned from NBA.com

In [32]:
h2h_df = None
created = False
i = 1

for ID in off_df['PLAYER_ID']:
    print(ID, end = '')
    r = requests.get('https://stats.nba.com/stats/leagueseasonmatchups?DateFrom=&DateTo=&LeagueID=00&OffPlayerID=' + str(ID) + '&Outcome=&PORound=0&PerMode=Totals&Season=' + season + '&SeasonType=Regular+Season', headers=headers, timeout=10)
    
    # Verbose
    print('...done! [' + str(i) + '/' + str(off_df.shape[0]) + ']')
    i += 1
    
    r_dict = json.loads(r.content)
    data = r_dict['resultSets'][0]['rowSet']
    columns = r_dict['resultSets'][0]['headers']
    
    if created == False:
        h2h_df = pd.DataFrame(data = data, columns = columns)
        created = True
    else:
        combine = pd.DataFrame(data=data, columns=columns)
        h2h_df = h2h_df.append(combine, ignore_index = True)
    time.sleep(1)
    
h2h_df

203952...done! [1/51]
203076...done! [2/51]
202711...done! [3/51]
203078...done! [4/51]
1627742...done! [5/51]
1627741...done! [6/51]
203468...done! [7/51]
1627747...done! [8/51]
1629012...done! [9/51]
1626156...done! [10/51]
203081...done! [11/51]
201568...done! [12/51]
1628368...done! [13/51]
201942...done! [14/51]
203471...done! [15/51]
1626164...done! [16/51]
1628378...done! [17/51]
203507...done! [18/51]
201935...done! [19/51]
1627759...done! [20/51]
1628369...done! [21/51]
202710...done! [22/51]
203954...done! [23/51]
1628381...done! [24/51]
201950...done! [25/51]
203944...done! [26/51]
1626157...done! [27/51]
202695...done! [28/51]
1626162...done! [29/51]
202689...done! [30/51]
203114...done! [31/51]
204001...done! [32/51]
200768...done! [33/51]
202681...done! [34/51]
200746...done! [35/51]
2544...done! [36/51]
1629029...done! [37/51]
1626149...done! [38/51]
203999...done! [39/51]
202696...done! [40/51]
1627783...done! [41/51]
202331...done! [42/51]
201566...done! [43/51]
162898

Unnamed: 0,SEASON_ID,OFF_PLAYER_ID,OFF_PLAYER_NAME,DEF_PLAYER_ID,DEF_PLAYER_NAME,GP,MATCHUP_MIN,PARTIAL_POSS,PLAYER_PTS,TEAM_PTS,...,MATCHUP_FG3A,MATCHUP_FG3_PCT,HELP_BLK,HELP_FGM,HELP_FGA,HELP_FG_PERC,MATCHUP_FTM,MATCHUP_FTA,SFL,MATCHUP_TIME_SEC
0,22019,203952,Andrew Wiggins,203084,Harrison Barnes,4,23:44,110.7,24,110,...,7,0.286,0,0,0,0,6,7,3,1424.2
1,22019,203952,Andrew Wiggins,203145,Kent Bazemore,5,20:30,105.9,20,80,...,7,0.429,0,0,0,0,3,3,2,1229.5
2,22019,203952,Andrew Wiggins,203914,Gary Harris,3,19:20,92.2,15,91,...,5,0.200,0,0,0,0,2,2,1,1159.8
3,22019,203952,Andrew Wiggins,203933,T.J. Warren,2,16:35,86.5,12,87,...,4,0.500,0,0,0,0,2,4,2,994.6
4,22019,203952,Andrew Wiggins,1628415,Dillon Brooks,3,15:08,81.8,24,101,...,9,0.444,0,0,0,0,4,5,2,908.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15922,22019,1629627,Zion Williamson,1629139,Yuta Watanabe,1,0:11,0.2,0,0,...,0,0.000,0,0,0,0,0,0,0,11.2
15923,22019,1629627,Zion Williamson,1629013,Landry Shamet,1,0:03,0.2,0,3,...,0,0.000,0,0,0,0,0,0,0,2.6
15924,22019,1629627,Zion Williamson,1628420,Monte Morris,1,0:01,0.1,0,0,...,0,0.000,0,0,0,0,0,0,0,1.4
15925,22019,1629627,Zion Williamson,1629035,Carsen Edwards,1,0:02,0.1,0,0,...,0,0.000,0,0,0,0,0,0,0,1.6


In [33]:
h2h_df.to_csv(loc + season + '_' + str(percentile) + '_h2h_stats.csv', index=False)

# Extract more offensive data

Get additional offensive statistics

In [34]:
r = requests.get('https://stats.nba.com/stats/leaguedashptstats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&Height=&LastNGames=0&LeagueID=00&Location=&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PerMode=PerGame&PlayerExperience=&PlayerOrTeam=Player&PlayerPosition=&PtMeasureType=PaintTouch&Season=' + season + '&SeasonSegment=&SeasonType=Regular+Season&StarterBench=&TeamID=0&VsConference=&VsDivision=&Weight=', headers=headers, timeout=10)

In [35]:
r_dict = json.loads(r.content)

data = r_dict['resultSets'][0]['rowSet']
columns = r_dict['resultSets'][0]['headers']

stats = pd.DataFrame(data=data, columns=columns)

# Remove unnecessary columns
stats = stats[['PLAYER_ID', 'TOUCHES', 'PAINT_TOUCHES', 'PAINT_TOUCH_FGM', 'PAINT_TOUCH_FGA', 'PAINT_TOUCH_PASSES', 'PAINT_TOUCH_TOV']]
off_df = pd.merge(off_df, stats, on=['PLAYER_ID'])
off_df.columns

Index(['PLAYER_ID', 'PLAYER_NAME', 'AGE', 'GP', 'MIN', 'FGM', 'FGA', 'FG_PCT',
       'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
       'REB', 'AST', 'TOV', 'BLKA', 'PF', 'PTS', 'SEASON_ID', 'TOUCHES',
       'PAINT_TOUCHES', 'PAINT_TOUCH_FGM', 'PAINT_TOUCH_FGA',
       'PAINT_TOUCH_PASSES', 'PAINT_TOUCH_TOV'],
      dtype='object')

In [36]:
r = requests.get('https://stats.nba.com/stats/leaguedashptstats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&Height=&LastNGames=0&LeagueID=00&Location=&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PerMode=PerGame&PlayerExperience=&PlayerOrTeam=Player&PlayerPosition=&PtMeasureType=Efficiency&Season=' + season + '&SeasonSegment=&SeasonType=Regular+Season&StarterBench=&TeamID=0&VsConference=&VsDivision=&Weight=', headers=headers, timeout=10)

In [37]:
r_dict = json.loads(r.content)

data = r_dict['resultSets'][0]['rowSet']
columns = r_dict['resultSets'][0]['headers']

stats = pd.DataFrame(data=data, columns=columns)
stats = stats[['PLAYER_ID', 'DRIVE_PTS', 'DRIVE_FG_PCT', 'CATCH_SHOOT_PTS',
                 'CATCH_SHOOT_FG_PCT', 'PULL_UP_PTS', 'PULL_UP_FG_PCT',
                 'PAINT_TOUCH_PTS', 'PAINT_TOUCH_FG_PCT', 'POST_TOUCH_PTS',
                 'POST_TOUCH_FG_PCT', 'ELBOW_TOUCH_PTS', 'ELBOW_TOUCH_FG_PCT',
                 'EFF_FG_PCT']]
off_df = pd.merge(off_df, stats, on=['PLAYER_ID'])

off_df.drop(columns=['PLAYER_NAME'], inplace=True)
off_df

Unnamed: 0,PLAYER_ID,AGE,GP,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,...,CATCH_SHOOT_FG_PCT,PULL_UP_PTS,PULL_UP_FG_PCT,PAINT_TOUCH_PTS,PAINT_TOUCH_FG_PCT,POST_TOUCH_PTS,POST_TOUCH_FG_PCT,ELBOW_TOUCH_PTS,ELBOW_TOUCH_FG_PCT,EFF_FG_PCT
0,203952,25.0,54,34.4,8.1,18.1,0.447,2.1,6.2,0.332,...,0.344,4.6,0.322,2.1,0.625,0.6,0.519,0.6,0.6,0.504
1,203076,27.0,62,34.4,8.9,17.7,0.503,1.2,3.5,0.33,...,0.353,2.7,0.36,8.2,0.709,4.5,0.452,2.1,0.593,0.536
2,202711,31.0,63,33.1,6.6,14.8,0.447,3.0,7.3,0.414,...,0.419,3.9,0.364,1.0,0.686,0.8,0.519,0.4,0.765,0.548
3,203078,27.0,57,36.0,10.4,22.9,0.455,3.0,8.4,0.353,...,0.38,8.5,0.383,1.1,0.51,0.2,0.385,0.8,0.594,0.52
4,1627742,22.0,62,33.9,8.2,17.7,0.463,2.4,6.2,0.391,...,0.409,4.9,0.388,1.3,0.652,0.5,0.414,0.5,0.565,0.53
5,1627741,27.0,72,30.8,6.9,16.1,0.429,3.8,9.6,0.394,...,0.401,7.7,0.395,0.9,0.519,0.0,0.5,0.2,0.5,0.546
6,203468,28.0,70,36.5,8.7,19.4,0.451,2.8,7.3,0.379,...,0.459,9.7,0.426,0.6,0.474,0.0,0.0,0.3,0.727,0.523
7,1627747,25.0,45,29.6,6.9,16.3,0.425,1.8,4.9,0.364,...,0.321,6.6,0.399,1.3,0.561,0.3,0.556,0.7,0.45,0.48
8,1629012,21.0,65,33.0,7.9,16.7,0.472,1.5,3.9,0.38,...,0.407,5.0,0.396,1.8,0.63,0.0,0.0,0.9,0.596,0.517
9,1626156,24.0,45,32.3,8.0,18.8,0.426,3.5,9.6,0.367,...,0.397,10.3,0.39,0.5,0.833,0.2,0.5,0.4,0.6,0.519


In [38]:
off_df.to_csv(loc + season + '_' + str(percentile) + '_off_stats.csv', index=False)