In [1]:
import pandas as pd
import requests
import json

import time

In [35]:
loc = r'../stats/'
season = '2018-19'

# Request offensive data from NBA API

Requests specifically data from the season defined above

In [36]:
headers = {
    'Accept': 'application/json, text/plain, */*',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
    'x-nba-stats-origin': 'stats',
    'Sec-Fetch-Site': 'same-site',
    'Sec-Fetch-Mode': 'cors',
    'Referer': 'https://www.nba.com/'
}

r = requests.get('https://stats.nba.com/stats/leaguedashplayerstats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&GameSegment=&Height=&LastNGames=0&LeagueID=00&Location=&MeasureType=Base&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=PerGame&Period=0&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&Season=' + season + '&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StarterBench=&TeamID=0&TwoWay=0&VsConference=&VsDivision=&Weight=', headers=headers, timeout=10)

In [37]:
r_dict = json.loads(r.content)

data = r_dict['resultSets'][0]['rowSet']
columns = r_dict['resultSets'][0]['headers']

df = pd.DataFrame(data=data, columns=columns)
df.drop(columns=df.columns[31:], inplace=True) # Get rid of ranks
df.drop(columns=['NICKNAME', 'TEAM_ID', 'TEAM_ABBREVIATION', 'STL', 'BLK', 'PFD', 'W', 'L', 'W_PCT'], inplace=True)
df['SEASON_ID'] = '220' + season[2:4]

print(df.shape)
df.head()

(530, 23)


Unnamed: 0,PLAYER_ID,PLAYER_NAME,AGE,GP,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,...,FT_PCT,OREB,DREB,REB,AST,TOV,BLKA,PF,PTS,SEASON_ID
0,203932,Aaron Gordon,23.0,78,33.8,6.0,13.4,0.449,1.6,4.4,...,0.731,1.7,5.7,7.4,3.7,2.1,0.7,2.2,16.0,22018
1,1628988,Aaron Holiday,22.0,50,12.9,2.1,5.2,0.401,0.9,2.5,...,0.82,0.1,1.2,1.3,1.7,0.8,0.2,1.4,5.9,22018
2,1627846,Abdel Nader,25.0,61,11.4,1.5,3.5,0.423,0.5,1.6,...,0.75,0.2,1.7,1.9,0.3,0.4,0.1,1.1,4.0,22018
3,201143,Al Horford,33.0,68,29.0,5.7,10.6,0.535,1.1,3.0,...,0.821,1.8,5.0,6.7,4.2,1.5,0.4,1.9,13.6,22018
4,202329,Al-Farouq Aminu,28.0,81,28.3,3.2,7.3,0.433,1.2,3.5,...,0.867,1.4,6.1,7.5,1.3,0.9,0.4,1.8,9.4,22018


# Get top percentile of players

Only uses the top x percentile to get H2H data from

In [38]:
percentile = 0.2

off_df = df.copy()
off_df = off_df[off_df['PTS'] > df.quantile(q=percentile)['PTS']]
print('Number of over ' + str(percentile * 100) + 'th percentile players: ' + str(off_df.shape[0]))
print('Minimum time to complete: ' + str(format(off_df.shape[0] * 1 / 60, '.1f')) + ' minutes')
print('Estimated time to complete: ' + str(format(off_df.shape[0] * 1.71 / 60, '.1f')) + ' minutes')

off_df.head()

Number of over 20.0th percentile players: 422
Minimum time to complete: 7.0 minutes
Estimated time to complete: 12.0 minutes


Unnamed: 0,PLAYER_ID,PLAYER_NAME,AGE,GP,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,...,FT_PCT,OREB,DREB,REB,AST,TOV,BLKA,PF,PTS,SEASON_ID
0,203932,Aaron Gordon,23.0,78,33.8,6.0,13.4,0.449,1.6,4.4,...,0.731,1.7,5.7,7.4,3.7,2.1,0.7,2.2,16.0,22018
1,1628988,Aaron Holiday,22.0,50,12.9,2.1,5.2,0.401,0.9,2.5,...,0.82,0.1,1.2,1.3,1.7,0.8,0.2,1.4,5.9,22018
2,1627846,Abdel Nader,25.0,61,11.4,1.5,3.5,0.423,0.5,1.6,...,0.75,0.2,1.7,1.9,0.3,0.4,0.1,1.1,4.0,22018
3,201143,Al Horford,33.0,68,29.0,5.7,10.6,0.535,1.1,3.0,...,0.821,1.8,5.0,6.7,4.2,1.5,0.4,1.9,13.6,22018
4,202329,Al-Farouq Aminu,28.0,81,28.3,3.2,7.3,0.433,1.2,3.5,...,0.867,1.4,6.1,7.5,1.3,0.9,0.4,1.8,9.4,22018


# Get H2H data for each player

Retrieves data from NBA.com based on each player ID

Takes a long time to grab all data as there is a 1 second delay between requests to avoid being ip banned from NBA.com

In [26]:
h2h_df = None
created = False
i = 1

for ID in off_df['PLAYER_ID']:
    print(ID, end = '')
    r = requests.get('https://stats.nba.com/stats/leagueseasonmatchups?DateFrom=&DateTo=&LeagueID=00&OffPlayerID=' + str(ID) + '&Outcome=&PORound=0&PerMode=Totals&Season=' + season + '&SeasonType=Regular+Season', headers=headers, timeout=1000)
    
    # Verbose
    print('...done! [' + str(i) + '/' + str(off_df.shape[0]) + ']')
    i += 1
    
    r_dict = json.loads(r.content)
    data = r_dict['resultSets'][0]['rowSet']
    columns = r_dict['resultSets'][0]['headers']
    
    if created == False:
        h2h_df = pd.DataFrame(data = data, columns = columns)
        created = True
    else:
        combine = pd.DataFrame(data=data, columns=columns)
        h2h_df = h2h_df.append(combine, ignore_index = True)
    time.sleep(1)
    
h2h_df

203932...done! [1/420]
1628988...done! [2/420]
1627846...done! [3/420]
201143...done! [4/420]
202329...done! [5/420]
202692...done! [6/420]
1629346...done! [7/420]
1627936...done! [8/420]
203458...done! [9/420]
1628035...done! [10/420]
203459...done! [11/420]
1629019...done! [12/420]
203083...done! [13/420]
2738...done! [14/420]
203952...done! [15/420]
1629014...done! [16/420]
1627790...done! [17/420]
203076...done! [18/420]
201229...done! [19/420]
1628394...done! [20/420]
203382...done! [21/420]
203085...done! [22/420]
202340...done! [23/420]
1628389...done! [24/420]
203463...done! [25/420]
1627732...done! [26/420]
202687...done! [27/420]
201933...done! [28/420]
1626246...done! [29/420]
1626171...done! [30/420]
203992...done! [31/420]
202711...done! [32/420]
1629626...done! [33/420]
202954...done! [34/420]
203078...done! [35/420]
1629634...done! [36/420]
1629164...done! [37/420]
1627742...done! [38/420]
202688...done! [39/420]
201572...done! [40/420]
1628971...done! [41/420]
1628981..

Unnamed: 0,SEASON_ID,OFF_PLAYER_ID,OFF_PLAYER_NAME,DEF_PLAYER_ID,DEF_PLAYER_NAME,GP,MATCHUP_MIN,PARTIAL_POSS,PLAYER_PTS,TEAM_PTS,...,MATCHUP_FG3A,MATCHUP_FG3_PCT,HELP_BLK,HELP_FGM,HELP_FGA,HELP_FG_PERC,MATCHUP_FTM,MATCHUP_FTA,SFL,MATCHUP_TIME_SEC
0,22019,203932,Aaron Gordon,202710,Jimmy Butler,4,25:22,117.3,24,110,...,10,0.400,0,0,0,0,2,2,0,1521.9
1,22019,203932,Aaron Gordon,1627823,Juancho Hernangomez,3,15:56,87.2,10,97,...,5,0.200,0,0,0,0,1,1,1,956.2
2,22019,203932,Aaron Gordon,203114,Khris Middleton,4,17:50,87.1,13,82,...,6,0.333,0,0,0,0,3,5,3,1070.2
3,22019,203932,Aaron Gordon,203933,T.J. Warren,2,16:36,81.6,25,82,...,5,0.600,0,0,0,0,2,4,2,995.6
4,22019,203932,Aaron Gordon,203084,Harrison Barnes,2,17:33,79.4,22,102,...,4,0.750,0,0,0,0,1,1,1,1052.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114822,22019,1629627,Zion Williamson,1629139,Yuta Watanabe,1,0:11,0.2,0,0,...,0,0.000,0,0,0,0,0,0,0,11.2
114823,22019,1629627,Zion Williamson,1629013,Landry Shamet,1,0:03,0.2,0,3,...,0,0.000,0,0,0,0,0,0,0,2.6
114824,22019,1629627,Zion Williamson,1628420,Monte Morris,1,0:01,0.1,0,0,...,0,0.000,0,0,0,0,0,0,0,1.4
114825,22019,1629627,Zion Williamson,1629035,Carsen Edwards,1,0:02,0.1,0,0,...,0,0.000,0,0,0,0,0,0,0,1.6


In [27]:
h2h_df.to_csv(loc + season + '_' + str(percentile) + '_h2h_stats.csv', index=False)

# Extract more offensive data

Get additional offensive statistics

In [39]:
r = requests.get('https://stats.nba.com/stats/leaguedashptstats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&Height=&LastNGames=0&LeagueID=00&Location=&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PerMode=PerGame&PlayerExperience=&PlayerOrTeam=Player&PlayerPosition=&PtMeasureType=PaintTouch&Season=' + season + '&SeasonSegment=&SeasonType=Regular+Season&StarterBench=&TeamID=0&VsConference=&VsDivision=&Weight=', headers=headers, timeout=10)

In [40]:
r_dict = json.loads(r.content)

data = r_dict['resultSets'][0]['rowSet']
columns = r_dict['resultSets'][0]['headers']

stats = pd.DataFrame(data=data, columns=columns)

# Remove unnecessary columns
stats = stats[['PLAYER_ID', 'TOUCHES', 'PAINT_TOUCHES', 'PAINT_TOUCH_FGM', 'PAINT_TOUCH_FGA', 'PAINT_TOUCH_PASSES', 'PAINT_TOUCH_TOV']]
off_df = pd.merge(off_df, stats, on=['PLAYER_ID'])
off_df.columns

Index(['PLAYER_ID', 'PLAYER_NAME', 'AGE', 'GP', 'MIN', 'FGM', 'FGA', 'FG_PCT',
       'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
       'REB', 'AST', 'TOV', 'BLKA', 'PF', 'PTS', 'SEASON_ID', 'TOUCHES',
       'PAINT_TOUCHES', 'PAINT_TOUCH_FGM', 'PAINT_TOUCH_FGA',
       'PAINT_TOUCH_PASSES', 'PAINT_TOUCH_TOV'],
      dtype='object')

In [41]:
r = requests.get('https://stats.nba.com/stats/leaguedashptstats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&Height=&LastNGames=0&LeagueID=00&Location=&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PerMode=PerGame&PlayerExperience=&PlayerOrTeam=Player&PlayerPosition=&PtMeasureType=Efficiency&Season=' + season + '&SeasonSegment=&SeasonType=Regular+Season&StarterBench=&TeamID=0&VsConference=&VsDivision=&Weight=', headers=headers, timeout=10)

In [42]:
r_dict = json.loads(r.content)

data = r_dict['resultSets'][0]['rowSet']
columns = r_dict['resultSets'][0]['headers']

stats = pd.DataFrame(data=data, columns=columns)
stats = stats[['PLAYER_ID', 'DRIVE_PTS', 'DRIVE_FG_PCT', 'CATCH_SHOOT_PTS',
                 'CATCH_SHOOT_FG_PCT', 'PULL_UP_PTS', 'PULL_UP_FG_PCT',
                 'PAINT_TOUCH_PTS', 'PAINT_TOUCH_FG_PCT', 'POST_TOUCH_PTS',
                 'POST_TOUCH_FG_PCT', 'ELBOW_TOUCH_PTS', 'ELBOW_TOUCH_FG_PCT',
                 'EFF_FG_PCT']]
off_df = pd.merge(off_df, stats, on=['PLAYER_ID'])

off_df.drop(columns=['PLAYER_NAME'], inplace=True)
off_df

Unnamed: 0,PLAYER_ID,AGE,GP,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,...,CATCH_SHOOT_FG_PCT,PULL_UP_PTS,PULL_UP_FG_PCT,PAINT_TOUCH_PTS,PAINT_TOUCH_FG_PCT,POST_TOUCH_PTS,POST_TOUCH_FG_PCT,ELBOW_TOUCH_PTS,ELBOW_TOUCH_FG_PCT,EFF_FG_PCT
0,203932,23.0,78,33.8,6.0,13.4,0.449,1.6,4.4,0.349,...,0.355,2.9,0.362,3.9,0.645,1.5,0.404,1.1,0.507,0.510
1,1628988,22.0,50,12.9,2.1,5.2,0.401,0.9,2.5,0.339,...,0.379,2.1,0.338,0.0,1.000,0.0,0.000,0.2,1.000,0.483
2,1627846,25.0,61,11.4,1.5,3.5,0.423,0.5,1.6,0.320,...,0.305,0.2,0.308,0.3,0.571,0.0,0.000,0.2,0.429,0.495
3,201143,33.0,68,29.0,5.7,10.6,0.535,1.1,3.0,0.360,...,0.444,0.6,0.458,4.1,0.706,2.0,0.514,1.5,0.603,0.586
4,202329,28.0,81,28.3,3.2,7.3,0.433,1.2,3.5,0.343,...,0.351,0.2,0.171,2.0,0.619,0.1,0.429,0.4,0.450,0.514
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
417,1627812,26.0,71,15.0,2.2,5.0,0.435,0.8,2.1,0.362,...,0.382,2.3,0.422,0.2,0.625,0.0,0.000,0.2,0.600,0.511
418,1628380,21.0,77,17.6,2.5,5.2,0.473,0.5,1.6,0.331,...,0.308,0.2,0.269,2.8,0.657,0.9,0.477,0.8,0.447,0.523
419,203897,24.0,63,34.5,8.4,18.0,0.467,1.9,5.1,0.374,...,0.376,5.8,0.357,1.5,0.741,0.3,0.412,0.6,0.571,0.518
420,2585,35.0,68,12.9,1.3,2.8,0.440,0.0,0.1,0.000,...,0.325,0.1,0.308,2.3,0.479,0.3,0.533,0.5,0.355,0.440


In [43]:
r = requests.get('https://stats.nba.com/stats/leaguedashplayerstats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&GameSegment=&Height=&LastNGames=0&LeagueID=00&Location=&MeasureType=Base&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=Per100Possessions&Period=0&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&Season=' + season + '&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StarterBench=&TeamID=0&TwoWay=0&VsConference=&VsDivision=&Weight=', headers=headers, timeout=10)

In [44]:
r_dict = json.loads(r.content)

data = r_dict['resultSets'][0]['rowSet']
columns = r_dict['resultSets'][0]['headers']

stats = pd.DataFrame(data=data, columns=columns)
stats = stats[['PLAYER_ID', 'PTS']]
stats.rename(columns={'PTS': 'PTS_PER_100'}, inplace=True)
off_df = pd.merge(off_df, stats, on=['PLAYER_ID'])
off_df.head()

Unnamed: 0,PLAYER_ID,AGE,GP,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,...,PULL_UP_PTS,PULL_UP_FG_PCT,PAINT_TOUCH_PTS,PAINT_TOUCH_FG_PCT,POST_TOUCH_PTS,POST_TOUCH_FG_PCT,ELBOW_TOUCH_PTS,ELBOW_TOUCH_FG_PCT,EFF_FG_PCT,PTS_PER_100
0,203932,23.0,78,33.8,6.0,13.4,0.449,1.6,4.4,0.349,...,2.9,0.362,3.9,0.645,1.5,0.404,1.1,0.507,0.51,22.8
1,1628988,22.0,50,12.9,2.1,5.2,0.401,0.9,2.5,0.339,...,2.1,0.338,0.0,1.0,0.0,0.0,0.2,1.0,0.483,21.1
2,1627846,25.0,61,11.4,1.5,3.5,0.423,0.5,1.6,0.32,...,0.2,0.308,0.3,0.571,0.0,0.0,0.2,0.429,0.495,15.9
3,201143,33.0,68,29.0,5.7,10.6,0.535,1.1,3.0,0.36,...,0.6,0.458,4.1,0.706,2.0,0.514,1.5,0.603,0.586,22.3
4,202329,28.0,81,28.3,3.2,7.3,0.433,1.2,3.5,0.343,...,0.2,0.171,2.0,0.619,0.1,0.429,0.4,0.45,0.514,15.7


In [45]:
off_df.to_csv(loc + season + '_' + str(percentile) + '_off_stats.csv', index=False)