In [1]:
import pandas as pd
import requests
import json
import os

import time

In [2]:
loc = './stats/'
seasons = ['2018-19', '2021-22']

# Headers for nba.com
headers = {
    'Accept': 'application/json, text/plain, */*',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
    'x-nba-stats-origin': 'stats',
    'Sec-Fetch-Site': 'same-site',
    'Sec-Fetch-Mode': 'cors',
    'Referer': 'https://www.nba.com/'
}

# Offensive Data

Scrape offensive data

In [3]:
def formatData(data):
    r_dict = json.loads(data.content)
    data = r_dict['resultSets'][0]['rowSet']
    columns = r_dict['resultSets'][0]['headers']
    formatted = pd.DataFrame(data=data, columns=columns)

    return formatted

In [4]:
def getOffData(season, percentile, delay=1):
    r = requests.get('https://stats.nba.com/stats/leaguedashplayerstats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&GameSegment=&Height=&LastNGames=0&LeagueID=00&Location=&MeasureType=Base&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=PerGame&Period=0&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&Season=' + season + '&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StarterBench=&TeamID=0&TwoWay=0&VsConference=&VsDivision=&Weight=', headers=headers, timeout=10)
    
    # Format data
    df = formatData(r)

    # Remove unnecessary columns
    df.drop(columns=df.columns[31:], inplace=True) # Get rid of ranks
    df.drop(columns=['NICKNAME', 'TEAM_ID', 'TEAM_ABBREVIATION', 'STL', 'BLK', 'PFD', 'W', 'L', 'W_PCT'], inplace=True)
    
    # Add Season ID column
    df['SEASON_ID'] = season[0] + season[0:4]

    # Get top percentile
    off_df = df.copy()
    off_df = off_df[off_df['PTS'] > df.quantile(q=percentile)['PTS']]
    print('Estimated time to complete: ' + str(format(off_df.shape[0] * (delay + delay*0.71) / 60, '.1f')) + ' minutes')

    # Generate h2h data
    h2h_df = None
    created = False
    i = 1
    for ID in off_df['PLAYER_ID']:
        print(str(ID) + '...', end = '')
        r = requests.get('https://stats.nba.com/stats/leagueseasonmatchups?DateFrom=&DateTo=&LeagueID=00&OffPlayerID=' + str(ID) + '&Outcome=&PORound=0&PerMode=Totals&Season=' + season + '&SeasonType=Regular+Season', headers=headers, timeout=1000)
        
        # Verbose
        print('done! [' + str(i) + '/' + str(off_df.shape[0]) + ']')
        i += 1
        
        r_dict = json.loads(r.content)
        data = r_dict['resultSets'][0]['rowSet']
        columns = r_dict['resultSets'][0]['headers']
        
        if created == False:
            h2h_df = pd.DataFrame(data = data, columns = columns)
            created = True
        else:
            combine = pd.DataFrame(data=data, columns=columns)
            h2h_df = h2h_df.append(combine, ignore_index = True)
        time.sleep(delay)
    
    # Save data to csv
    h2h_df.to_csv(loc + season + '_' + str(percentile) + '_h2h_stats.csv', index=False)

    # Paint touches stats
    r = requests.get('https://stats.nba.com/stats/leaguedashptstats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&Height=&LastNGames=0&LeagueID=00&Location=&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PerMode=PerGame&PlayerExperience=&PlayerOrTeam=Player&PlayerPosition=&PtMeasureType=PaintTouch&Season=' + season + '&SeasonSegment=&SeasonType=Regular+Season&StarterBench=&TeamID=0&VsConference=&VsDivision=&Weight=', headers=headers, timeout=10)
    stats = formatData(r)

    # Remove unnecessary columns
    stats = stats[['PLAYER_ID', 'TOUCHES', 'PAINT_TOUCHES', 'PAINT_TOUCH_FGM', 'PAINT_TOUCH_FGA', 'PAINT_TOUCH_PASSES', 'PAINT_TOUCH_TOV']]
    off_df = pd.merge(off_df, stats, on=['PLAYER_ID'])

    # Efficiency stats
    r = requests.get('https://stats.nba.com/stats/leaguedashptstats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&Height=&LastNGames=0&LeagueID=00&Location=&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PerMode=PerGame&PlayerExperience=&PlayerOrTeam=Player&PlayerPosition=&PtMeasureType=Efficiency&Season=' + season + '&SeasonSegment=&SeasonType=Regular+Season&StarterBench=&TeamID=0&VsConference=&VsDivision=&Weight=', headers=headers, timeout=10)
    stats = formatData(r)

    # Extract needed columns and merge
    stats = stats[['PLAYER_ID', 'DRIVE_PTS', 'DRIVE_FG_PCT', 'CATCH_SHOOT_PTS',
                    'CATCH_SHOOT_FG_PCT', 'PULL_UP_PTS', 'PULL_UP_FG_PCT',
                    'PAINT_TOUCH_PTS', 'PAINT_TOUCH_FG_PCT', 'POST_TOUCH_PTS',
                    'POST_TOUCH_FG_PCT', 'ELBOW_TOUCH_PTS', 'ELBOW_TOUCH_FG_PCT',
                    'EFF_FG_PCT']]
    off_df = pd.merge(off_df, stats, on=['PLAYER_ID'])
    off_df.drop(columns=['PLAYER_NAME'], inplace=True)

    # General stats
    r = requests.get('https://stats.nba.com/stats/leaguedashplayerstats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&GameSegment=&Height=&LastNGames=0&LeagueID=00&Location=&MeasureType=Base&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=Per100Possessions&Period=0&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&Season=' + season + '&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StarterBench=&TeamID=0&TwoWay=0&VsConference=&VsDivision=&Weight=', headers=headers, timeout=10)
    stats = formatData(r)

    # Extract needed columns and merge
    stats = stats[['PLAYER_ID', 'PTS']]
    stats.rename(columns={'PTS': 'PTS_PER_100'}, inplace=True)
    off_df = pd.merge(off_df, stats, on=['PLAYER_ID'])
    
    # Output offensive stats to csv
    off_df.to_csv(loc + season + '_' + str(percentile) + '_off_stats.csv', index=False)
    return True


# Defensive Data

Scrape defensive data

In [7]:
def getDefData(season):
    # Retrieve both defensive dashboard and defensive impact
    r_dd = requests.get('https://stats.nba.com/stats/leaguedashptdefend?College=&Conference=&Country=&DateFrom=&DateTo=&DefenseCategory=Overall&Division=&DraftPick=&DraftYear=&GameSegment=&Height=&LastNGames=0&LeagueID=00&Location=&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PerMode=PerGame&Period=0&PlayerExperience=&PlayerPosition=&Season=' + season + '&SeasonSegment=&SeasonType=Regular+Season&StarterBench=&TeamID=0&VsConference=&VsDivision=&Weight=', headers=headers, timeout=10)
    r_di = requests.get('https://stats.nba.com/stats/leaguedashptstats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&Height=&LastNGames=0&LeagueID=00&Location=&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PerMode=PerGame&PlayerExperience=&PlayerOrTeam=Player&PlayerPosition=&PtMeasureType=Defense&Season=' + season + '&SeasonSegment=&SeasonType=Regular+Season&StarterBench=&TeamID=0&VsConference=&VsDivision=&Weight=', headers=headers, timeout=10)

    # Format data
    dd_df = formatData(r_dd)
    di_df = formatData(r_di)

    dd_df.sort_values(by='PLAYER_NAME', inplace=True)
    di_df.sort_values(by='PLAYER_NAME', inplace=True)

    # Remove players not in both dataframes
    diff = list(set(di_df['PLAYER_ID'].tolist()) - set(dd_df['CLOSE_DEF_PERSON_ID'].tolist()))
    di_df = di_df[~di_df['PLAYER_ID'].isin(diff)]

    # Combine dataframes
    def_data = dd_df.join(di_df[di_df.columns[5:11]])

    # Drop unnecessary columns
    def_data.drop(columns=['PLAYER_LAST_TEAM_ID', 'PLAYER_LAST_TEAM_ABBREVIATION', 'PLAYER_POSITION', 'FREQ'], inplace=True)
    def_data.rename(columns={'CLOSE_DEF_PERSON_ID' : 'PLAYER_ID'}, inplace=True)

    # Get hustle statistics
    r = requests.get('https://stats.nba.com/stats/leaguehustlestatsplayer?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&Height=&LastNGames=0&LeagueID=00&Location=&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=PerGame&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&Season=' + season + '&SeasonSegment=&SeasonType=Regular+Season&TeamID=0&VsConference=&VsDivision=&Weight=', headers=headers, timeout=10)
    df = formatData(r)

    # Drop unnecessary columns
    df.drop(columns=['TEAM_ID', 'TEAM_ABBREVIATION', 'AGE', 'G', 'MIN',
                'SCREEN_AST_PTS', 'SCREEN_ASSISTS', 'OFF_LOOSE_BALLS_RECOVERED', 'DEF_LOOSE_BALLS_RECOVERED',
                'LOOSE_BALLS_RECOVERED', 'PCT_LOOSE_BALLS_RECOVERED_OFF', 'PCT_LOOSE_BALLS_RECOVERED_DEF',
                'OFF_BOXOUTS', 'BOX_OUTS', 'BOX_OUT_PLAYER_TEAM_REBS', 'BOX_OUT_PLAYER_REBS', 'PCT_BOX_OUTS_OFF',
                'PCT_BOX_OUTS_DEF', 'PCT_BOX_OUTS_TEAM_REB'], inplace=True)

    # Remove players not in df
    diff = list(set(df['PLAYER_ID'].tolist()) - set(def_data['PLAYER_ID'].tolist()))
    df = df[~df['PLAYER_ID'].isin(diff)]
    df.sort_values(by='PLAYER_NAME', inplace=True)

    # Insert hustle data
    def_data = def_data.join(df[df.columns[2:]])

    # Set season ID
    def_data['SEASON_ID'] = '220' + season[2:4]

    # Get defensive ratings
    r = requests.get('https://stats.nba.com/stats/leaguedashplayerstats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&GameSegment=&Height=&LastNGames=0&LeagueID=00&Location=&MeasureType=Defense&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=PerGame&Period=0&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&Season=' + season + '&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StarterBench=&TeamID=0&TwoWay=0&VsConference=&VsDivision=&Weight=', headers=headers, timeout=10)
    df = formatData(r)
    
    # Drop columns and remove players not in dataframe
    df.drop(columns=['NICKNAME', 'TEAM_ID', 'TEAM_ABBREVIATION', 'AGE', 'GP', 'W', 'L', 'W_PCT', 'MIN', 'DREB', 'PCT_DREB', 'STL',
        'BLK', 'GP_RANK', 'W_RANK', 'L_RANK', 'W_PCT_RANK', 'MIN_RANK', 'DEF_RATING_RANK', 'DREB_RANK', 'DREB_PCT_RANK',
        'PCT_DREB_RANK', 'STL_RANK', 'PCT_STL_RANK', 'BLK_RANK', 'PCT_BLK_RANK', 'OPP_PTS_OFF_TOV_RANK',
        'OPP_PTS_2ND_CHANCE_RANK', 'OPP_PTS_FB_RANK', 'OPP_PTS_PAINT_RANK', 'DEF_WS_RANK', 'CFID', 'CFPARAMS'], inplace=True)
    diff = list(set(df['PLAYER_ID'].tolist()) - set(def_data['PLAYER_ID'].tolist()))
    df = df[~df['PLAYER_ID'].isin(diff)]
    df.sort_values(by='PLAYER_NAME', inplace=True)

    # Insert rating data
    def_data = def_data.join(df[df.columns[2:]])

    # Save to csv
    def_data.to_csv(loc + season + '_def_stats.csv', index=False)

# Get All Data

Scrapes both offensive and defensive data, and head to head data

In [8]:
# Ensure directory exists
if not os.path.exists(loc):
    os.makedirs(loc)

# Get all offensive stats over seasons
season = seasons[0]
final = str(int(seasons[1][0:4]) + 1) + '-' + str(int(seasons[1][5:7]) + 1)

while season != final:
    print(season)
    getOffData(season, 0.9)
    getDefData(season)

    # Increment season
    season = str(int(season[0:4]) + 1) + '-' + str(int(season[5:7]) + 1)

2018-19
Estimated time to complete: 1.5 minutes
203083...done! [1/53]
203952...done! [2/53]
203076...done! [3/53]
201933...done! [4/53]
202711...done! [5/53]
203078...done! [6/53]
1627742...done! [7/53]
1627741...done! [8/53]
203468...done! [9/53]
1626156...done! [10/53]
203081...done! [11/53]
201568...done! [12/53]
1628368...done! [13/53]
201942...done! [14/53]
201565...done! [15/53]
1626164...done! [16/53]
1628378...done! [17/53]
203507...done! [18/53]
200755...done! [19/53]
203960...done! [20/53]
1627750...done! [21/53]
201935...done! [22/53]
202710...done! [23/53]
203954...done! [24/53]
1628381...done! [25/53]
202322...done! [26/53]
201950...done! [27/53]
203944...done! [28/53]
1626157...done! [29/53]
202695...done! [30/53]
202689...done! [31/53]
201142...done! [32/53]
203114...done! [33/53]
202691...done! [34/53]
1628398...done! [35/53]
202681...done! [36/53]
200746...done! [37/53]
1628374...done! [38/53]
2544...done! [39/53]
101150...done! [40/53]
1629029...done! [41/53]
201144..