In [1]:
import numpy as np
import pandas as pd

In [2]:
# Found this partway through the analysis: https://sites.northwestern.edu/nusportsanalytics/2019/06/21/the-rise-of-triple-doubles-in-the-nba-examining-the-statistical-causes/
# Basically exactly what I want to do.

In [3]:
# Import high-level information
from nba_api.stats.static import teams
from nba_api.stats.static import players
from nba_api.stats.endpoints import playergamelog

In [4]:
# R to python reference
# https://gist.github.com/conormm/fd8b1980c28dd21cfaf6975c86c74d07

In [5]:
# Get the first and last names of all active players.
nbaPlayers = players.get_players()
activePlayers = [player['full_name'] for player in nbaPlayers 
                    if player['is_active'] == True]

In [6]:
# Pings the nba_api for a player across one or more seasons.
#   - playerName (string): player's full name in the nba_api db.
class PlayerStats():
    def __init__( self, playerName ):
        self.playerName = playerName
        self.playerLogData = pd.DataFrame()
        self.seasons = list()

    # Get the game log for a certain player, based on their full name, across certain seasons.
    #   - seasons (list of integers): seasons to pull game log data on.
    #   - players (nba_api "players" endpoint)
    def GetPlayerGameLog( self, seasons, players ):
        playerDict = players.get_players()
        curPlayer = [player for player in playerDict if player['full_name'] == self.playerName]
        
        playerLog = list()
        for curSeason in seasons:
            # Ping API separately for each season and extract response dataframe.
            playerLog.append( playergamelog.PlayerGameLog( player_id = curPlayer[0]['id'], season = curSeason ).get_data_frames()[0] )
        
        # Combine all seasons.
        playerLog = pd.concat(playerLog)
        # Add player name.
        playerLog = playerLog.assign( Player_Name = curPlayer[0]['full_name']  )
        self.playerLogData = playerLog
        self.seasons = seasons


In [7]:
LukaDoncicStats = PlayerStats(playerName = 'Luka Doncic' )

In [8]:
LukaDoncicStats.playerName

'Luka Doncic'

In [9]:
LukaDoncicStats.GetPlayerGameLog(seasons=[2018,2019,2020],players=players)

In [10]:
LukaDoncicStats.seasons

[2018, 2019, 2020]

In [11]:
LukaDoncicStats.playerLogData

Unnamed: 0,SEASON_ID,Player_ID,Game_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,FGA,FG_PCT,...,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,VIDEO_AVAILABLE,Player_Name
0,22018,1629029,0021801216,"APR 09, 2019",DAL vs. PHX,W,32,6,14,0.429,...,16,11,3,0,4,1,21,15,1,Luka Doncic
1,22018,1629029,0021801172,"APR 03, 2019",DAL vs. MIN,L,32,11,21,0.524,...,12,6,0,1,4,1,27,-15,1,Luka Doncic
2,22018,1629029,0021801124,"MAR 28, 2019",DAL @ MIA,L,31,6,18,0.333,...,8,7,0,0,6,4,19,-8,1,Luka Doncic
3,22018,1629029,0021801114,"MAR 26, 2019",DAL vs. SAC,L,36,9,20,0.450,...,12,12,0,0,4,2,28,-7,1,Luka Doncic
4,22018,1629029,0021801092,"MAR 23, 2019",DAL @ GSW,W,27,6,14,0.429,...,11,10,3,0,3,3,23,29,1,Luka Doncic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,22020,1629029,0022000071,"JAN 01, 2021",DAL vs. MIA,W,36,9,22,0.409,...,15,7,0,1,5,2,27,12,1,Luka Doncic
62,22020,1629029,0022000059,"DEC 30, 2020",DAL vs. CHA,L,24,4,10,0.400,...,2,5,2,0,3,3,12,-27,1,Luka Doncic
63,22020,1629029,0022000031,"DEC 27, 2020",DAL @ LAC,W,26,8,18,0.444,...,9,8,1,0,2,0,24,29,1,Luka Doncic
64,22020,1629029,0022000008,"DEC 25, 2020",DAL @ LAL,L,34,9,19,0.474,...,4,7,0,1,3,0,27,-14,1,Luka Doncic


In [12]:
# Theory is that players hunt for double doubles and triple doubles. Plot distribution of 
# rebounds when 10 points or more
# assists when 10 points or more
# assists when 10 points or more and 10 rebounds or more.
# rebounds when 10 points or more and 10 assists or more.
# - PlayerStatsData is a PlayerStats class object.
class PlotGameLogStats():
    def __init__( self, playerStatsData, playerName ):
        self.playerStatsData = playerStatsData
        self.playerName      = playerName

    # Plot rebounds when 10 points or more.
    def ReboundsWhen10Points( self ):
        curPlayerData = self.playerStatsData

        plotData = curPlayerData[curPlayerData['PTS'] >= 10]
        plotData = plotData[['REB']]
        
        # Plot
        plot = plotData.plot.hist(bins = list(range(1,20)))
        plot.set_xlabel('Rebounds')
        plot.set_title(self.playerName +': Rebounds when 10 points or more')
        plot.locator_params(integer=True)
        return(plot)

    # Plot assists when 10 points or more.
    def AssistsWhen10Points( self ):
        curPlayerData = self.playerStatsData
        
        plotData = curPlayerData[curPlayerData['PTS'] >= 10]
        plotData = plotData[['AST']]

        # Plot
        plot = plotData.plot.hist(bins = list(range(1,20)))
        plot.set_xlabel('Assists')
        plot.set_title(self.playerName + ': Assists when 10 points or more')
        plot.locator_params(integer=True)
        return(plot)

    # Plot rebounds when 10 points or more and 10 assists or more.
    def Rebounds10PointsAndAssists( self ):
        curPlayerData = self.playerStatsData

        plotData = curPlayerData[(curPlayerData['PTS'] >= 10) & (curPlayerData['AST'] >= 10) ]
        plotData = plotData[['REB']]

        # Plot
        plot = plotData.plot.hist(bins = list(range(1,20)))
        plot.set_xlabel('Rebounds')
        plot.set_title(self.playerName + ': Rebounds when 10 points or more and 10 assists or more')
        plot.locator_params(integer=True)
        return(plot)

    # Plot assists when 10 points or more and 10 rebounds or more.
    def Assists10PointsAndRebounds( self ):
        curPlayerData = self.playerStatsData

        plotData = curPlayerData[(curPlayerData['PTS'] >= 10) & (curPlayerData['REB'] >= 10) ]
        plotData = plotData[['AST']]

        # Plot
        plot = plotData.plot.hist(bins = list(range(1,20)))
        plot.set_xlabel('Assists')
        plot.set_title(self.playerName + ': Assists when 10 points or more and 10 rebounds or more')
        plot.locator_params(integer=True)
        return(plot)
        

In [13]:
PlotLukaStats = PlotGameLogStats( playerStatsData = LukaDoncicStats.playerLogData, playerName = LukaDoncicStats.playerName )

In [14]:
PlotLukaStats.playerStatsData

Unnamed: 0,SEASON_ID,Player_ID,Game_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,FGA,FG_PCT,...,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,VIDEO_AVAILABLE,Player_Name
0,22018,1629029,0021801216,"APR 09, 2019",DAL vs. PHX,W,32,6,14,0.429,...,16,11,3,0,4,1,21,15,1,Luka Doncic
1,22018,1629029,0021801172,"APR 03, 2019",DAL vs. MIN,L,32,11,21,0.524,...,12,6,0,1,4,1,27,-15,1,Luka Doncic
2,22018,1629029,0021801124,"MAR 28, 2019",DAL @ MIA,L,31,6,18,0.333,...,8,7,0,0,6,4,19,-8,1,Luka Doncic
3,22018,1629029,0021801114,"MAR 26, 2019",DAL vs. SAC,L,36,9,20,0.450,...,12,12,0,0,4,2,28,-7,1,Luka Doncic
4,22018,1629029,0021801092,"MAR 23, 2019",DAL @ GSW,W,27,6,14,0.429,...,11,10,3,0,3,3,23,29,1,Luka Doncic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,22020,1629029,0022000071,"JAN 01, 2021",DAL vs. MIA,W,36,9,22,0.409,...,15,7,0,1,5,2,27,12,1,Luka Doncic
62,22020,1629029,0022000059,"DEC 30, 2020",DAL vs. CHA,L,24,4,10,0.400,...,2,5,2,0,3,3,12,-27,1,Luka Doncic
63,22020,1629029,0022000031,"DEC 27, 2020",DAL @ LAC,W,26,8,18,0.444,...,9,8,1,0,2,0,24,29,1,Luka Doncic
64,22020,1629029,0022000008,"DEC 25, 2020",DAL @ LAL,L,34,9,19,0.474,...,4,7,0,1,3,0,27,-14,1,Luka Doncic


In [15]:
PlotLukaStats.Assists10PointsAndRebounds()

<matplotlib.axes._subplots.AxesSubplot at 0x27857b70148>

In [16]:
### Get data on all active players and see if there's clustering around certain points totals.


In [17]:
# Create probability distribution function for each player/stat combination to get likelihood that each value occurs.
# Use conditional probability. e.g., what are the chances, if you have 10 or more points, that you also have 10 or more assists.

In [21]:
class StatDistributions():
    def __init__( self, PlayerStatsClass ):
        self.PlayerStatsClass = PlayerStatsClass
        self.Distributions = dict()
        self.StatFilters = StatFilters = np.array([
                                {'selectVar': ['AST'], 'filterVars': ['PTS']},
                                {'selectVar': ['AST'], 'filterVars': ['PTS','REB']},
                                {'selectVar': ['REB'], 'filterVars': ['PTS']},
                                {'selectVar': ['REB'], 'filterVars': ['PTS','AST']}
])
    
    def IterOverStats( self ):
        curPlayerStatClass = self.PlayerStatsClass
        
        AllPlayerDistributions= dict()

        for i in range(0, len(self.StatFilters)):

            CurFilter = self.StatFilters[i]
            
            curDistribution = self.CalcDistribution( curPlayerStatClass, CurFilter )
            AllPlayerDistributions = { **AllPlayerDistributions, **curDistribution }
            
        return(AllPlayerDistributions)
    
    def CalcDistribution( self, curPlayerStatClass, CurFilter ):
        
        # Flatten list with variables for needed for analysis.
        allStats = [CurFilter['selectVar'], CurFilter['filterVars']]
        allStats = allStats[0] + allStats[1]
    
        curPlayerData = curPlayerStatClass.playerLogData
        curPlayerData = curPlayerData[allStats] # Keep only required columns for memory management.

        # Filter down to entries of 10 or more for both of the filter variables.
        for var in CurFilter['filterVars']:
            curPlayerData = curPlayerData[curPlayerData[var] >= 10]

        statDistribution = curPlayerData[CurFilter['selectVar'][0]].value_counts(normalize = True) # Calculate relative frequency.
        statDistribution = statDistribution.sort_index()

        curStatName = CurFilter['selectVar'][0] + '_' + 'when' + '_' +'_'.join(CurFilter['filterVars']) + '_gt_9'
        
        print(curStatName)

        # Return results in a dictionary.
        d = dict()
        d[curStatName] = statDistribution

        return( d )

In [22]:
LukaStatDist = StatDistributions(LukaDoncicStats)

In [23]:
LukaStatDist.IterOverStats()

AST_when_PTS_gt_9
AST_when_PTS_REB_gt_9
REB_when_PTS_gt_9
REB_when_PTS_AST_gt_9


{'AST_when_PTS_gt_9': 0     0.005155
 1     0.015464
 2     0.046392
 3     0.046392
 4     0.067010
 5     0.092784
 6     0.097938
 7     0.118557
 8     0.118557
 9     0.103093
 10    0.097938
 11    0.056701
 12    0.051546
 13    0.015464
 14    0.015464
 15    0.020619
 16    0.010309
 17    0.005155
 19    0.010309
 20    0.005155
 Name: AST, dtype: float64, 'AST_when_PTS_REB_gt_9': 3     0.028986
 4     0.014493
 5     0.028986
 6     0.057971
 7     0.101449
 8     0.130435
 9     0.115942
 10    0.188406
 11    0.101449
 12    0.072464
 13    0.014493
 14    0.014493
 15    0.057971
 16    0.028986
 17    0.014493
 19    0.014493
 20    0.014493
 Name: AST, dtype: float64, 'REB_when_PTS_gt_9': 0     0.005155
 2     0.010309
 3     0.041237
 4     0.051546
 5     0.067010
 6     0.139175
 7     0.067010
 8     0.149485
 9     0.113402
 10    0.139175
 11    0.051546
 12    0.067010
 13    0.025773
 14    0.020619
 15    0.020619
 16    0.015464
 17    0.005155
 18    0.005155