In [1]:
import numpy as np
import pandas as pd
import matplotlib

In [2]:
# Found this partway through the analysis: https://sites.northwestern.edu/nusportsanalytics/2019/06/21/the-rise-of-triple-doubles-in-the-nba-examining-the-statistical-causes/
# Basically exactly what I want to do.

In [3]:
# Import high-level information
from nba_api.stats.static import teams
from nba_api.stats.static import players
from nba_api.stats.endpoints import playergamelog

In [4]:
# R to python reference
# https://gist.github.com/conormm/fd8b1980c28dd21cfaf6975c86c74d07

In [5]:
# Get the first and last names of all active players.
nbaPlayers = players.get_players()
activePlayers = [player['full_name'] for player in nbaPlayers 
                    if player['is_active'] == True]

In [6]:
# Pings the nba_api for a player across one or more seasons.
#   - playerName (string): player's full name in the nba_api db.
class PlayerStats():
    def __init__( self, playerName ):
        self.playerName = playerName
        self.playerLogData = pd.DataFrame()
        self.seasons = list()

    # Get the game log for a certain player, based on their full name, across certain seasons.
    #   - seasons (list of integers): seasons to pull game log data on.
    #   - players (nba_api "players" endpoint)
    def GetPlayerGameLog( self, seasons, players ):
        playerDict = players.get_players()
        curPlayer = [player for player in playerDict if player['full_name'] == self.playerName]
        
        playerLog = list()
        for curSeason in seasons:
            # Ping API separately for each season and extract response dataframe.
            playerLog.append( playergamelog.PlayerGameLog( player_id = curPlayer[0]['id'], season = curSeason ).get_data_frames()[0] )
        
        # Combine all seasons.
        playerLog = pd.concat(playerLog)
        # Add player name.
        playerLog = playerLog.assign( Player_Name = curPlayer[0]['full_name']  )
        self.playerLogData = playerLog
        self.seasons = seasons


In [7]:
LukaDoncicStats = PlayerStats(playerName = 'Luka Doncic' )

In [8]:
LukaDoncicStats.playerName

'Luka Doncic'

In [9]:
LukaDoncicStats.GetPlayerGameLog(seasons=[2020],players=players)

In [10]:
LukaDoncicStats.seasons

[2020]

In [11]:
LukaDoncicStats.playerLogData

Unnamed: 0,SEASON_ID,Player_ID,Game_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,FGA,FG_PCT,...,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,VIDEO_AVAILABLE,Player_Name
0,22020,1629029,0022001071,"MAY 16, 2021",DAL @ MIN,L,21,6,13,0.462,...,6,6,2,0,4,1,18,-15,1,Luka Doncic
1,22020,1629029,0022001057,"MAY 14, 2021",DAL vs. TOR,W,34,7,19,0.368,...,10,11,0,1,1,4,20,-5,1,Luka Doncic
2,22020,1629029,0022000154,"MAY 12, 2021",DAL vs. NOP,W,31,12,24,0.500,...,8,8,0,0,4,1,33,24,1,Luka Doncic
3,22020,1629029,0022001037,"MAY 11, 2021",DAL @ MEM,L,26,4,16,0.250,...,5,5,0,0,5,1,12,-23,1,Luka Doncic
4,22020,1629029,0022001021,"MAY 09, 2021",DAL @ CLE,W,22,5,11,0.455,...,5,5,2,0,1,2,15,4,1,Luka Doncic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,22020,1629029,0022000071,"JAN 01, 2021",DAL vs. MIA,W,36,9,22,0.409,...,15,7,0,1,5,2,27,12,1,Luka Doncic
62,22020,1629029,0022000059,"DEC 30, 2020",DAL vs. CHA,L,24,4,10,0.400,...,2,5,2,0,3,3,12,-27,1,Luka Doncic
63,22020,1629029,0022000031,"DEC 27, 2020",DAL @ LAC,W,26,8,18,0.444,...,9,8,1,0,2,0,24,29,1,Luka Doncic
64,22020,1629029,0022000008,"DEC 25, 2020",DAL @ LAL,L,34,9,19,0.474,...,4,7,0,1,3,0,27,-14,1,Luka Doncic


In [12]:
# Theory is that players hunt for double doubles and triple doubles. Plot distribution of 
# rebounds when 10 points or more
# assists when 10 points or more
# assists when 10 points or more and 10 rebounds or more.
# rebounds when 10 points or more and 10 assists or more.
# - PlayerStatsData is a PlayerStats class object.
class PlotGameLogStats():
    def __init__( self, playerStatsData, playerName ):
        self.playerStatsData = playerStatsData
        self.playerName      = playerName

    # Plot rebounds when 10 points or more.
    def ReboundsWhen10Points( self ):
        curPlayerData = self.playerStatsData

        plotData = curPlayerData[curPlayerData['PTS'] >= 10]
        plotData = plotData[['REB']]
        
        # Plot
        plot = plotData.plot.hist(bins = list(range(1,20)))
        plot.set_xlabel('Rebounds')
        plot.set_title(self.playerName +': Rebounds when 10 points or more')
        plot.locator_params(integer=True)
        return(plot)

    # Plot assists when 10 points or more.
    def AssistsWhen10Points( self ):
        curPlayerData = self.playerStatsData
        
        plotData = curPlayerData[curPlayerData['PTS'] >= 10]
        plotData = plotData[['AST']]

        # Plot
        plot = plotData.plot.hist(bins = list(range(1,20)))
        plot.set_xlabel('Assists')
        plot.set_title(self.playerName + ': Assists when 10 points or more')
        plot.locator_params(integer=True)
        return(plot)

    # Plot rebounds when 10 points or more and 10 assists or more.
    def Rebounds10PointsAndAssists( self ):
        curPlayerData = self.playerStatsData

        plotData = curPlayerData[(curPlayerData['PTS'] >= 10) & (curPlayerData['AST'] >= 10) ]
        plotData = plotData[['REB']]

        # Plot
        plot = plotData.plot.hist(bins = list(range(1,20)))
        plot.set_xlabel('Rebounds')
        plot.set_title(self.playerName + ': Rebounds when 10 points or more and 10 assists or more')
        plot.locator_params(integer=True)
        return(plot)

    # Plot assists when 10 points or more and 10 rebounds or more.
    def Assists10PointsAndRebounds( self ):
        curPlayerData = self.playerStatsData

        plotData = curPlayerData[(curPlayerData['PTS'] >= 10) & (curPlayerData['REB'] >= 10) ]
        plotData = plotData[['AST']]

        # Plot
        plot = plotData.plot.hist(bins = list(range(1,20)))
        plot.set_xlabel('Assists')
        plot.set_title(self.playerName + ': Assists when 10 points or more and 10 rebounds or more')
        plot.locator_params(integer=True)
        return(plot)
        

In [13]:
PlotLukaStats = PlotGameLogStats( playerStatsData = LukaDoncicStats.playerLogData, playerName = LukaDoncicStats.playerName )

In [14]:
PlotLukaStats.playerStatsData

Unnamed: 0,SEASON_ID,Player_ID,Game_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,FGA,FG_PCT,...,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,VIDEO_AVAILABLE,Player_Name
0,22020,1629029,0022001071,"MAY 16, 2021",DAL @ MIN,L,21,6,13,0.462,...,6,6,2,0,4,1,18,-15,1,Luka Doncic
1,22020,1629029,0022001057,"MAY 14, 2021",DAL vs. TOR,W,34,7,19,0.368,...,10,11,0,1,1,4,20,-5,1,Luka Doncic
2,22020,1629029,0022000154,"MAY 12, 2021",DAL vs. NOP,W,31,12,24,0.500,...,8,8,0,0,4,1,33,24,1,Luka Doncic
3,22020,1629029,0022001037,"MAY 11, 2021",DAL @ MEM,L,26,4,16,0.250,...,5,5,0,0,5,1,12,-23,1,Luka Doncic
4,22020,1629029,0022001021,"MAY 09, 2021",DAL @ CLE,W,22,5,11,0.455,...,5,5,2,0,1,2,15,4,1,Luka Doncic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,22020,1629029,0022000071,"JAN 01, 2021",DAL vs. MIA,W,36,9,22,0.409,...,15,7,0,1,5,2,27,12,1,Luka Doncic
62,22020,1629029,0022000059,"DEC 30, 2020",DAL vs. CHA,L,24,4,10,0.400,...,2,5,2,0,3,3,12,-27,1,Luka Doncic
63,22020,1629029,0022000031,"DEC 27, 2020",DAL @ LAC,W,26,8,18,0.444,...,9,8,1,0,2,0,24,29,1,Luka Doncic
64,22020,1629029,0022000008,"DEC 25, 2020",DAL @ LAL,L,34,9,19,0.474,...,4,7,0,1,3,0,27,-14,1,Luka Doncic


In [15]:
# Create probability distribution function for each player/stat combination to get likelihood that each value occurs.
# Use conditional probability. e.g., what are the chances, if you have 10 or more points, that you also have 10 or more assists.

In [16]:
class StatDistributions():
    def __init__( self, PlayerStatsClass ):
        self.PlayerStatsClass = PlayerStatsClass
        self.Distributions = dict()
        self.playerName = PlayerStatsClass.playerName
        self.StatFilters = StatFilters = np.array([
                                {'selectVar': ['AST'], 'filterVars': ['PTS']},
                                {'selectVar': ['AST'], 'filterVars': ['PTS','REB']},
                                {'selectVar': ['REB'], 'filterVars': ['PTS']},
                                {'selectVar': ['REB'], 'filterVars': ['PTS','AST']}
])
    
    def IterOverStats( self ):
        curPlayerStatClass = self.PlayerStatsClass
        
        AllPlayerDistributions= dict()

        for i in range(0, len(self.StatFilters)):

            CurFilter = self.StatFilters[i]
            
            curDistribution = self.CalcDistribution( curPlayerStatClass, CurFilter )
            AllPlayerDistributions = { **AllPlayerDistributions, **curDistribution }
            
        self.Distributions = AllPlayerDistributions
        return(0)
    
    def CalcDistribution( self, curPlayerStatClass, CurFilter ):
        
        # Flatten list with variables for needed for analysis.
        allStats = [CurFilter['selectVar'], CurFilter['filterVars']]
        allStats = allStats[0] + allStats[1]
    
        curPlayerData = curPlayerStatClass.playerLogData
        curPlayerData = curPlayerData[allStats] # Keep only required columns for memory management.

        # Filter down to entries of 10 or more for both of the filter variables.
        for var in CurFilter['filterVars']:
            curPlayerData = curPlayerData[curPlayerData[var] >= 10]

        statDistribution = curPlayerData[CurFilter['selectVar'][0]].value_counts(normalize = True) # Calculate relative frequency.
        statDistribution = statDistribution.sort_index()

        curStatName = CurFilter['selectVar'][0] + '_' + 'when' + '_' +'_'.join(CurFilter['filterVars']) + '_gt_9'

        # Return results in a dictionary.
        d = dict()
        d[curStatName] = statDistribution

        return( d )

In [None]:
pd.DataFrame( {'AST_when_PTS_gt_9': LukaStatDist.Distributions['AST_when_PTS_gt_9'], 'PlayerName': LukaStatDist.playerName} )

In [17]:
LukaStatDist = StatDistributions(LukaDoncicStats)

In [25]:
LukaStatDist.IterOverStats()

0

In [42]:
LukaStatDist.Distributions['AST_when_PTS_gt_9']

1     0.015152
4     0.075758
5     0.121212
6     0.075758
7     0.121212
8     0.166667
9     0.136364
10    0.030303
11    0.060606
12    0.060606
13    0.045455
14    0.015152
15    0.015152
16    0.030303
19    0.015152
20    0.015152
Name: AST, dtype: float64

In [44]:
pd.DataFrame( {'AST_when_PTS_gt_9': LukaStatDist.Distributions['AST_when_PTS_gt_9'], 'PlayerName': LukaStatDist.playerName} )

Unnamed: 0,AST_when_PTS_gt_9,PlayerName
1,0.015152,Luka Doncic
4,0.075758,Luka Doncic
5,0.121212,Luka Doncic
6,0.075758,Luka Doncic
7,0.121212,Luka Doncic
8,0.166667,Luka Doncic
9,0.136364,Luka Doncic
10,0.030303,Luka Doncic
11,0.060606,Luka Doncic
12,0.060606,Luka Doncic


In [20]:
### Gather two datasets ###
# Stats for all current active players
# Distributions for all current active players 

In [33]:
activePlayers = activePlayers[0:3]

In [30]:
# TODO: also pull player stat distributions

def GatherPlayerStats( activePlayers, pullSeasons = [2020]):

    allPlayerData = list()

    # Only select needed columns.
    selectCols = ['SEASON_ID','PTS','REB','AST','Player_Name','MIN']
    # Remove games with small number of minutes player. Per game threshold in minutes.
    minuteThreshold = 5

    #for curPlayerName in activePlayers:
    for curPlayerName in activePlayers:
        print(curPlayerName)

        # Pull player data from API
        CurPlayerStats = PlayerStats( playerName = curPlayerName )
        CurPlayerStats.GetPlayerGameLog(seasons=pullSeasons,players=players)

        # Trim data down to specific variables of interest.
        CurPlayerStats.playerLogData = CurPlayerStats.playerLogData[selectCols]
        CurPlayerStats.playerLogData = CurPlayerStats.playerLogData[CurPlayerStats.playerLogData['MIN'] >= minuteThreshold]
        
        allPlayerData.append(CurPlayerStats.playerLogData)
        
    allPlayerDataDF = pd.concat(allPlayerData)
        
    return(allPlayerDataDF)

In [34]:
allPlayerDataDF = GatherPlayerStats( activePlayers )

Steven Adams
Bam Adebayo
LaMarcus Aldridge


In [35]:
allPlayerDataDF

Unnamed: 0,SEASON_ID,PTS,REB,AST,Player_Name,MIN
0,22020,1,8,2,Steven Adams,24
1,22020,2,1,2,Steven Adams,13
2,22020,4,10,1,Steven Adams,32
3,22020,12,9,0,Steven Adams,16
4,22020,7,5,1,Steven Adams,29
...,...,...,...,...,...,...
21,22020,28,5,3,LaMarcus Aldridge,31
22,22020,14,6,2,LaMarcus Aldridge,25
23,22020,4,4,1,LaMarcus Aldridge,20
24,22020,12,6,4,LaMarcus Aldridge,29


In [None]:
### Plot distribution of all games
# Points distribution
ptsDistribution = allPlayerDataDF['PTS'].value_counts(normalize = True)
ptsDistribution = ptsDistribution.sort_index()
# Rebounds distribution
rebDistribution = allPlayerDataDF['REB'].value_counts(normalize = True)
rebDistribution = rebDistribution.sort_index()
# Assists distribution
astDistribution = allPlayerDataDF['AST'].value_counts(normalize = True)
astDistribution = astDistribution.sort_index()

In [None]:
astDistribution

In [None]:
allPlayerDataDF['PTS'].unique()

In [None]:
plot = ptsDistribution.plot.hist(bins = list(range(1,65)))

In [None]:
print(matplotlib.rcParams['backend'])