## NBA Player Stats Analysis


In [7]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# display all columns
pd.set_option('display.max_columns', None)

# read in data
data = pd.read_csv('nba_player_data.csv')

In [8]:
data.shape

(7209, 30)

In [9]:
data.head()

Unnamed: 0,Year,Season_type,PLAYER_ID,RANK,PLAYER,TEAM_ID,TEAM,GP,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,EFF,AST_TOV,STL_TOV
0,2014-15,Regular%20Season,201935,1,James Harden,1610612745,HOU,81,2981,647,1470,0.44,208,555,0.375,715,824,0.868,75,384,459,565,154,60,321,208,2217,2202,1.76,0.48
1,2014-15,Regular%20Season,201939,2,Stephen Curry,1610612744,GSW,80,2613,653,1341,0.487,286,646,0.443,308,337,0.914,56,285,341,619,163,16,249,158,1900,2073,2.49,0.66
2,2014-15,Regular%20Season,201566,3,Russell Westbrook,1610612760,OKC,67,2302,627,1471,0.426,86,288,0.299,546,654,0.835,124,364,488,574,140,14,293,184,1886,1857,1.96,0.48
3,2014-15,Regular%20Season,2544,4,LeBron James,1610612739,CLE,69,2493,624,1279,0.488,120,339,0.354,375,528,0.71,51,365,416,511,109,49,272,135,1743,1748,1.88,0.4
4,2014-15,Regular%20Season,203081,5,Damian Lillard,1610612757,POR,82,2925,590,1360,0.434,196,572,0.343,344,398,0.864,49,329,378,507,97,21,222,164,1720,1677,2.28,0.44


### Data cleaning & preparation

In [10]:
# Drop columns that are not needed
data.drop(columns=['RANK', 'EFF'], inplace=True)

In [11]:
# Rename columns and clean up column names
data['season_start_year'] = data['Year'].str[:4].astype(int)

In [12]:
# Clean Season type data
data['Season_type'].replace('Regular%20Season', 'Regular Season', inplace=True)

In [13]:
# Separate regular season and playoffs data
rs_df = data[data['Season_type'] == 'Regular Season']
playoffs_df = data[data['Season_type'] == 'Playoffs']


In [14]:
# coloumns for oroginal data frame
data.columns

Index(['Year', 'Season_type', 'PLAYER_ID', 'PLAYER', 'TEAM_ID', 'TEAM', 'GP',
       'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA',
       'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF',
       'PTS', 'AST_TOV', 'STL_TOV', 'season_start_year'],
      dtype='object')

In [15]:
# List of coloumns where makes sense to total the values
total_cols = ['GP', 'MIN','FGM','FGA','FG3M','FG3A','FTM','FTA',
              'OREB','DREB','REB','AST','STL','BLK','TOV','PF','PTS']

### Data Analysis: Which player stats are correlated with each other?

In [16]:
# Group by player and year and sum the total columns
data_per_min = data.groupby(['PLAYER', 'PLAYER_ID', 'Year'])[total_cols].sum().reset_index()

data_per_min.sample(5)


Unnamed: 0,PLAYER,PLAYER_ID,Year,GP,MIN,FGM,FGA,FG3M,FG3A,FTM,FTA,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
3954,Ochai Agbaji,1630534,2022-23,59,1209,165,386,81,228,56,69,43,78,121,67,16,15,41,99,467
3538,Mason Plumlee,203486,2014-15,88,1792,284,495,0,3,161,328,177,343,520,76,69,65,109,217,729
3250,LeBron James,2544,2020-21,51,1728,476,937,122,333,192,278,36,353,389,398,57,27,193,77,1266
4386,Russell Westbrook,201566,2019-20,65,2311,663,1421,63,246,307,412,111,396,507,438,105,22,285,223,1696
1108,De'Anthony Melton,1629001,2019-20,60,1167,161,401,40,140,93,121,43,177,220,175,77,20,84,109,455


In [17]:
# Transition to per minute stats
for col in data_per_min.columns[5:]:
    data_per_min[col] = data_per_min[col] / data_per_min['MIN']

# Add new indicators
data_per_min['FG%'] = data_per_min['FGM']/data_per_min['FGA']
data_per_min['3PT%'] = data_per_min['FG3M']/data_per_min['FG3A']
data_per_min['FT%'] = data_per_min['FTM']/data_per_min['FTA']
data_per_min['FG3A%'] = data_per_min['FG3A']/data_per_min['FGA']
data_per_min['PTS/FGA'] = data_per_min['PTS']/data_per_min['FGA']
data_per_min['FG3M/FGM'] = data_per_min['FG3M']/data_per_min['FGM']
data_per_min['FTA/FGA'] = data_per_min['FTA']/data_per_min['FGA']
data_per_min['TRU%'] = 0.5*data_per_min['PTS']/(data_per_min['FGA']+0.475*data_per_min['FTA'])
data_per_min['AST_TOV'] = data_per_min['AST']/data_per_min['TOV']

# Add minutes filter
data_per_min = data_per_min[data_per_min['MIN'] > 50]

# for correlation analysis: drop GP (doesn't make sense), drop any categorical columns (player, player_id, year)
data_per_min.drop(columns=['GP', 'PLAYER', 'PLAYER_ID', 'Year'], inplace=True)

data_per_min.sample(5)

Unnamed: 0,MIN,FGM,FGA,FG3M,FG3A,FTM,FTA,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,FG%,3PT%,FT%,FG3A%,PTS/FGA,FG3M/FGM,FTA/FGA,TRU%,AST_TOV
4132,200,0.155,0.32,0.035,0.08,0.055,0.1,0.045,0.16,0.205,0.06,0.01,0.06,0.045,0.08,0.4,0.484375,0.4375,0.55,0.25,1.25,0.225806,0.3125,0.544218,1.333333
3945,2459,0.132574,0.265962,0.043107,0.109394,0.0366,0.052867,0.043107,0.135828,0.178935,0.049207,0.043514,0.023587,0.038634,0.078894,0.344856,0.498471,0.394052,0.692308,0.411315,1.296636,0.325153,0.198777,0.592386,1.273684
3244,3337,0.255019,0.546299,0.043452,0.134552,0.148936,0.208271,0.026371,0.166017,0.192388,0.203776,0.042553,0.020977,0.106083,0.057836,0.702427,0.466813,0.32294,0.715108,0.246297,1.285793,0.170388,0.38124,0.544325,1.920904
3835,1224,0.156863,0.23366,0.000817,0.003268,0.069444,0.094771,0.078431,0.190359,0.268791,0.04902,0.049837,0.078431,0.055556,0.147876,0.383987,0.671329,0.25,0.732759,0.013986,1.643357,0.005208,0.405594,0.688948,0.882353
2931,948,0.157173,0.390295,0.077004,0.209916,0.063291,0.074895,0.014768,0.102321,0.117089,0.138186,0.027426,0.007384,0.050633,0.03903,0.454641,0.402703,0.366834,0.84507,0.537838,1.164865,0.489933,0.191892,0.533779,2.729167


In [18]:
data_per_min.corr()

Unnamed: 0,MIN,FGM,FGA,FG3M,FG3A,FTM,FTA,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,FG%,3PT%,FT%,FG3A%,PTS/FGA,FG3M/FGM,FTA/FGA,TRU%,AST_TOV
MIN,1.0,0.407087,0.34363,0.178127,0.103257,0.322161,0.26134,-0.119414,0.05345,-0.01535,0.218918,0.032569,-0.055551,0.150451,-0.357046,0.431262,0.174159,0.169111,0.230071,-0.033162,0.232615,-0.007372,0.029527,0.28506,0.020729
FGM,0.407087,1.0,0.870539,0.19176,0.134698,0.631948,0.596654,0.080472,0.236479,0.191296,0.249638,-0.030116,0.080294,0.434114,-0.181884,0.958685,0.404405,0.12487,0.197091,-0.263421,0.362172,-0.252086,0.084657,0.41519,-0.112628
FGA,0.34363,0.870539,1.0,0.425708,0.446272,0.571423,0.492278,-0.218719,-0.008945,-0.099064,0.349353,0.015506,-0.162664,0.448649,-0.316682,0.897297,-0.073205,0.167745,0.299447,0.022769,-0.024864,0.013706,-0.08992,0.023437,-0.005786
FG3M,0.178127,0.19176,0.425708,1.0,0.953799,-0.013062,-0.149965,-0.611848,-0.394101,-0.522208,0.139186,-0.029798,-0.430168,-0.069116,-0.402573,0.354258,-0.377341,0.575656,0.399155,0.823509,-0.092068,0.845536,-0.39392,0.110331,0.201612
FG3A,0.103257,0.134698,0.446272,0.953799,1.0,-0.029426,-0.164909,-0.647021,-0.429012,-0.560366,0.155212,-0.00807,-0.457898,-0.048384,-0.397351,0.297682,-0.520249,0.441773,0.38046,0.86837,-0.250147,0.831423,-0.422801,-0.07858,0.214002
FTM,0.322161,0.631948,0.571423,-0.013062,-0.029426,1.0,0.955754,0.094089,0.213189,0.181628,0.267302,0.023139,0.083714,0.469862,-0.07724,0.753976,0.206827,0.010468,0.271189,-0.303953,0.485675,-0.2974,0.609791,0.279679,-0.109309
FTA,0.26134,0.596654,0.492278,-0.149965,-0.164909,0.955754,1.0,0.235288,0.310255,0.306202,0.19997,0.023219,0.189075,0.473771,0.020817,0.685278,0.284775,-0.080297,0.029786,-0.421937,0.50721,-0.410105,0.715877,0.250066,-0.176569
OREB,-0.119414,0.080472,-0.218719,-0.611848,-0.647021,0.094089,0.235288,1.0,0.668057,0.869391,-0.354227,-0.103263,0.603079,0.006359,0.486929,-0.046888,0.571435,-0.397724,-0.369942,-0.646748,0.363898,-0.61456,0.389839,0.238987,-0.380512
DREB,0.05345,0.236479,-0.008945,-0.394101,-0.429012,0.213189,0.310255,0.668057,1.0,0.948486,-0.203434,-0.122616,0.538503,0.148571,0.326665,0.149697,0.481548,-0.229456,-0.2381,-0.483392,0.356177,-0.461108,0.326398,0.264691,-0.334624
REB,-0.01535,0.191296,-0.099064,-0.522208,-0.560366,0.181628,0.306202,0.869391,0.948486,1.0,-0.285909,-0.125389,0.614364,0.101366,0.42424,0.079443,0.56307,-0.320159,-0.315465,-0.596361,0.391455,-0.567859,0.382725,0.277521,-0.38375


In [21]:
fig = px.imshow(data_per_min.corr())

fig.layout.width = 500
fig.layout.height = 500

fig.update_layout(
    xaxis=dict(tickangle=90, tickfont=dict(size=8)),
    yaxis=dict(tickfont=dict(size=8)),
    autosize=False,
    width=1.5 * fig.layout.width,
    height=1.5 * fig.layout.height
)
fig.show()

In [None]:
(data_per_min['MIN']>=50).mean()

In [None]:
data.sample(5)