In [1]:
# imports
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from sklearn.metrics import *
from sklearn.ensemble import RandomForestClassifier

In [2]:
# load data
df=pd.read_csv('nba_data.csv')

In [3]:
# look at data
df.head()

Unnamed: 0.1,Unnamed: 0,name,pos,age,team,gp,gs,mp,fg,fga,...,trb,ast,stl,blk,tov,fouls,pts,Year,school,selection
0,0,Curly Armstrong,G-F,31,FTW,63,,,2.3,8.2,...,,2.8,,,,3.4,7.3,1950,,
1,1,Cliff Barker,SG,29,INO,49,,,2.1,5.6,...,,2.2,,,,2.0,5.7,1950,,
2,2,Leo Barnhorst,SF,25,CHS,67,,,2.6,7.4,...,,2.1,,,,2.9,6.5,1950,,
3,3,Ed Bartels,F,24,NYK,15,,,1.5,5.7,...,,1.3,,,,1.9,4.2,1950,,
4,4,Ralph Beard,G,22,INO,60,,,5.7,15.6,...,,3.9,,,,2.2,14.9,1950,,


In [4]:
# pick my season
df=df[df.Year==2019]

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,name,pos,age,team,gp,gs,mp,fg,fga,...,trb,ast,stl,blk,tov,fouls,pts,Year,school,selection
20826,20826,Álex Abrines,SG,25,OKC,31,2.0,19.0,1.8,5.1,...,1.5,0.6,0.5,0.2,0.5,1.7,5.3,2019,,
20827,20827,Quincy Acy,PF,28,PHO,10,0.0,12.3,0.4,1.8,...,2.5,0.8,0.1,0.4,0.4,2.4,1.7,2019,Baylor,74.0
20828,20828,Jaylen Adams,PG,22,ATL,34,1.0,12.6,1.1,3.2,...,1.8,1.9,0.4,0.1,0.8,1.3,3.2,2019,St. Bonaventure,84.0
20829,20829,Steven Adams,C,25,OKC,80,80.0,33.4,6.0,10.1,...,9.5,1.6,1.5,1.0,1.7,2.6,13.9,2019,Pittsburgh,12.0
20830,20830,Bam Adebayo,C,21,MIA,82,28.0,23.3,3.4,5.9,...,7.3,2.2,0.9,0.8,1.5,2.5,8.9,2019,Kentucky,14.0


In [6]:
# ok so now I will need to create a custom dictionary to see whether or not teams made the playoffs

In [7]:
# first let's see how each team is represented
df.team.unique()

array(['OKC', 'PHO', 'ATL', 'MIA', 'CLE', 'DEN', 'SAS', 'CHI', 'UTA',
       'BRK', 'NYK', 'POR', 'MEM', 'IND', 'MIL', 'DAL', 'HOU', 'TOR',
       'WAS', 'ORL', 'CHO', 'SAC', 'LAL', 'MIN', 'BOS', 'GSW', 'NOP',
       'LAC', 'PHI', 'DET'], dtype=object)

In [8]:
# creating my dictionary using binary
playoffs_dict={
    'OKC':1,'PHO':0,'ATL':0,'MIA':0,'CLE':0,'DEN':1,'SAS':1,'CHI':0,'UTA':1,'BRK':1,'NYK':0,'POR':1,'MEM':0,'IND':1,
    'MIL':1,'DAL':0,'HOU':1,'TOR':1,'WAS':0,'ORL':1,'CHO':0,'SAC':0,'LAL':0,'MIN':0,'BOS':1,'GSW':1,'NOP':0,'LAC':1,
    'PHI':1,'DET':1
}

In [9]:
# does this add to 16?
np.sum(list(playoffs_dict.values()))

16

In [10]:
# yup

In [11]:
# add playoff binary
df['playoffs']=df.team
df['playoffs']=df['playoffs'].map(lambda x: playoffs_dict[x])

In [12]:
# preview of new variable on un-sorted data
df.playoffs

20826    1
20827    0
20828    0
20829    1
20830    0
        ..
21351    0
21352    0
21353    0
21354    0
21355    1
Name: playoffs, Length: 530, dtype: int64

In [13]:
# sort data
df=df.sort_values(by='team')

In [14]:
df.head()

Unnamed: 0.1,Unnamed: 0,name,pos,age,team,gp,gs,mp,fg,fga,...,ast,stl,blk,tov,fouls,pts,Year,school,selection,playoffs
20933,20933,John Collins,PF,21,ATL,61,59.0,30.0,7.6,13.6,...,2.0,0.4,0.6,2.0,3.3,19.5,2019,Wake Forest,19.0,0
21325,21325,Taurean Prince,SF,24,ATL,55,47.0,28.2,4.8,10.8,...,2.1,1.0,0.3,1.8,2.6,13.5,2019,Baylor,12.0,0
21028,21028,Daniel Hamilton,SG,23,ATL,19,3.0,10.7,1.2,3.2,...,1.2,0.3,0.1,0.8,1.0,3.0,2019,Connecticut,112.0,0
21125,21125,Alex Len,C,25,ATL,77,31.0,20.1,4.2,8.4,...,1.1,0.4,0.9,1.3,2.6,11.1,2019,Maryland,5.0,0
20864,20864,Kent Bazemore,SG,29,ATL,67,35.0,24.5,4.1,10.3,...,2.3,1.3,0.6,1.8,2.5,11.6,2019,,,0


In [15]:
# look at dtypes and look for null values
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 530 entries, 20933 to 20956
Data columns (total 34 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  530 non-null    int64  
 1   name        530 non-null    object 
 2   pos         530 non-null    object 
 3   age         530 non-null    int64  
 4   team        530 non-null    object 
 5   gp          530 non-null    int64  
 6   gs          530 non-null    float64
 7   mp          530 non-null    float64
 8   fg          530 non-null    float64
 9   fga         530 non-null    float64
 10  fg%         526 non-null    float64
 11  3p          530 non-null    float64
 12  3pa         530 non-null    float64
 13  3p%         494 non-null    float64
 14  2p          530 non-null    float64
 15  2pa         530 non-null    float64
 16  2p%         520 non-null    float64
 17  efg%        526 non-null    float64
 18  ft          530 non-null    float64
 19  fta         530 non-nul

In [16]:
# fill missing values
df.school.fillna('UNK',inplace=True)
for col in ['efg%','3p%','2p%','ft%','fg%']:
    df[col].fillna(df[col].mean(),inplace=True)

In [17]:
# selection must be a whole number
df.selection.mean()

33.96820809248555

In [18]:
# fill using 34
df.selection.fillna(34,inplace=True)

In [19]:
# look at dtypes and look for null values
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 530 entries, 20933 to 20956
Data columns (total 34 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  530 non-null    int64  
 1   name        530 non-null    object 
 2   pos         530 non-null    object 
 3   age         530 non-null    int64  
 4   team        530 non-null    object 
 5   gp          530 non-null    int64  
 6   gs          530 non-null    float64
 7   mp          530 non-null    float64
 8   fg          530 non-null    float64
 9   fga         530 non-null    float64
 10  fg%         530 non-null    float64
 11  3p          530 non-null    float64
 12  3pa         530 non-null    float64
 13  3p%         530 non-null    float64
 14  2p          530 non-null    float64
 15  2pa         530 non-null    float64
 16  2p%         530 non-null    float64
 17  efg%        530 non-null    float64
 18  ft          530 non-null    float64
 19  fta         530 non-nul

In [20]:
df.head()

Unnamed: 0.1,Unnamed: 0,name,pos,age,team,gp,gs,mp,fg,fga,...,ast,stl,blk,tov,fouls,pts,Year,school,selection,playoffs
20933,20933,John Collins,PF,21,ATL,61,59.0,30.0,7.6,13.6,...,2.0,0.4,0.6,2.0,3.3,19.5,2019,Wake Forest,19.0,0
21325,21325,Taurean Prince,SF,24,ATL,55,47.0,28.2,4.8,10.8,...,2.1,1.0,0.3,1.8,2.6,13.5,2019,Baylor,12.0,0
21028,21028,Daniel Hamilton,SG,23,ATL,19,3.0,10.7,1.2,3.2,...,1.2,0.3,0.1,0.8,1.0,3.0,2019,Connecticut,112.0,0
21125,21125,Alex Len,C,25,ATL,77,31.0,20.1,4.2,8.4,...,1.1,0.4,0.9,1.3,2.6,11.1,2019,Maryland,5.0,0
20864,20864,Kent Bazemore,SG,29,ATL,67,35.0,24.5,4.1,10.3,...,2.3,1.3,0.6,1.8,2.5,11.6,2019,UNK,34.0,0


In [21]:
# get rid of useless data
df.drop('Unnamed: 0',axis=1,inplace=True)

In [22]:
df.head()

Unnamed: 0,name,pos,age,team,gp,gs,mp,fg,fga,fg%,...,ast,stl,blk,tov,fouls,pts,Year,school,selection,playoffs
20933,John Collins,PF,21,ATL,61,59.0,30.0,7.6,13.6,0.56,...,2.0,0.4,0.6,2.0,3.3,19.5,2019,Wake Forest,19.0,0
21325,Taurean Prince,SF,24,ATL,55,47.0,28.2,4.8,10.8,0.441,...,2.1,1.0,0.3,1.8,2.6,13.5,2019,Baylor,12.0,0
21028,Daniel Hamilton,SG,23,ATL,19,3.0,10.7,1.2,3.2,0.383,...,1.2,0.3,0.1,0.8,1.0,3.0,2019,Connecticut,112.0,0
21125,Alex Len,C,25,ATL,77,31.0,20.1,4.2,8.4,0.494,...,1.1,0.4,0.9,1.3,2.6,11.1,2019,Maryland,5.0,0
20864,Kent Bazemore,SG,29,ATL,67,35.0,24.5,4.1,10.3,0.402,...,2.3,1.3,0.6,1.8,2.5,11.6,2019,UNK,34.0,0


In [23]:
# deal with categorical data
df.select_dtypes(include='O')

Unnamed: 0,name,pos,team,school
20933,John Collins,PF,ATL,Wake Forest
21325,Taurean Prince,SF,ATL,Baylor
21028,Daniel Hamilton,SG,ATL,Connecticut
21125,Alex Len,C,ATL,Maryland
20864,Kent Bazemore,SG,ATL,UNK
...,...,...,...,...
20865,Bradley Beal,SG,WAS,Florida
21148,Ian Mahinmi,C,WAS,Le Havre
21166,Jordan McRae,SG,WAS,Tennessee
21263,Tomáš Satoranský,PG,WAS,UNK


In [24]:
# any weird values?
df[df.pos=='PF-C']

Unnamed: 0,name,pos,age,team,gp,gs,mp,fg,fga,fg%,...,ast,stl,blk,tov,fouls,pts,Year,school,selection,playoffs
21281,Jason Smith,PF-C,32,NOP,20,1.0,9.5,1.1,3.0,0.356,...,0.7,0.2,0.4,0.7,1.5,3.3,2019,Colorado State,20.0,0


In [25]:
# get rid of those values
for i in range(len(df)):
    if df.pos.iloc[i]=='PF-C':
        df.pos.iloc[i]='C'

In [26]:
# and now?
df[df.pos=='PF-C']

Unnamed: 0,name,pos,age,team,gp,gs,mp,fg,fga,fg%,...,ast,stl,blk,tov,fouls,pts,Year,school,selection,playoffs


In [27]:
# categorical data
df.select_dtypes(include='O')

Unnamed: 0,name,pos,team,school
20933,John Collins,PF,ATL,Wake Forest
21325,Taurean Prince,SF,ATL,Baylor
21028,Daniel Hamilton,SG,ATL,Connecticut
21125,Alex Len,C,ATL,Maryland
20864,Kent Bazemore,SG,ATL,UNK
...,...,...,...,...
20865,Bradley Beal,SG,WAS,Florida
21148,Ian Mahinmi,C,WAS,Le Havre
21166,Jordan McRae,SG,WAS,Tennessee
21263,Tomáš Satoranský,PG,WAS,UNK


In [28]:
# ok, so look at the categorical data - none of it matters except maybe position - i'm dropping it
df=df.select_dtypes(exclude='O')

In [29]:
# year is never changing and therefore not needed
df.drop('Year',axis=1,inplace=True)

In [30]:
df.head()

Unnamed: 0,age,gp,gs,mp,fg,fga,fg%,3p,3pa,3p%,...,drb,trb,ast,stl,blk,tov,fouls,pts,selection,playoffs
20933,21,61,59.0,30.0,7.6,13.6,0.56,0.9,2.6,0.348,...,6.2,9.8,2.0,0.4,0.6,2.0,3.3,19.5,19.0,0
21325,24,55,47.0,28.2,4.8,10.8,0.441,2.2,5.7,0.39,...,3.2,3.6,2.1,1.0,0.3,1.8,2.6,13.5,12.0,0
21028,23,19,3.0,10.7,1.2,3.2,0.383,0.4,1.2,0.348,...,1.9,2.5,1.2,0.3,0.1,0.8,1.0,3.0,112.0,0
21125,25,77,31.0,20.1,4.2,8.4,0.494,1.0,2.6,0.363,...,3.5,5.5,1.1,0.4,0.9,1.3,2.6,11.1,5.0,0
20864,29,67,35.0,24.5,4.1,10.3,0.402,1.4,4.5,0.32,...,3.3,3.9,2.3,1.3,0.6,1.8,2.5,11.6,34.0,0


In [31]:
# ok - so I see two ways to do this project:
# 1) do it by individual player
# 2) group by team
# I will try both and see what I find
# however it now occurs to me that I will need a lot more data to do a team analysis - so that might be put off for a bit

In [None]:
# code for later
# .agg({'A': 'sum','B': 'sum','C': lambda x: np.unique(x)}).reset_index()