In [86]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 100) # The maximum number of columns that can be shown is 50 CHANGED V2

# set seed for reproducibility
np.random.seed(0) 

TODO
☑

- ☐ Remove empty rows
- ☐ Remove irrelevant columns
- ☐ Remove empty columns
- ☐ Fill empty sets of data 

# Data pre-processing

In [171]:
# load dataset and print a sample
nba_data = pd.read_csv("data/Player Per Game.csv")
print(nba_data.sample(10))

       seas_id  season  player_id                player  birth_year pos   age  \
19798    11038    1988       2353        Dave Henderson         NaN  SG  23.0   
16945    14515    1995       2757          Tracy Murray         NaN  SF  23.0   
18758    12659    1991       2209         Tim McCormick         NaN   C  28.0   
10514    20896    2007       3640  Šarūnas Jasikevičius         NaN  PG  30.0   
18913    11914    1990       2501       Dražen Petrović         NaN  SG  25.0   
22114     8849    1982       1808       Jackie Robinson         NaN  SF  26.0   
26378     4392    1971       1254        Claude English         NaN   F  24.0   
8942     21612    2009       3104        Antawn Jamison         NaN  PF  32.0   
16468    15035    1996       2552          Tim Hardaway         NaN  PG  29.0   
26759     3919    1970       1069            Art Harris         NaN  SG  23.0   

       experience   lg   tm   g    gs  mp_per_game  fg_per_game  fga_per_game  \
19798           1  NBA  PHI

In [172]:
nba_data.columns

Index(['seas_id', 'season', 'player_id', 'player', 'birth_year', 'pos', 'age',
       'experience', 'lg', 'tm', 'g', 'gs', 'mp_per_game', 'fg_per_game',
       'fga_per_game', 'fg_percent', 'x3p_per_game', 'x3pa_per_game',
       'x3p_percent', 'x2p_per_game', 'x2pa_per_game', 'x2p_percent',
       'e_fg_percent', 'ft_per_game', 'fta_per_game', 'ft_percent',
       'orb_per_game', 'drb_per_game', 'trb_per_game', 'ast_per_game',
       'stl_per_game', 'blk_per_game', 'tov_per_game', 'pf_per_game',
       'pts_per_game'],
      dtype='object')

In [173]:
nba_data.dtypes # returns the datatype of each column

seas_id            int64
season             int64
player_id          int64
player            object
birth_year       float64
pos               object
age              float64
experience         int64
lg                object
tm                object
g                  int64
gs               float64
mp_per_game      float64
fg_per_game      float64
fga_per_game     float64
fg_percent       float64
x3p_per_game     float64
x3pa_per_game    float64
x3p_percent      float64
x2p_per_game     float64
x2pa_per_game    float64
x2p_percent      float64
e_fg_percent     float64
ft_per_game      float64
fta_per_game     float64
ft_percent       float64
orb_per_game     float64
drb_per_game     float64
trb_per_game     float64
ast_per_game     float64
stl_per_game     float64
blk_per_game     float64
tov_per_game     float64
pf_per_game      float64
pts_per_game     float64
dtype: object

In [168]:
nba_data.shape # returns the shape of data - 31059 rows, 35 columns

(31059, 35)


In [169]:
nba_data.describe

<bound method NDFrame.describe of        seas_id  season  player_id          player  birth_year  pos   age  \
0        30458    2023       5025      A.J. Green         NaN   SG  23.0   
1        30459    2023       5026     A.J. Lawson         NaN   SG  22.0   
2        30460    2023       5026     A.J. Lawson         NaN   SG  22.0   
3        30461    2023       5026     A.J. Lawson         NaN   SG  22.0   
4        30462    2023       4219    Aaron Gordon         NaN   PF  27.0   
...        ...     ...        ...             ...         ...  ...   ...   
31054      200    1947        157     Walt Miller         NaN    F  31.0   
31055      201    1947        158   Warren Fenley         NaN    F  24.0   
31056      202    1947        159   Wilbert Kautz         NaN  G-F  31.0   
31057      203    1947        160  Woody Grimshaw         NaN    G  27.0   
31058      204    1947        161     Wyndol Gray         NaN  G-F  24.0   

       experience   lg   tm   g    gs  mp_per_game  f

In [162]:
# removing players from seasons prior to 1980 - introduction of 3 point line
nba_data = nba_data.drop(nba_data[nba_data.season < 1980].index)

# removing irrelavant features
nba_data = (nba_data.drop(columns=[
'seas_id','season','gs','player_id','player','birth_year','age',
'experience','lg','tm'
]))

# remove players that played less than 1/4 of all total games
nba_data = nba_data.drop(nba_data[nba_data.g < 30].index)

# remove players that played less than 1/4 of possible game time
nba_data = nba_data.drop(nba_data[nba_data.mp_per_game < 12].index)

print(nba_data.sample(10))

      pos   g  mp_per_game  fg_per_game  fga_per_game  fg_percent  \
11947  PG  55         37.6          6.5          16.6       0.392   
19607   C  80         35.5          7.5          13.1       0.570   
13296   C  79         17.0          1.7           3.8       0.455   
14366  SF  47         22.5          3.8           8.4       0.452   
15279  SF  82         33.0          4.4           9.4       0.471   
6587   SG  64         17.8          2.5           6.1       0.420   
18850  SG  77         33.7          6.1          13.1       0.470   
17775   C  81         37.1          9.6          19.1       0.503   
9306   PG  53         29.7          3.6           9.2       0.387   
11999  SG  78         37.6          7.2          16.5       0.438   

       x3p_per_game  x3pa_per_game  x3p_percent  x2p_per_game  x2pa_per_game  \
11947           2.3            6.1        0.375           4.2           10.6   
19607           0.0            0.0          NaN           7.5           13.1   


In [163]:
# get the number of missing data points per column
missing_values_count = nba_data.isnull().sum()
print(missing_values_count)

pos                 0
g                   0
mp_per_game         0
fg_per_game         0
fga_per_game        0
fg_percent          0
x3p_per_game        0
x3pa_per_game       0
x3p_percent      1102
x2p_per_game        0
x2pa_per_game       0
x2p_percent         0
e_fg_percent        0
ft_per_game         0
fta_per_game        0
ft_percent          1
orb_per_game        0
drb_per_game        0
trb_per_game        0
ast_per_game        0
stl_per_game        0
blk_per_game        0
tov_per_game        0
pf_per_game         0
pts_per_game        0
dtype: int64


In [147]:
# how many total missing values do we have?
total_cells = np.product(nba_data.shape)
total_missing = missing_values_count.sum()

# percent of data that is missing
print((total_missing/total_cells) * 100)

0.3138426518708209


In [148]:
print(nba_data[nba_data['ft_percent'].isna()])


        pos   g  mp_per_game  fg_per_game  fga_per_game  fg_percent  \
7509  PF-SF  30         12.1          0.8           2.2       0.369   

      x3p_per_game  x3pa_per_game  x3p_percent  x2p_per_game  x2pa_per_game  \
7509           0.2            0.7         0.35           0.6            1.5   

      x2p_percent  e_fg_percent  ft_per_game  fta_per_game  ft_percent  \
7509        0.378         0.423          0.0           0.0         NaN   

      orb_per_game  drb_per_game  trb_per_game  ast_per_game  stl_per_game  \
7509           0.4           1.2           1.6           1.2           0.2   

      blk_per_game  tov_per_game  pf_per_game  pts_per_game  
7509           0.0           0.9          1.4           1.8  
