# Preprocessing
Preparing the modified NBA dataset from EDA for modeling 

In [2]:
#Importing the necessary packages 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import StandardScaler
from typing import Tuple

#To supress warnings
import warnings
warnings.filterwarnings('ignore')

#pandas show all columns
pd.set_option('display.max_columns', None)

#Show plots inline
%matplotlib inline

%load_ext autoreload
%autoreload 2


Reading the data from previous phases of the project. Here we're searching each dataframe for key metrics that can be used in the train,test,split function. The goal in this section is select a dataset that has already been averaged to a per 35 minute basis and standardizing the award_share column so it can be used as the target feature

In [3]:
mvp_per_35 = pd.read_csv('../data/mvp_per_35.csv')
mvp_per_35.head()

Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,season,player,pos,award_share,age,team_id,g,gs,mp_per_g,fg_per_g,fga_per_g,fg_pct,fg3_per_g,fg3a_per_g,fg3_pct,fg2_per_g,fg2a_per_g,fg2_pct,efg_pct,ft_per_g,fta_per_g,ft_pct,orb_per_g,drb_per_g,trb_per_g,ast_per_g,stl_per_g,blk_per_g,tov_per_g,pf_per_g,pts_per_g,mp,per,ts_pct,fg3a_per_fga_pct,fta_per_fga_pct,orb_pct,drb_pct,trb_pct,ast_pct,stl_pct,blk_pct,tov_pct,usg_pct,ows,dws,ws,ws_per_48,obpm,dbpm,bpm,vorp,mov,mov_adj,win_loss_pct,mvp_won
0,0,0,0,0,1982,Moses Malone,C,0.735,26,HOU,81,81,35.0,9.75,18.75,0.4325,0.0,0.083333,0.0,9.75,18.666667,0.433333,0.4325,6.5,8.5,0.635,5.75,6.5,12.25,1.5,0.75,1.25,3.0,2.166667,25.916667,2831.666667,22.333333,0.48,0.0025,0.378333,14.583333,17.666667,16.083333,5.75,0.916667,1.75,9.916667,24.916667,9.75,3.083333,12.833333,0.181667,5.166667,-1.5,3.75,4.583333,-0.033333,-0.325,0.4675,Yes
1,1,1,1,1,1983,Moses Malone,C,0.96,27,PHI,78,78,35.0,7.84,15.586667,0.4676,0.0,0.0,0.0,7.84,15.586667,0.468533,0.4676,7.186667,9.426667,0.710267,5.32,8.96,14.28,1.213333,1.026667,1.866667,3.173333,2.426667,22.866667,2727.2,23.426667,0.539467,0.000933,0.563733,15.68,24.173333,20.16,4.76,1.306667,2.8,12.88,24.266667,8.493333,5.6,14.093333,0.231467,3.733333,-0.28,3.453333,3.92,7.158667,7.028,0.740133,Yes
2,2,2,2,2,1984,Larry Bird,PF,0.858,27,BOS,79,77,35.0,8.772846,17.819843,0.449608,0.182768,0.822454,0.225718,8.590078,16.997389,0.460574,0.454178,4.295039,4.843342,0.811488,2.101828,7.127937,9.229765,6.031332,1.644909,0.822454,2.741514,2.284595,22.114883,2767.101828,22.114883,0.504439,0.04295,0.249478,6.396867,20.37859,13.616188,23.942559,2.101828,1.18799,11.057441,24.399478,7.219321,5.117493,12.428198,0.196475,5.117493,1.827676,6.94517,6.671018,5.994778,5.866841,0.690862,Yes
3,3,3,3,3,1985,Larry Bird,SF,0.978,28,BOS,80,77,35.0,10.189873,19.493671,0.462532,0.620253,1.417722,0.378354,9.56962,18.075949,0.468734,0.476709,4.43038,5.050633,0.781519,1.860759,7.531646,9.303797,5.848101,1.417722,1.063291,2.746835,2.303797,25.43038,2800.886076,23.481013,0.518354,0.06557,0.23038,5.316456,20.025316,13.025316,22.772152,1.683544,1.506329,9.924051,25.253165,9.303797,4.607595,13.911392,0.210886,6.025316,1.772152,7.797468,7.708861,5.892405,5.732911,0.680506,Yes
4,4,4,4,4,1986,Larry Bird,SF,0.981,29,BOS,82,81,35.0,8.934211,18.052632,0.456842,0.921053,2.210526,0.389605,8.013158,15.842105,0.466053,0.479868,4.973684,5.526316,0.825263,2.118421,6.907895,9.026316,6.263158,1.842105,0.552632,2.947368,2.026316,23.763158,2867.236842,23.578947,0.534211,0.111447,0.281842,6.631579,18.789474,13.078947,24.223684,2.302632,0.828947,11.697368,25.421053,8.842105,5.710526,14.552632,0.224737,6.078947,1.934211,8.013158,7.736842,8.667105,8.344737,0.7525,Yes


In [4]:
mvp_90_normalized = pd.read_csv('../data/mvp_90_scale_normalized.csv')
mvp_90_normalized.head()

Unnamed: 0,player,season,award_share,age,g,gs,mp_per_g,fg_per_g,fga_per_g,fg_pct,fg3_per_g,fg3a_per_g,fg3_pct,fg2_per_g,fg2a_per_g,fg2_pct,efg_pct,ft_per_g,fta_per_g,ft_pct,orb_per_g,drb_per_g,trb_per_g,ast_per_g,stl_per_g,blk_per_g,tov_per_g,pf_per_g,pts_per_g,mp,per,ts_pct,fg3a_per_fga_pct,fta_per_fga_pct,orb_pct,drb_pct,trb_pct,ast_pct,stl_pct,blk_pct,tov_pct,usg_pct,ows,dws,ws,ws_per_48,obpm,dbpm,bpm,vorp,mov,mov_adj,win_loss_pct
0,Moses Malone,-1.660099,-0.081676,0.069128,0.045944,0.083728,0.0,-1.56913,-1.83347,-0.253902,-0.994504,-1.134932,-2.451597,-0.271131,0.049705,-0.577213,-0.643584,0.905472,0.949349,-0.536075,3.078786,0.996266,1.760897,-2.117912,-1.120232,1.556655,0.248961,0.397306,-1.258045,0.034243,-0.981142,-0.390228,-1.098694,1.664147,2.993562,0.75248,1.494695,-2.256717,-1.221271,0.821814,0.52576,-1.390965,-0.979748,0.698976,-0.697928,-0.719612,-2.064908,-1.95825,-2.33958,-2.579808,-0.309431,-0.231171,0.04372
1,Larry Bird,-1.502336,0.565195,0.449333,0.450256,-0.120941,0.0,0.466054,0.272833,-0.33179,-0.53589,-0.683745,0.646847,0.707649,0.838409,-0.574743,-0.527654,-1.223779,-1.297817,0.146954,0.116232,0.464251,0.367045,-0.08324,-0.318551,0.251568,-0.420482,0.186527,-0.406418,0.460039,-0.967746,-0.671698,-0.704171,-1.378182,-0.018307,0.220693,0.171102,-0.57917,-0.67323,-0.24766,-0.524255,-1.155142,-0.635238,-0.324545,-0.775916,-1.174507,-0.714654,-0.242912,-0.600302,-0.222554,-0.876209,-0.842443,-0.666481
2,Larry Bird,-1.423454,0.673006,0.829538,0.854567,0.697737,0.0,-0.621453,-0.504045,-0.419226,-0.313479,-0.431436,0.738983,-0.173142,0.130638,-0.607723,-0.487442,-0.804072,-1.053546,0.566289,0.336898,0.231925,0.289321,0.09897,0.551457,-0.578001,-0.10572,-0.289485,-0.960244,0.843447,-0.943605,-0.460304,-0.424146,-0.908514,0.363896,0.062255,0.181051,-0.443983,0.227023,-0.80765,0.105664,-1.115009,-0.831493,0.812968,-0.501051,-0.868362,-0.683058,-0.107451,-0.513945,-0.205146,0.365745,0.39032,0.191016
3,Magic Johnson,-1.344573,-0.80042,0.069128,0.450256,0.493067,0.0,-1.261158,-1.711628,0.294814,-0.923212,-0.981507,-0.83292,-0.124467,-0.031434,-0.030957,-0.152339,0.34416,0.020096,0.493125,-0.238735,-0.617222,-0.537526,2.513449,0.135324,-0.69259,1.018987,-0.292097,-1.199112,0.454919,-0.338663,0.156041,-0.927837,0.760675,-0.162315,-0.653129,-0.51024,1.538468,-0.03679,-0.934975,1.396257,-1.130051,0.369165,-1.297817,-0.167587,-0.230811,-0.004122,-0.676486,-0.325092,-0.219669,0.499952,0.238009,0.335143
4,Michael Jordan,-1.029046,-1.231667,0.069128,0.854567,0.902406,0.0,1.553916,1.186903,0.395612,-0.714732,-0.803782,-0.034658,1.555327,1.495122,0.070739,-0.009361,0.468971,0.091823,0.372045,-0.34318,-0.720302,-0.649205,-0.366572,2.010987,0.06094,-1.020034,0.778111,1.044207,0.859414,0.612544,0.047472,-0.809927,-0.329642,-0.298787,-0.612404,-0.578163,-0.479918,1.968191,-0.16352,-1.126114,0.247714,1.401223,0.191661,1.492186,0.875834,0.695467,0.806005,0.822581,1.337406,0.339338,0.277995,-0.389229


In [5]:
df_per_35 = pd.read_csv('../data/df_per_35.csv')
df_per_35.head()

Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,season,player,pos,age,g,team_id,gs,mp_per_g,fg_per_g,fga_per_g,fg_pct,fg3_per_g,fg3a_per_g,fg3_pct,fg2_per_g,fg2a_per_g,fg2_pct,efg_pct,ft_per_g,fta_per_g,ft_pct,orb_per_g,drb_per_g,trb_per_g,ast_per_g,stl_per_g,blk_per_g,tov_per_g,pf_per_g,pts_per_g,mp,per,ts_pct,fg3a_per_fga_pct,fta_per_fga_pct,orb_pct,drb_pct,trb_pct,ast_pct,stl_pct,blk_pct,tov_pct,usg_pct,ows,dws,ws,ws_per_48,obpm,dbpm,bpm,vorp,award_share,mov,mov_adj,win_loss_pct,mvp_won
0,0,0,0,0,1982,Kareem Abdul-Jabbar,C,34,76,LAL,76,35.0,9.84375,17.002841,0.57571,0.0,0.0,0.0,9.84375,17.002841,0.576705,0.57571,4.076705,5.767045,0.701989,2.286932,6.363636,8.650568,2.982955,0.795455,2.684659,2.982955,2.883523,23.764205,2661.789773,23.267045,0.604545,0.001989,0.338068,7.258523,18.991477,13.323864,11.832386,1.09375,4.076705,13.224432,25.454545,6.860795,3.877841,10.639205,0.190909,3.778409,1.193182,4.971591,4.673295,0.045,4.84233,4.34517,0.691051,No
1,1,1,1,1,1982,Alvan Adams,C,27,79,PHO,75,35.0,7.392739,15.016502,0.570627,0.0,0.0,0.0,7.392739,15.016502,0.570627,0.570627,2.656766,3.349835,0.902145,1.963696,6.584158,8.547855,5.19802,1.617162,1.155116,2.887789,3.927393,17.442244,2764.191419,21.485149,0.611056,0.001155,0.262211,7.623762,23.333333,15.709571,25.528053,2.656766,2.194719,17.09571,26.336634,2.887789,5.429043,8.316832,0.166337,1.617162,2.541254,4.158416,3.927393,0.0,3.985149,3.523102,0.64802,No
2,2,2,2,2,1982,Mark Aguirre,SF,22,51,DAL,20,35.0,9.114583,19.565972,0.565104,0.607639,1.701389,0.427778,8.506944,17.864583,0.577257,0.583333,4.010417,5.833333,0.826389,2.065972,3.767361,5.954861,3.888889,0.850694,0.486111,3.159722,3.645833,22.725694,1784.027778,21.024306,0.624653,0.105729,0.365799,8.142361,15.677083,11.788194,22.604167,1.458333,1.09375,15.434028,36.215278,1.215278,0.972222,2.309028,0.074132,2.795139,-1.944444,0.850694,1.215278,0.0,-5.383681,-5.444444,0.41441,No
3,3,3,3,3,1982,Danny Ainge,SG,22,53,BOS,1,35.0,4.95283,13.867925,1.178774,0.330189,0.990566,0.970755,4.622642,12.54717,1.198585,1.218396,3.632075,3.962264,2.846226,1.650943,1.981132,3.632075,5.283019,2.311321,0.330189,3.301887,5.283019,13.537736,1862.264151,33.349057,1.449528,0.254245,0.970755,16.509434,19.481132,18.160377,65.04717,10.235849,0.990566,57.783019,70.990566,-0.990566,2.641509,1.650943,0.138679,-12.216981,3.301887,-8.915094,-0.330189,0.0,21.066038,20.966981,2.535849,No
4,4,4,4,4,1982,Tiny Archibald,PG,33,68,BOS,51,35.0,4.937304,10.532915,0.517868,0.109718,0.219436,0.411442,4.827586,10.31348,0.52116,0.523354,3.840125,5.047022,0.819592,0.438871,1.426332,1.865204,8.777429,0.877743,0.0,2.852665,2.084639,13.824451,2377.586207,15.689655,0.594671,0.027429,0.532132,1.426332,4.937304,3.181818,35.0,1.206897,0.109718,20.188088,19.639498,3.730408,1.974922,5.705329,0.126176,1.53605,-1.426332,0.109718,1.206897,0.0,7.0,6.967085,0.842633,No


In [6]:
mvp_90_contri = pd.read_csv('../data/mvp_90_contri.csv')
mvp_90_contri.head()

Unnamed: 0,player,fg_per_g,pts_per_g,usg_pct,ws,ows,vorp
0,Moses Malone,-1.56913,-1.258045,-1.390965,-0.697928,-0.979748,-2.579808
1,Larry Bird,0.466054,-0.406418,-1.155142,-0.775916,-0.635238,-0.222554
2,Larry Bird,-0.621453,-0.960244,-1.115009,-0.501051,-0.831493,-0.205146
3,Magic Johnson,-1.261158,-1.199112,-1.130051,-0.167587,0.369165,-0.219669
4,Michael Jordan,1.553916,1.044207,0.247714,1.492186,1.401223,1.337406


In [7]:
mvp_contri = pd.read_csv('../data/mvp_contri.csv')
mvp_contri.head()

Unnamed: 0.1,Unnamed: 0,season,player,fg_per_g,pts_per_g,usg_pct,ws,ows,vorp,award_share
0,0,1982,Moses Malone,11.7,31.1,29.9,15.4,11.7,5.5,0.735
1,1,1983,Moses Malone,8.4,24.5,26.0,15.1,9.1,4.2,0.96
2,2,1984,Larry Bird,9.6,24.2,26.7,13.6,7.9,7.3,0.858
3,3,1985,Larry Bird,11.5,28.7,28.5,15.7,10.5,8.7,0.978
4,4,1986,Larry Bird,9.7,25.8,27.6,15.8,9.6,8.4,0.981


Based on the data, the df_per_35 set looks to be the ideal data set going foward for training the model

In [8]:
#Performing another check to ensure there are no missing values
print(f'mvp_per_35 has missing values: {mvp_per_35.isnull().values.any()}')
print(f'df_per_35 has missing values: {df_per_35.isnull().values.any()}')
print(f'mvp_90_contri has missing values: {mvp_90_contri.isnull().values.any()}')

mvp_per_35 has missing values: False
df_per_35 has missing values: False
mvp_90_contri has missing values: False


In [9]:
#reloading the minimumally edited NBA dataset 
# to save the unedited version of award_share
df = pd.read_csv('../data/df.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,season,player,pos,age,team_id,g,gs,mp_per_g,fg_per_g,fga_per_g,fg_pct,fg3_per_g,fg3a_per_g,fg3_pct,fg2_per_g,fg2a_per_g,fg2_pct,efg_pct,ft_per_g,fta_per_g,ft_pct,orb_per_g,drb_per_g,trb_per_g,ast_per_g,stl_per_g,blk_per_g,tov_per_g,pf_per_g,pts_per_g,mp,per,ts_pct,fg3a_per_fga_pct,fta_per_fga_pct,orb_pct,drb_pct,trb_pct,ast_pct,stl_pct,blk_pct,tov_pct,usg_pct,ows,dws,ws,ws_per_48,obpm,dbpm,bpm,vorp,award_share,mov,mov_adj,win_loss_pct
0,0,1982,Kareem Abdul-Jabbar,C,34,LAL,76,76,35.2,9.9,17.1,0.579,0.0,0.0,0.0,9.9,17.1,0.58,0.579,4.1,5.8,0.706,2.3,6.4,8.7,3.0,0.8,2.7,3.0,2.9,23.9,2677,23.4,0.608,0.002,0.34,7.3,19.1,13.4,11.9,1.1,4.1,13.3,25.6,6.9,3.9,10.7,0.192,3.8,1.2,5.0,4.7,0.045,4.87,4.37,0.695
1,1,1982,Alvan Adams,C,27,PHO,79,75,30.3,6.4,13.0,0.494,0.0,0.0,0.0,6.4,13.0,0.494,0.494,2.3,2.9,0.781,1.7,5.7,7.4,4.5,1.4,1.0,2.5,3.4,15.1,2393,18.6,0.529,0.001,0.227,6.6,20.2,13.6,22.1,2.3,1.9,14.8,22.8,2.5,4.7,7.2,0.144,1.4,2.2,3.6,3.4,0.0,3.45,3.05,0.561
2,2,1982,Mark Aguirre,SF,22,DAL,51,20,28.8,7.5,16.1,0.465,0.5,1.4,0.352,7.0,14.7,0.475,0.48,3.3,4.8,0.68,1.7,3.1,4.9,3.2,0.7,0.4,2.6,3.0,18.7,1468,17.3,0.514,0.087,0.301,6.7,12.9,9.7,18.6,1.2,0.9,12.7,29.8,1.0,0.8,1.9,0.061,2.3,-1.6,0.7,1.0,0.0,-4.43,-4.48,0.341
3,3,1982,Danny Ainge,SG,22,BOS,53,1,10.6,1.5,4.2,0.357,0.1,0.3,0.294,1.4,3.8,0.363,0.369,1.1,1.2,0.862,0.5,0.6,1.1,1.6,0.7,0.1,1.0,1.6,4.1,564,10.1,0.439,0.077,0.294,5.0,5.9,5.5,19.7,3.1,0.3,17.5,21.5,-0.3,0.8,0.5,0.042,-3.7,1.0,-2.7,-0.1,0.0,6.38,6.35,0.768
4,4,1982,Tiny Archibald,PG,33,BOS,68,51,31.9,4.5,9.6,0.472,0.1,0.2,0.375,4.4,9.4,0.475,0.477,3.5,4.6,0.747,0.4,1.3,1.7,8.0,0.8,0.0,2.6,1.9,12.6,2167,14.3,0.542,0.025,0.485,1.3,4.5,2.9,31.9,1.1,0.1,18.4,17.9,3.4,1.8,5.2,0.115,1.4,-1.3,0.1,1.1,0.0,6.38,6.35,0.768


In [10]:
#Replacing the column in df_per_35 with award_share 
# from df

df_per_35['award_share'] = df['award_share'].reindex(df_per_35.index)

df_per_35.head()



Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,season,player,pos,age,g,team_id,gs,mp_per_g,fg_per_g,fga_per_g,fg_pct,fg3_per_g,fg3a_per_g,fg3_pct,fg2_per_g,fg2a_per_g,fg2_pct,efg_pct,ft_per_g,fta_per_g,ft_pct,orb_per_g,drb_per_g,trb_per_g,ast_per_g,stl_per_g,blk_per_g,tov_per_g,pf_per_g,pts_per_g,mp,per,ts_pct,fg3a_per_fga_pct,fta_per_fga_pct,orb_pct,drb_pct,trb_pct,ast_pct,stl_pct,blk_pct,tov_pct,usg_pct,ows,dws,ws,ws_per_48,obpm,dbpm,bpm,vorp,award_share,mov,mov_adj,win_loss_pct,mvp_won
0,0,0,0,0,1982,Kareem Abdul-Jabbar,C,34,76,LAL,76,35.0,9.84375,17.002841,0.57571,0.0,0.0,0.0,9.84375,17.002841,0.576705,0.57571,4.076705,5.767045,0.701989,2.286932,6.363636,8.650568,2.982955,0.795455,2.684659,2.982955,2.883523,23.764205,2661.789773,23.267045,0.604545,0.001989,0.338068,7.258523,18.991477,13.323864,11.832386,1.09375,4.076705,13.224432,25.454545,6.860795,3.877841,10.639205,0.190909,3.778409,1.193182,4.971591,4.673295,0.045,4.84233,4.34517,0.691051,No
1,1,1,1,1,1982,Alvan Adams,C,27,79,PHO,75,35.0,7.392739,15.016502,0.570627,0.0,0.0,0.0,7.392739,15.016502,0.570627,0.570627,2.656766,3.349835,0.902145,1.963696,6.584158,8.547855,5.19802,1.617162,1.155116,2.887789,3.927393,17.442244,2764.191419,21.485149,0.611056,0.001155,0.262211,7.623762,23.333333,15.709571,25.528053,2.656766,2.194719,17.09571,26.336634,2.887789,5.429043,8.316832,0.166337,1.617162,2.541254,4.158416,3.927393,0.0,3.985149,3.523102,0.64802,No
2,2,2,2,2,1982,Mark Aguirre,SF,22,51,DAL,20,35.0,9.114583,19.565972,0.565104,0.607639,1.701389,0.427778,8.506944,17.864583,0.577257,0.583333,4.010417,5.833333,0.826389,2.065972,3.767361,5.954861,3.888889,0.850694,0.486111,3.159722,3.645833,22.725694,1784.027778,21.024306,0.624653,0.105729,0.365799,8.142361,15.677083,11.788194,22.604167,1.458333,1.09375,15.434028,36.215278,1.215278,0.972222,2.309028,0.074132,2.795139,-1.944444,0.850694,1.215278,0.0,-5.383681,-5.444444,0.41441,No
3,3,3,3,3,1982,Danny Ainge,SG,22,53,BOS,1,35.0,4.95283,13.867925,1.178774,0.330189,0.990566,0.970755,4.622642,12.54717,1.198585,1.218396,3.632075,3.962264,2.846226,1.650943,1.981132,3.632075,5.283019,2.311321,0.330189,3.301887,5.283019,13.537736,1862.264151,33.349057,1.449528,0.254245,0.970755,16.509434,19.481132,18.160377,65.04717,10.235849,0.990566,57.783019,70.990566,-0.990566,2.641509,1.650943,0.138679,-12.216981,3.301887,-8.915094,-0.330189,0.0,21.066038,20.966981,2.535849,No
4,4,4,4,4,1982,Tiny Archibald,PG,33,68,BOS,51,35.0,4.937304,10.532915,0.517868,0.109718,0.219436,0.411442,4.827586,10.31348,0.52116,0.523354,3.840125,5.047022,0.819592,0.438871,1.426332,1.865204,8.777429,0.877743,0.0,2.852665,2.084639,13.824451,2377.586207,15.689655,0.594671,0.027429,0.532132,1.426332,4.937304,3.181818,35.0,1.206897,0.109718,20.188088,19.639498,3.730408,1.974922,5.705329,0.126176,1.53605,-1.426332,0.109718,1.206897,0.0,7.0,6.967085,0.842633,No


In [11]:
df_per_35.head()

Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,season,player,pos,age,g,team_id,gs,mp_per_g,fg_per_g,fga_per_g,fg_pct,fg3_per_g,fg3a_per_g,fg3_pct,fg2_per_g,fg2a_per_g,fg2_pct,efg_pct,ft_per_g,fta_per_g,ft_pct,orb_per_g,drb_per_g,trb_per_g,ast_per_g,stl_per_g,blk_per_g,tov_per_g,pf_per_g,pts_per_g,mp,per,ts_pct,fg3a_per_fga_pct,fta_per_fga_pct,orb_pct,drb_pct,trb_pct,ast_pct,stl_pct,blk_pct,tov_pct,usg_pct,ows,dws,ws,ws_per_48,obpm,dbpm,bpm,vorp,award_share,mov,mov_adj,win_loss_pct,mvp_won
0,0,0,0,0,1982,Kareem Abdul-Jabbar,C,34,76,LAL,76,35.0,9.84375,17.002841,0.57571,0.0,0.0,0.0,9.84375,17.002841,0.576705,0.57571,4.076705,5.767045,0.701989,2.286932,6.363636,8.650568,2.982955,0.795455,2.684659,2.982955,2.883523,23.764205,2661.789773,23.267045,0.604545,0.001989,0.338068,7.258523,18.991477,13.323864,11.832386,1.09375,4.076705,13.224432,25.454545,6.860795,3.877841,10.639205,0.190909,3.778409,1.193182,4.971591,4.673295,0.045,4.84233,4.34517,0.691051,No
1,1,1,1,1,1982,Alvan Adams,C,27,79,PHO,75,35.0,7.392739,15.016502,0.570627,0.0,0.0,0.0,7.392739,15.016502,0.570627,0.570627,2.656766,3.349835,0.902145,1.963696,6.584158,8.547855,5.19802,1.617162,1.155116,2.887789,3.927393,17.442244,2764.191419,21.485149,0.611056,0.001155,0.262211,7.623762,23.333333,15.709571,25.528053,2.656766,2.194719,17.09571,26.336634,2.887789,5.429043,8.316832,0.166337,1.617162,2.541254,4.158416,3.927393,0.0,3.985149,3.523102,0.64802,No
2,2,2,2,2,1982,Mark Aguirre,SF,22,51,DAL,20,35.0,9.114583,19.565972,0.565104,0.607639,1.701389,0.427778,8.506944,17.864583,0.577257,0.583333,4.010417,5.833333,0.826389,2.065972,3.767361,5.954861,3.888889,0.850694,0.486111,3.159722,3.645833,22.725694,1784.027778,21.024306,0.624653,0.105729,0.365799,8.142361,15.677083,11.788194,22.604167,1.458333,1.09375,15.434028,36.215278,1.215278,0.972222,2.309028,0.074132,2.795139,-1.944444,0.850694,1.215278,0.0,-5.383681,-5.444444,0.41441,No
3,3,3,3,3,1982,Danny Ainge,SG,22,53,BOS,1,35.0,4.95283,13.867925,1.178774,0.330189,0.990566,0.970755,4.622642,12.54717,1.198585,1.218396,3.632075,3.962264,2.846226,1.650943,1.981132,3.632075,5.283019,2.311321,0.330189,3.301887,5.283019,13.537736,1862.264151,33.349057,1.449528,0.254245,0.970755,16.509434,19.481132,18.160377,65.04717,10.235849,0.990566,57.783019,70.990566,-0.990566,2.641509,1.650943,0.138679,-12.216981,3.301887,-8.915094,-0.330189,0.0,21.066038,20.966981,2.535849,No
4,4,4,4,4,1982,Tiny Archibald,PG,33,68,BOS,51,35.0,4.937304,10.532915,0.517868,0.109718,0.219436,0.411442,4.827586,10.31348,0.52116,0.523354,3.840125,5.047022,0.819592,0.438871,1.426332,1.865204,8.777429,0.877743,0.0,2.852665,2.084639,13.824451,2377.586207,15.689655,0.594671,0.027429,0.532132,1.426332,4.937304,3.181818,35.0,1.206897,0.109718,20.188088,19.639498,3.730408,1.974922,5.705329,0.126176,1.53605,-1.426332,0.109718,1.206897,0.0,7.0,6.967085,0.842633,No


Selecting columns from the EDA phase with a mutual score of at least .1 

In [12]:
save_columns = ['season','player','fg_pct','blk_pct','usg_pct','vorp',
                'fg3_pct','tov_per_g','drb_per_g',
                'tov_pct','award_share']

testing_df = df_per_35[save_columns]

In [13]:


encodings = ['utf-8', 'iso-8859-1', 'windows-1252', 'cp850']

for encoding in encodings:
    try:
        testing_2023 = pd.read_csv('../data/NBA_2023.csv', encoding=encoding)
        break
    except UnicodeDecodeError:
        print('Encoding {} failed'.format(encoding))

print(testing_2023.head())

Encoding utf-8 failed
   season            player pos   age team_id     g  gs   per  ts_per  \
0    2023  Precious Achiuwa   C  23.0     TOR  54.0  11  15.1   0.552   
1    2023      Steven Adams   C  29.0     MEM  42.0  42  17.5   0.564   
2    2023       Bam Adebayo   C  25.0     MIA  74.0  74  20.0   0.592   
3    2023      Ochai Agbaji  SG  22.0     UTA  58.0  21   9.6   0.569   
4    2023      Santi Aldama  PF  22.0     MEM  77.0  20  13.9   0.591   

   fg3_pct  ft_pct  ast_pct  stl_pct  blk_pct  tov_pct  usg_pct  ows  dws  \
0    0.262   0.316      6.3      1.3      2.6     11.3     19.3  0.8  1.3   
1    0.004   0.490     11.1      1.5      3.7     19.8     14.6  1.3  2.1   
2    0.011   0.362     16.0      1.7      2.4     12.7     25.3  3.5  3.8   
3    0.603   0.185      6.7      0.6      1.1      8.8     15.6  1.0  0.4   
4    0.507   0.274      7.6      1.3      2.6      9.3     16.0  2.1  2.4   

    ws  ws_per_48  obpm  dbpm  bpm  vorp  fg_per_g  fga_per_g  fg_pct  \
0  

In [14]:
testing_2023.head()

Unnamed: 0,season,player,pos,age,team_id,g,gs,per,ts_per,fg3_pct,ft_pct,ast_pct,stl_pct,blk_pct,tov_pct,usg_pct,ows,dws,ws,ws_per_48,obpm,dbpm,bpm,vorp,fg_per_g,fga_per_g,fg_pct,fg3_per_g,fg3a_per_g,fg3_pct.1,fg2_per_g,fg2a_per_g,fg2_pct,efg_pct,ft_per_g,fta_per_g,ft_pct.1,mp_per_g,orb_per_g,drb_per_g,trb_per_g,ast_per_g,stl_per_g,blk_per_g,tov_per_g,pf_per_g,pts_per_g
0,2023,Precious Achiuwa,C,23.0,TOR,54.0,11,15.1,0.552,0.262,0.316,6.3,1.3,2.6,11.3,19.3,0.8,1.3,2.1,0.092,-1.5,-0.9,-2.4,-0.1,190.0,393,0.483,27.0,103.0,0.262,163.0,290.0,0.562,0.518,87.0,124.0,0.702,20.6,1.8,4.0,5.8,0.9,0.5,0.6,1.1,1.9,9.1
1,2023,Steven Adams,C,29.0,MEM,42.0,42,17.5,0.564,0.004,0.49,11.1,1.5,3.7,19.8,14.6,1.3,2.1,3.4,0.144,-0.3,0.9,0.6,0.7,157.0,263,0.597,0.0,1.0,0.0,157.0,262.0,0.599,0.597,47.0,129.0,0.364,27.0,5.1,6.5,11.5,2.3,0.9,1.1,1.9,2.3,8.6
2,2023,Bam Adebayo,C,25.0,MIA,74.0,74,20.0,0.592,0.011,0.362,16.0,1.7,2.4,12.7,25.3,3.5,3.8,7.4,0.136,0.7,0.8,1.5,2.3,600.0,1111,0.54,1.0,12.0,0.083,599.0,1099.0,0.545,0.541,324.0,402.0,0.806,35.0,2.5,6.8,9.3,3.2,1.2,0.8,2.5,2.8,20.6
3,2023,Ochai Agbaji,SG,22.0,UTA,58.0,21,9.6,0.569,0.603,0.185,6.7,0.6,1.1,8.8,15.6,1.0,0.4,1.4,0.057,-1.5,-1.4,-2.9,-0.3,161.0,373,0.432,81.0,225.0,0.36,80.0,148.0,0.541,0.54,56.0,69.0,0.812,20.4,0.7,1.3,2.1,1.0,0.3,0.3,0.7,1.7,7.9
4,2023,Santi Aldama,PF,22.0,MEM,77.0,20,13.9,0.591,0.507,0.274,7.6,1.3,2.6,9.3,16.0,2.1,2.4,4.6,0.13,-0.3,0.8,0.5,1.1,247.0,525,0.47,94.0,266.0,0.353,153.0,259.0,0.591,0.56,108.0,144.0,0.75,21.8,1.1,3.7,4.8,1.3,0.6,0.6,0.8,1.9,9.0


In [23]:
testing_2023.dtypes

season         int64
player        object
fg_pct       float64
blk_pct      float64
usg_pct      float64
vorp         float64
fg3_pct      float64
tov_per_g    float64
drb_per_g    float64
tov_pct      float64
dtype: object

In [15]:
columns_2023 = ['season','player','fg_pct','blk_pct','usg_pct','vorp',
                'fg3_pct','tov_per_g','drb_per_g',
                'tov_pct']

In [16]:
testing_2023 = testing_2023[columns_2023]

In [17]:
testing_2023.head()

Unnamed: 0,season,player,fg_pct,blk_pct,usg_pct,vorp,fg3_pct,tov_per_g,drb_per_g,tov_pct
0,2023,Precious Achiuwa,0.483,2.6,19.3,-0.1,0.262,1.1,4.0,11.3
1,2023,Steven Adams,0.597,3.7,14.6,0.7,0.004,1.9,6.5,19.8
2,2023,Bam Adebayo,0.54,2.4,25.3,2.3,0.011,2.5,6.8,12.7
3,2023,Ochai Agbaji,0.432,1.1,15.6,-0.3,0.603,0.7,1.3,8.8
4,2023,Santi Aldama,0.47,2.6,16.0,1.1,0.507,0.8,3.7,9.3


In [18]:
print(testing_2023.isin([np.nan, np.inf, -np.inf]).sum())

season        0
player       25
fg_pct        4
blk_pct      25
usg_pct      25
vorp         25
fg3_pct      29
tov_per_g     0
drb_per_g     0
tov_pct      27
dtype: int64


In [19]:
testing_2023 = testing_2023.replace([np.nan, np.inf, -np.inf],0)

In [20]:
print(testing_2023.isin([np.nan, np.inf, -np.inf]).sum())

season       0
player       0
fg_pct       0
blk_pct      0
usg_pct      0
vorp         0
fg3_pct      0
tov_per_g    0
drb_per_g    0
tov_pct      0
dtype: int64


In [22]:
testing_df.dtypes

season           int64
player          object
fg_pct         float64
blk_pct        float64
usg_pct        float64
vorp           float64
fg3_pct        float64
tov_per_g      float64
drb_per_g      float64
tov_pct        float64
award_share    float64
dtype: object

In [24]:
#Saving data frames for later use
mvp_per_35.to_csv(r'../data/mvp_per_35.csv')
df_per_35.to_csv(r'../data/df_per_35.csv')
testing_df.to_csv(r'../data/testing_df.csv')
testing_2023.to_csv(r'../data/testing_2023.csv')

# Conclusion

This notebook was intended to prepare the data for use across multiple ML models by selecting the appropriate features based on mutual information scores then performing train_test_split on the data to scale it for use. The target feature 'award_share' was added to a new column 'mvp_won' with string values of 'yes' or 'no' to allow for easy modeling in the next step. With all of these changes made, modeling and predicting the NBA MVP will be the next and final step.