In [81]:
import pandas as pd
import numpy as np

netData = pd.read_csv('data/netData.csv')

netData.head()

Unnamed: 0.1,Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,...,STL,BLK,TOV,PF,PTS,height,weight,birth_date,college,Salary
0,24096,2017.0,Alex Abrines,SG,23.0,OKC,68.0,6.0,1055.0,10.1,...,37.0,8.0,33.0,114.0,406.0,6-6,190.0,"August 1, 1993",,5725000.0
1,24097,2017.0,Quincy Acy,PF,26.0,TOT,38.0,1.0,558.0,11.8,...,14.0,15.0,21.0,67.0,222.0,6-7,240.0,"October 6, 1990",Baylor University,1709538.0
2,24098,2017.0,Quincy Acy,PF,26.0,DAL,6.0,0.0,48.0,-1.4,...,0.0,0.0,2.0,9.0,13.0,6-7,240.0,"October 6, 1990",Baylor University,1709538.0
3,24099,2017.0,Quincy Acy,PF,26.0,BRK,32.0,1.0,510.0,13.1,...,14.0,15.0,19.0,58.0,209.0,6-7,240.0,"October 6, 1990",Baylor University,1709538.0
4,24100,2017.0,Steven Adams,C,23.0,OKC,80.0,80.0,2389.0,16.5,...,88.0,78.0,146.0,195.0,905.0,7-0,255.0,"July 20, 1993",University of Pittsburgh,22471910.0


Now that the data is imported we can choose the variables that we will include in our model. We will include ws/48, Ft%, 3P%, 2P%, TS%, PER, STL%, BLK%, DRB%, TRB%, AST%, ORB%, TOV%, Age, Height, Weight, BMI, Position, and College.

For Age, Height, Weight, and BMI we will also include the square of these variables to account for the fact that more extreme values in these variables may also be detremental to the player.

For Position and Team we will tokenize, so position will be split into 5 variable and college will be split in two, indicating if a player went to college or did not.

Let's continue by removing the variables we are not using from the data set.

In [82]:
netData = netData.drop(columns = ['TOV', 'Unnamed: 0', 'Year', 'Player', 'Tm', 'G', 'GS', 'MP', '3PAr', 'FTr', 'USG%', 'blanl', 'OWS', 'DWS', 'WS', 'blank2', 'OBPM', 'DBPM', 'BPM', 'VORP', 'FG', 'FGA', 'FG%', '3P', '3PA', '2P', '2PA', 'eFG%', 'FT', 'FTA', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'PF', 'PTS', 'birth_date'])
netData.head()

Unnamed: 0,Pos,Age,PER,TS%,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,WS/48,3P%,2P%,FT%,height,weight,college,Salary
0,SG,23.0,10.1,0.56,1.9,7.1,4.5,5.5,1.7,0.6,8.3,0.095,0.381,0.426,0.898,6-6,190.0,,5725000.0
1,PF,26.0,11.8,0.565,3.9,18.0,11.0,4.9,1.2,2.0,9.7,0.082,0.411,0.413,0.75,6-7,240.0,Baylor University,1709538.0
2,PF,26.0,-1.4,0.355,4.6,15.2,9.7,0.0,0.0,0.0,9.8,-0.133,0.143,0.4,0.667,6-7,240.0,Baylor University,1709538.0
3,PF,26.0,13.1,0.587,3.8,18.2,11.1,5.4,1.3,2.2,9.6,0.102,0.434,0.414,0.754,6-7,240.0,Baylor University,1709538.0
4,C,23.0,16.5,0.589,13.0,15.5,14.2,5.4,1.8,2.6,16.0,0.13,0.0,0.572,0.611,7-0,255.0,University of Pittsburgh,22471910.0


Notice that the height is given in string form. Lets convert this into integers representing inches.

In [83]:
for index, row in netData.iterrows():
  height = row['height']
  if len(height) != 0:
    nums = height.split("-")
    num = 12 * int(nums[0]) + int(nums[1])
    netData.at[index, 'height'] = num

netData.head()

Unnamed: 0,Pos,Age,PER,TS%,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,WS/48,3P%,2P%,FT%,height,weight,college,Salary
0,SG,23.0,10.1,0.56,1.9,7.1,4.5,5.5,1.7,0.6,8.3,0.095,0.381,0.426,0.898,78,190.0,,5725000.0
1,PF,26.0,11.8,0.565,3.9,18.0,11.0,4.9,1.2,2.0,9.7,0.082,0.411,0.413,0.75,79,240.0,Baylor University,1709538.0
2,PF,26.0,-1.4,0.355,4.6,15.2,9.7,0.0,0.0,0.0,9.8,-0.133,0.143,0.4,0.667,79,240.0,Baylor University,1709538.0
3,PF,26.0,13.1,0.587,3.8,18.2,11.1,5.4,1.3,2.2,9.6,0.102,0.434,0.414,0.754,79,240.0,Baylor University,1709538.0
4,C,23.0,16.5,0.589,13.0,15.5,14.2,5.4,1.8,2.6,16.0,0.13,0.0,0.572,0.611,84,255.0,University of Pittsburgh,22471910.0


BMI was not initially in the data set, its an an engineered variable. Therefor lets add it in right now.

In [84]:
netData['BMI'] = netData['weight'] / (netData['height'] ** 2)
netData.head()

Unnamed: 0,Pos,Age,PER,TS%,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,WS/48,3P%,2P%,FT%,height,weight,college,Salary,BMI
0,SG,23.0,10.1,0.56,1.9,7.1,4.5,5.5,1.7,0.6,8.3,0.095,0.381,0.426,0.898,78,190.0,,5725000.0,0.031229
1,PF,26.0,11.8,0.565,3.9,18.0,11.0,4.9,1.2,2.0,9.7,0.082,0.411,0.413,0.75,79,240.0,Baylor University,1709538.0,0.038455
2,PF,26.0,-1.4,0.355,4.6,15.2,9.7,0.0,0.0,0.0,9.8,-0.133,0.143,0.4,0.667,79,240.0,Baylor University,1709538.0,0.038455
3,PF,26.0,13.1,0.587,3.8,18.2,11.1,5.4,1.3,2.2,9.6,0.102,0.434,0.414,0.754,79,240.0,Baylor University,1709538.0,0.038455
4,C,23.0,16.5,0.589,13.0,15.5,14.2,5.4,1.8,2.6,16.0,0.13,0.0,0.572,0.611,84,255.0,University of Pittsburgh,22471910.0,0.036139


Now lets split our categorical variables. We split college into went to college or not and positions into five columns in which a one represent if a player was in that position and a zero if they were not.

In [85]:
for index, row in netData.iterrows():
  if pd.isna(row['college']):
    netData.at[index, 'college'] = 0
  else:
    netData.at[index, 'college'] = 1

netData.head()

Unnamed: 0,Pos,Age,PER,TS%,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,WS/48,3P%,2P%,FT%,height,weight,college,Salary,BMI
0,SG,23.0,10.1,0.56,1.9,7.1,4.5,5.5,1.7,0.6,8.3,0.095,0.381,0.426,0.898,78,190.0,0,5725000.0,0.031229
1,PF,26.0,11.8,0.565,3.9,18.0,11.0,4.9,1.2,2.0,9.7,0.082,0.411,0.413,0.75,79,240.0,1,1709538.0,0.038455
2,PF,26.0,-1.4,0.355,4.6,15.2,9.7,0.0,0.0,0.0,9.8,-0.133,0.143,0.4,0.667,79,240.0,1,1709538.0,0.038455
3,PF,26.0,13.1,0.587,3.8,18.2,11.1,5.4,1.3,2.2,9.6,0.102,0.434,0.414,0.754,79,240.0,1,1709538.0,0.038455
4,C,23.0,16.5,0.589,13.0,15.5,14.2,5.4,1.8,2.6,16.0,0.13,0.0,0.572,0.611,84,255.0,1,22471910.0,0.036139


In [86]:
netData['SG'] = 0
netData['PF'] = 0
netData['C'] = 0
netData['SF'] = 0
netData['PG'] = 0

netData.head()

Unnamed: 0,Pos,Age,PER,TS%,ORB%,DRB%,TRB%,AST%,STL%,BLK%,...,height,weight,college,Salary,BMI,SG,PF,C,SF,PG
0,SG,23.0,10.1,0.56,1.9,7.1,4.5,5.5,1.7,0.6,...,78,190.0,0,5725000.0,0.031229,0,0,0,0,0
1,PF,26.0,11.8,0.565,3.9,18.0,11.0,4.9,1.2,2.0,...,79,240.0,1,1709538.0,0.038455,0,0,0,0,0
2,PF,26.0,-1.4,0.355,4.6,15.2,9.7,0.0,0.0,0.0,...,79,240.0,1,1709538.0,0.038455,0,0,0,0,0
3,PF,26.0,13.1,0.587,3.8,18.2,11.1,5.4,1.3,2.2,...,79,240.0,1,1709538.0,0.038455,0,0,0,0,0
4,C,23.0,16.5,0.589,13.0,15.5,14.2,5.4,1.8,2.6,...,84,255.0,1,22471910.0,0.036139,0,0,0,0,0


In [87]:
for index, row in netData.iterrows():
  currPos = row['Pos']
  if currPos == 'SG':
    netData.at[index, 'SG'] = 1
  elif currPos == 'PF':
    netData.at[index, 'PF'] = 1
  elif currPos == 'C':
    netData.at[index, 'C'] = 1
  elif currPos == 'SF':
    netData.at[index, 'SF'] = 1
  elif currPos == 'PG':
    netData.at[index, 'PG'] = 1

netData.head()

Unnamed: 0,Pos,Age,PER,TS%,ORB%,DRB%,TRB%,AST%,STL%,BLK%,...,height,weight,college,Salary,BMI,SG,PF,C,SF,PG
0,SG,23.0,10.1,0.56,1.9,7.1,4.5,5.5,1.7,0.6,...,78,190.0,0,5725000.0,0.031229,1,0,0,0,0
1,PF,26.0,11.8,0.565,3.9,18.0,11.0,4.9,1.2,2.0,...,79,240.0,1,1709538.0,0.038455,0,1,0,0,0
2,PF,26.0,-1.4,0.355,4.6,15.2,9.7,0.0,0.0,0.0,...,79,240.0,1,1709538.0,0.038455,0,1,0,0,0
3,PF,26.0,13.1,0.587,3.8,18.2,11.1,5.4,1.3,2.2,...,79,240.0,1,1709538.0,0.038455,0,1,0,0,0
4,C,23.0,16.5,0.589,13.0,15.5,14.2,5.4,1.8,2.6,...,84,255.0,1,22471910.0,0.036139,0,0,1,0,0


In [88]:
netData = netData.drop(columns = ['Pos'])
netData.head()

Unnamed: 0,Age,PER,TS%,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,...,height,weight,college,Salary,BMI,SG,PF,C,SF,PG
0,23.0,10.1,0.56,1.9,7.1,4.5,5.5,1.7,0.6,8.3,...,78,190.0,0,5725000.0,0.031229,1,0,0,0,0
1,26.0,11.8,0.565,3.9,18.0,11.0,4.9,1.2,2.0,9.7,...,79,240.0,1,1709538.0,0.038455,0,1,0,0,0
2,26.0,-1.4,0.355,4.6,15.2,9.7,0.0,0.0,0.0,9.8,...,79,240.0,1,1709538.0,0.038455,0,1,0,0,0
3,26.0,13.1,0.587,3.8,18.2,11.1,5.4,1.3,2.2,9.6,...,79,240.0,1,1709538.0,0.038455,0,1,0,0,0
4,23.0,16.5,0.589,13.0,15.5,14.2,5.4,1.8,2.6,16.0,...,84,255.0,1,22471910.0,0.036139,0,0,1,0,0


Now lets normalize every column by dividing each element of every column by the largest entry in that column. This will help us do feature importance later on because we will more easily be able to compare the coefficients in the regression equation.

In [89]:
maxSal = netData['Salary'].max()
netDataNorm = netData/netData.max() * 10

for index, row in netDataNorm.iterrows():
  netDataNorm.at[index, 'Salary'] = row['Salary'] * maxSal / 500000

print(netDataNorm.head())

    Age       PER       TS%      ORB%      DRB%      TRB%      AST%      STL%  \
0  5.75  3.279221  6.829268  0.722433  1.955923  1.778656   0.95986  1.531532   
1   6.5  3.831169  6.890244   1.48289  4.958678  4.347826  0.855148  1.081081   
2   6.5 -0.454545  4.329268  1.749049  4.187328  3.833992       0.0       0.0   
3   6.5  4.253247  7.158537  1.444867  5.013774  4.387352  0.942408  1.171171   
4  5.75  5.357143  7.182927  4.942966  4.269972  5.612648  0.942408  1.621622   

       BLK%      TOV%  ...    height    weight college    Salary       BMI  \
0  0.659341   1.90367  ...  8.965517  6.551724     0.0     114.5  7.265981   
1  2.197802  2.224771  ...   9.08046  8.275862    10.0  34.19076  8.947195   
2       0.0  2.247706  ...   9.08046  8.275862    10.0  34.19076  8.947195   
3  2.417582  2.201835  ...   9.08046  8.275862    10.0  34.19076  8.947195   
4  2.857143  3.669725  ...  9.655172  8.793103    10.0  449.4382  8.408363   

     SG    PF     C   SF   PG  
0  10.0   0.

Finally lets square the terms that we wanted to model quadratic terms for.

In [90]:
netDataNorm['Age^2'] = netDataNorm['Age'] ** 2
netDataNorm['height^2'] = netDataNorm['height'] ** 2
netDataNorm['weight^2'] = netDataNorm['weight'] ** 2
netDataNorm['BMI^2'] = netDataNorm['BMI'] ** 2
netDataNorm = netDataNorm.fillna(0)

netDataNorm.head()

Unnamed: 0,Age,PER,TS%,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,...,BMI,SG,PF,C,SF,PG,Age^2,height^2,weight^2,BMI^2
0,5.75,3.279221,6.829268,0.722433,1.955923,1.778656,0.95986,1.531532,0.659341,1.90367,...,7.265981,10.0,0.0,0.0,0.0,0.0,33.0625,80.380499,42.925089,52.79448
1,6.5,3.831169,6.890244,1.48289,4.958678,4.347826,0.855148,1.081081,2.197802,2.224771,...,8.947195,0.0,10.0,0.0,0.0,0.0,42.25,82.45475,68.489893,80.052305
2,6.5,-0.454545,4.329268,1.749049,4.187328,3.833992,0.0,0.0,0.0,2.247706,...,8.947195,0.0,10.0,0.0,0.0,0.0,42.25,82.45475,68.489893,80.052305
3,6.5,4.253247,7.158537,1.444867,5.013774,4.387352,0.942408,1.171171,2.417582,2.201835,...,8.947195,0.0,10.0,0.0,0.0,0.0,42.25,82.45475,68.489893,80.052305
4,5.75,5.357143,7.182927,4.942966,4.269972,5.612648,0.942408,1.621622,2.857143,3.669725,...,8.408363,0.0,0.0,10.0,0.0,0.0,33.0625,93.222354,77.318668,70.700574


In [91]:
netDataNorm.to_csv("data/finalData.csv")