In [1]:
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

### Load the data

In [2]:
df = pd.read_csv('nba_stats.csv')
df.head()

Unnamed: 0,img,player,position,height,weight,college,draft_yr,pick,drafted_by,all_nba,all_star,yrs,games,games_started,tot_min_played,tot_pts,tot_trb,tot_ast,fg_pct,fg3_pct,ft_pct,min_pg,pts_pg,trb_pg,ast_pg,fg_pg,fga_pg,fg3_pg,fg3a_pg,efg_pct,ft_pg,fta_pg,orb_pg,drb_pg,steals_pg,blocks_pg,tov_pg,pf_pg,win_share,ws_per_game,bpm,vorp,attend_college
0,https://d2cwpp38twqe55.cloudfront.net/req/2019...,Deandre Ayton,C,7-1,250lb,Arizona,2018,1,PHO,0,0,1,71,70,2183,1159,729,125,0.585,0.0,0.746,30.7,16.3,10.3,1.8,7.2,12.3,0.0,0.1,0.585,2.0,2.7,3.1,7.1,0.9,0.9,1.8,2.9,5.8,0.128,0.2,1.2,1
1,https://d2cwpp38twqe55.cloudfront.net/req/2019...,Marvin Bagley,PF,6-11,234lb,Duke,2018,2,SAC,0,0,1,62,4,1567,923,471,62,0.504,0.313,0.691,25.3,14.9,7.6,1.0,5.7,11.4,0.5,1.5,0.525,2.9,4.2,2.6,5.0,0.5,1.0,1.6,1.9,3.6,0.11,-1.8,0.1,1
2,https://d2cwpp38twqe55.cloudfront.net/req/2019...,Jaren Jackson,PF,6-11,242lb,Michigan State,2018,4,MEM,0,0,1,58,56,1515,798,272,64,0.506,0.359,0.766,26.1,13.8,4.7,1.1,5.1,10.2,0.9,2.4,0.549,2.6,3.4,1.3,3.4,0.9,1.4,1.7,3.8,3.3,0.105,0.1,0.8,1
3,https://d2cwpp38twqe55.cloudfront.net/req/2019...,Trae Young,PG,6-2,180lb,Oklahoma,2018,5,DAL,0,0,1,81,81,2503,1549,301,653,0.418,0.324,0.829,30.9,19.1,3.7,8.1,6.5,15.5,1.9,6.0,0.48,4.2,5.1,0.8,2.9,0.9,0.2,3.8,1.7,3.3,0.062,-1.1,0.6,1
4,https://d2cwpp38twqe55.cloudfront.net/req/2019...,Mohamed Bamba,C,7-1,241lb,Texas,2018,6,ORL,0,0,1,47,1,766,292,233,39,0.481,0.3,0.587,16.3,6.2,5.0,0.8,2.5,5.2,0.4,1.5,0.525,0.8,1.3,1.4,3.6,0.3,1.4,0.9,2.2,1.7,0.106,-0.2,0.4,1


### Getting rid of instances of a player playing two positions for encoding purposes.

In [3]:
df['position'] = df['position'].apply(lambda x: x.split('-')[0])
df['position'].value_counts()

SG    470
PF    454
SF    444
PG    407
C     394
Name: position, dtype: int64

### Creating new features so we can use height and weight in the model.

In [4]:
def height_to_inches(height):
    feet, inches = height.split('-')
    feet = int(feet)
    inches = int(inches)
    inches = (feet*12)+inches
    return inches

In [5]:
df['inches'] = df['height'].apply(height_to_inches)

In [6]:
df['pounds'] = df['weight'].apply(lambda x: x.split('lb')[0])
df['pounds'] = df['pounds'].astype('int64')

In [7]:
df['pounds'].dtype

dtype('int64')

### Assigning each position an integer value so they can be used in the model.

In [8]:
def pos_to_num(position):
    if position == 'PG':
        return 1
    if position == 'SG':
        return 2
    if position == 'SF':
        return 3
    if position == 'PF':
        return 4
    if position == 'C':
        return 5
    else:
        raise Exception('not a valid position')

In [9]:
df['pos_num'] = df['position'].apply(pos_to_num)

### Removing some unnecessary columns to clean up the dataset.

In [10]:
df.columns

Index(['img', 'player', 'position', 'height', 'weight', 'college', 'draft_yr',
       'pick', 'drafted_by', 'all_nba', 'all_star', 'yrs', 'games',
       'games_started', 'tot_min_played', 'tot_pts', 'tot_trb', 'tot_ast',
       'fg_pct', 'fg3_pct', 'ft_pct', 'min_pg', 'pts_pg', 'trb_pg', 'ast_pg',
       'fg_pg', 'fga_pg', 'fg3_pg', 'fg3a_pg', 'efg_pct', 'ft_pg', 'fta_pg',
       'orb_pg', 'drb_pg', 'steals_pg', 'blocks_pg', 'tov_pg', 'pf_pg',
       'win_share', 'ws_per_game', 'bpm', 'vorp', 'attend_college', 'inches',
       'pounds', 'pos_num'],
      dtype='object')

In [11]:
cols = ['img', 'player', 'position', 'height', 'weight', 'college', 'draft_yr',
        'pick', 'drafted_by', 'yrs', 'fg_pct', 'fg3_pct', 'ft_pct', 'min_pg', 
        'pts_pg', 'trb_pg', 'ast_pg', 'fg_pg', 'fga_pg', 'fg3_pg', 'fg3a_pg', 
        'efg_pct', 'ft_pg', 'fta_pg', 'orb_pg', 'drb_pg', 'steals_pg', 'blocks_pg', 
        'tov_pg', 'pf_pg', 'ws_per_game', 'bpm', 'vorp', 'attend_college', 'inches',
        'pounds', 'pos_num']

In [12]:
new_df = df[cols]
new_df.head()

Unnamed: 0,img,player,position,height,weight,college,draft_yr,pick,drafted_by,yrs,fg_pct,fg3_pct,ft_pct,min_pg,pts_pg,trb_pg,ast_pg,fg_pg,fga_pg,fg3_pg,fg3a_pg,efg_pct,ft_pg,fta_pg,orb_pg,drb_pg,steals_pg,blocks_pg,tov_pg,pf_pg,ws_per_game,bpm,vorp,attend_college,inches,pounds,pos_num
0,https://d2cwpp38twqe55.cloudfront.net/req/2019...,Deandre Ayton,C,7-1,250lb,Arizona,2018,1,PHO,1,0.585,0.0,0.746,30.7,16.3,10.3,1.8,7.2,12.3,0.0,0.1,0.585,2.0,2.7,3.1,7.1,0.9,0.9,1.8,2.9,0.128,0.2,1.2,1,85,250,5
1,https://d2cwpp38twqe55.cloudfront.net/req/2019...,Marvin Bagley,PF,6-11,234lb,Duke,2018,2,SAC,1,0.504,0.313,0.691,25.3,14.9,7.6,1.0,5.7,11.4,0.5,1.5,0.525,2.9,4.2,2.6,5.0,0.5,1.0,1.6,1.9,0.11,-1.8,0.1,1,83,234,4
2,https://d2cwpp38twqe55.cloudfront.net/req/2019...,Jaren Jackson,PF,6-11,242lb,Michigan State,2018,4,MEM,1,0.506,0.359,0.766,26.1,13.8,4.7,1.1,5.1,10.2,0.9,2.4,0.549,2.6,3.4,1.3,3.4,0.9,1.4,1.7,3.8,0.105,0.1,0.8,1,83,242,4
3,https://d2cwpp38twqe55.cloudfront.net/req/2019...,Trae Young,PG,6-2,180lb,Oklahoma,2018,5,DAL,1,0.418,0.324,0.829,30.9,19.1,3.7,8.1,6.5,15.5,1.9,6.0,0.48,4.2,5.1,0.8,2.9,0.9,0.2,3.8,1.7,0.062,-1.1,0.6,1,74,180,1
4,https://d2cwpp38twqe55.cloudfront.net/req/2019...,Mohamed Bamba,C,7-1,241lb,Texas,2018,6,ORL,1,0.481,0.3,0.587,16.3,6.2,5.0,0.8,2.5,5.2,0.4,1.5,0.525,0.8,1.3,1.4,3.6,0.3,1.4,0.9,2.2,0.106,-0.2,0.4,1,85,241,5


### Exporting the cleaned dataset to a new csv file so the data engineers can build the database.

In [35]:
new_df.to_csv('new_nba_stats.csv', index=False)

### Separate the data into train, validation, and test dataframes

In [13]:
train = new_df[new_df['draft_yr'] < 1990]
validate = new_df[(new_df['draft_yr'] >= 1990) & (new_df['draft_yr'] < 2000)]
test = new_df[new_df['draft_yr'] >= 2000]

In [14]:
target = 'yrs'
features = ['draft_yr', 'fg_pct', 'fg3_pct', 'ft_pct', 'min_pg', 'pts_pg', 
            'ast_pg', 'fg_pg', 'fga_pg', 'pick', 'fg3_pg', 'fg3a_pg', 'efg_pct', 
            'ft_pg', 'fta_pg', 'trb_pg', 'orb_pg', 'drb_pg', 'inches', 'pounds', 
            'steals_pg', 'blocks_pg', 'tov_pg', 'pf_pg', 'ws_per_game', 'bpm', 
            'vorp', 'attend_college', 'pos_num']

X_train = train[features]
y_train = train[target]
X_val = validate[features]
y_val = validate[target]
X_test = test[features]
y_test = test[target]

X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((813, 29), (813,), (459, 29), (459,), (897, 29), (897,))

### Model building

In [40]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor

In [41]:
scaler = StandardScaler()
model = RandomForestRegressor()
pipe = Pipeline([('scaler', scaler), ('model', model)])

In [42]:
parameters = {
    'model__max_depth': [10, 20, None],
    'model__n_estimators': [100, 500, 1000, 2000, 5000],
    'model__min_samples_leaf': [1, 3, 5],
    'model__min_samples_split': [2, 4, 6]
}

search = GridSearchCV(pipe, parameters, cv=3, verbose=10, n_jobs=3)
search.fit(X_train, y_train)

Fitting 3 folds for each of 135 candidates, totalling 405 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    6.5s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:   31.4s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:   52.7s
[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:  1.7min
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:  2.2min
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:  2.9min
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  3.8min
[Parallel(n_jobs=3)]: Done  55 tasks      | elapsed:  4.2min
[Parallel(n_jobs=3)]: Done  66 tasks      | elapsed:  4.8min
[Parallel(n_jobs=3)]: Done  79 tasks      | elapsed:  5.7min
[Parallel(n_jobs=3)]: Done  92 tasks      | elapsed:  6.6min
[Parallel(n_jobs=3)]: Done 107 tasks      | elapsed:  7.5min
[Parallel(n_jobs=3)]: Done 122 tasks      | elapsed:  8.3min
[Parallel(n_jobs=3)]: Done 139 tasks      | elapsed:  9.2min
[Parallel(n_jobs=3)]: Done 156 tasks      | elapsed: 10.3min
[Parallel(

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_s...='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=3,
       param_grid={'model__max_depth': [10, 20, None], 'model__n_estimators': [100, 500, 1000, 2000, 5000], 'model__min_samples_leaf': [1, 3, 5], 'model__min_samples_split': [2, 4, 6]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=10)

In [45]:
search.best_estimator_

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=3, min_samples_split=4,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False))])

In [46]:
search.best_score_

0.807529823034222

In [43]:
y_pred = search.best_estimator_.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
score = r2_score(y_val, y_pred)
print(f"MAE: {mae}")
print(f"R^2: {score}")

MAE: 1.5742798619960143
R^2: 0.8231591213582186


  Xt = transform.transform(Xt)


In [44]:
y_pred2 = search.best_estimator_.predict(X_test)
mae2 = mean_absolute_error(y_test, y_pred2)
score2 = r2_score(y_test, y_pred2)
print(f"MAE for test set: {mae2}")
print(f"R^2 for test set: {score2}")

  Xt = transform.transform(Xt)


MAE for test set: 1.9119149853547643
R^2 for test set: 0.5975386710223406


### XGBoost Pipeline

In [47]:
scaler = StandardScaler()
model2 = XGBRegressor()
xgb = Pipeline([('scaler', scaler), ('model2', model2)]) 

In [48]:
params = {
    'model2__max_depth': [2, 3, 4, 5],
    'model2__learning_rate': [0.001, 0.01, 0.1, 0.2],
    'model2__n_estimators': [100, 500, 1000],
    'model2__booster': ['gbtree', 'gblinear', 'dart']
}

gridsearch = GridSearchCV(xgb, params, cv=3, verbose=10, n_jobs=3)
gridsearch.fit(X_train, y_train)

Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    5.6s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:    7.8s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:    8.1s
[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:   13.3s
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:   18.2s
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:   23.5s
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   25.6s
[Parallel(n_jobs=3)]: Done  55 tasks      | elapsed:   28.9s
[Parallel(n_jobs=3)]: Done  66 tasks      | elapsed:   33.1s
[Parallel(n_jobs=3)]: Done  79 tasks      | elapsed:   39.9s
[Parallel(n_jobs=3)]: Done  92 tasks      | elapsed:   43.4s
[Parallel(n_jobs=3)]: Done 107 tasks      | elapsed:   51.1s
[Parallel(n_jobs=3)]: Done 122 tasks      | elapsed:   54.6s
[Parallel(n_jobs=3)]: Done 139 tasks      | elapsed:  1.1min
[Parallel(n_jobs=3)]: Done 156 tasks      | elapsed:  1.1min
[Parallel(



GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model2', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=3, ...lpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1))]),
       fit_params=None, iid='warn', n_jobs=3,
       param_grid={'model2__max_depth': [2, 3, 4, 5], 'model2__learning_rate': [0.001, 0.01, 0.1, 0.2], 'model2__n_estimators': [100, 500, 1000], 'model2__booster': ['gbtree', 'gblinear', 'dart']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=10)

In [49]:
gridsearch.best_estimator_

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model2', XGBRegressor(base_score=0.5, booster='dart', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.01, max_delta_step=0,
       max_depth=4, m...lpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1))])

In [50]:
gridsearch.best_params_

{'model2__booster': 'dart',
 'model2__learning_rate': 0.01,
 'model2__max_depth': 4,
 'model2__n_estimators': 1000}

In [51]:
gridsearch.best_score_

0.8407933604487383

In [52]:
xgb_preds = gridsearch.best_estimator_.predict(X_val)
mae3 = mean_absolute_error(y_val, xgb_preds)
score3 = r2_score(y_val, xgb_preds)
print(f"MAE: {mae3}")
print(f"R^2: {score3}")

MAE: 1.4435309061297663
R^2: 0.8554907216596112


  Xt = transform.transform(Xt)


### Pickle the best model

In [53]:
import joblib

joblib.dump(gridsearch.best_estimator_, 'career_model')

['career_model']