# NBA MVP Prediction Parameter Tuning Notebook

In [1]:
#essentials
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle
import warnings
warnings.filterwarnings('ignore')

#visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

#tools/metrics
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split
import shap

#modeling
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [2]:
data_path = os.path.dirname(os.getcwd()) + '/data' + '/master_table.csv'
print(data_path)
master_table = pd.read_csv(data_path)

/Users/dbtjdals/Desktop/nba_mvp_project/data/master_table.csv


In [3]:
to_drop = [
    'Rank',
    'Player',
    'Age',
    'year',
    'Tm',
    'team',
    'First',
    'Pts Won',
    'Pts Max',
    'WS/48',
    'WS',
    'MP',
    'G',
    #'W/L%', 
    'W', 
    'FG%',
    '3P%',
    #'PTS',
    'STL', 
    'BLK',
    'three_point_attempt_rate',
    'total_rebound_percentage',
    'offensive_rebound_percentage',
    'block_percentage',
    'defensive_rebound_percentage',
    'steal_percentage',
    'turnover_percentage',
    'assist_percentage',
    'AST',
    'TRB',
    #'free_throw_attempt_rate', ######### Experiment
    'FT%',
    'win_shares', 
    #'value_over_replacement_player', 
    'box_plus_minus', 
    #'offensive_box_plus_minus', 
    'defensive_box_plus_minus',
    'offensive_win_shares', 
    'defensive_win_shares', 
    'true_shooting_percentage' 
]

### Modeling helper functions

In [4]:
def train_test_split_by_year(year, df, scaling=False):
    #test year = selected year, train year = other years outside of selected year
    train_df = df[df['year'] != year]
    test_df = df[df['year'] == year]
    
    train_df2 = train_df.copy()
    test_df2 = test_df.copy()
    
    train_df2.drop(to_drop, axis=1, inplace=True)
    test_df2.drop(to_drop, axis=1, inplace=True)
    
    if scaling == True:
        sc_X = StandardScaler()
        sc_y = StandardScaler()
        train_df2 = sc_X.fit_transform(train_df2)
        test_df2 = sc_y.fit_transform(test_df2)
    
    X_train = train_df2.copy()
    y_train = X_train["Share"]
    
    X_test = test_df2.copy()
    y_test = X_test["Share"]

    X_train.drop('Share', axis=1, inplace=True)
    cols = X_train.columns
    X_test.drop('Share', axis=1, inplace=True)

    X_train = np.array(X_train)
    y_train = np.array(y_train)
    X_test = np.array(X_test)
    y_test = np.array(y_test)
    
    return X_train, y_train, X_test, y_test, cols

In [5]:
def run_model(regressor, X_train, y_train, X_test, y_test, df, year):
    model = regressor
    model.fit(X_train, y_train) 
    predictions = model.predict(X_test)
    mae = mean_absolute_error(predictions, y_test)
    r2 = r2_score(y_test, predictions)
    
    mvp_race = df[df['year'] == year]
    mvp_race['predicted_share'] = predictions
    mvp_race = mvp_race.sort_values(["Share", "predicted_share"], ascending = (False, False))
                                    
    actual_winner = mvp_race[mvp_race['Share'] == mvp_race['Share'].max()]['Player']
    predicted_winner = mvp_race[mvp_race['predicted_share'] == mvp_race['predicted_share'].max()]['Player']
    return model, mae, r2, predicted_winner.iloc[0], actual_winner.iloc[0], mvp_race

In [6]:
years = [year for year in range(1980, 2022)]

def run_model_average(df, regressor, scaling=False, print_metrics=False):
    mae_lst = []
    r2_lst = []
    predicted_lst = []
    actual_lst = []
    label_lst =[]
    model_lst = []
    for year in tqdm(years):
        X_train, y_train, X_test, y_test, cols = train_test_split_by_year(year=year, df=df, scaling=False)
        model, mae, r2, predicted_winner, actual_winner, mvp_race = run_model(regressor,
                                                             X_train,
                                                              y_train,
                                                              X_test,
                                                              y_test,
                                                              df=df,
                                                              year=year,
                                                            )
        if predicted_winner == actual_winner:
            label = 'correct'
        else:
            label = 'incorrect'
        mae_lst.append(mae)
        r2_lst.append(r2)
        predicted_lst.append(predicted_winner)
        actual_lst.append(actual_winner)
        label_lst.append(label)
        model_lst.append(model)
    d = {
    'year': years,
    'MAE': mae_lst,
    'R squared': r2_lst,
    'Predicted MVP': predicted_lst,
    'Actual MVP': actual_lst,
    'Label': label_lst
    }

    summary = pd.DataFrame(d)
    correct_count = summary['Label'].value_counts().iloc[0]
    incorrect_count = summary['Label'].value_counts().iloc[1]
    accuracy = correct_count / (correct_count + incorrect_count)
    avg_mae = summary['MAE'].mean()
    avg_r2  = summary['R squared'].mean()
    
    if print_metrics == True:
        print(f"Average MAE: {avg_mae}")
        print(f"Average R squared: {avg_r2}")
        print(f"Prediction accuracy: {accuracy}")
    return avg_mae, avg_r2, accuracy, summary, model_lst, cols

# Parameter Tuning

## SVM Regressor

In [74]:
kernels_lst = ['rbf', 'linear', 'poly', 'sigmoid']
c_lst = [0.4, 0.6, 0.8, 1]
epsilon_lst = [0.01, 0.02, 0.04, 0.06, 0.08, 0.1]

def find_best_parameter_svr(df): 
    avg_mae_lst = []
    avg_r2_lst = []
    avg_accuracy_lst = []
    summary_lst = []

    ex_kernels_lst = []
    ex_c_lst = []
    ex_epsilon_lst = []

    for i in kernels_lst:
        for j in c_lst:
            for k in epsilon_lst:
                mae, r2, accuracy, summary, models, cols = run_model_average(df=df,
                      regressor = SVR(kernel=i, C=j, epsilon=k),
                     scaling=True,
                    print_metrics=False)
                avg_mae_lst.append(mae)
                avg_r2_lst.append(r2)
                avg_accuracy_lst.append(accuracy)
                summary_lst.append(summary)

                ex_kernels_lst.append(i)
                ex_c_lst.append(j)
                ex_epsilon_lst.append(k)

    d = {
        'kernel': ex_kernels_lst,
        'C': ex_c_lst,
        'epsilon': ex_epsilon_lst,
        'MAE': avg_mae_lst,
        'r2': avg_r2_lst,
        'Accuracy': avg_accuracy_lst,
    }
    
    #sort by highest accuracy, then highest r squared
    df = pd.DataFrame(d).sort_values(["Accuracy", "r2"], ascending = (False, False))
    return df

In [75]:
svr_parameter_summary = find_best_parameter_svr(df=master_table)

100%|██████████| 42/42 [00:01<00:00, 37.11it/s]
100%|██████████| 42/42 [00:00<00:00, 44.57it/s]
100%|██████████| 42/42 [00:00<00:00, 52.48it/s]
100%|██████████| 42/42 [00:00<00:00, 59.83it/s]
100%|██████████| 42/42 [00:00<00:00, 54.12it/s]
100%|██████████| 42/42 [00:00<00:00, 61.18it/s]
100%|██████████| 42/42 [00:01<00:00, 34.16it/s]
100%|██████████| 42/42 [00:01<00:00, 39.17it/s]
100%|██████████| 42/42 [00:00<00:00, 52.14it/s]
100%|██████████| 42/42 [00:00<00:00, 58.39it/s]
100%|██████████| 42/42 [00:00<00:00, 62.94it/s]
100%|██████████| 42/42 [00:00<00:00, 68.45it/s]
100%|██████████| 42/42 [00:01<00:00, 37.29it/s]
100%|██████████| 42/42 [00:00<00:00, 42.89it/s]
100%|██████████| 42/42 [00:00<00:00, 51.33it/s]
100%|██████████| 42/42 [00:00<00:00, 58.07it/s]
100%|██████████| 42/42 [00:00<00:00, 61.84it/s]
100%|██████████| 42/42 [00:00<00:00, 65.78it/s]
100%|██████████| 42/42 [00:01<00:00, 36.07it/s]
100%|██████████| 42/42 [00:01<00:00, 40.91it/s]
100%|██████████| 42/42 [00:00<00:00, 48.

IndexError: single positional indexer is out-of-bounds

In [76]:
svr_parameter_summary.head()

Unnamed: 0,kernel,C,epsilon,MAE,r2,Accuracy
21,rbf,1.0,0.06,0.106487,0.595899,0.571429
22,rbf,1.0,0.08,0.108787,0.595465,0.571429
23,rbf,1.0,0.1,0.111387,0.592597,0.571429
16,rbf,0.8,0.08,0.109618,0.590166,0.571429
17,rbf,0.8,0.1,0.111937,0.587533,0.571429


## Random Forest Regressor

In [12]:
n_estimator_lst = [11,12,13,14,15,16]
max_depth_lst = [10,11,12]
min_samples_leaf_lst = [1]
min_samples_split_lst = [2]

def find_best_parameter_rf(df): 
    avg_mae_lst = []
    avg_r2_lst = []
    avg_accuracy_lst = []
    summary_lst = []

    ex_n_estimator_lst = []
    ex_max_depth_lst = []
    ex_min_samples_leaf_lst = []
    ex_min_samples_split_lst = []

    for i in n_estimator_lst:
        for j in max_depth_lst:
            for x in min_samples_leaf_lst:
                for y in min_samples_split_lst:
                    mae, r2, accuracy, summary, models, cols = run_model_average(df=df,
                          regressor=RandomForestRegressor(n_estimators = i,
                                                          random_state = 0,
                                                          max_depth=j,
                                                          min_samples_leaf=x,
                                                          min_samples_split=y
                                                         ),
                print_metrics=False)
                    avg_mae_lst.append(mae)
                    avg_r2_lst.append(r2)
                    avg_accuracy_lst.append(accuracy)
                    summary_lst.append(summary)

                    ex_n_estimator_lst.append(i)
                    ex_max_depth_lst.append(j)
                    ex_min_samples_leaf_lst.append(x)
                    ex_min_samples_split_lst.append(y)

    d = {
        'n_estimator': ex_n_estimator_lst,
        'max_depth': ex_max_depth_lst,
        'min_samples_leaf': ex_min_samples_leaf_lst, 
        'min_samples_split': ex_min_samples_split_lst, 
        'MAE': avg_mae_lst,
        'r2': avg_r2_lst,
        'Accuracy': avg_accuracy_lst,
    }
    
    #sort by highest accuracy, then highest r squared
    df = pd.DataFrame(d).sort_values(["Accuracy", "r2"], ascending = (False, False))
    return df

In [13]:
rf_parameter_options = find_best_parameter_rf(df=master_table)

100%|██████████| 42/42 [00:01<00:00, 27.59it/s]
100%|██████████| 42/42 [00:01<00:00, 28.05it/s]
100%|██████████| 42/42 [00:01<00:00, 27.08it/s]
100%|██████████| 42/42 [00:01<00:00, 26.99it/s]
100%|██████████| 42/42 [00:01<00:00, 25.49it/s]
100%|██████████| 42/42 [00:01<00:00, 25.51it/s]
100%|██████████| 42/42 [00:01<00:00, 24.95it/s]
100%|██████████| 42/42 [00:01<00:00, 23.96it/s]
100%|██████████| 42/42 [00:01<00:00, 23.88it/s]
100%|██████████| 42/42 [00:01<00:00, 23.60it/s]
100%|██████████| 42/42 [00:01<00:00, 22.46it/s]
100%|██████████| 42/42 [00:02<00:00, 20.99it/s]
100%|██████████| 42/42 [00:01<00:00, 22.38it/s]
100%|██████████| 42/42 [00:01<00:00, 21.65it/s]
100%|██████████| 42/42 [00:02<00:00, 20.92it/s]
100%|██████████| 42/42 [00:01<00:00, 21.34it/s]
100%|██████████| 42/42 [00:02<00:00, 20.79it/s]
100%|██████████| 42/42 [00:02<00:00, 20.20it/s]


In [14]:
rf_parameter_options.head()

Unnamed: 0,n_estimator,max_depth,min_samples_leaf,min_samples_split,MAE,r2,Accuracy
15,16,10,1,2,0.105547,0.591895,0.738095
2,11,12,1,2,0.105812,0.584584,0.738095
17,16,12,1,2,0.105157,0.595126,0.714286
11,14,12,1,2,0.105536,0.591663,0.714286
8,13,12,1,2,0.105593,0.589905,0.714286


In [None]:
rf_best_parameters = dict(rf_parameter_options.iloc[0][:5])

## XGBoost Regressor

In [7]:
n_estimator_lst = [16]
max_depth_lst = [5]
learning_rate_lst = [0.1954, 0.2745]
subsample_lst = [1]
colsample_bytree_lst = [1]

def find_best_parameter_xgb(df): 
    avg_mae_lst = []
    avg_r2_lst = []
    avg_accuracy_lst = []
    summary_lst = []

    ex_n_estimator_lst = []
    ex_max_depth_lst = []
    ex_learning_rate_lst = []
    ex_subsample = []
    ex_colsample_bytree = []

    for i in n_estimator_lst:
        for j in max_depth_lst:
            for k in learning_rate_lst:
                for a in subsample_lst:
                    for b in colsample_bytree_lst: 
                        mae, avg_r2, accuracy, summary, models, cols = run_model_average(
                                                                    df=df,
                                                                      regressor = XGBRegressor(n_estimators=i,
                                                                                               max_depth=j,
                                                                                               learning_rate=k,
                                                                                              subsample=a,
                                                                                              colsample_bytree=b),
                                                                    scaling=False)
                        avg_mae_lst.append(mae)
                        avg_r2_lst.append(avg_r2)
                        avg_accuracy_lst.append(accuracy)
                        summary_lst.append(summary)

                        ex_n_estimator_lst.append(i)
                        ex_max_depth_lst.append(j)
                        ex_learning_rate_lst.append(k)
                        ex_subsample.append(a)
                        ex_colsample_bytree.append(b)

    d = {
        'n_estimator': ex_n_estimator_lst,
        'max_depth': ex_max_depth_lst,
        'learning_rate': ex_learning_rate_lst,
        'subsample': ex_subsample, 
        'colsample_bytree': ex_colsample_bytree,
        'MAE': avg_mae_lst,
        'r2': avg_r2_lst,
        'Accuracy': avg_accuracy_lst,
    }
    
    #sort by highest accuracy, then highest r squared
    df = pd.DataFrame(d).sort_values(["Accuracy", "r2"], ascending = (False, False))
    return df

In [8]:
xgb_parameter_options = find_best_parameter_xgb(df=master_table)

100%|███████████████████████████████████████████| 42/42 [00:01<00:00, 34.11it/s]
100%|███████████████████████████████████████████| 42/42 [00:01<00:00, 37.65it/s]


In [9]:
xgb_parameter_options.head()

Unnamed: 0,n_estimator,max_depth,learning_rate,subsample,colsample_bytree,MAE,r2,Accuracy
1,16,5,0.2745,1,1,0.103273,0.606266,0.833333
0,16,5,0.1954,1,1,0.111044,0.595498,0.761905


In [10]:
xgb_best_parameters = dict(xgb_parameter_options.iloc[0][:5])
xgb_best_parameters

{'n_estimator': 16.0,
 'max_depth': 5.0,
 'learning_rate': 0.2745,
 'subsample': 1.0,
 'colsample_bytree': 1.0}

## LightGBM Regressor

In [89]:
n_estimator_lst = [17,23]
max_depth_lst = [4]
num_leaves_lst = [28,29,30,31,32]                          # 31 default
learning_rate_lst = [0.025, 0.05, 0.075, 0.1, 0.15]       # 0.1 default
boosting_type_lst = ['gbdt', 'dart', 'goss']

def find_best_parameter_lgbm(df): 
    avg_mae_lst = []
    avg_r2_lst = []
    avg_accuracy_lst = []
    summary_lst = []

    ex_n_estimator_lst = []
    ex_max_depth_lst = []
    ex_learning_rate_lst = []
    ex_num_leaves_lst = []
    ex_boosting_type_lst = []

    for i in n_estimator_lst:
        for j in max_depth_lst:
            for k in learning_rate_lst:
                for a in num_leaves_lst:
                    for b in boosting_type_lst: 
                        mae, avg_r2, accuracy, summary, models, cols = run_model_average(
                                                                    df=df,
                                                                      regressor = LGBMRegressor(n_estimators=i,
                                                                                               max_depth=j,
                                                                                               learning_rate=k,
                                                                                              num_leaves=a,
                                                                                              boosting_type=b,
                                                                                               random_state = 0,
                                                                                               ),
                                                                    scaling=False)
                        avg_mae_lst.append(mae)
                        avg_r2_lst.append(avg_r2)
                        avg_accuracy_lst.append(accuracy)
                        summary_lst.append(summary)

                        ex_n_estimator_lst.append(i)
                        ex_max_depth_lst.append(j)
                        ex_learning_rate_lst.append(k)
                        ex_num_leaves_lst.append(a)
                        ex_boosting_type_lst.append(b)

    d = {
        'n_estimator': ex_n_estimator_lst,
        'max_depth': ex_max_depth_lst,
        'learning_rate': ex_learning_rate_lst,
        'num_leaves': ex_num_leaves_lst, 
        'boosting_type': ex_boosting_type_lst,
        'MAE': avg_mae_lst,
        'r2': avg_r2_lst,
        'Accuracy': avg_accuracy_lst,
    }
    
    #sort by highest accuracy, then highest r squared
    df = pd.DataFrame(d).sort_values(["Accuracy", "r2"], ascending = (False, False))
    return df

In [90]:
lgbm_parameter_options = find_best_parameter_lgbm(df=master_table)

100%|██████████| 42/42 [00:01<00:00, 30.67it/s]
100%|██████████| 42/42 [00:01<00:00, 39.65it/s]
100%|██████████| 42/42 [00:00<00:00, 49.44it/s]
100%|██████████| 42/42 [00:00<00:00, 49.20it/s]
100%|██████████| 42/42 [00:00<00:00, 68.20it/s]
100%|██████████| 42/42 [00:00<00:00, 48.71it/s]
100%|██████████| 42/42 [00:00<00:00, 52.22it/s]
100%|██████████| 42/42 [00:00<00:00, 69.85it/s]
100%|██████████| 42/42 [00:00<00:00, 72.76it/s]
100%|██████████| 42/42 [00:00<00:00, 73.83it/s]
100%|██████████| 42/42 [00:00<00:00, 61.57it/s]
100%|██████████| 42/42 [00:00<00:00, 69.27it/s]
100%|██████████| 42/42 [00:00<00:00, 75.43it/s]
100%|██████████| 42/42 [00:00<00:00, 62.88it/s]
100%|██████████| 42/42 [00:00<00:00, 68.31it/s]
100%|██████████| 42/42 [00:00<00:00, 66.35it/s]
100%|██████████| 42/42 [00:00<00:00, 67.37it/s]
100%|██████████| 42/42 [00:00<00:00, 67.49it/s]
100%|██████████| 42/42 [00:00<00:00, 62.29it/s]
100%|██████████| 42/42 [00:00<00:00, 69.95it/s]
100%|██████████| 42/42 [00:00<00:00, 68.

In [91]:
lgbm_parameter_options.head()

Unnamed: 0,n_estimator,max_depth,learning_rate,num_leaves,boosting_type,MAE,r2,Accuracy
137,23,4,0.15,28,goss,0.106055,0.611992,0.761905
140,23,4,0.15,29,goss,0.106055,0.611992,0.761905
143,23,4,0.15,30,goss,0.106055,0.611992,0.761905
146,23,4,0.15,31,goss,0.106055,0.611992,0.761905
149,23,4,0.15,32,goss,0.106055,0.611992,0.761905


In [92]:
lgbm_best_parameters = dict(lgbm_parameter_options.iloc[0][:5])
lgbm_best_parameters

{'n_estimator': 23,
 'max_depth': 4,
 'learning_rate': 0.15,
 'num_leaves': 28,
 'boosting_type': 'goss'}