# Modeling 
This final phase of the NBA MVP Prediction Project will be concluded with modeling. In this notebook, we will recall the data from preprocessing that has been standardized and begin testing different parameters and models. Once the models have been selected, we will test each model for accuracy, recall, precision and F1 scores. Overall, the goal of the model is to predict the NBA MVP for any given season.

In [29]:
#Core packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#To surpress warnings
import warnings
warnings.filterwarnings('ignore')

#Adjusting display settings
%matplotlib inline
pd.set_option('display.max_columns',None)

#Packages used for modeling
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, f1_score, accuracy_score, recall_score, precision_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
#from lightgbm import LGBMRegressor



The first step is to load in the dataframes created from the previous phases, this notebook will be focusing in on the final dataframes created in preprocessing

In [5]:
df_per_35 = pd.read_csv('../data/df_per_35.csv')
testing_df = pd.read_csv('../data/testing_df.csv')

In [6]:
df_per_35.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,season,player,pos,age,g,team_id,gs,mp_per_g,fg_per_g,fga_per_g,fg_pct,fg3_per_g,fg3a_per_g,fg3_pct,fg2_per_g,fg2a_per_g,fg2_pct,efg_pct,ft_per_g,fta_per_g,ft_pct,orb_per_g,drb_per_g,trb_per_g,ast_per_g,stl_per_g,blk_per_g,tov_per_g,pf_per_g,pts_per_g,mp,per,ts_pct,fg3a_per_fga_pct,fta_per_fga_pct,orb_pct,drb_pct,trb_pct,ast_pct,stl_pct,blk_pct,tov_pct,usg_pct,ows,dws,ws,ws_per_48,obpm,dbpm,bpm,vorp,award_share,mov,mov_adj,win_loss_pct,mvp_won
0,0,0,1982,Kareem Abdul-Jabbar,C,34,76,LAL,76,35.0,9.84375,17.002841,0.57571,0.0,0.0,0.0,9.84375,17.002841,0.576705,0.57571,4.076705,5.767045,0.701989,2.286932,6.363636,8.650568,2.982955,0.795455,2.684659,2.982955,2.883523,23.764205,2661.789773,23.267045,0.604545,0.001989,0.338068,7.258523,18.991477,13.323864,11.832386,1.09375,4.076705,13.224432,25.454545,6.860795,3.877841,10.639205,0.190909,3.778409,1.193182,4.971591,4.673295,0.045,4.84233,4.34517,0.691051,No
1,1,1,1982,Alvan Adams,C,27,79,PHO,75,35.0,7.392739,15.016502,0.570627,0.0,0.0,0.0,7.392739,15.016502,0.570627,0.570627,2.656766,3.349835,0.902145,1.963696,6.584158,8.547855,5.19802,1.617162,1.155116,2.887789,3.927393,17.442244,2764.191419,21.485149,0.611056,0.001155,0.262211,7.623762,23.333333,15.709571,25.528053,2.656766,2.194719,17.09571,26.336634,2.887789,5.429043,8.316832,0.166337,1.617162,2.541254,4.158416,3.927393,0.0,3.985149,3.523102,0.64802,No
2,2,2,1982,Mark Aguirre,SF,22,51,DAL,20,35.0,9.114583,19.565972,0.565104,0.607639,1.701389,0.427778,8.506944,17.864583,0.577257,0.583333,4.010417,5.833333,0.826389,2.065972,3.767361,5.954861,3.888889,0.850694,0.486111,3.159722,3.645833,22.725694,1784.027778,21.024306,0.624653,0.105729,0.365799,8.142361,15.677083,11.788194,22.604167,1.458333,1.09375,15.434028,36.215278,1.215278,0.972222,2.309028,0.074132,2.795139,-1.944444,0.850694,1.215278,0.0,-5.383681,-5.444444,0.41441,No
3,3,3,1982,Danny Ainge,SG,22,53,BOS,1,35.0,4.95283,13.867925,1.178774,0.330189,0.990566,0.970755,4.622642,12.54717,1.198585,1.218396,3.632075,3.962264,2.846226,1.650943,1.981132,3.632075,5.283019,2.311321,0.330189,3.301887,5.283019,13.537736,1862.264151,33.349057,1.449528,0.254245,0.970755,16.509434,19.481132,18.160377,65.04717,10.235849,0.990566,57.783019,70.990566,-0.990566,2.641509,1.650943,0.138679,-12.216981,3.301887,-8.915094,-0.330189,0.0,21.066038,20.966981,2.535849,No
4,4,4,1982,Tiny Archibald,PG,33,68,BOS,51,35.0,4.937304,10.532915,0.517868,0.109718,0.219436,0.411442,4.827586,10.31348,0.52116,0.523354,3.840125,5.047022,0.819592,0.438871,1.426332,1.865204,8.777429,0.877743,0.0,2.852665,2.084639,13.824451,2377.586207,15.689655,0.594671,0.027429,0.532132,1.426332,4.937304,3.181818,35.0,1.206897,0.109718,20.188088,19.639498,3.730408,1.974922,5.705329,0.126176,1.53605,-1.426332,0.109718,1.206897,0.0,7.0,6.967085,0.842633,No


In [7]:
testing_df.head()

Unnamed: 0.1,Unnamed: 0,season,player,fg_pct,blk_pct,usg_pct,fta_per_fga_pct,vorp,mov,mp,fg3_pct,tov_per_g,drb_per_g,mov_adj,drb_pct,tov_pct,mvp_won
0,0,1982,Kareem Abdul-Jabbar,0.57571,4.076705,25.454545,0.338068,4.673295,4.84233,2661.789773,0.0,2.982955,6.363636,4.34517,18.991477,13.224432,No
1,1,1982,Alvan Adams,0.570627,2.194719,26.336634,0.262211,3.927393,3.985149,2764.191419,0.0,2.887789,6.584158,3.523102,23.333333,17.09571,No
2,2,1982,Mark Aguirre,0.565104,1.09375,36.215278,0.365799,1.215278,-5.383681,1784.027778,0.427778,3.159722,3.767361,-5.444444,15.677083,15.434028,No
3,3,1982,Danny Ainge,1.178774,0.990566,70.990566,0.970755,-0.330189,21.066038,1862.264151,0.970755,3.301887,1.981132,20.966981,19.481132,57.783019,No
4,4,1982,Tiny Archibald,0.517868,0.109718,19.639498,0.532132,1.206897,7.0,2377.586207,0.411442,2.852665,1.426332,6.967085,4.937304,20.188088,No


In [14]:
#Defining the models 
models = {'Random Forest Regressor': RandomForestRegressor(),
          'Linear Regression': LinearRegression(),
          'XGBoost': XGBRegressor(),
          #'LightGB': LGBMRegressor()
          }

In [31]:
linear = LinearRegression()
forest = RandomForestRegressor()

In [12]:
testing_df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [33]:
print(testing_df.isin([np.nan, np.inf, -np.inf]).sum())

season             0
player             0
fg_pct             0
blk_pct            0
usg_pct            0
fta_per_fga_pct    0
vorp               0
mov                3
mp                 0
fg3_pct            0
tov_per_g          0
drb_per_g          1
mov_adj            3
drb_pct            0
tov_pct            0
mvp_won            0
dtype: int64


In [34]:
testing_df = testing_df.replace([np.inf, -np.inf], 0)

In [35]:
print(testing_df.isin([np.nan, np.inf, -np.inf]).sum())

season             0
player             0
fg_pct             0
blk_pct            0
usg_pct            0
fta_per_fga_pct    0
vorp               0
mov                0
mp                 0
fg3_pct            0
tov_per_g          0
drb_per_g          0
mov_adj            0
drb_pct            0
tov_pct            0
mvp_won            0
dtype: int64


In [38]:
print(testing_df.dtypes)

season               int64
player              object
fg_pct             float64
blk_pct            float64
usg_pct            float64
fta_per_fga_pct    float64
vorp               float64
mov                float64
mp                 float64
fg3_pct            float64
tov_per_g          float64
drb_per_g          float64
mov_adj            float64
drb_pct            float64
tov_pct            float64
mvp_won             object
dtype: object


In [39]:
testing = pd.get_dummies(testing_df, columns=['mvp_won'])

In [40]:
testing.head()

Unnamed: 0,season,player,fg_pct,blk_pct,usg_pct,fta_per_fga_pct,vorp,mov,mp,fg3_pct,tov_per_g,drb_per_g,mov_adj,drb_pct,tov_pct,mvp_won_No,mvp_won_Yes
0,1982,Kareem Abdul-Jabbar,0.57571,4.076705,25.454545,0.338068,4.673295,4.84233,2661.789773,0.0,2.982955,6.363636,4.34517,18.991477,13.224432,1,0
1,1982,Alvan Adams,0.570627,2.194719,26.336634,0.262211,3.927393,3.985149,2764.191419,0.0,2.887789,6.584158,3.523102,23.333333,17.09571,1,0
2,1982,Mark Aguirre,0.565104,1.09375,36.215278,0.365799,1.215278,-5.383681,1784.027778,0.427778,3.159722,3.767361,-5.444444,15.677083,15.434028,1,0
3,1982,Danny Ainge,1.178774,0.990566,70.990566,0.970755,-0.330189,21.066038,1862.264151,0.970755,3.301887,1.981132,20.966981,19.481132,57.783019,1,0
4,1982,Tiny Archibald,0.517868,0.109718,19.639498,0.532132,1.206897,7.0,2377.586207,0.411442,2.852665,1.426332,6.967085,4.937304,20.188088,1,0


In [43]:
def model_evaluation(df, model, scaling):
    # Create X and y variables
    X = df.drop(['player','season','mvp_won_No', 'mvp_won_Yes'], axis=1)
    y = df['mvp_won_No','mvp_won_Yes']
    
    
    # Scale the data if scaling is True
    if scaling:
        scaler = StandardScaler()
        X= scaler.fit_transform(X)
    
    # Fit the model to the training data
    model.fit(X,y)
    
    # Make predictions using the testing data
    y_pred = model.predict(X)
    
    # Calculate evaluation metrics
    mse = mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    f1 = f1_score(y.argmax(axis=1), y_pred.argmax(axis=1))
    accuracy = accuracy_score(y.argmax(axis=1), y_pred.argmax(axis=1))
    recall = recall_score(y.argmax(axis=1), y_pred.argmax(axis=1))
    precision = precision_score(y.argmax(axis=1), y_pred.argmax(axis=1))
    
    # Create a data frame with evaluation metrics and actual/predicted MVP values
    result = pd.DataFrame({
        'mse': [mse],
        'r2': [r2],
        'f1': [f1],
        'accuracy': [accuracy],
        'recall': [recall],
        'precision': [precision],
        'actual_mvp': y.argmax(axis=1),
        'predicted_mvp': y_pred.argmax(axis=1)
    })
    
    return result


In [44]:
model_evaluation(df=testing, model=forest, scaling=True)

KeyError: ('mvp_won_No', 'mvp_won_Yes')