# Modeling 
This final phase of the NBA MVP Prediction Project will be concluded with modeling. In this notebook, we will recall the data from preprocessing that has been standardized and begin testing different parameters and models. Once the models have been selected, we will test each model for accuracy, recall, precision and F1 scores. Overall, the goal of the model is to predict the NBA MVP for any given season.

In [None]:
! pip install xgboost

In [None]:
#Core packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#To surpress warnings
import warnings
warnings.filterwarnings('ignore')

#Adjusting display settings
%matplotlib inline
pd.set_option('display.max_columns',None)

#Packages used for modeling
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor


The first step is to load in the dataframes created from the previous phases, this notebook will be focusing in on the final dataframes created in preprocessing

In [None]:
final_df = pd.read_csv('../data/final_df.csv')
testing_df = pd.read_csv('../data/testing_df.csv')

In [None]:
#Viewing the final_df dataframe
final_df.head()

Adding the season and player column back into the testing_df, it was dropped in the preprocessing phase but is needed to continue with modeling 

In [None]:
testing_df['season'] = final_df['season'].reindex(testing_df.index)

In [None]:
testing_df['player'] = final_df['player'].reindex(testing_df.index)

In [None]:
#Viewing the testing_df dataframe
testing_df.head()


In [None]:
testing_df.isnull()

In [None]:

le = LabelEncoder()
player_ids = le.fit_transform(testing_df['player'])
player_df = pd.concat([testing_df[['player','max_award_share']], pd.DataFrame({'player_id': player_ids})], axis=1)

X = player_df.drop(['player','max_award_share'], axis=1)
y= player_df['max_award_share']

In [None]:
#Creating X and y variables
#X = testing_df.drop(['max_award_share'], axis=1)
#y = testing_df['max_award_share']

#le = LabelEncoder()
#le.fit(X['player'])
#X['player_id'] = le.transform(X['player'])

#unique_players = le.inverse_transform(X['player_id'].unique())

In [None]:
player_id_map = dict(zip(X['player_id'], X['player']))
y = y.map(lambda x: le.transform([player_id_map[x]])[0])

In [None]:
#le = LabelEncoder()
#testing_df['player_id'] = le.fit_transform(testing_df['player'])
#unique_players = testing_df['player'].unique

In [None]:
#Creating X and y variables
#X = testing_df.drop(['max_award_share'], axis=1)
#y = testing_df['max_award_share']


In [None]:
X.drop(['player','Unnamed: 0','award_share_0','award_share_1'], axis=1, inplace=True)

In order to make the results more readable, we're going to add player names back into the dataframe, but first they must be converted to Numeric values for the models can still function

We will check the X and y variables for any missing values, this dataset is expected to have missing values due to some statistical categories not being tracked in earlier seasons. We may want to consider applying the average method to those columns for future testing

In [None]:
print(f'X has missing values: {X.isnull().values.any()}')
print(f'y has missing values: {y.isnull().values.any()}')
print(f'X has infinite values: {np.isinf(X).values.any()}')
print(f'y has infinite values: {np.isinf(y).values.any()}')

In [None]:
X = X.fillna(value=0)

In [None]:
print(f'X has missing values: {X.isnull().values.any()}')
print(f'y has missing values: {y.isnull().values.any()}')
print(f'X has infinite values: {np.isinf(X).values.any()}')
print(f'y has infinite values: {np.isinf(y).values.any()}')

In [None]:
#Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.3, random_state=42)

In [None]:
#Defining the models 
models = {'Random Forest Regressor': RandomForestRegressor(),
          'Linear Regression': LinearRegression(),
          'XGBoost': XGBRegressor(),
          'LightGB': LGBMRegressor()
          }

In [None]:
#Initializing an empty dictionary to store testing results
model_results = {}

Fitting each model on the training and testing data, then the models will be evaluated and added to a dataframe to visualize the results

In [None]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    #Evaluating the performance metrics of each model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)  
    
    #Inverse transform of encoded player IDs back to player names
    #predicted_mvp = le.inverse_transform(y_pred.astype(int))
    #actual_mvp = y_test
    #predicted_mvp = pd.Series(le.inverse_transform(y_pred.astype(int)), index=y_test.index)
    #actual_mvp = pd.Series(le.inverse_transform(y_test.astype(int)), index=y_test.index)
    predicted_mvp = le.inverse_transform(y_pred.astype(int))
    actual_mvp = [player_df.iloc[i]['player'] for i in range(len(player_df))]
    
    #Creating a dataframe with the results of each model
    df = pd.DataFrame({
        'mean_squared_error': [mse] * len(y_test),
        'r2_score': [r2] * len(y_test),
        'season': X_test['season'].values,
        'predicted_mvp': predicted_mvp,
        'actual_mvp': actual_mvp
    })

    #Filtering the dataframe to only include rows where the predicted MVP is correct
    df_correct = df[df['predicted_mvp'] == df['actual_mvp']]
    df_correct['model_correct'] = 'Yes'
    
    #Filtering the dataframe to only include rows where the predicted MVP is incorrect
    df_incorrect = df[df['predicted_mvp'] != df['actual_mvp']]
    df_incorrect['model_correct'] = 'No'
    
    
    #Concatening the correct and incorrect dataframes
    df_final = pd.concat([df_correct, df_incorrect])

    # Map encoded player_id values back to player names
    df_final['predicted_mvp'] = le.inverse_transform(df_final['predicted_mvp'].astype(int))
    df_final['actual_mvp'] = df_final['actual_mvp'].apply(lambda x: le.inverse_transform([x])[0])
    
    #Adding the model to the dataframe
    df_final['model'] = name

    #Plotting the results for the model
    plt.scatter(df_final['predicted_mvp'], df_final['actual_mvp'])
    plt.xlabel('Predicted MVP')
    plt.ylabel('Actual MVP')
    plt.title(name)
    plt.show()


EA

In [None]:
df_final.head()


In [None]:
plt.bar(df_final['model'], df_final['mean_squared_error'])
plt.xlabel('Model')
plt.ylabel('Mean Squared Error')
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15,10))

sns.scatterplot(ax=axes[0,0], data=results_df, x='predicted_mvp', y='actual_mvp', hue='correct')
axes[0,0].set_xlabel('Predicted MVP')
axes[0,0].set_ylabel('Actual MVP')
axes[0,0].set_title('Predicted vs Actual MVP')

sns.boxplot(ax=axes[0,1], data=results_df, x='model', y='mean_squared_error')
axes[0,1].set_xlabel('Model')
axes[0,1].set_ylabel('Mean Squared Error')
axes[0,1].set_title('Mean Squared Error by Model')

sns.boxplot(ax=axes[1,0], data=results_df, x='model', y='r2_score')
axes[1,0].set_xlabel('Model')
axes[1,0].set_ylabel('R2 Score')
axes[1,0].set_title('R2 Score by Model')

sns.countplot(ax=axes[1,1], data=results_df, x='model_correct')
axes[1,1].set_xlabel('Prediction Correct')
axes[1,1].set_ylabel('Count')
axes[1,1].set_title('Correct vs Incorrect Predictions')

plt.tight_layout()
plt.show()