# Model Selection
### Imports

In [553]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import ElasticNet, BayesianRidge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor


### Import Data
Import the data generated by the data_clean.ipynb notebook

In [554]:
# Read data from data cleaning
total_data_df = pd.read_csv('output.csv')


## Parameter Selection + Regularization
We will use Standardization (Z-score normalization), which scales the features so that they have a mean of 0 and a standard deviation of 1. It subtracts the mean of each feature and divides by its standard deviation. Standardization can help algorithms converge faster and perform better, especially for algorithms that rely on distance metrics or gradient descent.

In [555]:
from sklearn.preprocessing import StandardScaler

# Select features and target
# features = ['Pos_PG', 'Pos_SG', 'Pos_SF', 'Pos_PF', 'Pos_C', 'G', 'GS', 'MP', 'FG', 'FG%', '3P', '3P%', '2P', '2P%', 'eFG%', 'ORB', 'DRB', 'AST', 'BLK', 'PTS', 'PER', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Tm_Rcrd', 'MVP_strk']
# features = ['Pos_PG', 'Pos_SG', 'Pos_SF', 'Pos_PF', 'Pos_C', 'G', 'GS', 'MP', 'FG', 'FG%', '3P', '3P%', '2P', '2P%', 'eFG%', 'ORB', 'DRB', 'AST', 'BLK', 'PTS', 'PER', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Tm_Rcrd']
features = ['Age' ,'G', 'GS', 'MP', 'FG', 'FG%', '3P', '3P%', 'eFG%', 'ORB', 'DRB', 'AST', 'PTS', 'PER', 'FTr', 'USG%', 'OWS', 'DWS', 'WS', 'OBPM', 'DBPM', 'VORP', 'Tm_Rcrd']
target = 'MVP_Shr'

# scale the features: uncomment the following lines to scale the features
# scaler = StandardScaler()
# total_data_df[features] = scaler.fit_transform(total_data_df[features])

# total_data_df.head(10)


## Experimenting with different models
We're going to have to try a lot of different models and parameter tuning via K-Fold Cross Validation, where the set we leave out (for validation), will be a set corresponding to a particular season

In [556]:
season_list = [f"{year}_{year + 1 - 2000:02}" for year in range(2000, 2023)]

def model_testing(model):
    mvp_list = []
    for season in season_list:
        # Split data into training and testing sets
        X_train = total_data_df[total_data_df['Szn'] != season][features]
        X_test = total_data_df[total_data_df['Szn'] == season][features]
        y_train = total_data_df[total_data_df['Szn'] != season][target]
        y_test = total_data_df[total_data_df['Szn'] == season][target]

        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        # add the top candidates to the prediction
        predictions = pd.DataFrame({'Player': total_data_df[total_data_df['Szn'] == season]['Plyr'], 'Predicted MVP Share': y_pred})
        mvp = predictions.sort_values(by='Predicted MVP Share', ascending=False).head(1)['Player'].values[0]
        # print(mvp)
        # print the type of mvp variable
        # print(type(mvp))

        mvp_list.append(mvp)         # values[0] is to select the value since dataframes are by default indexed
        
        # predictions = pd.DataFrame({'Player': total_data_df[total_data_df['Szn'] == season]['Plyr'], 'Predicted MVP Share': y_pred})
        # print(f'MVP Predictions for the {season} season:')
        # print(predictions.sort_values(by='Predicted MVP Share', ascending=False).head(1))
        # print('\n')

    # print(mvp_list)
    return mvp_list

In [557]:
# list of all models, and their string representation

models = {
    'Linear Regression': LinearRegression(),                # 0.5652173913043478
    'Lasso': Lasso(alpha=0.01),                              # 0.5652173913043478
    'Ridge': Ridge(alpha=0.1),                              # 0.5652173913043478
    'Bagging Regression': BaggingRegressor(n_estimators=200), # 0.5652173913043478
    'Decision Tree': DecisionTreeRegressor(max_features=30),               # 0.4782608695652174
    'Random Forest': RandomForestRegressor(n_estimators=50, max_features=5), # 0.6521739130434783
    'Gradient Boosting': GradientBoostingRegressor(max_depth=5, learning_rate=0.21, n_estimators=50),       # 0.5652173913043478
    'AdaBoost': AdaBoostRegressor(learning_rate= 0.5, n_estimators=50),    # 0.6086956521739131
    'Support Vector Machine': SVR(),                        # 0.6956521739130435
    # 'Elastic Net': ElasticNet(),                            # don't even try this
    # 'Bayesian Ridge': BayesianRidge(),                      # 0.4782608695652174
    # 'K Nearest Neighbors': KNeighborsRegressor(),           # 0.5652173913043478
    # 'Multi-layer Perceptron': MLPRegressor(),               # sucks
    # 'XGBoost': XGBRegressor()                               # 0.5652173913043478
}

results = []

# Test each model
for model_name, model_object in models.items():
    print(f'Testing {model_name}')
    model_result = model_testing(model_object)
    # print(model_result)
    results.append(model_result)

# Make a dataframe, where each column is a model, and each row is a season
results_df = pd.DataFrame()

mvp_winners = ['Allen Iverson', 'Tim Duncan', 'Tim Duncan', 'Kevin Garnett', 'Steve Nash', 'Steve Nash', 'Dirk Nowitzki', 'Kobe Bryant', 'LeBron James', 'LeBron James', 'Derick Rose', 'LeBron James', 'LeBron James', 'Kevin Durant', 'Stephen Curry', 'Stephen Curry', 'Russell Westbrook', 'James Harden', 'Giannis Antetokounmpo', 'Giannis Antetokounmpo', 'Nikola Jokić', 'Nikola Jokić', 'Joel Embiid']
results_df['Season'] = season_list
results_df['Truth'] = mvp_winners

for index, (key, value) in enumerate(models.items()):
    results_df[key] = results[index]



Testing Linear Regression
Testing Lasso
Testing Ridge
Testing Support Vector Machine


In [558]:
results_df

Unnamed: 0,Season,Truth,Linear Regression,Lasso,Ridge,Support Vector Machine
0,2000_01,Allen Iverson,Shaquille O'Neal,Shaquille O'Neal,Shaquille O'Neal,Shaquille O'Neal
1,2001_02,Tim Duncan,Tim Duncan,Tim Duncan,Tim Duncan,Tim Duncan
2,2002_03,Tim Duncan,Tracy McGrady,Tracy McGrady,Tracy McGrady,Tim Duncan
3,2003_04,Kevin Garnett,Kevin Garnett,Kevin Garnett,Kevin Garnett,Kevin Garnett
4,2004_05,Steve Nash,Kevin Garnett,Kevin Garnett,Kevin Garnett,Kevin Garnett
5,2005_06,Steve Nash,LeBron James,LeBron James,LeBron James,Dirk Nowitzki
6,2006_07,Dirk Nowitzki,LeBron James,LeBron James,LeBron James,Dirk Nowitzki
7,2007_08,Kobe Bryant,LeBron James,LeBron James,LeBron James,LeBron James
8,2008_09,LeBron James,LeBron James,LeBron James,LeBron James,LeBron James
9,2009_10,LeBron James,LeBron James,LeBron James,LeBron James,LeBron James


### Accuracy Calculation

In [559]:
# Accuracy Calculation (correct/23)
for i in range(len(results)):
    correct = 0
    for j in range(len(results[0])):
        if results[i][j] == mvp_winners[j]:
            correct += 1
    print(correct/23)


0.4782608695652174
0.5217391304347826
0.4782608695652174
0.6956521739130435
