# Model Selection
### Imports

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

### Import Data
Import the data generated by the data_clean.ipynb notebook

In [2]:
#Read data from data cleaning
total_data_df = pd.read_csv('output.csv')

### Simple Linear Regression
We're going to have to try a lot of different models and parameter tuning, but for now we'll just set up a simple Linear Regression model via K-Fold Cross Validation, where the set we leave out (for validation), will be a set corresponding to a particular season

In [4]:
# Print the columns
print(total_data_df.columns)

# Select features and target variable - This is where we get rid of useless columns (by not selecting them)
features = ['Pos_PG', 'Pos_SG', 'Pos_SF', 'Pos_PF', 'Pos_C', 'G', 'GS', 'MP', 'FG', 'FG%', '3P', '3P%', '2P', '2P%', 'eFG%', 'ORB', 'DRB', 'AST', 'BLK', 'PTS', 'PER', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Tm_Rcrd']
target = 'MVP_Shr'

# Prepare the data
X_train = total_data_df[total_data_df['Szn'] != '2022_23'][features]
X_test = total_data_df[total_data_df['Szn'] == '2022_23'][features]
y_train = total_data_df[total_data_df['Szn'] != '2022_23'][target]
y_test = total_data_df[total_data_df['Szn'] == '2022_23'][target]

# Print X_train shape
print(X_train.shape)
# Print Y_train shape
print(y_train.shape)

# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Show the NBA MVP predictions for the 2022_23 season
predictions = pd.DataFrame({'Player': total_data_df[total_data_df['Szn'] == '2022_23']['Plyr'], 'Predicted MVP Share': y_pred})
predictions.sort_values(by='Predicted MVP Share', ascending=False)
# make a for loop to predict the MVP share for each season
features = ['Pos_PG', 'Pos_SG', 'Pos_SF', 'Pos_PF', 'Pos_C', 'G', 'GS', 'MP', 'FG', 'FG%', '3P', '3P%', '2P', '2P%', 'eFG%', 'ORB', 'DRB', 'AST', 'BLK', 'PTS', 'PER', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Tm_Rcrd']
target = 'MVP_Shr'

# ToDo: just keep working on this loop, and make it work for every model in the future
season_list = [f"{year}_{year + 1 - 2000:02}" for year in range(2000, 2023)]

for season in season_list:
    X_train = total_data_df[total_data_df['Szn'] != season][features]
    X_test = total_data_df[total_data_df['Szn'] == season][features]
    y_train = total_data_df[total_data_df['Szn'] != season][target]
    y_test = total_data_df[total_data_df['Szn'] == season][target]

    model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    predictions = pd.DataFrame({'Player': total_data_df[total_data_df['Szn'] == season]['Plyr'], 'Predicted MVP Share': y_pred})
    print(f'MVP Predictions for the {season} season:')
    print(predictions.sort_values(by='Predicted MVP Share', ascending=False).head(1))
    print('\n')

Index(['Plyr', 'G', 'GS', 'MP', 'FG', 'FG%', '3P', '3P%', '2P', '2P%', 'eFG%',
       'ORB', 'DRB', 'AST', 'BLK', 'PTS', 'id', 'Szn', 'PER', '3PAr', 'FTr',
       'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'USG%', 'OWS', 'DWS',
       'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP', 'MVP_Rnk', 'MVP_Shr',
       'Tm_Rcrd', 'Pos_C', 'Pos_PF', 'Pos_PG', 'Pos_SF', 'Pos_SG'],
      dtype='object')
(2127, 39)
(2127,)
MVP Predictions for the 2000_01 season:
              Player  Predicted MVP Share
58  Shaquille O'Neal             0.284393


MVP Predictions for the 2001_02 season:
         Player  Predicted MVP Share
117  Tim Duncan             0.331543


MVP Predictions for the 2002_03 season:
            Player  Predicted MVP Share
252  Tracy McGrady             0.356659


MVP Predictions for the 2003_04 season:
            Player  Predicted MVP Share
324  Kevin Garnett             0.407349


MVP Predictions for the 2004_05 season:
            Player  Predicted MVP Share
421  Kevin Garnet