# Model Selection
### Imports

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

### Simple Linear Regression
We're going to have to try a lot of different models and parameter tuning, but for now we'll just set up a simple Linear Regression model via K-Fold Cross Validation, where the set we leave out (for validation), will be a set corresponding to a particular season

In [2]:
#Read data from data cleaning
total_data_df = pd.read_csv('output.csv')

# Print the columns
print(total_data_df.columns)

# Select features and target variable
features = ['Pos_PG', 'Pos_SG', 'Pos_SF', 'Pos_PF', 'Pos_C', 'G', 'GS', 'MP', 'FG', 'FG%', '3P', '3P%', '2P', '2P%', 'eFG%', 'ORB', 'DRB', 'AST', 'BLK', 'PTS', 'PER', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Tm_Rcrd']
target = 'MVP_Shr'

# Prepare the data
X_train = total_data_df[total_data_df['Szn'] != '2022_23'][features]
X_test = total_data_df[total_data_df['Szn'] == '2022_23'][features]
y_train = total_data_df[total_data_df['Szn'] != '2022_23'][target]
y_test = total_data_df[total_data_df['Szn'] == '2022_23'][target]

# Print X_train shape
print(X_train.shape)
# Print Y_train shape
print(y_train.shape)

# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Show the NBA MVP predictions for the 2022_23 season
predictions = pd.DataFrame({'Player': total_data_df[total_data_df['Szn'] == '2022_23']['Plyr'], 'Predicted MVP Share': y_pred})
predictions.sort_values(by='Predicted MVP Share', ascending=False)
# make a for loop to predict the MVP share for each season
features = ['Pos_PG', 'Pos_SG', 'Pos_SF', 'Pos_PF', 'Pos_C', 'G', 'GS', 'MP', 'FG', 'FG%', '3P', '3P%', '2P', '2P%', 'eFG%', 'ORB', 'DRB', 'AST', 'BLK', 'PTS', 'PER', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Tm_Rcrd']
target = 'MVP_Shr'

# ToDo: just keep working on this loop, and make it work for every model in the future
season_list = [f"{year}_{year + 1 - 2000:02}" for year in range(2000, 2023)]

for season in season_list:
    X_train = total_data_df[total_data_df['Szn'] != season][features]
    X_test = total_data_df[total_data_df['Szn'] == season][features]
    y_train = total_data_df[total_data_df['Szn'] != season][target]
    y_test = total_data_df[total_data_df['Szn'] == season][target]

    model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    predictions = pd.DataFrame({'Player': total_data_df[total_data_df['Szn'] == season]['Plyr'], 'Predicted MVP Share': y_pred})
    print(f'MVP Predictions for the {season} season:')
    print(predictions.sort_values(by='Predicted MVP Share', ascending=False).head(1))
    print('\n')

Index(['Plyr', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA',
       '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB',
       'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'id', 'Szn', 'PER',
       'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%',
       'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM',
       'VORP', 'MVP_Rnk', 'MVP_Shr', 'Tm_Rcrd', 'Pos_C', 'Pos_PF', 'Pos_PG',
       'Pos_SF', 'Pos_SG'],
      dtype='object')
(2127, 39)
(2127,)


NameError: name 'season_list' is not defined