In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.isotonic import IsotonicRegression
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor
import random

In [2]:
#Load data from cvs
start_year = 1977
end_year = 2024
years = range(start_year, end_year+1)
mvps = {}
roys = {}
stats = {}
rookies = {}

for year in years:
    mvps[year] = pd.read_csv(f'mvp_{year}.csv')
    mvps[year]['Year'] = year
    roys[year] = pd.read_csv(f'roy_{year}.csv')
    roys[year]['Year'] = year
    stats[year] = pd.read_csv(f'stats_{year}.csv')
    stats[year]['Year'] = year
    rookies[year] = pd.read_csv(f'rookies_{year}.csv')
    rookies[year] = pd.merge(rookies[year], stats[year][['Player','Tm', 'Pos', 'Rk', 'W', 'L']], on='Player', how='left')
    rookies[year]['Pos'] = rookies[year]['Pos'].fillna('Unkown')
    rookies[year]['Year'] = year

In [3]:
#Combine data for MVP win share with Total Player stats

def merge_stats_share(stats, award, new_col_name):
    #Remove asterix from names
    stats['Player'] = stats['Player'].str.replace('*', '', regex=False)
    award['Player'] = award['Player'].str.replace('*', '', regex=False)

    #Normalize voting shares so they sum to 100
    total_share = sum(award['Share'])
    award['Share'] = award['Share']*100/total_share
    
    merge = pd.merge(stats, award[['Player', 'Share']], on='Player', how='left')
    merge = merge.fillna(0.000)
    merge = merge.rename(columns={'Share': new_col_name})
    return merge

mvp_stats_merges = {}

for year in years:
    mvp_stats_merges[year] = merge_stats_share(stats[year],mvps[year],'MVP Vote Share')

mvp_stats_merges[2001]

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,STL,BLK,TOV,PF,PTS,Rk,W,L,Year,MVP Vote Share
0,Mahmoud Abdul-Rauf,PG,31,VAN,41,0,486,120,246,0.488,...,9,1,26,50,266,26.000000,23.000000,59.000000,2001,0.0
1,Tariq Abdul-Wahad,SG,26,DEN,29,12,420,43,111,0.387,...,14,13,34,54,111,21.000000,40.000000,42.000000,2001,0.0
2,Shareef Abdur-Rahim,SF,24,VAN,81,81,3241,604,1280,0.472,...,90,77,231,238,1663,26.000000,23.000000,59.000000,2001,0.0
3,Cory Alexander,PG,27,ORL,26,0,227,18,56,0.321,...,16,0,25,29,52,16.000000,43.000000,39.000000,2001,0.0
4,Courtney Alexander,PG,23,TOT,65,24,1382,239,573,0.417,...,45,5,75,139,618,11.670732,30.817073,34.182927,2001,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421,David Wingate,SG,37,SEA,1,0,9,3,3,1.000,...,0,0,0,1,6,17.000000,44.000000,38.000000,2001,0.0
422,Rubén Wolkowyski,PF,27,SEA,34,1,305,25,79,0.316,...,6,18,12,38,75,17.000000,44.000000,38.000000,2001,0.0
423,Metta World Peace,SF,21,CHI,76,74,2363,327,815,0.401,...,152,45,159,254,907,29.000000,15.000000,67.000000,2001,0.0
424,Lorenzen Wright,C,25,ATL,71,46,1988,363,811,0.448,...,42,63,125,232,881,25.000000,25.000000,57.000000,2001,0.0


In [4]:
#Combine data for ROY with rookie player stats
roy_rookies_merges = {}
for year in years:
    roy_rookies_merges[year] = merge_stats_share(rookies[year],roys[year],'ROY Vote Share')

roy_rookies_merges[2001]

Unnamed: 0,Player,Age,G,MP,FG,FGA,3P,3PA,FT,FTA,...,FG%,3P%,FT%,Tm,Pos,Rk,W,L,Year,ROY Vote Share
0,Courtney Alexander,23,65,1382,239,573,17,46,123,150,...,0.417,0.370,0.820,TOT,PG,11.670732,30.817073,34.182927,2001,0.0
1,Dalibor Bagarić,20,35,259,17,65,0,1,13,28,...,0.262,0.000,0.464,CHI,C,29.000000,15.000000,67.000000,2001,0.0
2,Erick Barkley,22,8,38,8,22,3,8,0,0,...,0.364,0.375,0.000,POR,PG,5.000000,50.000000,32.000000,2001,0.0
3,Raja Bell,24,5,30,2,7,1,3,0,0,...,0.286,0.333,0.000,PHI,SG,4.000000,56.000000,26.000000,2001,0.0
4,Mark Blount,25,64,1098,101,200,0,0,46,66,...,0.505,0.000,0.697,BOS,C,20.000000,36.000000,46.000000,2001,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,Hedo Türkoğlu,21,74,1245,138,335,28,86,87,112,...,0.412,0.326,0.777,SAC,SF,2.000000,55.000000,27.000000,2001,0.0
64,David Vanterpool,27,22,411,46,110,0,6,30,50,...,0.418,0.000,0.600,WAS,PG,27.000000,19.000000,63.000000,2001,0.0
65,Jake Voskuhl,23,16,143,11,25,0,0,8,14,...,0.440,0.000,0.571,CHI,C,29.000000,15.000000,67.000000,2001,0.0
66,Rubén Wolkowyski,27,34,305,25,79,0,2,25,34,...,0.316,0.000,0.735,SEA,PF,17.000000,44.000000,38.000000,2001,0.0


In [9]:
all_mvp_data = pd.concat(mvp_stats_merges.values(), ignore_index=True)
all_roy_data = pd.concat(roy_rookies_merges.values(), ignore_index=True)

In [6]:
#Remove Categorical Data
all_mvp_data = all_mvp_data.drop(['Player', 'Tm'], axis=1)
all_roy_data = all_roy_data.drop(['Player', 'Tm'], axis=1)

#One-hot encode the Position
all_mvp_data = pd.get_dummies(all_mvp_data, columns=['Pos'])
all_roy_data = pd.get_dummies(all_roy_data, columns=['Pos'])

In [40]:
#Split training and testing data (keep each year's data together)
test_percentage = 0.2
num_years_test = int(test_percentage * (end_year-start_year))
test_years = random.sample(years, num_years_test)

mvp_train = all_mvp_data[all_mvp_data['Year'].isin(test_years) == False]
mvp_test = all_mvp_data[all_mvp_data['Year'].isin(test_years)]

roy_train = all_roy_data[all_roy_data['Year'].isin(test_years) == False]
roy_test = all_roy_data[all_roy_data['Year'].isin(test_years)]

In [41]:
#Create Features --> X = features used to predict, Y = feature to preict

mvp_x_train = mvp_train.drop(['MVP Vote Share'], axis=1)
mvp_x_test = mvp_test.drop(['MVP Vote Share'], axis=1)

mvp_y_train = mvp_train['MVP Vote Share']
mvp_y_test = mvp_test['MVP Vote Share']

roy_x_train = roy_train.drop(['ROY Vote Share'], axis=1)
roy_x_test = roy_test.drop(['ROY Vote Share'], axis=1)

roy_y_train = roy_train['ROY Vote Share']
roy_y_test = roy_test['ROY Vote Share']


In [7]:
#Function to create prediction model
def create_rfr_preds(x_train, y_train, x_test, y_test, num_estimators=100, max_depth=None, min_samples_split=2, min_samples_leaf=1):
    # Initialize the model
    model = RandomForestRegressor(n_estimators=num_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, random_state=42)

    # Fit the model
    model.fit(x_train, y_train)

    # Predict and evaluate
    preds = model.predict(x_test)
    
    #cut off low values and normalize
    significant_vote_threshold = 0.5
    preds[preds < significant_vote_threshold] = 0
    preds =(preds / sum(preds)) * 100
    
    mse = mean_squared_error(y_test, preds)

    return [model, preds, mse]

In [77]:
#MVP Model

mvp_rfr = create_rfr_preds(mvp_x_train, mvp_y_train, mvp_x_test, mvp_y_test, 100)
mvp_model = mvp_rfr[0]
mvp_preds = mvp_rfr[1]
mvp_mse = mvp_rfr[2]
                         
print(f'Mean Squared Error: {mvp_mse}')

Mean Squared Error: 5.768826277054243


In [52]:
#ROY RFR Model

roy_rfr = create_rfr_preds(roy_x_train, roy_y_train, roy_x_test, roy_y_test,74, max_depth=10, min_samples_split=2, min_samples_leaf=30)
roy_model = roy_rfr[0]
roy_preds = roy_rfr[1]
roy_mse = roy_rfr[2]
                         
print(f'Mean Squared Error: {roy_mse}')

Mean Squared Error: 53.79742896459233


In [55]:
#Predict for just one of the MVP test years

mvp_test_year = random.sample(test_years, 1)[0]
mvp_test_year_data = all_mvp_data[all_mvp_data['Year'] == mvp_test_year]
mvp_test_x =  mvp_test_year_data.drop(['MVP Vote Share'], axis=1)
mvp_test_y = mvp_test_year_data['MVP Vote Share']

mvp_test_year_preds = mvp_model.predict(mvp_test_x)

mvp_test_year_final = mvp_stats_merges[mvp_test_year].copy()
mvp_test_year_final['MVP Vote Share Prediction'] = mvp_test_year_preds
mvp_test_year_final.reset_index(drop=True)
mvp_test_year_final_sorted =  mvp_test_year_final.sort_values(by='MVP Vote Share Prediction', ascending=False)
mvp_test_year_final_sorted

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,BLK,TOV,PF,PTS,Rk,W,L,Year,MVP Vote Share,MVP Vote Share Prediction
220,Karl Malone,PF,33,UTA,82,82,2998,864,1571,0.550,...,48,233,217,2249,2.0,64.0,18.0,1997,32.961538,24.890111
184,Michael Jordan,SG,33,CHI,82,82,3106,920,1892,0.486,...,44,166,156,2431,1.0,69.0,13.0,1997,32.000000,23.466431
365,John Stockton,PG,34,UTA,82,82,2896,416,759,0.548,...,15,248,194,1183,2.0,64.0,18.0,1997,0.115385,9.632603
113,Patrick Ewing,C,34,NYK,78,78,2887,655,1342,0.488,...,189,269,250,1751,10.0,57.0,25.0,1997,1.923077,8.811325
257,Alonzo Mourning,C,26,MIA,66,65,2320,473,885,0.534,...,189,226,272,1310,5.0,61.0,21.0,1997,0.153846,8.078627
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,Antonio Harvey,PF,26,SEA,6,0,26,5,11,0.455,...,4,1,8,15,3.0,57.0,25.0,1997,0.000000,0.000000
152,Lucious Harris,SG,26,PHI,54,3,813,112,294,0.381,...,3,34,45,293,25.0,22.0,60.0,1997,0.000000,0.000000
151,Othella Harrington,PF,23,HOU,57,1,860,112,204,0.549,...,22,57,112,273,7.0,57.0,25.0,1997,0.000000,0.000000
150,Ron Harper,PG,33,CHI,76,74,1740,177,406,0.436,...,38,50,138,480,1.0,69.0,13.0,1997,0.000000,0.000000


In [56]:
#Predict for just one of the ROY test years

roy_test_year = random.sample(test_years, 1)[0]
roy_test_year_data = all_roy_data[all_roy_data['Year'] == roy_test_year]
roy_test_x =  roy_test_year_data.drop(['ROY Vote Share'], axis=1)
roy_test_y = roy_test_year_data['ROY Vote Share']

roy_test_year_preds = roy_model.predict(roy_test_x)

roy_test_year_final = roy_rookies_merges[roy_test_year].copy()
roy_test_year_final['ROY Vote Share Prediction'] = roy_test_year_preds
roy_test_year_final.reset_index(drop=True)
roy_test_year_final_sorted =  roy_test_year_final.sort_values(by='ROY Vote Share Prediction', ascending=False)
roy_test_year_final_sorted

Unnamed: 0,Player,Age,G,MP,FG,FGA,3P,3PA,FT,FTA,...,3P%,FT%,Tm,Pos,Rk,W,L,Year,ROY Vote Share,ROY Vote Share Prediction
0,Shareef Abdur-Rahim,20,80,2802,550,1214,7,27,387,519,...,0.259,0.746,VAN,PF,29.000000,14.000000,68.000000,1997,21.721722,34.794850
29,Allen Iverson,21,76,3045,625,1504,155,455,382,544,...,0.341,0.702,PHI,PG,25.000000,22.000000,60.000000,1997,38.338338,34.794850
61,Antoine Walker,20,82,2970,576,1354,52,159,231,366,...,0.327,0.631,BOS,PF,27.000000,15.000000,67.000000,1997,5.205205,34.794850
30,Kerry Kittles,22,82,3012,507,1189,158,419,175,227,...,0.377,0.771,NJN,SG,22.000000,26.000000,56.000000,1997,4.304304,28.640401
36,Stephon Marbury,19,67,2324,355,871,102,288,245,337,...,0.354,0.727,MIN,PG,17.000000,40.000000,42.000000,1997,30.430430,10.717893
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37,Walter McCarty,22,35,192,26,68,4,14,8,14,...,0.286,0.571,NYK,PF,10.000000,57.000000,25.000000,1997,0.000000,0.000030
38,Amal McCaskill,23,17,109,10,32,0,2,8,12,...,0.000,0.667,ORL,C,16.000000,45.000000,37.000000,1997,0.000000,0.000030
42,Ruben Nembhard,24,10,113,16,37,0,6,8,10,...,0.000,0.800,TOT,SG-PG,0.414634,7.439024,2.560976,1997,0.000000,0.000030
15,Brian Evans,23,14,59,8,22,4,8,0,0,...,0.500,0.000,ORL,SF,16.000000,45.000000,37.000000,1997,0.000000,0.000029


In [None]:
#Trying to improve the ROY model --> Number of estimators
mses = [0] * 100
for j in range(10):
    #Split training and testing data (keep each year's data together)
    test_percentage = 0.2
    num_years_test = int(test_percentage * (end_year-start_year))
    test_years = random.sample(years, num_years_test)
    
    roy_train = all_roy_data[all_roy_data['Year'].isin(test_years) == False]
    roy_test = all_roy_data[all_roy_data['Year'].isin(test_years)]

    roy_x_train = roy_train.drop(['ROY Vote Share'], axis=1)
    roy_x_test = roy_test.drop(['ROY Vote Share'], axis=1)

    roy_y_train = roy_train['ROY Vote Share']
    roy_y_test = roy_test['ROY Vote Share']
    
    for i in range(1,100):
        roy_rfr = create_rfr_preds(i, roy_x_train, roy_y_train, roy_x_test, roy_y_test)
        roy_model = roy_rfr[0]
        roy_preds = roy_rfr[1]
        roy_mse = roy_rfr[2]
                                 
        print(f'Mean Squared Error with {i} estimators: {roy_mse}')
        mses[i-1] += roy_mse

In [21]:
#xgb ROY Model

# Initialize the model
roy_xgb_model = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.05,
                max_depth = 5, alpha = 10, n_estimators = 100)

# Fit the model
roy_xgb_model.fit(roy_x_train, roy_y_train)

# Predict
roy_xgb_preds = roy_xgb_model.predict(roy_x_test)

# Evaluate
roy_xgb_mse = mean_squared_error(roy_y_test, roy_xgb_preds)

print(f'Mean Squared Error: {roy_xgb_mse}')

Mean Squared Error: 56.904349342462524


In [92]:
#Grid Search Best Params Function
def grid_search(param_grid, cv, x_train, y_train):
    #Create model
    model = RandomForestRegressor(random_state=42)
    
    # Setup the grid search
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
    
    # Fit grid search
    best_model = grid_search.fit(x_train, y_train)
    
    # Get the best parameters and the corresponding score
    return {"parameters": best_model.best_params_, "score": best_model.best_score_}

In [97]:
# Running Grid Search on ROY Model to find optimal values
# Running Grid Search on MVP Model to find optimal values

grid_search_mvp = grid_search(param_grid = {
    'max_depth': [3, 5, 7, 10, 15],
    'min_samples_split': [2, 10, 20, 30, 40],
    'min_samples_leaf': [1, 5, 10, 20, 30]
}, cv = 5, x_train=roy_x_train, y_train = roy_y_train)

parameters = grid_search_mvp['parameters']
score = grid_search_mvp['score']

print(f'Optimal parameters: {parameters}')
print(f'Best Score: {score}')

Fitting 5 folds for each of 125 candidates, totalling 625 fits
Optimal parameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 30}
Best Score: -42.011657079583415


In [98]:
# Running Grid Search on MVP Model to find optimal values

grid_search_mvp = grid_search(param_grid = {
    'max_depth': [3, 5, 7, 10, 15],
    'min_samples_split': [2, 10, 20, 30, 40],
    'min_samples_leaf': [1, 5, 10, 20, 30]
}, cv = 5, x_train=mvp_x_train, y_train = mvp_y_train)

parameters = grid_search_mvp['parameters']
score = grid_search_mvp['score']

print(f'Optimal parameters: {parameters}')
print(f'Best Score: {score}')

Fitting 5 folds for each of 125 candidates, totalling 625 fits
Optimal parameters: {'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best Score: -2.4448247287608718


In [10]:
#Preparing PER48 Models:
all_mvp_data_per_48 = all_mvp_data.copy()
all_roy_data_per_48 = all_roy_data.copy()

new_cols = ['FGP48', 'FGAP48', '3PP48', '3PAP48', '2PP48', '2PAP48', 'FTP48', 'FTAP48', 'ORBP48', 'DRBP48', 'TRBP48', 'ASTP48',
       'STLP48', 'BLKP48', 'TOVP48', 'PFP48', 'PTSP48']
old_cols = ['FG', 'FGA', '3P', '3PA', '2P', '2PA', 'FT', 'FTA', 'ORB', 'DRB', 'TRB', 'AST',
       'STL', 'BLK', 'TOV', 'PF', 'PTS']

all_roy_data_per_48['2P'] = all_roy_data_per_48['FG'] - all_roy_data_per_48['3P']
all_roy_data_per_48['2PA'] = all_roy_data_per_48['FGA'] - all_roy_data_per_48['3PA']
all_roy_data_per_48['DRB'] = all_roy_data_per_48['TRB'] - all_roy_data_per_48['ORB']

all_mvp_data_per_48[new_cols] = all_mvp_data_per_48[old_cols].div(all_mvp_data_per_48['MP'], axis=0) * 48 
all_mvp_data_per_48.replace([np.inf, -np.inf], 0, inplace=True)
all_mvp_data_per_48 = all_mvp_data_per_48.fillna(0)
all_roy_data_per_48[new_cols] = all_roy_data_per_48[old_cols].div(all_roy_data_per_48['MP'], axis=0) * 48 
all_roy_data_per_48.replace([np.inf, -np.inf], 0, inplace=True)
all_roy_data_per_48 = all_roy_data_per_48.fillna(0)

#Remove Categorical Data
all_mvp_data_per_48 = all_mvp_data_per_48.drop(['Player', 'Tm'], axis=1)
all_roy_data_per_48 = all_roy_data_per_48.drop(['Player', 'Tm'], axis=1)

#One-hot encode the Position
all_mvp_data_per_48 = pd.get_dummies(all_mvp_data_per_48, columns=['Pos'])
all_roy_data_per_48 = pd.get_dummies(all_roy_data_per_48, columns=['Pos'])

#Split training and testing data (keep each year's data together)
test_percentage = 0.2
num_years_test = int(test_percentage * (end_year-start_year))
test_years = random.sample(years, num_years_test)

mvp_train_per_48 = all_mvp_data_per_48[all_mvp_data_per_48['Year'].isin(test_years) == False]
mvp_test_per_48 = all_mvp_data_per_48[all_mvp_data_per_48['Year'].isin(test_years)]

roy_train_per_48 = all_roy_data_per_48[all_roy_data_per_48['Year'].isin(test_years) == False]
roy_test_per_48 = all_roy_data_per_48[all_roy_data_per_48['Year'].isin(test_years)]

#Create Features --> X = features used to predict, Y = feature to preict
mvp_x_train_per_48 = mvp_train_per_48.drop(['MVP Vote Share'], axis=1)
mvp_x_test_per_48 = mvp_test_per_48.drop(['MVP Vote Share'], axis=1)

mvp_y_train_per_48 = mvp_train_per_48['MVP Vote Share']
mvp_y_test_per_48 = mvp_test_per_48['MVP Vote Share']

roy_x_train_per_48 = roy_train_per_48.drop(['ROY Vote Share'], axis=1)
roy_x_test_per_48 = roy_test_per_48.drop(['ROY Vote Share'], axis=1)

roy_y_train_per_48 = roy_train_per_48['ROY Vote Share']
roy_y_test_per_48 = roy_test_per_48['ROY Vote Share']

In [11]:
#Per 48 MVP Model:

#MVP Model

mvp_per_48_rfr = create_rfr_preds(mvp_x_train_per_48, mvp_y_train_per_48, mvp_x_test_per_48, mvp_y_test_per_48, 100)
mvp_per_48_model = mvp_per_48_rfr[0]
mvp_per_48_preds = mvp_per_48_rfr[1]
mvp_per_48_mse = mvp_per_48_rfr[2]
                         
print(f'Mean Squared Error: {mvp_per_48_mse}')

Mean Squared Error: 4.767449508526435


In [None]:
#Per 48 ROY Model:

#ROY Model

roy_per_48_rfr = create_rfr_preds(roy_x_train_per_48, roy_y_train_per_48, roy_x_test_per_48, roy_y_test_per_48, num_estimators=74, max_depth=10, min_samples_split=2, min_samples_leaf=30)
roy_per_48_model = roy_per_48_rfr[0]
roy_per_48_preds = roy_per_48_rfr[1]
roy_per_48_mse = roy_per_48_rfr[2]
                         
print(f'Mean Squared Error: {roy_per_48_mse}')

In [None]:
#Optimizing MVP per 48 model

# Running Grid Search on MVP Model to find optimal values

grid_search_mvp = grid_search(param_grid = {
    'max_depth': [3, 5, 7, 10, 15],
    'min_samples_split': [2, 10, 20, 30, 40],
    'min_samples_leaf': [1, 5, 10, 20, 30]
}, cv = 5, x_train=mvp_x_train_per_48, y_train = mvp_y_train_per_48)

parameters = grid_search_mvp['parameters']
score = grid_search_mvp['score']

print(f'Optimal parameters: {parameters}')
print(f'Best Score: {score}')

In [None]:
#Optimizing ROY per 48 model

# Running Grid Search on MVP Model to find optimal values

grid_search_mvp = grid_search(param_grid = {
    'max_depth': [3, 5, 7, 10, 15],
    'min_samples_split': [2, 10, 20, 30, 40],
    'min_samples_leaf': [1, 5, 10, 20, 30]
}, cv = 5, x_train=roy_x_train_per_48, y_train = roy_y_train_per_48)

parameters = grid_search_mvp['parameters']
score = grid_search_mvp['score']

print(f'Optimal parameters: {parameters}')
print(f'Best Score: {score}')

In [104]:
#MVP model is done

mvp_per_48_rfr = create_rfr_preds(mvp_x_train_per_48, mvp_y_train_per_48, mvp_x_test_per_48, mvp_y_test_per_48, num_estimators = 50, max_depth = 10, min_samples_leaf = 1, min_samples_split = 2)
mvp_per_48_model = mvp_per_48_rfr[0]
mvp_per_48_preds = mvp_per_48_rfr[1]
mvp_per_48_mse = mvp_per_48_rfr[2]
                         
print(f'Mean Squared Error: {mvp_per_48_mse}')

Mean Squared Error: 4.7284179672612625


In [107]:
# Fit a linear regressor on your initial predictions vs actual values
calibrator = LinearRegression()
calibrator.fit(roy_per_48_preds.reshape(-1, 1), roy_y_test_per_48)

# Calibrate the predictions
calibrated_roy_preds = calibrator.predict(roy_per_48_preds.reshape(-1, 1))

# Evaluate the calibrated predictions
calibrated_roy_mse = mean_squared_error(roy_y_test_per_48, calibrated_roy_preds)
print(f'Calibrated Mean Squared Error: {calibrated_roy_mse}')

Calibrated Mean Squared Error: 35.0220472035923


In [111]:
roy_iso_regressor = IsotonicRegression(out_of_bounds='clip')
roy_iso_regressor.fit(roy_per_48_preds, roy_y_test_per_48)

# Apply isotonic regression to adjust the predictions
calibrated_roy_preds_iso = roy_iso_regressor.predict(roy_per_48_preds)

# Evaluate the calibrated predictions
calibrated_roy_mse_iso = mean_squared_error(roy_y_test_per_48, calibrated_roy_preds_iso)
print(f'ROY Calibrated Mean Squared Error with Isotonic Regression: {calibrated_roy_mse_iso}')


Calibrated Mean Squared Error with Isotonic Regression: 20.529891079328376


In [130]:
calibrated_mvp_model = IsotonicRegression(out_of_bounds='clip')
calibrated_mvp_model.fit(mvp_per_48_preds, mvp_y_test_per_48)

# Apply isotonic regression to adjust the predictions
calibrated_mvp_preds_iso = calibrated_mvp_model.predict(mvp_per_48_preds)

# Evaluate the calibrated predictions
calibrated_mvp_mse_iso = mean_squared_error(mvp_y_test_per_48, calibrated_mvp_preds_iso)
print(f'MVP Calibrated Mean Squared Error with Isotonic Regression: {calibrated_mvp_mse_iso}')

ValueError: Found input variables with inconsistent numbers of samples: [3839, 3899]

In [127]:
# Define base models
estimators = [
    ('rf', RandomForestRegressor(n_estimators=50, random_state=42, max_depth=15, min_samples_leaf=1, min_samples_split=30)),
    ('gb', GradientBoostingRegressor(n_estimators=100, random_state=42))
]

# Initialize Stacking Regressor
roy_stack_model = StackingRegressor(
    estimators=estimators,
    final_estimator=Ridge(alpha=100)
)

# Fit the model
roy_stack_model.fit(roy_x_train_per_48, roy_y_train_per_48)
roy_stacked_preds = roy_stack_model.predict(roy_x_test_per_48)

# Evaluate the stacked model
roy_stacked_mse = mean_squared_error(roy_y_test_per_48, roy_stacked_preds)
print(f'ROY Stacked Model MSE: {roy_stacked_mse}')

Stacked Model MSE: 29.83042763908475


In [129]:
calibrated_roy_model = IsotonicRegression(out_of_bounds='clip')
calibrated_roy_model.fit(roy_stacked_preds, roy_y_test_per_48)

# Apply isotonic regression to adjust the predictions
calibrated_roy_preds = calibrated_roy_model.predict(roy_stacked_preds)

# Evaluate the calibrated predictions
calibrated_roy_mse = mean_squared_error(roy_y_test_per_48, calibrated_roy_preds)
print(f'ROY Calibrated Mean Squared Error with Isotonic Regression: {calibrated_roy_mse}')

ROY Calibrated Mean Squared Error with Isotonic Regression: 16.320979957989206
