In [68]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer

In [69]:
stats=pd.read_csv(r"datasets/player_mvp_stats_adv.csv")
del stats['Unnamed: 0']
stats22_24=pd.read_csv(r"stats_2024.csv")
del stats22_24['Unnamed: 0']
stats=pd.concat([stats,stats22_24],ignore_index=True)

In [70]:
stat_ratios=stats[["PTS","AST","TRB", "STL","BLK","3P","WS","WS/48", "eFG%", "VORP","BPM",'PER',"Year"]].groupby("Year").apply(lambda x: x/x.mean())

  stat_ratios=stats[["PTS","AST","TRB", "STL","BLK","3P","WS","WS/48", "eFG%", "VORP","BPM",'PER',"Year"]].groupby("Year").apply(lambda x: x/x.mean())


In [71]:
stat_ratios.index=stat_ratios.index.droplevel()

In [72]:
stats[[
    'PTS_R',
    'AST_R',
    'TRB_R',
    'STL_R',
    'BLK_R',
    '3P_R',
    'WS_R',
    'WS/48_R',
    'eFG%_R',
    'VORP_R',
    'BPM_R',
    'PER_R'
    ]]=stat_ratios[
        ["PTS","AST","TRB", "STL","BLK","3P","WS","WS/48", "eFG%", "VORP","BPM",'PER']
    ]

In [73]:
stats["NPos"]=stats["Pos"].astype("category").cat.codes
stats["NTm"]=stats["Tm"].astype("category").cat.codes

In [74]:
f_stats=stats[
    (stats['PTS'] > 0) &  # Players who scored at least some points
    (stats['G'] >= 10) &  # Players who played in at least 10 games
    (stats['MP'] >= 20)  # Players with meaningful playing time
].copy()

In [75]:
predictors=['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
    '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
    'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
    'W', 'L', 'W/L%', 'GB', 'PS/G', 'PA/G', 'SRS', 'PER', 'TS%', 
    '3PAr', 'FTr', 'ORB%', 'DRB%','TRB%', 'AST%', 'STL%', 'BLK%', 
    'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 
    'BPM', 'VORP','PTS_R', 'AST_R', 'TRB_R', 'STL_R', 'BLK_R', 
    '3P_R', 'WS_R', 'WS/48_R', 'eFG%_R', 'VORP_R', 'BPM_R', 'PER_R','NPos', 'NTm']

In [76]:
train=f_stats[f_stats['Year']<2024]
test=f_stats[f_stats['Year']==2024]

In [None]:
def prep_data(train,test, predictors):
    f_test=test[
        (test['PTS'] > 15) &  # Players who scored at least some points
        (test['G'] >= 65) &  # Players who played in at least 10 games
        (test['MP'] >= 20) # Players with meaningful playing time
    ].copy()
    
    f_train=train.copy()
    
    scaler=StandardScaler()
    
    f_train[predictors]=scaler.fit_transform(f_train[predictors])
    
    f_test[predictors]=scaler.transform(f_test[predictors])
    
    return f_train, f_test

In [None]:
f_train,f_test=prep_data(train,test,predictors)

In [117]:
f_test['Year'].unique()

array([1.71935998])

In [51]:
def optimize_random_forest(X_train, y_train):
    """Perform hyperparameter optimization for Random Forest"""
    param_dist = {
        'n_estimators': [100, 200, 300, 400, 500],
        'max_depth': [None, 10, 20, 30, 40, 50],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2', None]
    }
    
    rf = RandomForestRegressor(random_state=42)
    
    def custom_scorer(y_true, y_pred):
        df = pd.DataFrame({'true': y_true, 'pred': y_pred})
        df['rank'] = df['pred'].rank(ascending=False)
        return (df['rank'] <= 5).mean()
    
    scorer = make_scorer(custom_scorer)
    
    random_search = RandomizedSearchCV(
        rf, param_distributions=param_dist,
        n_iter=100, cv=5, scoring=scorer,
        random_state=42, n_jobs=-1
    )
    
    random_search.fit(X_train, y_train)
    return random_search.best_estimator_, random_search.best_params_

In [52]:
def add_ranks(combo:pd.DataFrame)->pd.DataFrame:
    combo=combo.sort_values('Actual Share',ascending=False)
    combo['Rk']=list(range(1,combo.shape[0]+1))
    combo=combo.sort_values('Predicted Share', ascending=False)
    combo['Predicted_Rk']=list(range(1,combo.shape[0]+1))
    combo['Diff']=combo["Rk"]-combo['Predicted_Rk']
    return combo

In [121]:
def prep_final_predictions(train:pd.Series, test:pd.Series, predictors:list, model:RandomForestRegressor):
    """_summary_

    Args:
        train (pd.Series): _description_
        test (pd.Series): _description_
        predictors (list): _description_
        model (_type_): _description_
    """
    f_train,f_test=prep_data(train, test, predictors)
    
    preds=model.predict(f_test[predictors])
    
    results= pd.DataFrame({
        'Player': f_test['Player'],
        'Team': f_test['Tm'],
        'Predicted Share': preds,
        'Actual Share': f_test['Share']
    })
    # key_stats= ['PTS', 'AST', 'TRB', 'W/L%', 'PER', 'WS', 'WS/48', 'BPM', 'VORP']
    
    # for s in key_stats:
    #     results[s]=test.loc[f_test.index, s]
    
    results=add_ranks(results)
    
    results.sort_values('Diff', ascending=False)
    
    # cols=['Player', 'Team', 'Predicted Share', 'Actual Share']+key_stats
    
    # results=results[cols]
    
    return results
        

In [119]:
X,y=f_train[predictors],f_train['Share']
best_rf,best_params=optimize_random_forest(X,y)

KeyboardInterrupt: 

In [None]:
final_rf=RandomForestRegressor(random_state=42)
final_rf.fit(f_train[predictors],f_train['Share'])


KeyError: "None of [Index([    0,     1,    20,    31,    36,    46,    52,    67,    71,    75,\n       ...\n       15656, 15658, 15661, 15687, 15690, 15696, 15704, 15718, 15762, 15783],\n      dtype='int64', length=1555)] are in the [index]"

In [125]:
preds=final_rf.predict(f_test[predictors])

In [131]:
preds=pd.DataFrame(preds,columns=['predictions'],index=f_test.index)
combo=pd.concat([test[["Player","Share"]],preds],axis=1)

In [132]:
combo.sort_values("Share", ascending=False)

Unnamed: 0,Player,Share,predictions
14541,Nikola Jokić,0.935,
15274,Shai Gilgeous-Alexander,0.646,
14485,Luka Dončić,0.572,
15045,Giannis Antetokounmpo,0.194,
15204,Jalen Brunson,0.143,
...,...,...,...
14710,Tari Eason,0.000,
14749,Aaron Nesmith,0.000,
14750,Andrew Nembhard,0.000,
14752,Bennedict Mathurin,0.000,


In [122]:
results_df=prep_final_predictions(train, test, predictors,best_rf)

In [123]:
results_df.head()

Unnamed: 0,Player,Team,Predicted Share,Actual Share,Rk,Predicted_Rk,Diff
2331,LeBron James,CLE,0.261221,0.969,9,1,8
11924,David Robinson,SAS,0.248338,0.723,41,2,39
7897,Kevin Garnett,MIN,0.233525,0.991,4,3,1
2348,LeBron James,CLE,0.229576,0.98,7,4,3
1583,Michael Jordan,CHI,0.223913,0.928,15,5,10
