In [292]:
import pandas as pd

In [293]:
# Set display options
pd.set_option("display.max_rows", None)  # Display all rows
pd.set_option("display.max_columns", None)  # Display all columns
pd.set_option("display.max_colwidth", None)  # Display full column content
pd.set_option("display.width", None)  # Display full width of the dataframe

In [294]:
stats_df = pd.read_csv("../data/full_player_data_1991-2022.csv")

In [295]:
print(f"Stats df shape: {stats_df.shape}")
stats_df.head()

Stats df shape: (14697, 41)


Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year,Pts Won,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,0.476,0.1,0.7,0.2,3.0,5.9,0.507,0.486,2.7,3.7,0.738,2.5,3.8,6.3,0.9,0.7,0.3,1.2,1.4,9.1,1991,0.0,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
1,Byron Scott,SG,29,LAL,82,82,32.1,6.1,12.8,0.477,0.9,2.7,0.324,5.2,10.1,0.517,0.51,1.4,1.8,0.797,0.7,2.3,3.0,2.2,1.2,0.3,1.0,1.8,14.5,1991,0.0,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
2,Elden Campbell,PF,22,LAL,52,0,7.3,1.1,2.4,0.455,0.0,0.0,0.0,1.1,2.4,0.455,0.455,0.6,0.9,0.653,0.8,1.1,1.8,0.2,0.2,0.7,0.3,1.4,2.8,1991,0.0,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
3,Irving Thomas,PF,25,LAL,26,0,4.2,0.7,1.9,0.34,0.0,0.0,0.0,0.7,1.9,0.34,0.34,0.5,0.8,0.571,0.5,0.7,1.2,0.4,0.2,0.0,0.5,0.9,1.8,1991,0.0,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
4,James Worthy,SF,29,LAL,78,74,38.6,9.2,18.7,0.492,0.3,1.2,0.289,8.8,17.5,0.505,0.501,2.7,3.4,0.797,1.4,3.2,4.6,3.5,1.3,0.4,1.6,1.5,21.4,1991,0.0,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73


In [296]:
# Feature Engineering

# Rename per-game features to more appropriate names so that it is clear they are averages rather than cumulative totals

# Re-compute per-game statistics as cumulative rather than per-game averages
# This will favor players who play more games
# The new name of each feature will be the feature minus the final 2 chars: '/G'
player_per_game_features = [
    "MP",
    "FG",
    "FGA",
    "3P",
    "3PA",
    "2P",
    "2PA",
    "FT",
    "FTA",
    "ORB",
    "DRB",
    "TRB",
    "AST",
    "STL",
    "BLK",
    "TOV",
    "PF",
    "PTS",
]

for feature in player_per_game_features:
    stats_df[f"{feature}/G"] = stats_df[feature]
    stats_df[feature] = stats_df[feature] * stats_df["G"]

player_per_game_features = [
    "MP/G",
    "FG/G",
    "FGA/G",
    "3P/G",
    "3PA/G",
    "2P/G",
    "2PA/G",
    "FT/G",
    "FTA/G",
    "ORB/G",
    "DRB/G",
    "TRB/G",
    "AST/G",
    "STL/G",
    "BLK/G",
    "TOV/G",
    "PF/G",
    "PTS/G",
]

# Rename team per-game metrics to more appropriate names
stats_df["Team PTS/G"] = stats_df["PS/G"]
del stats_df["PS/G"]
stats_df["Opponent PTS/G"] = stats_df["PA/G"]
del stats_df["PA/G"]

team_per_game_features = ["Opponent PTS/G", "Team PTS/G"]


stats_df.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB', 'SRS',
       'MP/G', 'FG/G', 'FGA/G', '3P/G', '3PA/G', '2P/G', '2PA/G', 'FT/G',
       'FTA/G', 'ORB/G', 'DRB/G', 'TRB/G', 'AST/G', 'STL/G', 'BLK/G', 'TOV/G',
       'PF/G', 'PTS/G', 'Team PTS/G', 'Opponent PTS/G'],
      dtype='object')

In [297]:
# Feature engineering

# Create metrics to estimate a player's impact per game based on a fantasy-basketball scoring system
# Reward players for positive attributes (ex. pts, rebounds, or blocks)
# Penalize playuers for negative attributes (turnovers, missed shots, etc. )

# Use an ESPN-inspired fantasy basketball scoring system:
# Point = 1 3PM = 1  FGA = -1  FGM = 2  FTA = -1  FTM = 1  REB = 1  AST = 2  STL = 4  BLK = 4  TOV = -2

stats_df["Total Fantasy PTS"] = (
    (1 * stats_df["PTS"])
    + (1 * stats_df["3P"])
    + (-1 * stats_df["FGA"])
    + (2 * stats_df["FG"])
    + (-1 * stats_df["FTA"])
    + (1 * stats_df["FT"])
    + (1 * stats_df["TRB"])
    + (2 * stats_df["AST"])
    + (4 * stats_df["STL"])
    + (4 * stats_df["BLK"])
    + (-2 * stats_df["TOV"])
)

stats_df["Fantasy PTS/G"] = stats_df["Total Fantasy PTS"] / stats_df["G"]

stats_df.sort_values(by="Total Fantasy PTS", ascending=False).head(30)

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year,Pts Won,Pts Max,Share,Team,W,L,W/L%,GB,SRS,MP/G,FG/G,FGA/G,3P/G,3PA/G,2P/G,2PA/G,FT/G,FTA/G,ORB/G,DRB/G,TRB/G,AST/G,STL/G,BLK/G,TOV/G,PF/G,PTS/G,Team PTS/G,Opponent PTS/G,Total Fantasy PTS,Fantasy PTS/G
13226,Hakeem Olajuwon,C,30,HOU,82,82,3239.0,844.6,1599.0,0.529,0.0,8.2,0.0,844.6,1599.0,0.532,0.529,442.8,574.0,0.779,287.0,787.2,1066.0,287.0,147.6,344.4,262.4,303.4,2140.2,1993,647.0,980.0,0.66,Houston Rockets,55,27,0.671,0.0,3.57,39.5,10.3,19.5,0.0,0.1,10.3,19.5,5.4,7.0,3.5,9.6,13.0,3.5,1.8,4.2,3.2,3.7,26.1,104.0,99.8,5182.4,63.2
9078,David Robinson,C,28,SAS,80,80,3240.0,840.0,1656.0,0.507,8.0,32.0,0.345,832.0,1632.0,0.51,0.51,696.0,928.0,0.749,240.0,616.0,856.0,384.0,136.0,264.0,256.0,232.0,2384.0,1994,730.0,1010.0,0.723,San Antonio Spurs,55,27,0.671,3.0,5.05,40.5,10.5,20.7,0.1,0.4,10.4,20.4,8.7,11.6,3.0,7.7,10.7,4.8,1.7,3.3,3.2,2.9,29.8,100.0,94.8,4896.0,61.2
10358,Michael Jordan,SG,27,CHI,82,82,3034.0,992.2,1836.8,0.539,32.8,90.2,0.312,959.4,1746.6,0.551,0.547,574.0,672.4,0.851,114.8,377.2,492.0,451.0,221.4,82.0,205.0,229.6,2583.0,1991,891.0,960.0,0.928,Chicago Bulls,61,21,0.744,0.0,8.57,37.0,12.1,22.4,0.4,1.1,11.7,21.3,7.0,8.2,1.4,4.6,6.0,5.5,2.7,1.0,2.5,2.8,31.5,110.0,101.0,4862.6,59.3
13242,Hakeem Olajuwon,C,31,HOU,80,80,3280.0,896.0,1696.0,0.528,8.0,16.0,0.421,888.0,1672.0,0.529,0.53,392.0,544.0,0.716,232.0,728.0,952.0,288.0,128.0,296.0,272.0,288.0,2184.0,1994,889.0,1010.0,0.88,Houston Rockets,58,24,0.707,0.0,4.19,41.0,11.2,21.2,0.1,0.2,11.1,20.9,4.9,6.8,2.9,9.1,11.9,3.6,1.6,3.7,3.4,3.6,27.3,101.1,96.8,4816.0,60.2
10200,David Robinson,C,25,SAS,82,81,3091.4,754.4,1369.4,0.552,0.0,8.2,0.143,754.4,1361.2,0.554,0.552,590.4,779.0,0.762,336.2,729.8,1066.0,205.0,123.0,319.8,270.6,262.4,2099.2,1991,476.0,960.0,0.496,San Antonio Spurs,55,27,0.671,0.0,4.3,37.7,9.2,16.7,0.0,0.1,9.2,16.6,7.2,9.5,4.1,8.9,13.0,2.5,1.5,3.9,3.3,3.2,25.6,107.1,102.6,4756.0,58.0
6647,Russell Westbrook,PG,28,OKC,81,81,2802.6,826.2,1944.0,0.425,202.5,583.2,0.343,623.7,1360.8,0.459,0.476,712.8,842.4,0.845,137.7,729.0,866.7,842.4,129.6,32.4,437.4,186.3,2559.6,2017,888.0,1010.0,0.879,Oklahoma City Thunder,47,35,0.573,4.0,1.14,34.6,10.2,24.0,2.5,7.2,7.7,16.8,8.8,10.4,1.7,9.0,10.7,10.4,1.6,0.4,5.4,2.3,31.6,106.6,105.8,4665.6,57.6
7895,LeBron James,PF,33,CLE,82,82,3025.8,861.0,1582.6,0.542,147.6,410.0,0.367,705.2,1172.6,0.603,0.59,385.4,533.0,0.731,98.4,615.0,705.2,746.2,114.8,73.8,344.4,139.4,2255.0,2018,738.0,1010.0,0.731,Cleveland Cavaliers,50,32,0.61,0.0,0.59,36.9,10.5,19.3,1.8,5.0,8.6,14.3,4.7,6.5,1.2,7.5,8.6,9.1,1.4,0.9,4.2,1.7,27.5,110.9,109.9,4657.6,56.8
14388,Kevin Garnett,PF,27,MIN,82,82,3230.8,803.6,1607.2,0.499,8.2,41.0,0.256,795.4,1566.2,0.506,0.502,369.0,467.4,0.791,246.0,893.8,1139.8,410.0,123.0,180.4,213.2,205.0,1984.4,2004,1219.0,1230.0,0.991,Minnesota Timberwolves,58,24,0.707,0.0,5.86,39.4,9.8,19.6,0.1,0.5,9.7,19.1,4.5,5.7,3.0,10.9,13.9,5.0,1.5,2.2,2.6,2.5,24.2,94.5,89.1,4641.2,56.6
10065,James Harden,PG,29,HOU,78,78,2870.4,842.4,1911.0,0.442,374.4,1029.6,0.368,468.0,881.4,0.528,0.541,756.6,858.0,0.879,62.4,452.4,514.8,585.0,156.0,54.6,390.0,241.8,2815.8,2019,776.0,1010.0,0.768,Houston Rockets,53,29,0.646,0.0,4.96,36.8,10.8,24.5,4.8,13.2,6.0,11.3,9.7,11.0,0.8,5.8,6.6,7.5,2.0,0.7,5.0,3.1,36.1,113.9,109.1,4609.8,59.1
10244,David Robinson,C,29,SAS,81,81,3078.0,785.7,1490.4,0.53,8.1,16.2,0.3,785.7,1466.1,0.533,0.532,656.1,850.5,0.774,234.9,639.9,874.8,234.9,137.7,259.2,234.9,226.8,2235.6,1995,901.0,1050.0,0.858,San Antonio Spurs,62,20,0.756,0.0,5.9,38.0,9.7,18.4,0.1,0.2,9.7,18.1,8.1,10.5,2.9,7.9,10.8,2.9,1.7,3.2,2.9,2.8,27.6,106.6,100.6,4592.7,56.7


# Use the data for machine learning to predict MVP voting

## Extract Features from the data, split the data into training / testing sets

In [313]:
numeric_cols = [
    "Age",
    "G",
    "GS",
    "FG%",
    "3P%",
    "2P%",
    "eFG%",
    "FT%",
    "Year",
    "Pts Won",
    "Pts Max",
    "Share",
    "W",
    "L",
    "W/L%",
    "GB",
    "Opponent PTS/G",
    "Team PTS/G",
    "SRS",
    "MP/G",
    "FG/G",
    "FGA/G",
    "3P/G",
    "3PA/G",
    "2P/G",
    "2PA/G",
    "FT/G",
    "FTA/G",
    "ORB/G",
    "DRB/G",
    "TRB/G",
    "AST/G",
    "STL/G",
    "BLK/G",
    "TOV/G",
    "PF/G",
    "PTS/G",
    "MP",
    "FG",
    "FGA",
    "3P",
    "3PA",
    "2P",
    "2PA",
    "FT",
    "FTA",
    "ORB",
    "DRB",
    "TRB",
    "AST",
    "STL",
    "BLK",
    "TOV",
    "PF",
    "PTS",
    "Total Fantasy PTS",
    "Fantasy PTS/G",
]

# Share = MVP Voting Share - the target of our supervised regression
target = "Share"

# Leave out the "Pts Max" and "Pts Won" columns, since these go into computing "Share" (the target feature)
target_related_features = ["Pts Won", "Pts Max"]

# Exclude columns that are highly related to other columns (ex. team losses are the complement of team wins (82 - wins) so W and L can be replaced by W/L%, 2PA can be computed by 2P and 2P%, etc.)
collinear_features = [
    "G",
    "FGA",
    "3PA",
    "2PA",
    "FTA",
    "DRB",
    "W",
    "L",
    "Team PTS",
    "Opponent PTS",
    "FGA/G",
    "3PA/G",
    "2PA/G",
    "FTA/G",
    "DRB/G",
    "Team PTS/G",
    "Opponent PTS/G",
    "GB",
]

all_per_game_features = list(
    set(player_per_game_features).union(set(team_per_game_features))
)

# Select only our desired features
# For "positive" player stats like PTS, REB, AST, etc. use cumulative values
# For "negative" player stats like PF (personal fouls) and TOV (turnovers), use per-game averages.
# This way, players who accumulate more total positive stats (via more game time) are rewarded, and players who have accumulate more total negative stats (via more game time) are not penalized (their averages are used instead)

prediction_features = [
    "G",
    "MP",
    "FG",
    "3P",
    "2P",
    "eFG%",
    "FT",
    "FT%",
    "ORB",
    "TRB",
    "AST",
    "STL",
    "BLK",
    "TOV/G",
    "PF/G",
    "PTS",
    "Year",
    "W/L%",
    "Total Fantasy PTS",
    "Fantasy PTS/G",
]

print(f"Total number of features: {len(prediction_features)}")

Total number of features: 20


In [314]:
# Use the data from every NBA season 1991 -2022 as training data
training_df = stats_df[stats_df["Year"] < 2022]
# We want to predict the 2022 NBA MVP - use this as our testing data
testing_df = stats_df[stats_df["Year"] == 2022]

X_train = training_df[prediction_features]
y_train = training_df[target]
testing_df.shape

(605, 61)

## Perform Data Preprocessing & Model Selection - implement feature scaling and dimensionality reduction, then train and fit various supervised regression models, perform hyperparameter tuning using cross-validation within the training set, and consider tradeoffs of different model types

In [315]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.svm import SVR

# Map models to their hyperparams for hyperparameter tuning
random_state = 1
models = {
    "LinearRegression": {"model": LinearRegression(), "params": {}},
    "Ridge": {"model": Ridge(), "params": {"ridge__alpha": [0.1, 1.0, 10.0]}},
    "RandomForestRegressor": {
        "model": RandomForestRegressor(random_state=random_state),
        "params": {
            "randomforestregressor__n_estimators": [50, 100, 200],
            "randomforestregressor__max_depth": [3, 5, 7, None],
        },
    },
    "GradientBoostingRegressor": {
        "model": GradientBoostingRegressor(random_state=random_state),
        "params": {
            "gradientboostingregressor__n_estimators": [50, 100, 200],
            "gradientboostingregressor__learning_rate": [0.01, 0.1, 0.2],
        },
    },
    "SVR": {
        "model": SVR(),
        "params": {"svr__C": [0.1, 1.0, 10.0], "svr__epsilon": [0.01, 0.1, 0.2]},
    },
}

## Leverage SKLearn pipelining to streamline data pre-processing, model fitting, and hyperparameter tuning

In [316]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# Map each model type to the pipeline w/ its optimal hyperparameters
best_pipelines = {}
# Map each model type to the best score obtained by optimal hyperparans
best_scores = {}

for model_name, config in models.items():
    pipeline = make_pipeline(
        # Standardize features by scaling mean to 0 and use unit variance (1)
        StandardScaler(),
        # Reduce the dimensionality to 15 features while preserving the most variance
        PCA(n_components=15),
        config["model"],
    )

    # Conduct an exhaustive search to find the optimal param combinations
    # Perform 5-fold cross validation - for the data in the training set: pre-process the data w/ scaling and dimensionality reduction
    # Use all available CPUs and neagtive MSE as our error metric
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=config["params"],
        cv=5,
        n_jobs=-1,
        scoring="neg_mean_squared_error",
    )

    # Fit the grid search to the data
    grid_search.fit(X_train, y_train)

    # Store the pipeline w/ best hyperparameters and scores
    best_pipelines[model_name] = grid_search.best_estimator_
    # Convert back to positive MSE
    best_scores[model_name] = -grid_search.best_score_


print(f"Best Pipelines: {best_pipelines}")
print(f"Best Scores: {best_scores}")

Best Pipelines: {'LinearRegression': Pipeline(steps=[('standardscaler', StandardScaler()),
                ('pca', PCA(n_components=15)),
                ('linearregression', LinearRegression())]), 'Ridge': Pipeline(steps=[('standardscaler', StandardScaler()),
                ('pca', PCA(n_components=15)), ('ridge', Ridge(alpha=10.0))]), 'RandomForestRegressor': Pipeline(steps=[('standardscaler', StandardScaler()),
                ('pca', PCA(n_components=15)),
                ('randomforestregressor',
                 RandomForestRegressor(random_state=1))]), 'GradientBoostingRegressor': Pipeline(steps=[('standardscaler', StandardScaler()),
                ('pca', PCA(n_components=15)),
                ('gradientboostingregressor',
                 GradientBoostingRegressor(random_state=1))]), 'SVR': Pipeline(steps=[('standardscaler', StandardScaler()),
                ('pca', PCA(n_components=15)), ('svr', SVR(epsilon=0.01))])}
Best Scores: {'LinearRegression': 0.0028408120884990022,

In [317]:
from sklearn.pipeline import Pipeline

# Find the model with the lowest MSE - it is a SVR with epsilon = 0.01 and default params
best_model_type: str = min(best_scores, key=best_scores.get)
best_pipeline: Pipeline = best_pipelines[best_model_type]
print(f"Best model type: {best_model_type}")
best_pipeline

Best model type: SVR


## A SVR with epsilon=0.01 and default SKLearn params seems to be our most promising model. Re-train our SVR on the entire training set and employ various strategies for model evaluation. Consider if post-processing model predictions to better suit the context of predicting NBA MVP vote shares is appropriate and helpful to the model.  

In [318]:
best_pipeline.fit(X_train, y_train)

# Evaluate the model on the test dataset
X_test = testing_df[prediction_features]
y_test = testing_df[target]
y_pred = best_pipeline.predict(X_test)

# Evaluate the model again with post-processing prediction
# Since a player can't receive negative vote shares, we can assess our model performance with clipping as well
import numpy as np

# Clip negative predictions to zero
y_pred_clipped = np.maximum(y_pred, 0)

mse_original = mean_squared_error(y_pred=y_pred, y_true=y_test)
print(f"Mean Squared Error (original) on test data: {mse_original}")

mse_clipped = mean_squared_error(y_pred=y_pred_clipped, y_true=y_test)
print(f"Mean Squared Error (clipped) on test data: {mse_clipped}")

Mean Squared Error (original) on test data: 0.0005185985682739388
Mean Squared Error (clipped) on test data: 0.0005033273241985324


In [319]:
# A small improvement in MSE after clipping suggests that our model is robust, and the post-processing clipping can be kept to ensure model predictions are consistent with real MVP voting outcomes, since players cannot receive negative votes
y_pred = y_pred_clipped

# Create a series column containing our predictions
predictions_df = pd.DataFrame(y_pred, columns=["predictions"], index=testing_df.index)

## Store our model's predictions in a date frame, compare the predicted MVP voting ranks of players to their actual MVP voting ranks

In [320]:
mvp_votes_with_preds_2022_df = pd.concat(
    [testing_df[["Player", "Share"]], predictions_df], axis=1
)
# Rename the columns for more clarity
mvp_votes_with_preds_2022_df["Predicted Share"] = mvp_votes_with_preds_2022_df[
    "predictions"
]
del mvp_votes_with_preds_2022_df["predictions"]

mvp_votes_with_preds_2022_df["Actual Share"] = mvp_votes_with_preds_2022_df["Share"]
del mvp_votes_with_preds_2022_df["Share"]

In [321]:
# Let's see who our algorithm predicted to win MVP
mvp_votes_with_preds_2022_df.sort_values("Actual Share", ascending=False).head(10)

Unnamed: 0,Player,Predicted Share,Actual Share
663,Nikola Jokić,0.803679,0.875
837,Joel Embiid,0.303809,0.706
11678,Giannis Antetokounmpo,0.44891,0.595
907,Devin Booker,0.047507,0.216
11469,Luka Dončić,0.251742,0.146
1179,Jayson Tatum,0.07752,0.043
12226,Ja Morant,0.12859,0.01
6398,Stephen Curry,0.04127,0.004
905,Chris Paul,0.095935,0.002
8241,LeBron James,0.013074,0.001


In [324]:
# Sort our DF by actual and predicted shares in descending order
mvp_votes_with_preds_2022_df = mvp_votes_with_preds_2022_df.sort_values(
    ["Actual Share", "Predicted Share"], ascending=False
)

# Assign ranks based on actual shares, considering ties for players who received 0% of MVP voting shares
mvp_votes_with_preds_2022_df["Actual Rank"] = (
    mvp_votes_with_preds_2022_df["Actual Share"]
    .rank(method="min", ascending=False)
    .astype(int)
)

mvp_votes_with_preds_2022_df["Predicted Rank"] = (
    mvp_votes_with_preds_2022_df["Predicted Share"]
    .rank(method="min", ascending=False)
    .astype(int)
)

top15_preds = mvp_votes_with_preds_2022_df.head(15)
top15_preds

Unnamed: 0,Player,Predicted Share,Actual Share,Actual Rank,Predicted Rank
663,Nikola Jokić,0.803679,0.875,1,1
837,Joel Embiid,0.303809,0.706,2,3
11678,Giannis Antetokounmpo,0.44891,0.595,3,2
907,Devin Booker,0.047507,0.216,4,12
11469,Luka Dončić,0.251742,0.146,5,4
1179,Jayson Tatum,0.07752,0.043,6,9
12226,Ja Morant,0.12859,0.01,7,6
6398,Stephen Curry,0.04127,0.004,8,14
905,Chris Paul,0.095935,0.002,9,7
6185,Kevin Durant,0.050484,0.001,10,11


In [269]:
# There seem to be completely "random" players sneaking into the top 30 in votes (ex. Jaden Springer, Craig Sword, Derrick Walton )
# Examing their 2022 stats on basketball reference.com, they all have one thing in common: a category in which they are shooting 100%
stats_df_2022 = stats_df[stats_df["Year"] == 2022]

shooting_cols = ["FG%", "eFG%", "2P%", "3P%", "FT%"]
dfs = []

# For each shooting column, filterthe df for players with shooting percentage >= 0.99 for that column
for col in shooting_cols:
    elite_shooters_df = stats_df_2022[stats_df_2022[col] >= 0.99]
    dfs.append(elite_shooters_df)

# Concatenate all filtered dfs into a single df
filtered_df = pd.concat(dfs, ignore_index=True)

outliers = filtered_df["Player"].unique()

top_30_vote_recipients = mvp_votes_with_preds_2022_df["Player"][:30].to_list()


outlier_vote_recipients = [
    player for player in outliers if player in top_30_vote_recipients
]

outlier_vote_recipients

['Joe Johnson',
 'Juwan Morgan',
 'Jarron Cumberland',
 'DaQuan Jeffries',
 'Jakob Poeltl',
 'Moses Wright',
 'Derrick Walton']

## 

In [280]:
from sklearn.feature_selection import RFE

# Create a pipeline w/ standardization and recursive feature elimination for 20 features, using a Random Forest regressor as an estimator
rfe = RFE(estimator=RandomForestRegressor(), n_features_to_select=20)
piepline = make_pipeline(StandardScaler(), rfe)

# Fit RFE on the training data
piepline.fit(X_train, y_train)

# Get the names of the rfe-selected features
selected_features_indices = rfe.support_
rfe_features = X_train.columns[selected_features_indices]
rfe_features

Index(['GS', 'MP', 'FG', 'FG%', '2P', '2P%', 'eFG%', 'FT', 'FT%', 'ORB', 'TRB',
       'AST', 'STL', 'BLK', 'TOV/G', 'PF/G', 'PTS', 'Year', 'W/L%', 'SRS'],
      dtype='object')

In [279]:
# Use the rfe-selected features for training and testing
X_train_final = X_train[rfe_features]
X_test_final = X_test[rfe_features]

# Create a pipleine consisting of a standardization scaler and our strongest model from model selection
pipeline = make_pipeline(StandardScaler(), SVR(epsilon=0.01))

# Fit the pipeline on our training dataset
pipeline.fit(X_train_final, y_train)

# Evaluate the model on the testing dataset
y_pred_final = pipeline.predict(X_test_final)

y_pred_final_clipped = np.maximum(y_pred_final, 0)

mse_final_original = mean_squared_error(y_pred=y_pred_final, y_true=y_test)
print(f"Mean Squared Error (original) on final test data: {mse_final_original}")

mse_final_clipped = mean_squared_error(y_pred=y_pred_final_clipped, y_true=y_test)
print(f"Mean Squared Error (clipped) on final test data: {mse_final_clipped}")

Mean Squared Error (original) on final test data: 0.000573432357653818
Mean Squared Error (clipped) on final test data: 0.0005547500391852233


## Interpreting the final error metrics after feature selection 
- Model Performance for original feature selection: (principal component analysis) 
    - Mean Squared Error (original) on test data: 0.0006498223820819034
    - Mean Squared Error (clipped) on test data: 0.0006389756256203042
- Model performance after final feature selection (recursive feature selection)
    - Mean Squared Error (original) on final test data: 0.000573432357653818
    - Mean Squared Error (clipped) on final test data: 0.0005547500391852233
- High-dimensional datasets such as this can be prone to overfitting. 
## Addressing the bias-variance tradeoff