## Data Importing and Pre-processing

In [None]:
# import libraries needed
import pandas as pd

pd.set_option("display.max_columns", None)
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm, skew, probplot
from scipy.special import boxcox1p
import warnings
from datetime import datetime

warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=FutureWarning, module="pandas.*")
%matplotlib inline

In [None]:
# read file and see number of rows and cols
nba_df = pd.read_csv("nba_2022-23_all_stats_with_salary.csv")
nba_df.shape

In [None]:
nba_df.head()

Here we are grabbing salaries from the past 5 seasons from another dataset and adding them as columns in our nba_df. We are not using salary data from the last season becuase it would be too highly correlated with the salary data we are trying to predict. 

In [None]:
full_nba_salaries = pd.read_csv('full_nba_salaries.csv')

# Define the years and corresponding column names for salaries
years = ['2017-18', '2018-19', '2019-20', '2020-21']
salary_columns = [f'Salary{year[2:4]}-{year[5:]}' for year in years]

# Initialize new salary columns in the nba_stats DataFrame with 0
for col in salary_columns:
    nba_df[col] = 0

# Iterate through the full_nba_salaries DataFrame
for index, row in full_nba_salaries.iterrows():
    season = row['Season']
    player_name = row['Name']
    
    if season in years:
        salary_col = f'Salary{season[2:4]}-{season[5:]}'
        # Update the salary in the nba_stats DataFrame, converting to integer
        nba_df.loc[nba_df['Player Name'] == player_name, salary_col] = int(row['Salary'].replace('$', '').replace(',', '').replace('(TW)', ''))



# Display the updated DataFrame
nba_df

In [None]:
# reanme 'Unnamed: 0' column to 'ID'
nba_df = nba_df.rename(columns={"Unnamed: 0": "Id"})

In [None]:
# Remove spaces from column names
nba_df.columns = [col.replace(" ", "") for col in nba_df.columns]

In [None]:
# count number of categorical variables
category_count = 0

for cat in nba_df.dtypes:
    if cat == "object":
        category_count += 1

In [None]:
print("Number of categorical variables:", category_count)

# column 1 is the ID column so we subract 1
numeric_count = nba_df.shape[1] - category_count - 1

print("Number of contineous variables:", numeric_count)

In [None]:
# see all the column names
nba_df.columns

### Handling our missing data

In [None]:
# display the missing data and its percent of the column
total_missing = nba_df.isnull().sum().sort_values(ascending=False)
percent_missing = (nba_df.isnull().sum() / nba_df.isnull().count()).sort_values(ascending=False)

missing_data_df = pd.concat([total_missing, percent_missing], axis=1, keys=["Total Missing", "Percent Missing"])
missing_data_df.head(8)

In [None]:
# example row of a player who has missing data
# players with missing data are those who did not play many games so they never accumilated that stat during the season
null_fg = nba_df[nba_df['FG%'].isnull()]
null_fg

In [None]:
# visualize this in a bar graph
missing_data_df["Percent Missing"].head(8).plot(
    kind="barh", figsize=(20,10)
).invert_yaxis()
plt.xlabel("Percent Missing")
plt.ylabel("Variable")
plt.title("The 8 Columns and their Percent of Missing Data")
plt.show()

In [None]:
# fill in the missing data with 0s
# data is "missing" because player never recorded that stat during the season so we impute that data to be 0 to identify them in our model
cols_to_fill_zero = [
    "FT%",
    "3P%",
    "2P%",
    "TS%",
    "3PAr",
    "FTr",
    "eFG%",
    "FG%",
]

for col in cols_to_fill_zero:
    nba_df[col] = nba_df[col].fillna(0)


In [None]:
# show same player who had null values now has zeros in those fields
imputed_row = nba_df[nba_df["PlayerName"] == "Alondes Williams"]
imputed_row

### Handling outliers for better training

In [None]:
fig, ax = plt.subplots()
ax.scatter(x=nba_df["GP"], y=nba_df["Salary"])
plt.ylabel("Salary", fontsize=13)
plt.xlabel("GP (Games Played)", fontsize=13)
plt.show()

There seem to be some outliers where players did not play the majority of the season, yet were given large salaries. This is likely due to season ending injuries. Additionally, there are players present in the data set that were on 10-day contracts. For this reason, we will remove data from players who played in less than 20 games.

In [None]:
# drop less than 20 games
nba_df = nba_df[nba_df['GP'] >= 20]

### Normalize

In [None]:
sns.distplot(nba_df["Salary"], fit=norm)

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(nba_df["Salary"])
print("\n mu = {:.2f} and sigma = {:.2f}\n".format(mu, sigma))

# Now plot the distribution
plt.legend(
    ["Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )".format(mu, sigma)], loc="best"
)
plt.ylabel("Frequency")
plt.title("Salary distribution")

# Get also the QQ-plot
fig = plt.figure()
res = probplot(nba_df["Salary"], plot=plt)
plt.show()

In [None]:
# We use the numpy fuction log1p which  applies log(1+x) to all elements of the column
nba_df["Salary_normalized"] = np.log1p(nba_df["Salary"])

# Check the new distribution
sns.distplot(nba_df["Salary_normalized"], fit=norm)

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(nba_df["Salary_normalized"])
print("\n mu = {:.2f} and sigma = {:.2f}\n".format(mu, sigma))

# Now plot the distribution
plt.legend(
    ["Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )".format(mu, sigma)], loc="best"
)
plt.ylabel("Frequency")
plt.title("Salary distribution")

# Get also the QQ-plot
fig = plt.figure()
res = probplot(nba_df["Salary_normalized"], plot=plt)
plt.show()

## Data Analysis and Visualization

In [None]:
# scatterplot
sns.set()
cols = [
    "Salary_normalized",
    "Age",
    "MP",
    "3P",
    "TRB",
    "AST",
    "PTS",
    "PER",
    "TS%",
    "DWS",
    "VORP"
]
sns.pairplot(nba_df[cols], size=2.5)
plt.show();

In [None]:
# Exclude non-numeric columns
numeric_df = nba_df.select_dtypes(include=[np.number])
corrmat = numeric_df.corr()

f, ax = plt.subplots(figsize=(15, 12))
sns.heatmap(corrmat, vmax=0.8, square=True);

In [None]:
salary_correlations = corrmat['Salary_normalized']
print(salary_correlations.sort_values(ascending=False))

In [None]:
# Visualize number of players at each position by age

plt.figure(figsize=(20,4))
sns.set_style('whitegrid')
sns.countplot(x='Age',hue='Position', data=nba_df, palette='viridis');

#### Target Variable Visualizations

In [None]:
# boxplot to visualize the spread of salaries by each position
sns.boxplot(x='Position', y='Salary_normalized', data=nba_df, palette='rainbow');

In [None]:
# plot to show correclation between points and salaries by position as well
# points has the highest positive correlation to salary as seen above
sns.lmplot(y='Salary', x='PTS', data=nba_df, hue='Position', palette='Set1');

Now lets compare Salary to VORP.
VORP is a box score estimate of the points per 100 team possessions that a player contributes above a replacement level player, translated to an average team and proportional to an 82 game season.

In [None]:
sns.jointplot(x='VORP',y='Salary_normalized',data=nba_df,color='purple');

Now lets compare Salary to a defensive advanced statistic like DWS.
DWS stands for Defensive Win Shares, which is a metric in the NBA that compares a player's defensive rating to the league average.

In [None]:
plt.figure(figsize=(12, 8))
sns.scatterplot(x='DWS', y='Salary_normalized', data=nba_df, hue='Position', palette='viridis', alpha=0.6);

#### Feature Selection

#### Step 1: Dropping low correlated features
First, we are dropping features that have below a 0.2 correlation index with 'Salary'

In [None]:
# Identify the columns to exclude from feature selection
exclude_columns = ['Id', 'Salary', 'Salary_normalized']

# Get the numerical columns excluding the columns to exclude
numeric_columns = [col for col in numeric_df.columns if col not in exclude_columns]

In [None]:
low_corr_columns = [col for col in numeric_columns if abs(corrmat.loc[col, 'Salary']) < 0.2]

print(low_corr_columns)

numeric_df.drop(columns=low_corr_columns, inplace=True)

nba_df.drop(columns=low_corr_columns, inplace=True)

In [None]:
nba_df

#### Step 2: Encoding Categorical Features
Next, we will encode categorical features so that our supervised model can use them for predicitons. These features are Position and Team. The position and team are likely influential on a player's salary but are represented by strings in our dataset. We will label encoding because the values of these features are within a limited range. This will give each team and position a unique numerical marker.

In [None]:
from sklearn.preprocessing import LabelEncoder

# Create label encoders
position_encoder = LabelEncoder()
team_encoder = LabelEncoder()

# Perform label encoding for 'Position' and 'Team' variables
nba_df['Position_encoded'] = position_encoder.fit_transform(nba_df['Position'])
nba_df['Team_encoded'] = team_encoder.fit_transform(nba_df['Team'])

# Drop the original 'Position' and 'Team' columns
nba_df.drop(['Position', 'Team'], axis=1, inplace=True)
nba_df.head()


In [None]:
# identify skewness
skewed_feats = (
    numeric_df
    .apply(lambda x: skew(x.dropna()))
    .sort_values(ascending=False)
)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({"Skew": skewed_feats})
skewness.head(20)

In [None]:
skewness["Skew"].head(10).plot(
    kind="barh", figsize=(20, 10)
).invert_yaxis()  # top 10 skewed columns
plt.xlabel("Skew")
plt.ylabel("Variable Name")
plt.title("Top 10 Skewed Variables")
plt.show()

In [None]:
skewness = skewness[abs(skewness) > 0.75]
print(
    "There are {} skewed numerical features to Box Cox transform (normalize)".format(
        skewness.shape[0]
    )
)

In [None]:
negative_value_columns = numeric_df.columns[(numeric_df < 0).any()]

# Print the list of column names
print("Columns with negative values:")
print(negative_value_columns.tolist())


In [None]:
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    # skip over columns that don't need transformation
    # skip over columns that have negative values so that they don't become NULL when transforming
    if feat not in [
        "Id",
        "Salary",
        "Salary_normalized",
        "Salary17-18",
        "Salary18-19",
        "Salary19-20",
        "Salary20-21",
        "Salary21-22",
        'OWS', 
        'WS', 
        'WS/48', 
        'OBPM', 
        'DBPM', 
        'BPM', 
        'VORP'
    ]:
        nba_df[feat] = boxcox1p(nba_df[feat], lam)

In [None]:
# check that the box cot did not add any NULL values
null_columns = nba_df.columns[nba_df.isnull().any()]
null_count = nba_df[null_columns].isnull().sum()

print("Column Name: NULL Count")
for i in range(0, len(null_columns)):
    print(f"{null_columns[i]}: {null_count[i]}")

## Data Analytics

All of our data is labled therefore we will be implementing supervised learning methods

In [None]:
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
from skopt import BayesSearchCV

Here we define multiple regression features that will train and evaluate.

In [None]:
lr_w_int = LinearRegression()
lr_no_int = LinearRegression(fit_intercept=False)
elastic_net = ElasticNet(alpha=0.01, l1_ratio=0.1)
rf = RandomForestRegressor(n_estimators=500)
dt = DecisionTreeRegressor(max_depth=10)
model_xgb = xgb.XGBRegressor(max_depth=5, n_estimators=1000, learning_rate=0.01)
model_xgb_hyper = xgb.XGBRegressor(max_depth=5, n_estimators=1000, learning_rate=0.01)

This function, `hyperparameter_tune_bayesian`, performs Bayesian hyperparameter tuning for an XGBoost regressor using training features (`X_train`) and target variable (`y_train`). It defines a parameter search space, applies Bayesian optimization with cross-validation, and returns the best hyperparameters found.

In [None]:
def hyperparameter_tune_bayesian(X_train, y_train, regressor):
    """
    Perform hyperparameter tuning for XGBoost using Bayesian search.

    Parameters:
    - X_train: pandas DataFrame
        Training features.
    - y_train: pandas Series
        Training target variable.
    - regressor_type: str
        Type of regressor to tune ('xgboost').

    Returns:
    - best_params: dict
        Best hyperparameters found during tuning.
    """
    # Define the common parameter space for both XGBoost
    param_space_common = {
        "n_estimators": (100, 1200),
        "learning_rate": (0.01, 0.2, "log-uniform"),
        "max_depth": (3, 10),
    }

    regressor_type = regressor.lower()
    if regressor_type == "xgboost":
        regressor = xgb.XGBRegressor()
    else:
        raise ValueError("Unsupported regressor type. Choose 'xgboost'.")

    # Update the search space with common parameters
    param_space = param_space_common.copy()

    # Perform Bayesian search
    bayes_search = BayesSearchCV(
        estimator=regressor,
        search_spaces=param_space,
        scoring="neg_mean_squared_error",
        cv=5,
        n_jobs=-1,  # Set the number of parallel jobs
    )
    bayes_search.fit(X_train, np.log1p(y_train))

    # Get the best hyperparameters
    best_params = bayes_search.best_params_

    return best_params

This function, `k_fold_regression`, performs k-fold cross-validation on a given dataset using a specified regressor to predict a target column (default "Salary"). It splits the data into training and validation sets, fits the model, and evaluates its performance using RMSE for each fold. Optionally, it can tune hyperparameters for an XGBoost regressor. The function returns a DataFrame with prediction results for each fold, along with lists of RMSE scores, training set sizes, and validation set sizes.

In [None]:
def k_fold_regression(
    data,
    regressor,
    target_column="Salary",
    cols_to_ignore=['Salary', 'Id', 'PlayerName', 'Salary_normalized'],
    n_splits=5,
    tune_hyperparameters=False,
    model_name=None
):
    rmse_scores = []
    train_sizes = []
    test_sizes = []
    fold_results = []

    # Prepare the feature matrix X and target vector y
    X = data.drop(columns=cols_to_ignore)
    y = data[target_column]

    # Initialize KFold cross-validator
    kf = KFold(n_splits=n_splits, shuffle=True)

    # Cross-validation process
    foldCount = 0
    fold_models = []
    for train_index, val_index in kf.split(X):
        if (
            isinstance(regressor, (xgb.XGBRegressor))
            and tune_hyperparameters
        ):
            if isinstance(regressor, xgb.XGBRegressor):
                regressor_type = "xgboost"
            else:
                raise ValueError(
                    "Unsupported regressor type. Supported types: XGBRegressor"
                )

            # Use the entire training data for hyperparameter tuning
            best_params = hyperparameter_tune_bayesian(X, y, regressor_type)
            print(f"Best hyperparameters for {regressor_type} Fold: {best_params}")
            regressor.set_params(**best_params)

        # Split into training and validation folds
        X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
        y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]

        # Fit the model on the training fold
        regressor.fit(X_train_fold, np.log1p(y_train_fold))  # Fit on log-transformed target
        fold_models.append(regressor)
        y_pred_log_fold = regressor.predict(X_val_fold)
        y_pred_fold = np.expm1(y_pred_log_fold)  # Convert back to original scale

        # Calculate RMSE for the validation fold
        rmse = np.sqrt(mean_squared_error(y_val_fold, y_pred_fold))

        if (
            isinstance(regressor, (xgb.XGBRegressor))
            and tune_hyperparameters
        ):
            print(model_name, f"RMSE: {rmse:.4f}, Train Size: {len(y_train_fold)}, Test Size: {len(y_val_fold)}")

        # if regular xgboost, then print rmse for best folds since this is a high performing model
        elif isinstance(regressor, RandomForestRegressor):
            print(f"Random Forest: {rmse:.4f}, Train Size: {len(y_train_fold)}, Test Size: {len(y_val_fold)}")

        rmse_scores.append(rmse)
        train_sizes.append(len(y_train_fold))
        test_sizes.append(len(y_val_fold))

        # Record results for 'Id', 'Actual', 'Predicted', 'Fold', and 'Set' in a list
        fold_results.append({
            'Id' : foldCount,
            'Actual': y_val_fold.tolist(),
            'Predicted': y_pred_fold,
            'rmse' : rmse
        })
        foldCount += 1

        
    # Create a DataFrame from the results
    result_df = pd.DataFrame(
        fold_results, columns=["Id", "Actual", "Predicted", 'rmse']
    )

    return result_df, rmse_scores, train_sizes, test_sizes, fold_models

This function, `compute_rmse_std_k_fold`, calculates the mean and standard deviation of RMSE scores from k-fold cross-validation on a given DataFrame using a specified model. It optionally tunes hyperparameters for the model and appends the RMSE scores from each fold to a list. The function returns the mean and standard deviation of the RMSE scores.

In [None]:
def compute_rmse_std_k_fold(df, model, tune_hyper=False, model_name=None):
    rmse_list = []
        
    rmse_list.append(k_fold_regression(df, model, tune_hyperparameters=tune_hyper, model_name=model_name)[1])

    mean = np.mean(rmse_list)
    std = np.std(rmse_list)

    return mean, std

Now we are training each of the models and saving their RMSE and STD.

In [None]:
#Get the avg rmse and std over 30 tests for each model
lr_no_int_list = compute_rmse_std_k_fold(nba_df, lr_no_int)
lr_w_int_list = compute_rmse_std_k_fold(nba_df, lr_w_int)
elastic_net_list = compute_rmse_std_k_fold(nba_df, elastic_net)
dt_list = compute_rmse_std_k_fold(nba_df, dt)
rf_list = compute_rmse_std_k_fold(nba_df, rf)
model_xgb_list = compute_rmse_std_k_fold(nba_df, model_xgb)

#The next line takes a while (Roughly 10 mins), If want to quickly run, comment out this line and the line below in data
model_xgb_hyper_list = compute_rmse_std_k_fold(nba_df, model_xgb, tune_hyper=True)

# plot RMSE and STD for each Algorithm
data = {
    "Linear (No Intercept)": lr_no_int_list,
    "Linear (w/ Intercept)": lr_w_int_list,
    "Elastic Net": elastic_net_list,
    "Decision Tree": dt_list,
    "Random Forest": rf_list,
    "XGBoost": model_xgb_list,

    #Comment below if want to run quicker
    "XGBoost Hyper": model_xgb_hyper_list,
}
data_df = pd.DataFrame(data=data).T.reset_index().sort_values(by=[0], ascending=True)
data_df.columns = ["Algorithm", "RMSE", "STD"]

In [None]:
data_df

In [None]:
# creating the bar plot
data_df.plot(kind="bar", x="Algorithm", y=["RMSE", "STD"], figsize=(20, 10), rot=0)
plt.xlabel("Algorithm", fontsize=20)
plt.ylabel("Root Mean Squared Error / Standard Deviation", fontsize=20)
plt.show()

Hyper tuning XGBoost did not outperform our regular XGBoost model. Therefore, we will not be using it in our stacked or voting models going forward.

## Meta Model

### Why Use Stacked Models and Voting Models to Create a Meta Model

Stacked models and voting models are ensemble techniques that combine the predictions of multiple base models to improve overall performance. Here’s why they are beneficial:

1. **Leverage Strengths of Multiple Models:**
   - Different models have different strengths and weaknesses. By combining them, you can take advantage of the strengths of each model while mitigating their weaknesses. For instance, decision trees can handle non-linear relationships well, while linear models can be more stable with fewer parameters.

2. **Reduce Overfitting:**
   - Individual models might overfit the training data, but combining multiple models can reduce this risk. The errors of individual models may cancel each other out, leading to a more generalizable model.

3. **Improve Prediction Accuracy:**
   - Ensemble methods often achieve better performance than individual models. By aggregating the predictions of multiple models, the overall prediction accuracy is typically improved.

4. **Model Robustness:**
   - Ensemble methods can provide more robust predictions. If one model performs poorly on certain data points, the other models can compensate, leading to more stable and reliable predictions.

### Stacking Regressor

A stacking regressor combines the predictions of several base models using another model (the final estimator) to make the final prediction. This method can capture complex patterns in the data by leveraging the diverse learning algorithms of the base models.

### Voting Regressor

A voting regressor combines the predictions of multiple models by averaging their predictions (or using weighted averaging). This method is simple yet effective in combining the strengths of multiple models to achieve better overall performance.

### Combined Stacking and Voting Regressor

Combining stacking and voting regressors further enhances the model’s ability to generalize and improve prediction accuracy. The stacking regressor uses the voting regressor as its final estimator, combining the strengths of both methods to create a powerful meta-model.

By using stacked models and voting models, you aim to create a meta-model that is more accurate, robust, and capable of generalizing better to new data compared to individual models.


In [None]:
from sklearn.ensemble import StackingRegressor

# first stacking model
  
estimators = [
   ('decision_tree', dt),
   ('rf', rf),
]


sr = StackingRegressor(
   estimators=estimators,
   final_estimator=model_xgb
)

In [None]:
from sklearn.ensemble import VotingRegressor

# voting stacking model, putting weights on different models

vr = VotingRegressor([
   ('rf', rf),
   ('model_xgb', model_xgb),
   ('decision_tree', dt),
  
], weights=[1,1,3])


In [None]:
estimators2 = [
   ('rf', rf),
   ('model_xgb', model_xgb),
   #('decision_tree', dt)
]

# using the voting model as our final estimator

sr2 = StackingRegressor(
   estimators=estimators2,
   final_estimator=vr
)

# More tesing with the new models
Removed Nearest Neighbor, Linear No Int, Linear W Int, Elastic Net, and Descision tree because they are worst performing

In [None]:
rf_list = compute_rmse_std_k_fold(nba_df, rf)
model_xgb_list = compute_rmse_std_k_fold(nba_df, model_xgb)
dt_list = compute_rmse_std_k_fold(nba_df, dt)

sr_list = compute_rmse_std_k_fold(nba_df, sr)
vr_list = compute_rmse_std_k_fold(nba_df, vr)
sr2_list = compute_rmse_std_k_fold(nba_df, sr2)

#model_xgb_hyper_list = compute_rmse_std_k_fold(nba_df, model_xgb, tune_hyper=True)

# plot RMSE and STD for each Algorithm
data = {
    "Random Forest": rf_list,
    "XGBoost": model_xgb_list,
    "Decision Tree": dt_list,
    
    "Stacking Regressor": sr_list,
    "Voting Regressor": vr_list,
    "Stacking Regressor 2": sr2_list,
    
    #"XGBoost Hyper": model_xgb_hyper_list,
}
data_df = pd.DataFrame(data=data).T.reset_index().sort_values(by=[0], ascending=True)
data_df.columns = ["Algorithm", "RMSE", "STD"]

In [None]:
data_df

In [None]:
# creating the bar plot
data_df.plot(kind="bar", x="Algorithm", y=["RMSE", "STD"], figsize=(20, 10), rot=0)
plt.xlabel("Algorithm", fontsize=20)
plt.ylabel("Root Mean Squared Error / Standard Deviation", fontsize=20)
plt.show()

#### Looks like Random Forest is still our best performing model

Let's grab the lowest rmse fold

In [None]:
rf_list = k_fold_regression(nba_df, rf, model_name="Random Forest")
rf_preds = rf_list[0]
rf_models = rf_list[4]

rf_preds.head()

In [None]:
# Sort the DataFrame by the 'rmse' column in ascending order
sorted_df = rf_preds.sort_values(by='rmse', ascending=True)

# Grab the first row of the sorted DataFrame
first_row = sorted_df.iloc[0]

In [None]:
print(first_row[["Actual", "Predicted"]])

In [None]:
# Assuming first_row is the output from the previous steps
actual_values = first_row["Actual"]
predicted_values = first_row["Predicted"]

# Plot the density plots for Actual and Predicted values
sns.kdeplot(
    data=actual_values,
    fill=True,
    common_norm=False,
    alpha=0.4,
    label="Actual"
)
sns.kdeplot(
    data=predicted_values,
    fill=True,
    common_norm=False,
    alpha=0.4,
    label="Predicted"
)

# Add labels and legend
plt.xlabel("Values")
plt.ylabel("Density")
plt.xlim((0, 700000))
plt.legend()
plt.show()

In [None]:
# Calculate the absolute difference and create a new column
first_row['Difference'] = first_row['Actual'] - first_row['Predicted']

In [None]:
# sanity check on predictions
print("min prediction: ", first_row["Predicted"].min())
print("max prediction: ", first_row["Predicted"].max())
print("max error: ", first_row["Difference"].max())
print("mean error: ", abs(first_row["Difference"]).mean())
print("median error: ", np.median(abs(first_row["Difference"])))

In [None]:
bestdiffs = pd.Series(first_row['Difference'])
# plot a histogram of the difference of our actuals and predictions
bestdiffs.hist(bins=50)

Looks like there is a large outlier we are off by (~15,000,000). Let's examine this record.

In [None]:
print(first_row)

In [None]:

filtered_values = bestdiffs[bestdiffs > 10000000]

# Print the filtered Series
print("Here is the low outlier value: " + str(filtered_values.iloc[0]))

# Get the low outlier value
low_outlier_value = filtered_values.iloc[0]

# Find the index of 1 in first_row
index = np.where(first_row['Difference'] == low_outlier_value)[0][0]
# Get the corresponding 'Actual' value from first_row using boolean indexing
corresponding_actual = first_row['Actual'][index]

# Print the corresponding 'Actual' value
print("The 'Actual' value corresponding to the outlier value is:", corresponding_actual)

corresponding_Predicted = first_row['Predicted'][index]

# Print the corresponding 'Actual' value
print("The 'Predicted' value corresponding to the outlier value is:", corresponding_Predicted)

nba_df[nba_df['Salary'] == corresponding_actual]


We can examine this record to see if there is something strange about it, but in your notebook you didn't so we'll stop here. For our purposes, we see our model performs well and our predicted and actual distributions are similar.

### Variable Importance Plot

Tree based models (Decision Trees, Random Forest, GBMs) have feature importance plots that allow you to see which features have the most impact on our model. Let's take a look at our Random Forest model that we used in our meta-model to get a sense of which features are the most important.

In [None]:
# Get the feature names
feature_names = nba_df.columns.drop(['Salary', 'Id', 'PlayerName', 'Salary_normalized'])

# Get the first model from the list
model = rf_models[0]

# Get the feature importances of the first model
feature_importances = model.feature_importances_
sorted_indices = feature_importances.argsort()[::-1]

# Create a single plot for the first model
fig, ax = plt.subplots(figsize=(10, 6))

ax.bar(range(len(feature_importances)), feature_importances[sorted_indices])
ax.set_xticks(range(len(feature_importances)))
ax.set_xticklabels(feature_names[sorted_indices], rotation=90)
ax.set_xlabel("Features")
ax.set_ylabel("Importance")
ax.set_title("Variable Importance - Random Forest")

plt.tight_layout()
plt.show()

This is really odd because Random Forest seems to be favoring one feature way more than the others. To compare we will also create a plot for our XGBoost model below.

In [None]:
feature_important = model_xgb.get_booster().get_score(importance_type="weight")

keys = list(feature_important.keys())
values = list(feature_important.values())

data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(
    by="score", ascending=False
)
data[:20].plot(kind="barh", figsize=(20, 10)).invert_yaxis()
## plot top 20 features
plt.xlabel("Feature Importance", fontsize=20)
plt.ylabel("Feature Name", fontsize=20)
plt.title("Variable Importance - XGBoost", fontsize=20)
plt.show()