## Data Importing and Pre-processing

In [None]:
# import libraries needed
import pandas as pd

pd.set_option("display.max_columns", None)
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm, skew, probplot
from scipy.special import boxcox1p
import warnings
from datetime import datetime

warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=FutureWarning, module="pandas.*")
%matplotlib inline

In [None]:
# read file and see number of rows and cols
nba_df = pd.read_csv("full_nba_salaries.csv")
nba_df.shape

In [None]:
nba_df.head()

In [None]:
# remove 'Lg_x' and 'Lg_y' columns
# we understand that all of these players are in the NBA so having a column that all says they are in the nba, twice, is unnecessary 
nba_df = nba_df.drop(['Lg_x', 'Lg_y'], axis=1)

# remove the 'Team' column because we already have 'Tm' 
nba_df = nba_df.drop(columns=['Team'])

# reanme 'Unnamed: 0' column to 'ID'
nba_df = nba_df.rename(columns={"Unnamed: 0": "Id"})

In [None]:
print(nba_df.dtypes)

# we see that there are categorical variables that should be labeled as numeric

In [None]:
# list all columns from 'MP' to 'AST' that need to be converted to float
col_to_be_floats = nba_df.loc[:, 'MP':'PTS'].columns.tolist()

# add additional columns 'G', 'GS', to the list
col_to_be_floats.extend(['G', 'GS'])

# convert each column to float, handling non-numeric entries
for column in col_to_be_floats:
    # convert non-convertible strings to NaN
    nba_df[column] = pd.to_numeric(nba_df[column], errors='coerce')

In [None]:
# remove the dollar sign and comma from the 'Salary' column
nba_df['Salary'] = nba_df['Salary'].str.replace('$', '', regex=False)
nba_df['Salary'] = nba_df['Salary'].str.replace(',', '', regex=False)

# remove the '(TW)' which stands for a two-way contract
nba_df['Salary'] = nba_df['Salary'].str.replace('(TW)', '', regex=False)

# convert to int
nba_df['Salary'] = nba_df['Salary'].astype(int)

In [None]:
# count number of categorical variables
category_count = 0

for cat in nba_df.dtypes:
    if cat == "object":
        category_count += 1

print("Number of categorical variables:", category_count)

# column 1 is the ID column so we subract 1
numeric_count = nba_df.shape[1] - category_count - 1

print("Number of contineous variables:", numeric_count)

In [None]:
nba_df.head()

### Handling missing data

In [None]:
#Some rows are all labled "Did not play (injury/illness)". We can remove these rows as there is no valuable data
dnp_mask = nba_df['Pos'].str.startswith("Did ")
nba_df = nba_df[~dnp_mask]

In [None]:
# Some of the Awards data are null when they should just be marked as 'None'
nba_df['Awards'].fillna('None', inplace=True)

In [None]:
# display the missing data and its percent of the column
total_missing = nba_df.isnull().sum().sort_values(ascending=False)
percent_missing = (nba_df.isnull().sum() / nba_df.isnull().count()).sort_values(ascending=False)

missing_data_df = pd.concat([total_missing, percent_missing], axis=1, keys=["Total Missing", "Percent Missing"])
missing_data_df

In [None]:
# visualize this in a bar graph
missing_data_df["Percent Missing"].head(10).plot(
    kind="barh", figsize=(20, 10)
).invert_yaxis()  # top 10 missing columns
plt.xlabel("Missing Proportion")
plt.ylabel("Variable Name")
plt.title("Top 10 Proportion of Missing Data In Columns")
plt.show()

In [None]:
# see what the null values are for 'GS'
null_gs = nba_df[nba_df['GS'].isnull()]
null_gs.head()

In [None]:
# impute Magic Johnson's null values of games started to be equal to his games, since stat was never recorded
nba_df['GS'] = nba_df['GS'].fillna(nba_df['G'])

## Data Analysis and Visualization

In [None]:
import pandas as pd

# Filter stats between the 2015-16 season and the 2021-22 season
filtered_df = nba_df[(nba_df['Season'] >= '2015-16') & (nba_df['Season'] <= '2021-22')]

# Select all numerical stat values for a player
columns_to_analyze = ['Salary', 'Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']

# Calculate the correlation matrix between all stats
correlation_matrix = filtered_df[columns_to_analyze].corr()

# View the correlation values between salary and other columns
print(correlation_matrix['Salary'].sort_values(ascending=False))

In [44]:
# Replace season with start year
nba_df['Season'] = nba_df['Season'].apply(lambda x: int(x.split('-')[0]))

### Encode categorical data

In [None]:
from sklearn.preprocessing import LabelEncoder
cols = (
    "Tm",
    # "Awards", TODO: Make this better for awards
    "Pos"
)

# process columns, apply LabelEncoder to categorical features
for c in cols:
    lbl = LabelEncoder()
    lbl.fit(list(nba_df[c].values))
    nba_df[c] = lbl.transform(list(nba_df[c].values))

# shape
print("Shape nba_df: {}".format(nba_df.shape))

## Data Analytics

In [None]:
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

### Redudant or Correlated Features
It is clear that all percentage statistics are redudant as they have a low correlation value and can already be found in the raw data for shots made and shots attempted. We will remove these columns from the data when feeding it into our model.

We will also be removing Offensive Rebounds (ORB) and Defensive Rebounds (DRB) and only keep the Total Rebounds (TRB). The former stats are included in the latter so having both is redundant as the features are correlated.

We can also remove Games (G) and Games Started (GS) because this data can be reduced to Minutes Played (MP) which has a greater total correlation.

In [None]:
# Select relevant features and avoid redundancy
selected_features = ['PTS', 'FG', 'FGA', 'FT', 'FTA', '2P', '2PA', '3P', '3PA', 'MP', 'TRB', 'TOV', 'AST', 'STL', 'PF', 'BLK', 'AGE']

# Use the selected features for modeling
X = filtered_df[selected_features]
y = filtered_df['Salary']
