## Data Importing and Pre-processing

In [None]:
# import libraries needed
import pandas as pd

pd.set_option("display.max_columns", None)
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm, skew, probplot
from scipy.special import boxcox1p
import warnings
from datetime import datetime

warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=FutureWarning, module="pandas.*")
%matplotlib inline

In [None]:
# read file and see number of rows and cols
nba_df = pd.read_csv("full_nba_salaries.csv")
nba_df.shape

In [None]:
nba_df.head()

In [None]:
# remove 'Lg_x' and 'Lg_y' columns
# we understand that all of these players are in the NBA so having a column that all says they are in the nba, twice, is unnecessary 
nba_df = nba_df.drop(['Lg_x', 'Lg_y'], axis=1)

# remove the 'Team' column because we already have 'Tm' 
nba_df = nba_df.drop(columns=['Team'])

# reanme 'Unnamed: 0' column to 'ID'
nba_df = nba_df.rename(columns={"Unnamed: 0": "Id"})

In [None]:
print(nba_df.dtypes)

# we see that there are categorical variables that should be labeled as numeric

In [None]:
# list all columns from 'MP' to 'AST' that need to be converted to float
col_to_be_floats = nba_df.loc[:, 'MP':'PTS'].columns.tolist()

# add additional columns 'G', 'GS', to the list
col_to_be_floats.extend(['G', 'GS'])

# convert each column to float, handling non-numeric entries
for column in col_to_be_floats:
    # convert non-convertible strings to NaN
    nba_df[column] = pd.to_numeric(nba_df[column], errors='coerce')

In [None]:
# remove the dollar sign and comma from the 'Salary' column
nba_df['Salary'] = nba_df['Salary'].str.replace('$', '', regex=False)
nba_df['Salary'] = nba_df['Salary'].str.replace(',', '', regex=False)

# remove the '(TW)' which stands for a two-way contract
nba_df['Salary'] = nba_df['Salary'].str.replace('(TW)', '', regex=False)

# convert to int
nba_df['Salary'] = nba_df['Salary'].astype(int)

In [None]:
# count number of categorical variables
category_count = 0

for cat in nba_df.dtypes:
    if cat == "object":
        category_count += 1

print("Number of categorical variables:", category_count)

# column 1 is the ID column so we subract 1
numeric_count = nba_df.shape[1] - category_count - 1

print("Number of contineous variables:", numeric_count)

In [None]:
nba_df.head()

### Handling missing data

In [None]:
# display the missing data and its percent of the column
total_missing = nba_df.isnull().sum().sort_values(ascending=False)
percent_missing = (nba_df.isnull().sum() / nba_df.isnull().count()).sort_values(ascending=False)

missing_data_df = pd.concat([total_missing, percent_missing], axis=1, keys=["Total Missing", "Percent Missing"])
missing_data_df

In [None]:
#Some rows are all labled "Did not play (injury/illness)". We can remove these rows as there is no valuable data
dnp_mask = nba_df['Pos'].str.startswith("Did ")
nba_df = nba_df[~dnp_mask]

In [None]:
# visualize this in a bar graph
missing_data_df["Percent Missing"].head(10).plot(
    kind="barh", figsize=(20, 10)
).invert_yaxis()  # top 10 missing columns
plt.xlabel("Missing Proportion")
plt.ylabel("Variable Name")
plt.title("Top 10 Proportion of Missing Data In Columns")
plt.show()

## Data Analysis and Visualization

In [None]:
# scatterplot
cols = [
"GS",
"MP",
"FG",
"FG%",
"3P",
"3P%",
"eFG%",
"FT",
"FT%",
"TRB",
"AST",
"STL",
"BLK",
"TOV",
"PF",
"PTS"
]
sns.pairplot(nba_df[cols], size=2.5)
plt.show()