# NBA Awards Predictor 
Part 2: Data Cleaning

Author: Abhi Vellore
Inspired By: Dataquest Web Scraping NBA Stats With Python: Data Project [Part 2 of 3] and https://github.com/JustinGong03/nba-awards-predictor

Some portions are adapted from the below sources. 
https://www.youtube.com/watch?v=LobWMsz35NM
https://github.com/JustinGong03/nba-awards-predictor
Accessed 2023. 


In part 1 of our project, we created CSV files with a variety of NBA statistics. To eventually build models, hoewver, we need to clean this disconnected data and manipulate it into datasets we can use to create predictions. Hence, we will use Pandas in order to extensively clean the data as well and construct new features for certain awards. Part 3 will then be focused on exploratory data analysis (EDA) to eventually build our machine learning models.


Table of Contents

1. Player Dataset
2. Award Datasets
3. Team Datasets
4. Putting it Together
5. Additional feature Engineering
6. Saving Datasets

In [461]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

1. Player Datasets

In [462]:
#specifying features to avoid overlapping data and filter for relevant data. Will later rename per_36 features to prevent overlap
normal_features = ["Player","Pos", "Age", "Tm", "G", "GS", "MP", "FG", "FGA", "FG%", "3P", "3PA", "3P%", 
                  "2P", "2PA", "2P%", "eFG%", "FT", "FTA", "FT%", "ORB", "DRB", "TRB", "AST", "STL", "BLK", "TOV", 
                  "PF", "PTS", "Year"]
per_features = ["Player", "Tm", "FG", "FGA", "FG%", "3P", "3PA", "3P%", "2P", "2PA", "2P%", "FT", "FTA", 
                  "FT%", "ORB", "DRB", "TRB", "AST", "STL", "BLK", "TOV", "PF", "PTS", "Year"]
adv_features = ["Player", "Tm", "PER", "TS%", "3PAr", "FTr", "ORB%", "DRB%", "TRB%", "AST%", "STL%", "BLK%",
                  "TOV%", "USG%",  "OWS", "DWS", "WS", "WS/48", "OBPM", "DBPM", "BPM", "VORP", "Year"]

In [463]:
# Import all players from CSVs, cleaning extraneous features
players = pd.read_csv("data/players.csv")[normal_features]
players36 = pd.read_csv("data/players36.csv")[per_features]
playersAdv = pd.read_csv("data/playersAdv.csv")[adv_features]

# Remove asteriks from certain players. Regex to not replace regular expressions as well
players["Player"] = players["Player"].str.replace("*", "", regex=False)
players36["Player"] = players36["Player"].str.replace("*", "", regex=False)
playersAdv["Player"] = playersAdv["Player"].str.replace("*", "", regex=False)

players.head(10)


Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,Tariq Abdul-Wahad,SG,25,TOT,61,56,25.9,4.5,10.6,0.424,...,1.7,3.1,4.8,1.6,1.0,0.5,1.7,2.4,11.4,2000
1,Tariq Abdul-Wahad,SG,25,ORL,46,46,26.2,4.8,11.2,0.433,...,1.7,3.5,5.2,1.6,1.2,0.3,1.9,2.5,12.2,2000
2,Tariq Abdul-Wahad,SG,25,DEN,15,10,24.9,3.4,8.7,0.389,...,1.6,1.9,3.5,1.7,0.4,0.8,1.3,2.1,8.9,2000
3,Shareef Abdur-Rahim,SF,23,VAN,82,82,39.3,7.2,15.6,0.465,...,2.7,7.4,10.1,3.3,1.1,1.1,3.0,3.0,20.3,2000
4,Cory Alexander,PG,26,DEN,29,2,11.3,1.0,3.4,0.286,...,0.3,1.2,1.4,2.0,0.8,0.1,1.0,1.3,2.8,2000
5,Ray Allen,SG,24,MIL,82,82,37.4,7.8,17.2,0.455,...,1.0,3.4,4.4,3.8,1.3,0.2,2.2,2.3,22.1,2000
6,Rafer Alston,PG,23,MIL,27,0,13.4,1.0,3.5,0.284,...,0.2,0.7,0.9,2.6,0.4,0.0,1.1,1.1,2.2,2000
7,John Amaechi,C,29,ORL,80,53,21.1,3.8,8.8,0.437,...,0.8,2.6,3.3,1.2,0.4,0.5,1.7,2.0,10.5,2000
8,Derek Anderson,SG,25,LAC,64,58,34.4,5.9,13.4,0.438,...,1.3,2.8,4.0,3.4,1.4,0.2,2.6,2.3,16.9,2000
9,Kenny Anderson,PG,29,BOS,82,82,31.6,5.3,12.0,0.44,...,0.7,2.1,2.7,5.1,1.7,0.1,1.6,2.8,14.0,2000


In [464]:
# Group traded players to prevent duplicates per season
def single_row(df):
   if df.shape[0] == 1:
      return df
   else:
      row = df[df["Tm"] == "TOT"]
      row["Tm"] = df.iloc[-1,:]["Tm"]
      return row

def mutate_players(df):
   df = df.groupby(["Player", "Year"]).apply(single_row)
   # Rid of the two duplicate groupby levels
   df.index = df.index.droplevel()
   df.index = df.index.droplevel()
   return df

# Apply to each dataframe
players = mutate_players(players)
players36 = mutate_players(players36)
playersAdv = mutate_players(playersAdv)

players


Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
178,A.C. Green,PF,36,LAL,82,82,23.5,2.1,4.7,0.447,...,2.0,4.0,5.9,1.0,0.6,0.2,0.6,1.5,5.0,2000
677,A.C. Green,PF,37,MIA,82,1,17.2,1.8,4.0,0.444,...,1.3,2.5,3.8,0.5,0.4,0.1,0.5,1.5,4.5,2001
56,A.J. Bramlett,C,23,CLE,8,0,7.6,0.5,2.6,0.190,...,1.5,1.3,2.8,0.0,0.1,0.0,0.4,1.6,1.0,2000
13967,A.J. Green,SG,23,MIL,35,1,9.9,1.5,3.6,0.424,...,0.2,1.1,1.3,0.6,0.2,0.0,0.3,0.9,4.4,2023
680,A.J. Guyton,PG,22,CHI,33,8,19.1,2.4,5.8,0.406,...,0.3,0.8,1.1,1.9,0.3,0.2,0.7,1.1,6.0,2001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1412,Željko Rebrača,C,29,DET,74,4,15.9,2.6,5.1,0.505,...,1.1,2.8,3.9,0.5,0.4,1.0,1.1,2.6,6.9,2002
1893,Željko Rebrača,C,30,DET,30,12,16.3,2.7,4.8,0.552,...,0.9,2.2,3.1,0.3,0.2,0.6,1.0,2.6,6.6,2003
2453,Željko Rebrača,C,31,ATL,24,2,11.4,1.4,3.2,0.442,...,1.0,1.5,2.4,0.3,0.2,0.5,0.7,2.2,3.8,2004
3031,Željko Rebrača,C,32,LAC,58,2,16.0,2.3,4.0,0.568,...,0.8,2.3,3.2,0.4,0.2,0.7,0.8,2.2,5.8,2005


In [465]:
# Prepare certain per36 stats to be combined with normal game stats

per_features = ["FG", "FGA", "FG%", "3P", "3PA", "3P%", 
                  "2P", "2PA", "2P%", "FT", "FTA", "FT%", "ORB", "DRB", "TRB", "AST", "STL", "BLK", "TOV", 
                  "PF", "PTS"]

suffix = '_per36'

# Create a dictionary to map old column names to new column names with the suffix
new_column_names = {col: col + suffix for col in per_features}
# Rename the columns in the subset using the rename() function
players36 = players36.rename(columns=new_column_names) # type: ignore

players36



Unnamed: 0,Player,Tm,FG_per36,FGA_per36,FG%_per36,3P_per36,3PA_per36,3P%_per36,2P_per36,2PA_per36,...,ORB_per36,DRB_per36,TRB_per36,AST_per36,STL_per36,BLK_per36,TOV_per36,PF_per36,PTS_per36,Year
178,A.C. Green,LAL,4.6,10.3,0.447,0.0,0.1,0.250,4.6,10.2,...,4.3,8.7,13.0,2.1,1.4,0.5,1.4,3.4,11.0,2000
677,A.C. Green,MIA,5.6,12.7,0.444,0.0,0.2,0.000,5.6,12.4,...,4.2,8.0,12.2,1.5,1.2,0.3,1.8,4.6,14.3,2001
56,A.J. Bramlett,CLE,3.3,17.3,0.190,0.0,0.0,,3.3,17.3,...,9.9,8.2,18.1,0.0,0.8,0.0,2.5,10.7,6.6,2000
13967,A.J. Green,MIL,7.3,17.3,0.424,6.1,14.5,0.419,1.2,2.8,...,0.8,5.4,6.2,3.0,0.8,0.0,1.2,4.3,21.3,2023
680,A.J. Guyton,CHI,6.7,16.4,0.406,2.3,5.9,0.391,4.4,10.5,...,0.9,2.2,3.1,5.5,0.8,0.4,2.0,3.0,16.9,2001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1412,Željko Rebrača,DET,8.5,16.9,0.505,0.0,0.0,,8.5,16.9,...,3.8,9.3,13.1,1.7,1.3,3.3,3.8,8.7,23.2,2002
1893,Željko Rebrača,DET,9.1,16.4,0.552,0.0,0.0,,9.1,16.4,...,3.1,7.4,10.4,1.0,0.7,1.9,3.3,9.0,22.4,2003
2453,Željko Rebrača,ATL,6.8,15.3,0.442,0.0,0.0,,6.8,15.3,...,4.6,7.0,11.5,1.2,1.0,2.2,3.4,10.3,18.1,2004
3031,Željko Rebrača,LAC,7.7,13.6,0.568,0.0,0.0,,7.7,13.6,...,2.8,7.8,10.7,1.5,0.8,2.3,2.8,7.5,19.7,2005


In [466]:
#merging datasets

player_stats = players.merge(players36, how="outer", on=["Player", "Year", "Tm"])
player_stats = player_stats.merge(playersAdv, how="outer", on=["Player", "Year", "Tm"])


In [467]:
# Map Team Names to be the same
nicknames = {}
with open("nicknames.csv") as f:
   lines = f.readlines()
   for line in lines[1:]:
      abbrev,name = line.replace("\n","").split(",")
      nicknames[abbrev] = name

player_stats["Team"] = player_stats["Tm"].map(nicknames)
player_stats

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Team
0,A.C. Green,PF,36,LAL,82,82,23.5,2.1,4.7,0.447,...,11.0,1.7,3.3,5.0,0.124,-1.1,0.8,-0.3,0.8,Los Angeles Lakers
1,A.C. Green,PF,37,MIA,82,1,17.2,1.8,4.0,0.444,...,14.4,1.1,2.1,3.2,0.110,-2.2,0.0,-2.2,-0.1,Miami Heat
2,A.J. Bramlett,C,23,CLE,8,0,7.6,0.5,2.6,0.190,...,17.1,-0.2,0.1,-0.2,-0.129,-9.2,-6.7,-15.9,-0.2,Cleveland Cavaliers
3,A.J. Green,SG,23,MIL,35,1,9.9,1.5,3.6,0.424,...,16.6,0.5,0.3,0.8,0.111,-0.3,-0.6,-0.9,0.1,Milwaukee Bucks
4,A.J. Guyton,PG,22,CHI,33,8,19.1,2.4,5.8,0.406,...,16.5,0.4,-0.1,0.3,0.020,-0.8,-3.2,-4.0,-0.3,Chicago Bulls
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11514,Željko Rebrača,C,29,DET,74,4,15.9,2.6,5.1,0.505,...,21.4,1.4,1.9,3.3,0.134,-2.9,0.6,-2.3,-0.1,Detroit Pistons
11515,Željko Rebrača,C,30,DET,30,12,16.3,2.7,4.8,0.552,...,19.2,0.7,0.7,1.3,0.133,-2.8,0.2,-2.6,-0.1,Detroit Pistons
11516,Željko Rebrača,C,31,ATL,24,2,11.4,1.4,3.2,0.442,...,18.2,0.1,0.4,0.5,0.097,-4.5,0.6,-3.9,-0.1,Atlanta Hawks
11517,Željko Rebrača,C,32,LAC,58,2,16.0,2.3,4.0,0.568,...,16.0,1.4,0.9,2.4,0.122,-1.7,0.4,-1.3,0.2,Los Angeles Clippers


2. Simplify Award Data and Combine Each Award

In [468]:
# Read all Award CSVs and only keep essential information
mvps = pd.read_csv("data/mvps.csv")
dpoys = pd.read_csv("data/dpoys.csv")
mips = pd.read_csv("data/mips.csv")
smoys = pd.read_csv("data/smoys.csv")

features = ["Player", "Year", "Pts Won", "Pts Max", "Share"]

mvps = mvps[features]
dpoys = dpoys[features]
mips = mips[features]
smoys = smoys[features]



In [469]:
# Merge player and award data; replace NAs with 0s
mvp_df = mvps.merge(player_stats, how="outer", on=["Player", "Year"])
dpoy_df = dpoys.merge(player_stats, how="outer", on=["Player", "Year"])
mip_df = mips.merge(player_stats, how="outer", on=["Player", "Year"])



mvp_df[["Pts Won", "Pts Max", "Share"]] = mvp_df[["Pts Won", "Pts Max", "Share"]].fillna(0)
dpoy_df[["Pts Won", "Pts Max", "Share"]] = dpoy_df[["Pts Won", "Pts Max", "Share"]].fillna(0)
mip_df[["Pts Won", "Pts Max", "Share"]] = mip_df[["Pts Won", "Pts Max", "Share"]].fillna(0)

mvp_df = mvp_df[mvp_df["Player"] != "Player"]
dpoy_df = dpoy_df[dpoy_df["Player"] != "Player"]
mip_df = mip_df[mip_df["Player"] != "Player"]




In [470]:
# Factoring in additional rules for the SMOY award in terms of number of games started
possible_smoy = player_stats[player_stats["GS"] < (player_stats["G"] / 2)]


smoy_df = smoys.merge(possible_smoy, how="outer", on=["Player", "Year"])
smoy_df[["Pts Won", "Pts Max", "Share"]] = smoy_df[["Pts Won", "Pts Max", "Share"]].fillna(0)

smoy_df = smoy_df[smoy_df["Player"] != "Player"]


In [471]:
# Major awards require a certain number of games to be played. The new NBA CBA requires 65 games 
# to be played in order to qualify for any award. However, for historical data, we choose a lower threshold
# of 42 games, or half the season. Clean our player data to rid of these extraneous players

# Define the minimum number of games required
minimum_games_played = 42

# Create a boolean mask for rows to keep and filter
mask = mvp_df['G'] >= minimum_games_played
mvp_df = mvp_df[mask]
mask = dpoy_df['G'] >= minimum_games_played
dpoy_df = dpoy_df[mask]
mask = mip_df['G'] >= minimum_games_played
mip_df = mip_df[mask]
mask = smoy_df['G'] >= minimum_games_played
smoy_df = smoy_df[mask]



3. Team Information. Combine 4 Datasets Together

In [472]:
# Specifying Features per Table
normal_features = ["Rk", "Team",	"FG","FGA","FG%","3P","3PA","3P%","2P","2PA","2P%",	"FT", "FTA", 
                     "FT%", "ORB", "DRB", "TRB", "AST", "STL", "BLK", "TOV", "PF", "PTS", "Year"]
defense_features = ["Team", "FG","FGA","FG%","3P","3PA","3P%","2P","2PA","2P%", "FT", "FTA", 
                     "FT%", "ORB", "DRB", "TRB", "AST", "STL", "BLK", "TOV", "PF", "PTS", "Year"]
advanced_features = ["Team", "Age", "W", "L", "PW", "PL","MOV", "SOS", "SRS",	"ORtg", "DRtg", "NRtg",	"Pace",
                     "FTr", "3PAr",	"TS%", "eFG%", "TOV%", "ORB%", "FT/FGA", "eFG%.1", "TOV%.1", "DRB%", 
                     "FT/FGA.1", "Attend./G", "Year"]
shooting_features = ["Team", "Dist.", "0-3",	"3-10", "10-16","16-3P", "0-3.1", "3-10.1", "10-16.1", "16-3P.1", "Year"]

In [473]:
teams = pd.read_csv("data/teamAllStats.csv")[normal_features]
defense = pd.read_csv("data/teamDefense.csv")[defense_features]
teamsAdvanced = pd.read_csv("data/teamAdvanced.csv")[advanced_features]
shooting = pd.read_csv("data/shooting.csv")[shooting_features]

In [474]:
# Clean Team Names
teams["Team"] = teams["Team"].str.replace("*", "", regex=False)
defense["Team"] = defense["Team"].str.replace("*", "", regex=False)
teamsAdvanced["Team"] = teamsAdvanced["Team"].str.replace("*", "", regex=False)
shooting["Team"] = shooting["Team"].str.replace("*", "", regex=False)

In [475]:
# Rename Variables to be combined

defense_features = ["FG","FGA","FG%","3P","3PA","3P%","2P","2PA","2P%", "FT", "FTA", 
                     "FT%", "ORB", "DRB", "TRB", "AST", "STL", "BLK", "TOV", "PF", "PTS"]

suffix = '_defense'

# Create a dictionary to map old column names to new column names with the suffix
new_column_names = {col: col + suffix for col in defense_features}
# Rename the columns in the subset using the rename() function
defense = defense.rename(columns=new_column_names)

normal_features = ["FG","FGA","FG%","3P","3PA","3P%","2P","2PA","2P%", "FT", "FTA", 
                     "FT%", "ORB", "DRB", "TRB", "AST", "STL", "BLK", "TOV", "PF", "PTS"]
suffix = '_team'
new_column_names = {col: col + suffix for col in normal_features}
teams = teams.rename(columns=new_column_names)

advanced_features = ["Age", "W", "L", "PW", "PL","MOV", "SOS", "SRS",	"ORtg", "DRtg", "NRtg",	"Pace",
                     "FTr", "3PAr",	"TS%", "eFG%", "TOV%", "ORB%", "FT/FGA", "eFG%.1", "TOV%.1", "DRB%", 
                     "FT/FGA.1", "Attend./G"]
new_column_names = {col: col + suffix for col in advanced_features}
teamsAdvanced = teamsAdvanced.rename(columns=new_column_names)

shooting_features = ["Dist.", "0-3", "3-10", "10-16","16-3P", "0-3.1", "3-10.1", "10-16.1", "16-3P.1"]
new_column_names = {col: col + suffix for col in shooting_features}
teamsAdvanced = teamsAdvanced.rename(columns=new_column_names)



In [476]:
allTeam = teams.merge(defense, how="outer", on = ["Team", "Year"])
allTeam = allTeam.merge(teamsAdvanced, how="outer", on = ["Team", "Year"])
allTeam = allTeam.merge(shooting, how="outer", on = ["Team", "Year"])

allTeam = allTeam[allTeam["Team"] != "Team"]
allTeam = allTeam[allTeam["Team"] != "League Average"]


4. Combining everything together and checking for Accuracy

In [477]:
final_mvp = mvp_df.merge(allTeam, how="outer", on = ["Team", "Year"])
final_dpoy = dpoy_df.merge(allTeam, how="outer", on = ["Team", "Year"])
final_mip = mip_df.merge(allTeam, how="outer", on = ["Team", "Year"])
final_smoy = smoy_df.merge(allTeam, how="outer", on = ["Team", "Year"])

In [478]:
# Convert as many columns as possible to integers for easier use
final_mvp = final_mvp.apply(pd.to_numeric, errors="ignore")
final_dpoy = final_dpoy.apply(pd.to_numeric, errors="ignore")
final_mip = final_mip.apply(pd.to_numeric, errors="ignore")
final_smoy = final_smoy.apply(pd.to_numeric, errors="ignore")

Inspecting Data for Accuracy

In [479]:
print("MVP: ", final_mvp.shape)
print("DPOY: ", final_dpoy.shape)
print("MIP: ", final_mip.shape)
print("SMOY: ", final_smoy.shape)
print("Team Statistics: ", allTeam.shape)

MVP:  (7767, 151)
DPOY:  (7767, 151)
MIP:  (7767, 151)
SMOY:  (4053, 151)
Team Statistics:  (715, 78)


In [480]:
# Checking a few Specific Players over multiple years
# Citation: Code Adapted from 
# https://github.com/JustinGong03/nba-awards-predictor/blob/master/notebooks/nba_awards_predictor_part1.ipynb
# Access 2023

columns = ["Player", "Team", "PTS", "Year", "Share"]
print("MVP:")
print((final_mvp[(final_mvp["Player"] == "Nikola Jokić") & (final_mvp["Year"] > 2019)][columns]), "\n")
print("DPOY:")
print((final_dpoy[(final_dpoy["Player"] == "Marcus Smart") & (final_dpoy["Year"] > 2019)][columns]), "\n")
print("MIP:")
print((final_mip[(final_mip["Player"] == "Shai Gilgeous-Alexander") & (final_mip["Year"] > 2019)][columns]), "\n")
print("SMOY:")
print((final_smoy[(final_smoy["Player"] == "Montrezl Harrell") & (final_smoy["Year"] > 2019)][columns]), "\n")


MVP:
            Player            Team   PTS  Year  Share
2683  Nikola Jokić  Denver Nuggets  19.9  2020  0.018
2728  Nikola Jokić  Denver Nuggets  26.4  2021  0.961
2868  Nikola Jokić  Denver Nuggets  27.1  2022  0.875
3007  Nikola Jokić  Denver Nuggets  24.5  2023  0.674 

DPOY:
            Player            Team   PTS  Year  Share
2834  Marcus Smart  Boston Celtics  12.9  2020  0.014
2957  Marcus Smart  Boston Celtics  12.1  2022  0.514
3432  Marcus Smart  Boston Celtics  13.1  2021  0.000
3790  Marcus Smart  Boston Celtics  11.5  2023  0.000 

MIP:
                       Player                   Team   PTS  Year  Share
4088  Shai Gilgeous-Alexander  Oklahoma City Thunder  19.0  2020  0.042
4461  Shai Gilgeous-Alexander  Oklahoma City Thunder  31.4  2023  0.578
4774  Shai Gilgeous-Alexander  Oklahoma City Thunder  24.5  2022  0.000 

SMOY:
                Player                  Team   PTS  Year  Share
1431  Montrezl Harrell  Los Angeles Clippers  18.6  2020  0.794
1493  Montrezl H

5. Additional Feature Engineering

Normally, exploratory data analysis is done prior to feature engineering. However, through domain knowledge, I know that some additional statistics may be useful, especially in regards to the MIP award, which requires improvement over years. We engineer those features here prior to EDA.

Most features will be branched off of a combination of team and player data. 

1. MVP Data

Our MVP data already accounts for features such as stats_per_36 minute, team performance, and even team defense numbers. 

2. DPOY Data

One of the biggest factors in the DPOY award is team defense, which is accounted for here. By scraping the advanced player statistics, we also have factors such as STL%, BLK%, and Defense +-. 

3. SMOY Data

Similar features required as for MVP, except for players who do not start at least half of their games. We have already factored all of this in.

4. MIP Data

Most Improved Player, however, requires year-over-year improvement by a player, something we do not currently consider.Hence, we will engineer statistics on improvements from previous years.

We'll include both the numerical change and the percent change for a variety of counting statistics. We'll also engineer features on team improvement from year to year.

In [481]:
# Calculate numerical and percent change in counting stats for a player

# Citation: Adapted from
# https://github.com/JustinGong03/nba-awards-predictor/blob/master/notebooks/nba_awards_predictor_part1.ipynb
# Accessed 2023


# Features that should be calculated
change_features = ["GS", "MP", "TS%", "TRB", "AST", "STL", "BLK", "PTS", "eFG%", "FTA"]

def percent_change(df):


   player = df["Player"]
   year = df["Year"]

   prev_year = final_mip[(final_mip["Player"] == player) & (final_mip["Year"] == year - 1)]

   # if prev_year.empty:
   #    return null

   if prev_year.empty:
      return [0] * 11

   def perc(x, y):
      try:
         return (x - y) / y
      except ZeroDivisionError:
         return x - y

   #calculates percent change for each statistic
   GS = perc(df["GS"], prev_year["GS"]).values[0]
   MP = perc(df["MP"], prev_year["MP"]).values[0]
   TS = perc(df["TS%"], prev_year["TS%"]).values[0]
   TRB = perc(df["TRB"], prev_year["TRB"]).values[0]
   AST = perc(df["AST"], prev_year["AST"]).values[0]
   STL = perc(df["STL"], prev_year["STL"]).values[0]
   BLK = perc(df["BLK"], prev_year["BLK"]).values[0]
   PTS = perc(df["PTS"], prev_year["PTS"]).values[0]
   EFG = perc(df["eFG%"], prev_year["eFG%"]).values[0]
   FTA = perc(df["FTA"], prev_year["FTA"]).values[0]
   agg = GS + MP + TS + TRB + AST + STL + BLK + PTS + EFG + FTA

   return [GS, MP, TS, TRB, AST, STL, BLK, PTS, EFG, FTA, agg]



In [483]:
# Percentage Differences across counting stats
pct = final_mip.apply(percent_change, axis = 1, result_type="expand")
pct.columns = ["GS - %", "MP - %", "TS - %", "TRB - %", "AST - %", "STL - %", "BLK - %", "PTS - %", "EFG - %", 
                  "FTA - %", "AGG"]
pct

Unnamed: 0,GS - %,MP - %,TS - %,TRB - %,AST - %,STL - %,BLK - %,PTS - %,EFG - %,FTA - %,AGG
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
7762,0.600000,-0.256318,-0.062500,-0.215686,-0.285714,-0.333333,-0.500000,-0.365217,-0.043750,-0.304348,-1.766867
7763,27.000000,0.661017,0.088235,0.459459,1.000000,0.000000,0.333333,0.953488,0.077079,1.214286,31.786898
7764,0.012346,-0.004975,0.107900,-0.062500,-0.089888,-0.062500,0.000000,0.074257,0.092873,0.240741,0.308253
7765,0.046154,-0.138801,0.003745,-0.312500,-0.210526,-0.400000,-0.333333,-0.183673,0.014257,-0.272727,-1.787406


In [482]:
# Citation: Adapted from
# https://github.com/JustinGong03/nba-awards-predictor/blob/master/notebooks/nba_awards_predictor_part1.ipynb
# Accessed 2023

def quantitative_change(df):
   player = df["Player"]
   year = df["Year"]
   
   prev_year = final_mip[(final_mip["Player"] == player) & (final_mip["Year"] == year - 1)]

   if prev_year.empty:
      return [0] * 10

   def calc(x, y):
      return (x - y).values[0]
   
   #calculates quantitaive change for desired variables
   GS = calc(df["GS"], prev_year["GS"])
   MP = calc(df["MP"], prev_year["MP"])
   TS = calc(df["TS%"], prev_year["TS%"])
   TRB = calc(df["TRB"], prev_year["TRB"])
   AST = calc(df["AST"], prev_year["AST"])
   STL = calc(df["STL"], prev_year["STL"])
   BLK = calc(df["BLK"], prev_year["BLK"])
   PTS = calc(df["PTS"], prev_year["PTS"])
   EFG = calc(df["eFG%"], prev_year["eFG%"])
   FTA = calc(df["FTA"], prev_year["FTA"])

   return [GS, MP, TS, TRB, AST, STL, BLK, PTS, EFG, FTA]   

In [484]:
# Quantitative Statistics (Pure Differences)
qt = final_mip.apply(quantitative_change, axis = 1, result_type="expand")
qt.columns = ["GS - ch", "MP - ch", "TS - ch", "TRB - ch", "AST - ch", "STL - ch", "BLK - ch", "PTS - ch", "EFG - ch",
                  "FTA - ch"]
qt

Unnamed: 0,GS - ch,MP - ch,TS - ch,TRB - ch,AST - ch,STL - ch,BLK - ch,PTS - ch,EFG - ch,FTA - ch
0,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.000,0.0
1,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.000,0.0
2,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.000,0.0
3,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.000,0.0
4,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.000,0.0
...,...,...,...,...,...,...,...,...,...,...
7762,6.0,-7.1,-0.032,-1.1,-0.4,-0.2,-0.3,-4.2,-0.021,-0.7
7763,27.0,7.8,0.048,1.7,0.3,0.0,0.1,4.1,0.038,1.7
7764,1.0,-0.2,0.056,-0.2,-0.8,-0.1,0.0,1.5,0.043,1.3
7765,3.0,-4.4,0.002,-1.5,-0.4,-0.4,-0.1,-2.7,0.007,-0.9


In [485]:
final_mip = pd.concat([final_mip, pct, qt], axis = 1)
final_mip

Unnamed: 0,Player,Year,Pts Won,Pts Max,Share,Pos,Age,Tm,G,GS,...,GS - ch,MP - ch,TS - ch,TRB - ch,AST - ch,STL - ch,BLK - ch,PTS - ch,EFG - ch,FTA - ch
0,Jalen Rose,2000,32.0,121,0.264,SF,27.0,IND,80.0,80.0,...,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.000,0.0
1,Austin Croshere,2000,17.0,121,0.140,PF,24.0,IND,81.0,14.0,...,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.000,0.0
2,Al Harrington,2000,0.0,0,0.000,PF,19.0,IND,50.0,0.0,...,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.000,0.0
3,Chris Mullin,2000,0.0,0,0.000,SG,36.0,IND,47.0,2.0,...,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.000,0.0
4,Dale Davis,2000,0.0,0,0.000,C,30.0,IND,74.0,72.0,...,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7762,Maurice Taylor,2005,0.0,0,0.000,PF-C,28.0,NYK,65.0,16.0,...,6.0,-7.1,-0.032,-1.1,-0.4,-0.2,-0.3,-4.2,-0.021,-0.7
7763,Mike Sweetney,2005,0.0,0,0.000,C,22.0,NYK,77.0,28.0,...,27.0,7.8,0.048,1.7,0.3,0.0,0.1,4.1,0.038,1.7
7764,Stephon Marbury,2005,0.0,0,0.000,PG,27.0,NYK,82.0,82.0,...,1.0,-0.2,0.056,-0.2,-0.8,-0.1,0.0,1.5,0.043,1.3
7765,Tim Thomas,2005,0.0,0,0.000,SF,27.0,NYK,71.0,68.0,...,3.0,-4.4,0.002,-1.5,-0.4,-0.4,-0.1,-2.7,0.007,-0.9


6. Saving Datasets

Finally, we end by simply saving our final datasets into CSV files for future use.


In [487]:
# Save as CSVs
final_mvp.to_csv("data/dfs/finalMVP.csv")
final_dpoy.to_csv("data/dfs/finalDPOY.csv")
final_mip.to_csv("data/dfs/finalMIP.csv")
final_smoy.to_csv("data/dfs/finalSMOY.csv")
allTeam.to_csv("data/dfs/allTeam.csv")
