In [1]:
import os as os
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import itertools
from openpyxl import load_workbook
pd.set_option('display.max_rows', 500)
pd.options.display.float_format = '{:.4f}'.format
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, plot_roc_curve, plot_confusion_matrix
import random
from sklearn.metrics import r2_score
pd.set_option('display.max_columns', None)
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold

# TABLE OF CONTENTS


# Stage 1.) Create standardized validation and cleaning functions
        
        1.) Validate that no yearly files are the EXACT same
        2.) Validate that all yearly files being combined have the same column structure
        3.) Validate compilation. Ensure the sum of record counts of yearly results = record count of combined dataframe
        4.) Validate data completeness. Make sure we have data for every year in the range 2004-2023
        5.) Clean player/team names of symbols that denote something in the original source data
        

# Stage 2.) Compile dataframes from yearly source files. Validate and do preliminary cleaning

## 2.1) PLAYER STATS:

#### Player Regular Season Stats Dataframes
* [Data Source 1. BASIC PLAYER STATS IN REGULAR SEASON](#Step1)
        
* [Data Source 2. ADVANCED PLAYER STATS IN REGULAR SEASON](#Step2)
        - Validate player list of advanced stats = player list of basic stats for each year

* [Data Source 3. PLAYER SHOOTING STATS IN REGULAR SEASON](#Step3)
        - Validate player list of shooting stats = player list of basic stats for each year
        
* [Data Source 4. PLAYER PLAY-BY-PLAY STATS IN REGULAR SEASON](#Step4)
        - Validate player list of shooting stats = player list of basic stats for each year

#### Player Playoff Stats Dataframes

* [Data Source 5. BASIC PLAYER STATS IN PLAYOFFS](#Step5)

* [Data Source 6. ADVANCED PLAYER STATS IN PLAYOFFS](#Step6)
        - Validate player list of basic stats in playoffs = player list of advanced stats in playoffs for each year
        
* [Data Source 7. PLAYER SHOOTING STATS IN PLAYOFFS](#Step7)
        - Validate player list of shooting stats = player list of basic stats for each year
        
* [Data Source 8. PLAYER PLAY-BY-PLAY STATS IN PLAYOFFS](#Step8)
        - Validate player list of shooting stats = player list of basic stats for each year        
        
#### Player Awards (e.g. MVP) Dataframes
* [Data Source 9. AWARD VOTING HISTORY](#Step9)
        - Validate manual extraction accuracy by ensuring award winners actually had stats in the year they won the award
* [Data Source 10. FINALS MVP](#Step10)
        - Validate manual extraction accuracy by ensuring award winners actually had stats in the year they won the award
        
## 2.2) TEAM STATS: 

#### Team Regular Season Stats Dataframes
* [Data Source 11. BASIC TEAM STATS](#Step11)
* [Data Source 12. OPPONENT STATS AGAINST TEAM](#Step12)
* [Data Source 13. ADVANCED TEAM STATS](#Step13)
* [Data Source 14. TEAM SHOOTING STATS](#Step14)
* [Data Source 15, OPPONENT SHOOTING AGAINST TEAM STATS](#Step15)

#### Team Regular Season Standings Dataframe
* [Data Source 16. REGULAR SEASON TEAM STANDINGS](#Step22)

#### Team Playoff Stats Dataframes
* [Data Source 17. BASIC TEAM STATS](#Step16)
* [Data Source 18. OPPONENT STATS AGAINST TEAM](#Step17)
* [Data Source 19. ADVANCED TEAM STATS](#Step18)
* [Data Source 20. TEAM SHOOTING STATS](#Step19)
* [Data Source 21. OPPONENT SHOOTING AGAINST TEAM STATS](#Step20)

#### Team Playoff Standings Dataframe
* [Data Source 22. PLAYOFFS TEAM STANDINGS](#Step21)
  
# Stage 3.) Extensively clean and validate all dataframes as prep for joins
* [Step 1. Standardardize name of "Player Unique ID" column for all player-related dataframes](#Step23)
        - This is necessary because the name of the columns were one of "-9999","Player-additional", or "-additional"
* [Step 2. Standardardize name of "Team Name" column for all player-related dataframes](#Step24)
        - This is necessary because the name of the columns were one of "Tm" or "Team"
* [Step 3. Address duplicate records for regular season stats where players got traded mid season](#Step25)
        - For players that got traded mid-season, keep team value as the team they were traded to last before drop
        - Drop duplicate records for players that got traded
        - After drop, validate only one player record exists in each regular season player-related dataframe
        - After drop, validate all regular season player stat dataframes are the same length
* [Step 4. Validate we don't have the same issue for playoff stats for players](#Step26) 
        - Validate that there aren't multiple player records in playoffs (players can't be traded, so shouldn't)
        - Validate all playoff player stat dataframes are the same length as expected
* [Step 5. Normalize team names across years where teams change location or name](#Step27) 
        - Examples include Charlotte Bobcats -> Charlotte Hornets. Or Seattle SuperSonics to OKC Thunder
        
# Stage 4.) Create keys for table joins across all dataframes

* [Key creation 1. Create Player-unique-id-year-concat to enable joins between player tables](#Step28)
        - Example of a key for a Giannis record in 2019 would be "antetgi01-2019"
* [Key creation 2. Create Team-name-year-concat to enable joins between all tables](#Step29) 
        - Example of a key for a Toronto Raptors record in 2019 would be "Toronto Raptors-2019"
        
# Stage 5.) Change each dataframe's column names to be both more understandable and unique between tables once merged 
* [Player Related Stats. Change column names](#Step30)
        - Include table name in column name so that when we join tables we can differentiate same named columns 
            - For example, there may be 3P% across multiple tables. Change 3P% to "Regular season player 3 Point %"
* [Team Related Stats. Change column names](#Step31)
        - Include table name in column name so that when we join tables we can differentiate same named columns 
            - For example, there may be team 3P% across multiple tables. Change 3P% to "Team Playoff 3 Point %"
            
# Stage 6.) Do some feature engineering to aid machine learning in Stage 8+ and refine dataframes further before merge
* [Create indicators for each type of qualified* player stat](#Step99)
        - Example: create column to indicate a player has played more than 50 games*
* [Create ranked stats columns for key stats so you have performance relative to peers](#Step99)
        - Example: create column that is the rank of each player's Points Per Game (e.g. top scorer = 1)
* [Encode variables like "Position"](#Step99)
* [Perform final cleaning by excluding redundant columns prior to merging tables](#Step99)
        - Examples of redundant columns: Player Name, Team Name, Year
       
# Stage 7.) Join all REGULAR SEASON player stats, player awards, and team stats tables together into a single, combined dataframe. The machine learning models will train on this combined dataframe. 


## PLAYER STATS MERGED DATAFRAMES

* [Merged Table 1. MERGE ALL REGULAR SEASON PLAYER STATS. Join the following tables together:](#Step32)
        - Basic Player Stats (Regular Season)
        - Advanced Player Stats (Regular Season)
        - Player Shooting Stats (Regular Season)
        - Player Play by Play Stats (Regular Season)
        
* [Merged Table 2. MERGE ALL REGULAR  TEAM STATS. Join the following tables together:](#Step33)
        - Basic Team Stats (Regular Season)
        - Basic Opponent Team Stats (Regular Season)
        - Advanced Team Stats (Regular Season)
        - Team Shooting Stats (Regular Season)
        - Team Opponent Shooting Stats (Regular Season)
        - Team Regular Season Record
        
## Remove 2024 data and store. This data will be used separately for predicting, not training

# Stage 8.) Create machine learning models to predict vote share % for each end of season player award
        
        Standard Procedures:
        - 1.) Merge player stats, team stats, and award stat table to have final data set
        - 2.) Split data into training and testing data sets
        - 3.) Evaluate model predictive power
        - 4.) Identify which features were most important to model
        - 5.) Try to simplify/refine model by filtering down to subset of features 
        - 6.) Use final model to populate and store the model-predicted vote % for every player-season
        - 7.) Use final model to predict 2024 award winner



* [MVP Award Prediction](#Step34)
* [DPOY Award Prediction](#Step35)
* [6MAN Award Prediction](#Step36)
        - Filtered down training/testing data to just 6MAN eligible candidates (>60 games played, <35 games started)
* [ROY Award Prediction](#Step37)
        - Load in rookie data spanning 2003-2024 so that we can flag which player-seasons qualified as rookie seasons
        - Filtered down training/testing data to just rookies        
* [MIP Award Prediction](#Step38)
        - Create delta columns to store changes in stats between seasons to help model understand "improvement"
* [2024 Playoff Prediction](#Step39)
        - The results of this exercise were unfortunately very boring. Complex models just barely edged out purely using regular season team record. Turns out regular seasoning winning is really the only important predictor of post season winning 
          
# Stage 9.) Compile all dataframes into one
* [Merged Table 3. MERGE ALL PLAYER AWARDS. Join the following tables together:](#Step40)
        - MVP Vote History
        - ROY Vote History
        - All NBA Vote History
        - All Defense History
        - MIP Vote History
        - Six Man Vote History        
        - DPOY Vote History
        - Finals MVP Vote History
* [Merged Table 4. MERGE ALL PLAYOFF PLAYER STATS. Join the following tables together:](#Step41)
        - Basic Player Stats (Playoffs)
        - Advanced Player Stats (Playoffs)
        - Player Shooting Stats (Playoffs)
        - Player Play by Play Stats (Playoffs)
        
* [Merged Table 5. MERGE ALL PLAYOFF TEAM STATS. Join the following tables together:](#Step42)
        - Basic Team Stats (Playoffs)
        - Basic Opponent Team Stats (Playoffs)
        - Advanced Team Stats (Playoffs)
        - Team Shooting Stats (Playoffs)
        - Team Opponent Shooting Stats (Playoffs)    
        
* [Merged Table 6. MERGE ALL TEAM RELATED DATA. Join the following tables together:](#Step43)  
        - ALL REGULAR SEASON TEAM STATS (created above by merging)
        - ALL PLAYOFF TEAM STATS (created above by merging)
        - Team Playoff Record         
        
* [Merged Table 7. MERGE ALL PLAYER RELATED DATA. Join the following tables together:](#Step44)  
        - ALL REGULAR SEASON PLAYER STATS (created above by merging)
        - ALL PLAYOFF PLAYER STATS (created above by merging)
        - All PLAYER AWARDS (created above by merging)
        
* [Merged Table 8. MERGE ALL PLAYER AND TEAM RELATED DATA INTO ONE ULTIMATE DATAFRAME](#Step45)  
        - ALL TEAM RELATED DATA (created above)
        - ALL PLAYER RELATED DATA (created above)
        
# Stage 10.) Export final dataframes of interest

* [Special combined team stats containing league average data](#Step46)  
* [ULTIMATE FINAL COMBINED DATAFRAME](#Step47)  
               

# Create list of validation functions

In [2]:
def create_list_of_dataframes_for_yearly_stats(folder_path,header_row_value):

    master_df_list = []

    cumulative_row_counter = 0

    for file_name in os.listdir(folder_path):
        
        if file_name == 'desktop.ini':
            
            continue
        
        # Assign season value
        year = file_name.split(' ')[0]

        # Create dataframe name
        dataframe_name = f'_{file_name.split(".")[0].lower().replace(" ","_")}_df'

        # Store dataframe globally with specified name
        globals()[dataframe_name] = pd.read_csv(f'{folder_path}\\{file_name}',header=header_row_value)

        # Add a Year Column and assign to year
        globals()[dataframe_name]['Year'] = int(year)

        # Append to master_df_list
        master_df_list.append(globals()[dataframe_name]) 

        # Cumulative_row_counter
        cumulative_row_counter = cumulative_row_counter + len(globals()[dataframe_name])
    
    globals()['master_df_list'] = master_df_list
    globals()['cumulative_row_counter'] = cumulative_row_counter
    
    return master_df_list[0].head(1)
    

In [3]:
def check_all_unique_dataframes(dfs,column_to_ignore):
    num_dfs = len(dfs)
    identical_pairs = []
    
    # Dataframes with column removed
    list_of_dataframes_edited = []
    
    # Remove column to ignore
    for i in range(num_dfs):
        edited_df = dfs[i].drop(column_to_ignore,axis=1)
        list_of_dataframes_edited.append(edited_df)

    # Compare and append
    for i in range(num_dfs):
        for j in range(i+1, num_dfs):
            if list_of_dataframes_edited[i].equals(list_of_dataframes_edited[j]):
                identical_pairs.append((i, j))
                
    if len(identical_pairs)>0:
        raise Exception(f'MATCHING DATAFRAMES FOUND - {identical_pairs}. This is likely the result of accidentally copying the wrong years data while creating yearly data files')  

In [4]:
def validate_column_structure_is_same(list_of_dataframes):
    
    first_dataframe_column_structure = list(list_of_dataframes[0].columns)
    first_dataframe_column_structure

    for dataframe in list_of_dataframes:
        if list(dataframe.columns)!=first_dataframe_column_structure:
            raise Exception(f"{dataframe} columns don't match. Review files to ensure they're all the same structure and that a file is not misplaced")

In [5]:
def concat_dataframes_and_validate_and_reset_index_and_sort(desired_name_of_output_dataframe,list_of_dataframes_to_concat,length_value_to_use_to_validate):
    
    globals()[desired_name_of_output_dataframe] = pd.concat(list_of_dataframes_to_concat)

    # Verify that total record count of master dataframe matches the cumulative record count of the original dataframes
    if len(globals()[desired_name_of_output_dataframe])!=length_value_to_use_to_validate:
        raise Exception('Record count of master dataframe does not match cumulative count of component dataframes')

    # Reset indices and sort
    globals()[desired_name_of_output_dataframe] = globals()[desired_name_of_output_dataframe].sort_values('Year',ascending=False)
    globals()[desired_name_of_output_dataframe].reset_index(drop=True,inplace=True)
    globals()[desired_name_of_output_dataframe].head(1)

In [6]:
def validate_completeness_of_data(dataframe,beginning_year,ending_year_plus_one): 

    # Validate that every year contains data
    set_of_years_in_dataframe = set(dataframe['Year'])
    set_of_years_in_range = set(range(2004,2024))

    years_not_in_dataframe = list(set_of_years_in_range-set_of_years_in_dataframe)

    if len(years_not_in_dataframe)>0:
        raise Exception(f'Missing data for the following years: {years_not_in_dataframe}')

In [7]:
def clean_column_values(dataframe,column_to_clean):
    
    # Clean player names of symbols like asterisks and names with '.'
    dataframe[column_to_clean] = dataframe[column_to_clean].str.replace("*",'')
    dataframe[column_to_clean] = dataframe[column_to_clean].str.replace(".",'')

In [8]:
def validate_dataframes_using_column_values(dataframe_to_compare,column_to_compare,dataframe_to_compare_against,column_to_compare_against):
    
    for year in dataframe_to_compare['Year'].unique():
        
        # Filter df of interest on year
        dataframe_to_compare = dataframe_to_compare.loc[dataframe_to_compare['Year']==year]

        # Grab list of players for year
        unique_values_of_column_to_compare = set(dataframe_to_compare[column_to_compare])

        # Filter basic yearly player stats df on year
        dataframe_to_compare_against = dataframe_to_compare_against.loc[dataframe_to_compare_against['Year']==year]

        # Grab list of players that played in year
        unique_values_of_column_to_compare_against = set(dataframe_to_compare_against[column_to_compare_against])

        # Determine difference
        unique_values_not_found = list(unique_values_of_column_to_compare - unique_values_of_column_to_compare_against)

        # Raise error if players are found to not have played in year
        if len(unique_values_not_found)>0:
             raise Exception(f'For {year}, {unique_values_not_found} not found in dataframe used to compare against. Likely issue of loading the wrong years data.')


# Stage 2.) Compile PLAYER related dataframes from yearly source files

## Regular Season Player Stats

### BASIC PLAYER STATS IN REGULAR SEASON. Compile yearly data into one dataframe
 <a class="anchor" id="Step1"></a>

#### Create dataframes of yearly data

In [9]:
create_list_of_dataframes_for_yearly_stats(f'{os.getcwd()}\\Player Stats\\Regular season\\NBA Basic Player Stats',0)

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Player-additional,Year
0,1,Shareef Abdur-Rahim,PF,27,TOT,85,56,31.6,5.9,12.4,0.475,0.1,0.4,0.265,5.8,12.0,0.482,0.48,4.4,5.0,0.869,2.2,5.3,7.5,2.0,0.8,0.4,2.2,2.6,16.3,abdursh01,2004


###### Validate that all of the yearly data files contain different data

In [10]:
check_all_unique_dataframes(master_df_list,'Year')

###### Validate that the column structure for every data frame is the same.

In [11]:
validate_column_structure_is_same(master_df_list)

###### Concat/combine yearly dataframes into one AND validate that the length of the combined dataframe is the same as the sum of the yearly dataframes

In [12]:
concat_dataframes_and_validate_and_reset_index_and_sort('basic_player_stats_regular_season_df',master_df_list,cumulative_row_counter)

###### Validate we have all of the data we need from 2004-2023

In [13]:
validate_completeness_of_data(basic_player_stats_regular_season_df,2004,2024)

###### Clean player names

In [14]:
clean_column_values(basic_player_stats_regular_season_df,'Player')

  dataframe[column_to_clean] = dataframe[column_to_clean].str.replace("*",'')
  dataframe[column_to_clean] = dataframe[column_to_clean].str.replace(".",'')


###### Validate against dataframe

In [15]:
# N/A

In [16]:
# Export basic player stats
# basic_player_stats_regular_season_df.to_excel('Basic Player Stats.xlsx')

### ADVANCED PLAYER STATS IN REGULAR SEASON. Compile yearly data into one dataframe
 <a class="anchor" id="Step2"></a>

#### Create dataframes of yearly data

In [17]:
create_list_of_dataframes_for_yearly_stats(f'{os.getcwd()}\\Player Stats\\Regular season\\NBA Advanced Player Stats',0)

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,Unnamed: 19,OWS,DWS,WS,WS/48,Unnamed: 24,OBPM,DBPM,BPM,VORP,Player-additional,Year
0,1,Shareef Abdur-Rahim,PF,27,TOT,85,2684,19.9,0.557,0.032,0.407,8.2,19.0,13.6,12.3,1.4,1.0,12.9,24.4,,6.1,2.0,8.1,0.145,,2.3,-0.7,1.6,2.4,abdursh01,2004


###### Validate that all of the yearly data files contain different data

In [18]:
check_all_unique_dataframes(master_df_list,'Year')

###### Validate that the column structure for every data frame is the same.

In [19]:
validate_column_structure_is_same(master_df_list)

###### Concat/combine yearly dataframes into one AND validate that the length of the combined dataframe is the same as the sum of the yearly dataframes

In [20]:
concat_dataframes_and_validate_and_reset_index_and_sort('advanced_player_stats_in_regular_season_df',master_df_list,cumulative_row_counter)

###### Validate we have all of the data we need from 2004-2023

In [21]:
validate_completeness_of_data(advanced_player_stats_in_regular_season_df,2004,2024)

###### Clean player names

In [22]:
clean_column_values(advanced_player_stats_in_regular_season_df,'Player')

  dataframe[column_to_clean] = dataframe[column_to_clean].str.replace("*",'')
  dataframe[column_to_clean] = dataframe[column_to_clean].str.replace(".",'')


###### Validate against dataframe

In [23]:
validate_dataframes_using_column_values(advanced_player_stats_in_regular_season_df,'Player-additional',basic_player_stats_regular_season_df,'Player-additional')

### PLAYER SHOOTING IN REGULAR SEASON. Compile yearly data into one dataframe
 <a class="anchor" id="Step3"></a>

#### Create dataframes of yearly data

In [24]:
create_list_of_dataframes_for_yearly_stats(f'{os.getcwd()}\\Player Stats\\Regular season\\NBA Player Shooting Stats',1)

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,MP,FG%,Dist.,Unnamed: 9,2P,0-3,3-10,10-16,16-3P,3P,Unnamed: 16,2P.1,0-3.1,3-10.1,10-16.1,16-3P.1,3P.1,Unnamed: 23,2P.2,3P.2,Unnamed: 26,%FGA,#,Unnamed: 29,%3PA,3P%,Unnamed: 32,Att.,#.1,-9999,Year
0,1,Shareef Abdur-Rahim,PF,27,TOT,85,2684,0.475,7.8,,0.968,0.357,0.319,0.116,0.176,0.032,,0.482,0.585,0.426,0.426,0.414,0.265,,0.539,0.889,,0.069,71,,0.206,0.429,,0,0,abdursh01,2004


###### Validate that all of the yearly data files contain different data

In [None]:
check_all_unique_dataframes(master_df_list,'Year')

###### Validate that the column structure for every data frame is the same.

In [None]:
validate_column_structure_is_same(master_df_list)

###### Concat/combine yearly dataframes into one AND validate that the length of the combined dataframe is the same as the sum of the yearly dataframes

In [None]:
concat_dataframes_and_validate_and_reset_index_and_sort('player_shooting_in_regular_season_df',master_df_list,cumulative_row_counter)

###### Validate we have all of the data we need from 2004-2023

In [None]:
validate_completeness_of_data(player_shooting_in_regular_season_df,2004,2024)

###### Clean player names

In [None]:
clean_column_values(advanced_player_stats_in_regular_season_df,'Player')

###### Validate against dataframe

In [None]:
validate_dataframes_using_column_values(player_shooting_in_regular_season_df,'-9999',basic_player_stats_regular_season_df,'Player-additional')

### PLAYER PLAY-BY-PLAY STATS IN REGULAR SEASON. Compile yearly data into one dataframe
 <a class="anchor" id="Step4"></a>

#### Create dataframes of yearly data

In [None]:
create_list_of_dataframes_for_yearly_stats(f'{os.getcwd()}\\Player Stats\\Regular season\\NBA Player Play by Play Stats',1)

###### Validate that all of the yearly data files contain different data

In [None]:
check_all_unique_dataframes(master_df_list,'Year')

###### Validate that the column structure for every data frame is the same.

In [None]:
validate_column_structure_is_same(master_df_list)

###### Concat/combine yearly dataframes into one AND validate that the length of the combined dataframe is the same as the sum of the yearly dataframes

In [None]:
concat_dataframes_and_validate_and_reset_index_and_sort('player_play_by_play_in_regular_season_df',master_df_list,cumulative_row_counter)

###### Validate we have all of the data we need from 2004-2023

In [None]:
validate_completeness_of_data(player_play_by_play_in_regular_season_df,2004,2024)

###### Clean player names

In [None]:
clean_column_values(player_play_by_play_in_regular_season_df,'Player')

###### Validate against dataframe

In [None]:
validate_dataframes_using_column_values(player_play_by_play_in_regular_season_df,'-9999',basic_player_stats_regular_season_df,'Player-additional')

## Player Playoff Stats

### BASIC PLAYER STATS IN PLAYOFFS. Compile yearly data into one dataframe
 <a class="anchor" id="Step5"></a>

#### Create dataframes of yearly data

In [None]:
create_list_of_dataframes_for_yearly_stats(f'{os.getcwd()}\\Player Stats\\Playoffs\\NBA Basic Player Stats in Playoffs',0)

###### Validate that all of the yearly data files contain different data

In [None]:
check_all_unique_dataframes(master_df_list,'Year')

###### Validate that the column structure for every data frame is the same.

In [None]:
validate_column_structure_is_same(master_df_list)

###### Concat/combine yearly dataframes into one AND validate that the length of the combined dataframe is the same as the sum of the yearly dataframes

In [None]:
concat_dataframes_and_validate_and_reset_index_and_sort('player_basic_playoff_stats_df',master_df_list,cumulative_row_counter)

###### Validate we have all of the data we need from 2004-2023

In [None]:
validate_completeness_of_data(player_basic_playoff_stats_df,2004,2024)

###### Clean player names

In [None]:
clean_column_values(player_basic_playoff_stats_df,'Player')

###### Validate against dataframe

In [None]:
# N/A

### ADVANCED PLAYER STATS IN PLAYOFFS. Compile yearly data into one dataframe
 <a class="anchor" id="Step6"></a>

#### Create dataframes of yearly data

In [None]:
create_list_of_dataframes_for_yearly_stats(f'{os.getcwd()}\\Player Stats\\Playoffs\\NBA Advanced Player Stats in Playoffs',0)

###### Validate that all of the yearly data files contain different data

In [None]:
check_all_unique_dataframes(master_df_list,'Year')

###### Validate that the column structure for every data frame is the same.

In [None]:
validate_column_structure_is_same(master_df_list)

###### Concat/combine yearly dataframes into one AND validate that the length of the combined dataframe is the same as the sum of the yearly dataframes

In [None]:
concat_dataframes_and_validate_and_reset_index_and_sort('player_advanced_stats_in_playoffs_df',master_df_list,cumulative_row_counter)

###### Validate we have all of the data we need from 2004-2023

In [None]:
validate_completeness_of_data(player_advanced_stats_in_playoffs_df,2004,2024)

###### Clean player names

In [None]:
clean_column_values(player_advanced_stats_in_playoffs_df,'Player')

###### Validate against dataframe

In [None]:
validate_dataframes_using_column_values(player_advanced_stats_in_playoffs_df,'Player-additional',player_basic_playoff_stats_df,'Player-additional')

### PLAYER SHOOTING STATS IN PLAYOFFS. Compile yearly data into one dataframe
 <a class="anchor" id="Step7"></a>

#### Create dataframes of yearly data

In [None]:
create_list_of_dataframes_for_yearly_stats(f'{os.getcwd()}\\Player Stats\\Playoffs\\NBA Player Shooting',1)

###### Validate that all of the yearly data files contain different data

In [None]:
check_all_unique_dataframes(master_df_list,'Year')

###### Validate that the column structure for every data frame is the same.

In [None]:
validate_column_structure_is_same(master_df_list)

###### Concat/combine yearly dataframes into one AND validate that the length of the combined dataframe is the same as the sum of the yearly dataframes

In [None]:
concat_dataframes_and_validate_and_reset_index_and_sort('player_shooting_stats_in_playoffs_df',master_df_list,cumulative_row_counter)

###### Validate we have all of the data we need from 2004-2023

In [None]:
validate_completeness_of_data(player_shooting_stats_in_playoffs_df,2004,2024)

###### Clean player names

In [None]:
clean_column_values(player_shooting_stats_in_playoffs_df,'Player')

###### Validate against dataframe

In [None]:
validate_dataframes_using_column_values(player_shooting_stats_in_playoffs_df,'-9999',player_basic_playoff_stats_df,'Player-additional')

### PLAYER PLAY-BY-PLAY STATS IN PLAYOFFS. Compile yearly data into one dataframe
 <a class="anchor" id="Step8"></a>

#### Create dataframes of yearly data

In [None]:
create_list_of_dataframes_for_yearly_stats(f'{os.getcwd()}\\Player Stats\\Playoffs\\NBA Player Play by Play',1)

###### Validate that all of the yearly data files contain different data

In [None]:
check_all_unique_dataframes(master_df_list,'Year')

###### Validate that the column structure for every data frame is the same.

In [None]:
validate_column_structure_is_same(master_df_list)

###### Concat/combine yearly dataframes into one AND validate that the length of the combined dataframe is the same as the sum of the yearly dataframes

In [None]:
concat_dataframes_and_validate_and_reset_index_and_sort('player_play_by_play_stats_in_playoffs_df',master_df_list,cumulative_row_counter)

###### Validate we have all of the data we need from 2004-2023

In [None]:
validate_completeness_of_data(player_play_by_play_stats_in_playoffs_df,2004,2024)

###### Clean player names

In [None]:
clean_column_values(player_play_by_play_stats_in_playoffs_df,'Player')

###### Validate against dataframe

In [None]:
validate_dataframes_using_column_values(player_play_by_play_stats_in_playoffs_df,'-9999',player_basic_playoff_stats_df,'Player-additional')

## AWARD VOTING HISTORY. For each award, compile yearly data into one dataframe
 <a class="anchor" id="Step9"></a>

### MVP. Compile yearly data into one dataframe

#### Create dataframes of yearly data

In [None]:
create_list_of_dataframes_for_yearly_stats(f'{os.getcwd()}\\NBA Player Awards\\MVP',1)

###### Validate that all of the yearly data files contain different data

In [None]:
check_all_unique_dataframes(master_df_list,'Year')

###### Validate that the column structure for every data frame is the same.

In [None]:
validate_column_structure_is_same(master_df_list)

###### Concat/combine yearly dataframes into one AND validate that the length of the combined dataframe is the same as the sum of the yearly dataframes

In [None]:
concat_dataframes_and_validate_and_reset_index_and_sort('mvp_vote_df',master_df_list,cumulative_row_counter)

###### Validate we have all of the data we need from 2004-2023

In [None]:
validate_completeness_of_data(mvp_vote_df,2004,2024)

###### Clean player names

In [None]:
clean_column_values(mvp_vote_df,'Player')

###### Validate against dataframe

In [None]:
validate_dataframes_using_column_values(mvp_vote_df,'-9999',basic_player_stats_regular_season_df,'Player-additional')

### ROY. Compile yearly data into one dataframe

#### Create dataframes of yearly data

In [None]:
create_list_of_dataframes_for_yearly_stats(f'{os.getcwd()}\\NBA Player Awards\\ROY',1)

###### Validate that all of the yearly data files contain different data

In [None]:
check_all_unique_dataframes(master_df_list,'Year')

###### Validate that the column structure for every data frame is the same.

In [None]:
validate_column_structure_is_same(master_df_list)

###### Concat/combine yearly dataframes into one AND validate that the length of the combined dataframe is the same as the sum of the yearly dataframes

In [None]:
concat_dataframes_and_validate_and_reset_index_and_sort('roy_vote_df',master_df_list,cumulative_row_counter)

###### Validate we have all of the data we need from 2004-2023

In [None]:
validate_completeness_of_data(roy_vote_df,2004,2024)

###### Clean player names

In [None]:
clean_column_values(roy_vote_df,'Player')

###### Validate against dataframe

In [None]:
validate_dataframes_using_column_values(roy_vote_df,'-9999',basic_player_stats_regular_season_df,'Player-additional')

### ALL NBA. Compile yearly data into one dataframe

#### Create dataframes of yearly data

In [None]:
create_list_of_dataframes_for_yearly_stats(f'{os.getcwd()}\\NBA Player Awards\\ALL NBA',1)

###### Validate that all of the yearly data files contain different data

In [None]:
check_all_unique_dataframes(master_df_list,'Year')

###### Validate that the column structure for every data frame is the same.

In [None]:
validate_column_structure_is_same(master_df_list)

###### Concat/combine yearly dataframes into one AND validate that the length of the combined dataframe is the same as the sum of the yearly dataframes

In [None]:
concat_dataframes_and_validate_and_reset_index_and_sort('all_nba_df',master_df_list,cumulative_row_counter)

###### Validate we have all of the data we need from 2004-2023

In [None]:
validate_completeness_of_data(all_nba_df,2004,2024)

###### Clean player names

In [None]:
clean_column_values(all_nba_df,'Player')

###### Validate against dataframe

In [None]:
validate_dataframes_using_column_values(all_nba_df,'-9999',basic_player_stats_regular_season_df,'Player-additional')

### ALL DEFENSE. Compile yearly data into one dataframe

#### Create dataframes of yearly data

In [None]:
create_list_of_dataframes_for_yearly_stats(f'{os.getcwd()}\\NBA Player Awards\\ALL Defense',1)

###### Validate that all of the yearly data files contain different data

In [None]:
check_all_unique_dataframes(master_df_list,'Year')

###### Validate that the column structure for every data frame is the same.

In [None]:
validate_column_structure_is_same(master_df_list)

###### Concat/combine yearly dataframes into one AND validate that the length of the combined dataframe is the same as the sum of the yearly dataframes

In [None]:
concat_dataframes_and_validate_and_reset_index_and_sort('all_defense_df',master_df_list,cumulative_row_counter)

###### Validate we have all of the data we need from 2004-2023

In [None]:
validate_completeness_of_data(all_defense_df,2004,2024)

###### Clean player names

In [None]:
clean_column_values(all_defense_df,'Player')

###### Validate against dataframe

In [None]:
validate_dataframes_using_column_values(all_defense_df,'-9999',basic_player_stats_regular_season_df,'Player-additional')

### MIP. Compile yearly data into one dataframe

#### Create dataframes of yearly data

In [None]:
create_list_of_dataframes_for_yearly_stats(f'{os.getcwd()}\\NBA Player Awards\\MIP',1)

###### Validate that all of the yearly data files contain different data

In [None]:
check_all_unique_dataframes(master_df_list,'Year')

###### Validate that the column structure for every data frame is the same.

In [None]:
validate_column_structure_is_same(master_df_list)

###### Concat/combine yearly dataframes into one AND validate that the length of the combined dataframe is the same as the sum of the yearly dataframes

In [None]:
concat_dataframes_and_validate_and_reset_index_and_sort('mip_df',master_df_list,cumulative_row_counter)

###### Validate we have all of the data we need from 2004-2023

In [None]:
validate_completeness_of_data(mip_df,2004,2024)

###### Clean player names

In [None]:
clean_column_values(mip_df,'Player')

###### Validate against dataframe

In [None]:
validate_dataframes_using_column_values(mip_df,'-9999',basic_player_stats_regular_season_df,'Player-additional')

### 6th Man. Compile yearly data into one dataframe

#### Create dataframes of yearly data

In [None]:
create_list_of_dataframes_for_yearly_stats(f'{os.getcwd()}\\NBA Player Awards\\6MAN',1)

###### Validate that all of the yearly data files contain different data

In [None]:
check_all_unique_dataframes(master_df_list,'Year')

###### Validate that the column structure for every data frame is the same.

In [None]:
validate_column_structure_is_same(master_df_list)

###### Concat/combine yearly dataframes into one AND validate that the length of the combined dataframe is the same as the sum of the yearly dataframes

In [None]:
concat_dataframes_and_validate_and_reset_index_and_sort('six_man_df',master_df_list,cumulative_row_counter)

###### Validate we have all of the data we need from 2004-2023

In [None]:
validate_completeness_of_data(six_man_df,2004,2024)

###### Clean player names

In [None]:
clean_column_values(six_man_df,'Player')

###### Validate against dataframe

In [None]:
validate_dataframes_using_column_values(six_man_df,'-9999',basic_player_stats_regular_season_df,'Player-additional')

### DPOY. Compile yearly data into one dataframe

#### Create dataframes of yearly data

In [None]:
create_list_of_dataframes_for_yearly_stats(f'{os.getcwd()}\\NBA Player Awards\\DPOY',1)

###### Validate that all of the yearly data files contain different data

In [None]:
check_all_unique_dataframes(master_df_list,'Year')

###### Validate that the column structure for every data frame is the same.

In [None]:
validate_column_structure_is_same(master_df_list)

###### Concat/combine yearly dataframes into one AND validate that the length of the combined dataframe is the same as the sum of the yearly dataframes

In [None]:
concat_dataframes_and_validate_and_reset_index_and_sort('dpoy_df',master_df_list,cumulative_row_counter)

###### Validate we have all of the data we need from 2004-2023

In [None]:
validate_completeness_of_data(dpoy_df,2004,2024)

###### Clean player names

In [None]:
clean_column_values(dpoy_df,'Player')

###### Validate against dataframe

In [None]:
validate_dataframes_using_column_values(dpoy_df,'-9999',basic_player_stats_regular_season_df,'Player-additional')

## LOAD IN FINALS MVP AND STATS

In [None]:
# Load in data
finals_mvp = pd.read_csv(f'{os.getcwd()}\\NBA Player Awards\\Finals MVP Results and Stats.txt',header=1)

# Convert season to year
finals_mvp.rename({'Season':'Year'},axis=1,inplace=True)
finals_mvp['Year'] = [str(int(year.split('-')[1]) + 2000) if len(year.split('-')[1]) == 2 else year.split('-')[1] 
                     for year in finals_mvp['Year']]

# Filter on just years of interest
list_of_years = list(range(2003,2024))
finals_mvp['Year'] = finals_mvp['Year'].astype(int)
finals_mvp = finals_mvp.loc[finals_mvp['Year'].isin(list_of_years)]

finals_mvp.head(30)

# Stage 2.) Compile TEAM related dataframes from yearly source files   

## Regular Season Team Stats

### BASIC TEAM STATS IN REGULAR SEASON. Compile yearly data into one dataframe
 <a class="anchor" id="Step11"></a>

#### Create dataframes of yearly data

In [None]:
create_list_of_dataframes_for_yearly_stats(f'{os.getcwd()}\\Team Stats\\Regular Season\\NBA Basic Team (TEAM) Stats',0)

###### Validate that all of the yearly data files contain different data

In [None]:
check_all_unique_dataframes(master_df_list,'Year')

###### Validate that the column structure for every data frame is the same.

In [None]:
validate_column_structure_is_same(master_df_list)

###### Concat/combine yearly dataframes into one AND validate that the length of the combined dataframe is the same as the sum of the yearly dataframes

In [None]:
concat_dataframes_and_validate_and_reset_index_and_sort('nba_team_basic_stat_in_regular_season_df',master_df_list,cumulative_row_counter)

###### Validate we have all of the data we need from 2004-2023

In [None]:
validate_completeness_of_data(nba_team_basic_stat_in_regular_season_df,2004,2024)

###### Clean team names

In [None]:
clean_column_values(nba_team_basic_stat_in_regular_season_df,'Team')

### OPPONENT BASIC STATS AGAINST TEAM IN REGULAR SEASON. Compile yearly data into one dataframe
 <a class="anchor" id="Step12"></a>

#### Create dataframes of yearly data

In [None]:
create_list_of_dataframes_for_yearly_stats(f'{os.getcwd()}\\Team Stats\\Regular Season\\NBA Basic Team (OPPONENT) Stats',0)

###### Validate that all of the yearly data files contain different data

In [None]:
check_all_unique_dataframes(master_df_list,'Year')

###### Validate that the column structure for every data frame is the same.

In [None]:
validate_column_structure_is_same(master_df_list)

###### Concat/combine yearly dataframes into one AND validate that the length of the combined dataframe is the same as the sum of the yearly dataframes

In [None]:
concat_dataframes_and_validate_and_reset_index_and_sort('nba_team_basic_opponent_stat_in_regular_season_df',master_df_list,cumulative_row_counter)

###### Validate we have all of the data we need from 2004-2023

In [None]:
validate_completeness_of_data(nba_team_basic_opponent_stat_in_regular_season_df,2004,2024)

###### Clean team names

In [None]:
clean_column_values(nba_team_basic_opponent_stat_in_regular_season_df,'Team')

### ADVANCED TEAM STATS IN REGULAR SEASON. Compile yearly data into one dataframe
 <a class="anchor" id="Step13"></a>

#### Create dataframes of yearly data

In [None]:
create_list_of_dataframes_for_yearly_stats(f'{os.getcwd()}\\Team Stats\\Regular Season\\NBA Advanced Team Stats',1)

###### Validate that all of the yearly data files contain different data

In [None]:
check_all_unique_dataframes(master_df_list,'Year')

###### Validate that the column structure for every data frame is the same.

In [None]:
validate_column_structure_is_same(master_df_list)

###### Concat/combine yearly dataframes into one AND validate that the length of the combined dataframe is the same as the sum of the yearly dataframes

In [None]:
concat_dataframes_and_validate_and_reset_index_and_sort('nba_advanced_team_stats',master_df_list,cumulative_row_counter)

###### Validate we have all of the data we need from 2004-2023

In [None]:
validate_completeness_of_data(nba_advanced_team_stats,2004,2024)

###### Clean team names

In [None]:
clean_column_values(nba_advanced_team_stats,'Team')

### TEAM SHOOTING IN REGULAR SEASON. Compile yearly data into one dataframe
 <a class="anchor" id="Step14"></a>

#### Create dataframes of yearly data

In [None]:
create_list_of_dataframes_for_yearly_stats(f'{os.getcwd()}\\Team Stats\\Regular Season\\NBA Shooting Team (TEAM) Stats',1)

###### Validate that all of the yearly data files contain different data

In [None]:
check_all_unique_dataframes(master_df_list,'Year')

###### Validate that the column structure for every data frame is the same.

In [None]:
validate_column_structure_is_same(master_df_list)

###### Concat/combine yearly dataframes into one AND validate that the length of the combined dataframe is the same as the sum of the yearly dataframes

In [None]:
concat_dataframes_and_validate_and_reset_index_and_sort('nba_team_shooting_stat_in_regular_season_df',master_df_list,cumulative_row_counter)

###### Validate we have all of the data we need from 2004-2023

In [None]:
validate_completeness_of_data(nba_team_shooting_stat_in_regular_season_df,2004,2024)

###### Clean team names

In [None]:
clean_column_values(nba_team_shooting_stat_in_regular_season_df,'Team')

### OPPONENT SHOOTING AGAINST TEAM IN REGULAR SEASON. Compile yearly data into one dataframe
 <a class="anchor" id="Step15"></a>

#### Create dataframes of yearly data

In [None]:
create_list_of_dataframes_for_yearly_stats(f'{os.getcwd()}\\Team Stats\\Regular Season\\NBA Shooting Team (OPPONENT) Stats',1)

###### Validate that all of the yearly data files contain different data

In [None]:
check_all_unique_dataframes(master_df_list,'Year')

###### Validate that the column structure for every data frame is the same.

In [None]:
validate_column_structure_is_same(master_df_list)

###### Concat/combine yearly dataframes into one AND validate that the length of the combined dataframe is the same as the sum of the yearly dataframes

In [None]:
concat_dataframes_and_validate_and_reset_index_and_sort('nba_team_opponent_shooting_stat_in_regular_season_df',master_df_list,cumulative_row_counter)

###### Validate we have all of the data we need from 2004-2023

In [None]:
validate_completeness_of_data(nba_team_opponent_shooting_stat_in_regular_season_df,2004,2024)

###### Clean team names

In [None]:
clean_column_values(nba_team_opponent_shooting_stat_in_regular_season_df,'Team')

### Regular Season Team Standings
 <a class="anchor" id="Step22"></a>

#### Create dataframes of yearly data. Remove columns causing load issues (Division data and Month Data) that we're not interested in anyway

In [None]:
folder_path = f'{os.getcwd()}\\NBA Standings'

master_df_list = []

cumulative_row_counter = 0

for file_name in os.listdir(folder_path):

    if file_name == 'desktop.ini':

        continue

    # Assign season value
    year = file_name.split(' ')[0]

    # Create dataframe name
    dataframe_name = f'_{file_name.split(".")[0].lower().replace(" ","_")}_df'

    # Store dataframe globally with specified name
    globals()[dataframe_name] = pd.read_csv(f'{folder_path}\\{file_name}',header=1)

    # Add a Year Column and assign to year
    globals()[dataframe_name]['Year'] = int(year)
    
    ## Remove columns causing load issues we don't need anyway ##
    list_to_remove = ['M','E','W','A','C','SE','NW','P','SW','Oct','Nov','Dec','Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep']
    
    for column in list_to_remove:
        if column in list(globals()[dataframe_name].columns):
            globals()[dataframe_name].drop(column,axis=1,inplace=True)

    # Append to master_df_list
    master_df_list.append(globals()[dataframe_name]) 

    # Cumulative_row_counter
    cumulative_row_counter = cumulative_row_counter + len(globals()[dataframe_name])

master_df_list[0].head(1)

###### Validate that all of the yearly data files contain different data

In [None]:
check_all_unique_dataframes(master_df_list,'Year')

###### Validate that the column structure for every data frame is the same.

In [None]:
validate_column_structure_is_same(master_df_list)

###### Concat/combine yearly dataframes into one AND validate that the length of the combined dataframe is the same as the sum of the yearly dataframes

In [None]:
concat_dataframes_and_validate_and_reset_index_and_sort('regular_season_standing_df',master_df_list,cumulative_row_counter)

###### Validate we have all of the data we need from 2004-2023

In [None]:
validate_completeness_of_data(regular_season_standing_df,2004,2024)

###### Clean team names

In [None]:
clean_column_values(regular_season_standing_df,'Team')

## Team Playoff Stats

### BASIC TEAM STATS IN PLAYOFFS. Compile yearly data into one dataframe
 <a class="anchor" id="Step16"></a>

#### Create dataframes of yearly data

In [None]:
create_list_of_dataframes_for_yearly_stats(f'{os.getcwd()}\\Team Stats\\Playoffs\\Team Basic Stats',0)

###### Validate that all of the yearly data files contain different data

In [None]:
check_all_unique_dataframes(master_df_list,'Year')

###### Validate that the column structure for every data frame is the same.

In [None]:
validate_column_structure_is_same(master_df_list)

###### Concat/combine yearly dataframes into one AND validate that the length of the combined dataframe is the same as the sum of the yearly dataframes

In [None]:
concat_dataframes_and_validate_and_reset_index_and_sort('nba_team_basic_stat_in_playoffs_df',master_df_list,cumulative_row_counter)

###### Validate we have all of the data we need from 2004-2023

In [None]:
validate_completeness_of_data(nba_team_basic_stat_in_playoffs_df,2004,2024)

###### Clean team names

In [None]:
clean_column_values(nba_team_basic_stat_in_playoffs_df,'Tm')

### OPPONENT BASIC STATS AGAINST TEAM IN PLAYOFFS. Compile yearly data into one dataframe
 <a class="anchor" id="Step17"></a>

#### Create dataframes of yearly data

In [None]:
create_list_of_dataframes_for_yearly_stats(f'{os.getcwd()}\\Team Stats\\Playoffs\\Basic Opponent Stats Against Team',0)

###### Validate that all of the yearly data files contain different data

In [None]:
check_all_unique_dataframes(master_df_list,'Year')

###### Validate that the column structure for every data frame is the same.

In [None]:
validate_column_structure_is_same(master_df_list)

###### Concat/combine yearly dataframes into one AND validate that the length of the combined dataframe is the same as the sum of the yearly dataframes

In [None]:
concat_dataframes_and_validate_and_reset_index_and_sort('nba_team_basic_opponent_stat_in_playoffs_df',master_df_list,cumulative_row_counter)

###### Validate we have all of the data we need from 2004-2023

In [None]:
validate_completeness_of_data(nba_team_basic_opponent_stat_in_playoffs_df,2004,2024)

###### Clean team names

In [None]:
clean_column_values(nba_team_basic_opponent_stat_in_playoffs_df,'Tm')

###### Validate against dataframe

In [None]:
validate_dataframes_using_column_values(nba_team_basic_opponent_stat_in_playoffs_df,'Tm',nba_team_basic_stat_in_playoffs_df,'Tm')

### ADVANCED TEAM STATS IN PLAYOFFS. Compile yearly data into one dataframe

 <a class="anchor" id="Step18"></a>

#### Create dataframes of yearly data

In [None]:
create_list_of_dataframes_for_yearly_stats(f'{os.getcwd()}\\Team Stats\\Playoffs\\Advanced',1)

###### Validate that all of the yearly data files contain different data

In [None]:
check_all_unique_dataframes(master_df_list,'Year')

###### Validate that the column structure for every data frame is the same.

In [None]:
validate_column_structure_is_same(master_df_list)

###### Concat/combine yearly dataframes into one AND validate that the length of the combined dataframe is the same as the sum of the yearly dataframes

In [None]:
concat_dataframes_and_validate_and_reset_index_and_sort('nba_team_advanced_stat_in_playoffs_df',master_df_list,cumulative_row_counter)

###### Validate we have all of the data we need from 2004-2023

In [None]:
validate_completeness_of_data(nba_team_advanced_stat_in_playoffs_df,2004,2024)

###### Clean team names

In [None]:
clean_column_values(nba_team_advanced_stat_in_playoffs_df,'Tm')

###### Validate against dataframe

In [None]:
validate_dataframes_using_column_values(nba_team_advanced_stat_in_playoffs_df,'Tm',nba_team_basic_stat_in_playoffs_df,'Tm')

### TEAM SHOOTING IN PLAYOFFS. Compile yearly data into one dataframe
 <a class="anchor" id="Step19"></a>

#### Create dataframes of yearly data

In [None]:
create_list_of_dataframes_for_yearly_stats(f'{os.getcwd()}\\Team Stats\\Playoffs\\Team Shooting',1)

###### Validate that all of the yearly data files contain different data

In [None]:
check_all_unique_dataframes(master_df_list,'Year')

###### Validate that the column structure for every data frame is the same.

In [None]:
validate_column_structure_is_same(master_df_list)

###### Concat/combine yearly dataframes into one AND validate that the length of the combined dataframe is the same as the sum of the yearly dataframes

In [None]:
concat_dataframes_and_validate_and_reset_index_and_sort('nba_team_shooting_in_playoffs_df',master_df_list,cumulative_row_counter)

###### Validate we have all of the data we need from 2004-2023

In [None]:
validate_completeness_of_data(nba_team_shooting_in_playoffs_df,2004,2024)

###### Clean team names

In [None]:
clean_column_values(nba_team_shooting_in_playoffs_df,'Tm')

###### Validate against dataframe

In [None]:
validate_dataframes_using_column_values(nba_team_shooting_in_playoffs_df,'Tm',nba_team_basic_stat_in_playoffs_df,'Tm')

### OPPONENT SHOOTING AGAINST TEAM IN PLAYOFFS. Compile yearly data into one dataframe
 <a class="anchor" id="Step20"></a>

#### Create dataframes of yearly data

In [None]:
create_list_of_dataframes_for_yearly_stats(f'{os.getcwd()}\\Team Stats\\Playoffs\\Opponent Shooting Stats Against Team',1)

###### Validate that all of the yearly data files contain different data

In [None]:
check_all_unique_dataframes(master_df_list,'Year')

###### Validate that the column structure for every data frame is the same.

In [None]:
validate_column_structure_is_same(master_df_list)

###### Concat/combine yearly dataframes into one AND validate that the length of the combined dataframe is the same as the sum of the yearly dataframes

In [None]:
concat_dataframes_and_validate_and_reset_index_and_sort('nba_team_opponent_shooting_stat_in_playoffs_df',master_df_list,cumulative_row_counter)

###### Validate we have all of the data we need from 2004-2023

In [None]:
validate_completeness_of_data(nba_team_opponent_shooting_stat_in_playoffs_df,2004,2024)

###### Clean team names

In [None]:
clean_column_values(nba_team_opponent_shooting_stat_in_playoffs_df,'Tm')

###### Validate against dataframe

In [None]:
validate_dataframes_using_column_values(nba_team_opponent_shooting_stat_in_playoffs_df,'Tm',nba_team_basic_stat_in_playoffs_df,'Tm')

### TEAM PLAYOFF STANDINGS. Compile yearly data into onedataframe 
 <a class="anchor" id="Step21"></a>

#### Create dataframes of yearly data

In [None]:
folder_path = f'{os.getcwd()}\\NBA Playoff Results\\Playoff Standings'
    
master_df_list = []

cumulative_row_counter = 0

for file_name in os.listdir(folder_path):

    if file_name == 'desktop.ini':

        continue

    # Assign season value
    year = file_name.split(' ')[0]

    # Create dataframe name
    dataframe_name = f'_{file_name.split(".")[0].lower().replace(" ","_")}_df'

    # Store dataframe globally with specified name
    globals()[dataframe_name] = pd.read_csv(f'{folder_path}\\{file_name}',header=1)

    # Add a Year Column and assign to year
    globals()[dataframe_name]['Year'] = int(year)

    # Remove division columns from dataframe
    list_of_columns_to_drop = ['A','C','M','P','SW','SE','NW','Apr','May','Jun','Jul','Aug','Sep','Oct','≤3','≥10']
        
    for column in list_of_columns_to_drop:
        
        if column in list(globals()[dataframe_name].columns):
        
            globals()[dataframe_name] = globals()[dataframe_name].drop(column,axis=1)
    
    # Append to master_df_list
    master_df_list.append(globals()[dataframe_name]) 

    # Cumulative_row_counter
    cumulative_row_counter = cumulative_row_counter + len(globals()[dataframe_name])


master_df_list[0].head(1)

###### Validate that all of the yearly data files contain different data

In [None]:
check_all_unique_dataframes(master_df_list,'Year')

###### Validate that the column structure for every data frame is the same.

In [None]:
validate_column_structure_is_same(master_df_list)

###### Concat/combine yearly dataframes into one AND validate that the length of the combined dataframe is the same as the sum of the yearly dataframes

In [None]:
concat_dataframes_and_validate_and_reset_index_and_sort('nba_team_playoff_standing',master_df_list,cumulative_row_counter)

###### Validate we have all of the data we need from 2004-2023

In [None]:
validate_completeness_of_data(nba_team_playoff_standing,2004,2024)

###### Clean team names

In [None]:
clean_column_values(nba_team_playoff_standing,'Team')

###### Validate against dataframe

In [None]:
validate_dataframes_using_column_values(nba_team_playoff_standing,'Team',nba_team_basic_stat_in_playoffs_df,'Tm')

# Extensively Cleaning and Validating Dataframes as Preparation for Merge

## Create standardized "Player Unique ID" column for regular season stats, playoff season stats, and all awards. There are a few different names for this field
 <a class="anchor" id="Step23"></a>

In [None]:
list_of_dataframes_to_rename_column = [basic_player_stats_regular_season_df,
                                       advanced_player_stats_in_regular_season_df,
                                       player_shooting_in_regular_season_df,
                                       player_play_by_play_in_regular_season_df,
                                       player_basic_playoff_stats_df,
                                       player_advanced_stats_in_playoffs_df,
                                       player_shooting_stats_in_playoffs_df,
                                       player_play_by_play_stats_in_playoffs_df,
                                       mvp_vote_df,
                                       roy_vote_df,
                                       all_nba_df,
                                       all_defense_df,
                                       mip_df,
                                       six_man_df,
                                       dpoy_df,
                                       finals_mvp]

list_of_player_name_columns_observed = ['Player-additional',
                                        '-additional',
                                        '-9999']

for i, dataframe_to_rename_column in enumerate(list_of_dataframes_to_rename_column):
    
    column_name_to_change = 'RAISE_ERROR_IF_I_DONT_CHANGE'
    
    # Check to see which it is 
    for observed_name in list_of_player_name_columns_observed:
        
        if observed_name in list(dataframe_to_rename_column.columns):
            
            column_name_to_change = observed_name
    
    # Rename it
    dataframe_to_rename_column = dataframe_to_rename_column.rename({column_name_to_change:'Player Unique ID'},axis=1)
    
    # Assign to list
    list_of_dataframes_to_rename_column[i] = dataframe_to_rename_column
    
# Assign changes from list to actual dataframes
basic_player_stats_regular_season_df,advanced_player_stats_in_regular_season_df,player_shooting_in_regular_season_df,player_play_by_play_in_regular_season_df,player_basic_playoff_stats_df,player_advanced_stats_in_playoffs_df,player_shooting_stats_in_playoffs_df,player_play_by_play_stats_in_playoffs_df,mvp_vote_df,roy_vote_df,all_nba_df,all_defense_df,mip_df,six_man_df,dpoy_df,finals_mvp = list_of_dataframes_to_rename_column 

### Create standardized "Team Name" column for ALL dataframes. There are team column names of "Tm" and "Team". Convert "Tm" to "Team" for all dataframes with "Tm"
 <a class="anchor" id="Step24"></a>

In [None]:
list_of_all_dataframes = [basic_player_stats_regular_season_df,
                           advanced_player_stats_in_regular_season_df,
                           player_shooting_in_regular_season_df,
                           player_play_by_play_in_regular_season_df,
                           player_basic_playoff_stats_df,
                           player_advanced_stats_in_playoffs_df,
                           player_shooting_stats_in_playoffs_df,
                           player_play_by_play_stats_in_playoffs_df,
                           mvp_vote_df,
                           roy_vote_df,
                           all_nba_df,
                           all_defense_df,
                           mip_df,
                           six_man_df,
                           dpoy_df,
                           finals_mvp,
                          nba_team_basic_stat_in_regular_season_df,
                          nba_team_basic_opponent_stat_in_regular_season_df,
                          nba_advanced_team_stats,
                          nba_team_shooting_stat_in_regular_season_df,
                          nba_team_opponent_shooting_stat_in_regular_season_df,
                          nba_team_basic_stat_in_playoffs_df,
                          nba_team_basic_opponent_stat_in_playoffs_df,
                          nba_team_advanced_stat_in_playoffs_df,
                          nba_team_shooting_in_playoffs_df,
                          nba_team_opponent_shooting_stat_in_playoffs_df,
                          nba_team_playoff_standing,
                          regular_season_standing_df
                         ]

list_of_player_name_columns_observed = ['Tm','Team']

for i, dataframe_to_rename_column in enumerate(list_of_all_dataframes):
        
    break_flag = 0
    
    # Check to see which it is 
    for observed_name in list_of_player_name_columns_observed:
        
        if observed_name in list(dataframe_to_rename_column.columns):
            
            break_flag = break_flag + 1
            
            column_name_to_change = observed_name
    
    if break_flag!=1:
        print(i)
        break
        
    
    # Rename it
    dataframe_to_rename_column = dataframe_to_rename_column.rename({column_name_to_change:'Team'},axis=1)
    
    # Assign to list
    list_of_all_dataframes[i] = dataframe_to_rename_column
    
# Assign changes from list to actual dataframes
basic_player_stats_regular_season_df,advanced_player_stats_in_regular_season_df,player_shooting_in_regular_season_df,player_play_by_play_in_regular_season_df,player_basic_playoff_stats_df,player_advanced_stats_in_playoffs_df,player_shooting_stats_in_playoffs_df,player_play_by_play_stats_in_playoffs_df,mvp_vote_df,roy_vote_df,all_nba_df,all_defense_df,mip_df,six_man_df,dpoy_df,finals_mvp,nba_team_basic_stat_in_regular_season_df,nba_team_basic_opponent_stat_in_regular_season_df,nba_advanced_team_stats,nba_team_shooting_stat_in_regular_season_df,nba_team_opponent_shooting_stat_in_regular_season_df,nba_team_basic_stat_in_playoffs_df,nba_team_basic_opponent_stat_in_playoffs_df,nba_team_advanced_stat_in_playoffs_df,nba_team_shooting_in_playoffs_df,nba_team_opponent_shooting_stat_in_playoffs_df,nba_team_playoff_standing,regular_season_standing_df = list_of_all_dataframes 

### Before dropping duplicate player names, use multiple records for players that are traded to determine and populate the team name column with the team they got most recently traded to

 <a class="anchor" id="Step25"></a>

In [None]:
list_of_all_dataframes = [basic_player_stats_regular_season_df,
                           advanced_player_stats_in_regular_season_df,
                           player_shooting_in_regular_season_df,
                           player_play_by_play_in_regular_season_df,
                           player_basic_playoff_stats_df,
                           player_advanced_stats_in_playoffs_df,
                           player_shooting_stats_in_playoffs_df,
                           player_play_by_play_stats_in_playoffs_df,
                           mvp_vote_df,
                           roy_vote_df,
                           all_nba_df,
                           all_defense_df,
                           mip_df,
                           six_man_df,
                           dpoy_df,
                           finals_mvp,
                          nba_team_basic_stat_in_regular_season_df,
                          nba_team_basic_opponent_stat_in_regular_season_df,
                          nba_advanced_team_stats,
                          nba_team_shooting_stat_in_regular_season_df,
                          nba_team_opponent_shooting_stat_in_regular_season_df,
                          nba_team_basic_stat_in_playoffs_df,
                          nba_team_basic_opponent_stat_in_playoffs_df,
                          nba_team_advanced_stat_in_playoffs_df,
                          nba_team_shooting_in_playoffs_df,
                          nba_team_opponent_shooting_stat_in_playoffs_df,
                          nba_team_playoff_standing,
                          regular_season_standing_df
                         ]

for i, dataframe in enumerate(list_of_all_dataframes):
    
    print(i)
    
    # Create column
    dataframe['Team Name Refined - accounts for players being traded'] = dataframe['Team']
    
    # Assign to list
    list_of_all_dataframes[i] = dataframe
    
# Assign changes from list to actual dataframes
basic_player_stats_regular_season_df,advanced_player_stats_in_regular_season_df,player_shooting_in_regular_season_df,player_play_by_play_in_regular_season_df,player_basic_playoff_stats_df,player_advanced_stats_in_playoffs_df,player_shooting_stats_in_playoffs_df,player_play_by_play_stats_in_playoffs_df,mvp_vote_df,roy_vote_df,all_nba_df,all_defense_df,mip_df,six_man_df,dpoy_df,finals_mvp,nba_team_basic_stat_in_regular_season_df,nba_team_basic_opponent_stat_in_regular_season_df,nba_advanced_team_stats,nba_team_shooting_stat_in_regular_season_df,nba_team_opponent_shooting_stat_in_regular_season_df,nba_team_basic_stat_in_playoffs_df,nba_team_basic_opponent_stat_in_playoffs_df,nba_team_advanced_stat_in_playoffs_df,nba_team_shooting_in_playoffs_df,nba_team_opponent_shooting_stat_in_playoffs_df,nba_team_playoff_standing,regular_season_standing_df = list_of_all_dataframes 

In [None]:
list_of_dataframes_to_address = [basic_player_stats_regular_season_df,
                               advanced_player_stats_in_regular_season_df,
                               player_shooting_in_regular_season_df,
                               player_play_by_play_in_regular_season_df]

for i, dataframe_to_address in enumerate(list_of_dataframes_to_address):
    
    # For each year, default in dynamic Team value for players that changed teams
    for year in list(dataframe_to_address['Year'].unique()):
        
        # Grab list of players that changed teams
        temporary_df = dataframe_to_address.loc[(dataframe_to_address['Year']==year)&(dataframe_to_address['Team']=='TOT')]
        temporary_list_of_players = list(temporary_df['Player Unique ID'].unique())
        
        # For each player that switched team, default all records for player with the team of the first record that is not Team = TOT
        for player in temporary_list_of_players:
            
            temp_df = dataframe_to_address.loc[(dataframe_to_address['Player Unique ID']==player)&(dataframe_to_address['Year']==year)]

            temp_df = temp_df.loc[temp_df['Team']!='TOT'].reset_index(drop=True)
            
            if len(temp_df)==0:
                print(f'{year}-{player}-{i}')
            
            team_value_to_populate = temp_df['Team'][0]

            dataframe_to_address.loc[(dataframe_to_address['Player Unique ID']==player)&(dataframe_to_address['Year']==year),'Team Name Refined - accounts for players being traded'] = team_value_to_populate 
    
    # Validate that all 'TOT' values have been overwritten
    if len(dataframe_to_address.loc[dataframe_to_address['Team']=='Team Name Refined - accounts for players being traded'])>0:
        raise Exception(f'For dataframe in {i} position in list, TOT value exist')
            
    # Assign to list
    list_of_dataframes_to_address[i] = dataframe_to_address
    
# Assign changes from list to actual dataframes
basic_player_stats_regular_season_df,advanced_player_stats_in_regular_season_df,player_shooting_in_regular_season_df,player_play_by_play_in_regular_season_df = list_of_dataframes_to_address
            

### Drop "duplicate" records where there are multiple records resulting from players changing team mid season.

### Keep the record with combined stats (most games played). 

### Validate only one player record for each year after changes

In [None]:
list_of_dataframes_to_clean = [basic_player_stats_regular_season_df,
                               advanced_player_stats_in_regular_season_df,
                               player_shooting_in_regular_season_df,
                               player_play_by_play_in_regular_season_df]

for i, dataframe_to_clean in enumerate(list_of_dataframes_to_clean):
    

    
    player_unique_key = 'Player Unique ID'
    
    # Create temporary year-player concat for dropping duplicates
    dataframe_to_clean['concat'] = dataframe_to_clean[player_unique_key] + dataframe_to_clean['Year'].astype(str)
    
    # For each year, drop duplicates
    for year in dataframe_to_clean['Year'].unique():

        # Sort by year -> player -> G
        dataframe_to_clean = dataframe_to_clean.sort_values(by=['Year',player_unique_key, 'G'], ascending=[True, True, False])

        # Drop duplicates, so only retaining the record with combined player stats
        dataframe_to_clean = dataframe_to_clean.drop_duplicates(subset='concat', keep='first')

        # Validate only one player record for each year in dataframe
        for year in dataframe_to_clean['Year'].unique():

            length_of_dataframe = len(dataframe_to_clean.loc[dataframe_to_clean['Year']==year])
            number_of_players = len(dataframe_to_clean.loc[dataframe_to_clean['Year']==year][player_unique_key].unique())

            if length_of_dataframe != number_of_players:
                raise Exception(f'Multiple player records detected for {year} in {dataframe_to_clean}.')

    # Remove temporary year-player concat
    dataframe_to_clean = dataframe_to_clean.drop('concat',axis=1)

    # Assign dataframe to overwrite dataframe in list
    list_of_dataframes_to_clean[i] = dataframe_to_clean
    
# Assign dataframes to rewrite with changes held in list of dataframes
basic_player_stats_regular_season_df,advanced_player_stats_in_regular_season_df,player_shooting_in_regular_season_df,player_play_by_play_in_regular_season_df = list_of_dataframes_to_clean

### Validate all regular season player stat dataframes are same length after changes

In [None]:
list_of_dataframes_to_clean = [basic_player_stats_regular_season_df,
                               advanced_player_stats_in_regular_season_df,
                               player_shooting_in_regular_season_df,
                               player_play_by_play_in_regular_season_df]

# Validate that all dataframes are the same length as expected after changes

dataframe_length = 0

for i,dataframe in enumerate(list_of_dataframes_to_clean):
    
    # For first, assign dataframe_length
    if i == 0:
        
        dataframe_length = len(dataframe)
    
    # If next, compare to dataframe length
    if i != 0:
        
        if len(dataframe) != dataframe_length:
            
            raise Exception('Dataframes should have matching length. Review why they do not.')

### Ensure we do not have same issue for playoff (logically we shouldn't as players aren't traded during playoffs)

 <a class="anchor" id="Step26"></a>

In [None]:
list_of_playoff_dataframes_to_check = [player_basic_playoff_stats_df,
                                       player_advanced_stats_in_playoffs_df,
                                       player_shooting_stats_in_playoffs_df,
                                       player_play_by_play_stats_in_playoffs_df]

# Address different key column name for player key. Reassign key for particular dataframe

for dataframe in list_of_playoff_dataframes_to_check:
    
    player_unique_key = 'Player Unique ID'
    
    for year in dataframe['Year'].unique():
    
        length_of_dataframe = len(dataframe.loc[dataframe['Year']==year])
        number_of_players = len(dataframe.loc[dataframe['Year']==year][player_unique_key].unique())

        if length_of_dataframe != number_of_players:
            raise Exception(f'Multiple player records detected for {year} in {dataframe}.')

### Validate all playoff player stat dataframes are same length as expected

In [None]:
list_of_playoff_dataframes_to_check = [player_basic_playoff_stats_df,
                                       player_advanced_stats_in_playoffs_df,
                                       player_shooting_stats_in_playoffs_df,
                                       player_play_by_play_stats_in_playoffs_df]
# Validate that all dataframes are the same length as expected after changes

dataframe_length = 0

for i,dataframe in enumerate(list_of_playoff_dataframes_to_check):
    
    # For first, assign dataframe_length
    if i == 0:
        
        dataframe_length = len(dataframe)
    
    # If next, compare to dataframe length
    if i != 0:
        
        if len(dataframe) != dataframe_length:
            
            raise Exception('Dataframes should have matching length. Review why they do not.')

## Normalizing and standardizing team names data for team changes

 <a class="anchor" id="Step27"></a>

### Create dictionary to later convert older team names (e.g. Charlotte Bobcats) to modern team names (e.g. Charlotte Hornets)

### Also add aliases found in validation testing

In [None]:
## NOTES ON TEAM CHANGES ##

## Seattle SuperSonics from 2004 all the way to 2007-2008 season -> OKC Thunder from 2008-2009 season onward

## Charlotte Bobcats first season was 2004-2005. Then -> Charlotte Hornets from 2014-2015 season and onward

## New Orleans Hornets til till 2004-2005 season. Then New Orleans/OKC Hornets for two seasons,
## and then back to New Orleans Hornets through 2012-2013 season -> New Orleans Pelicans from 2013-2014 season onward

## New Orleans / OKC Hornets from 2005-2006 AND 2006-2007

## New Jersey Nets til 2011-2012 season-> Brooklyn Nets from 2012-2013 season and onward

## New Orleans / OKC Hornets


## CREATE TEAM NAME REPLACEMENT DICTIONARY BASED ON CHANGES ##

team_name_replacement_dict = {'Seattle Supersonics':'Oklahoma City Thunder',
                              'Seattle SuperSonics':'Oklahoma City Thunder',
                             'Charlotte Bobcats':'Charlotte Hornets',
                             'New Orleans Hornets':'New Orleans Pelicans',
                             'New Orleans/Oklahoma City Hornets':'New Orleans Pelicans',
                             'New Jersey Nets':'Brooklyn Nets',
                             'Warriors':'Golden State Warriors',
                             'Portland Trailblazers':'Portland Trail Blazers',
                             'New Jersey  Nets':'Brooklyn Nets'}

### Create dictionary to convert city in team name field (e.g. CHA) to modern day Team Name (e.g. Charlotte Hornets)

In [None]:
## CREATE LIST OF CITY TO TEAM DICTIONARY

city_name_to_team_name_dict = {'ATL':'Atlanta Hawks',
                              'BOS':'Boston Celtics',
                              'BRK':'Brooklyn Nets',
                              'CHA':'Charlotte Hornets',
                              'CHI':'Chicago Bulls',
                              'CHO':'Charlotte Hornets',
                              'CLE':'Cleveland Cavaliers',
                              'DAL':'Dallas Mavericks',
                              'DEN':'Denver Nuggets',
                              'DET':'Detroit Pistons',
                              'GSW':'Golden State Warriors',
                              'HOU':'Houston Rockets',
                              'IND':'Indiana Pacers',
                              'LAC':'Los Angeles Clippers',
                              'LAL':'Los Angeles Lakers',
                              'MEM':'Memphis Grizzlies',
                              'MIA':'Miami Heat',
                              'MIL':'Milwaukee Bucks',
                              'MIN':'Minnesota Timberwolves',
                              'NJN':'Brooklyn Nets',
                              'NOH':'New Orleans Pelicans',
                              'NOK':'New Orleans Pelicans',
                              'NOP':'New Orleans Pelicans',
                              'NYK':'New York Knicks',
                              'OKC':'Oklahoma City Thunder',
                              'ORL':'Orlando Magic',
                              'PHI':'Philadelphia 76ers',
                              'PHO':'Phoenix Suns',
                              'POR':'Portland Trail Blazers',
                              'SAC':'Sacramento Kings',
                              'SAS':'San Antonio Spurs',
                              'SEA':'Oklahoma City Thunder',
                              'TOR':'Toronto Raptors',
                              'TOT':'Multiple Teams',
                              'UTA':'Utah Jazz',
                              'WAS':'Washington Wizards'}

### Define final list of the 30 acceptable team names for each team in the NBA

In [None]:
# Define list of acceptable team names
acceptable_team_name_list = ['Dallas Mavericks', 'Sacramento Kings',
                               'Golden State Warriors', 'Atlanta Hawks', 'Boston Celtics',
                               'Oklahoma City Thunder', 'Los Angeles Lakers', 'Utah Jazz',
                               'Memphis Grizzlies', 'Indiana Pacers', 'New York Knicks',
                               'Denver Nuggets', 'Minnesota Timberwolves', 'Philadelphia 76ers',
                               'New Orleans Pelicans', 'Milwaukee Bucks', 'Phoenix Suns',
                               'Toronto Raptors', 'Los Angeles Clippers', 'Miami Heat',
                               'Detroit Pistons', 'Houston Rockets', 'Charlotte Hornets',
                               'Cleveland Cavaliers', 'Orlando Magic', 'San Antonio Spurs',
                               'Chicago Bulls', 'Washington Wizards', 'Brooklyn Nets',
                               'Portland Trail Blazers','League Average','Multiple Teams']

In [None]:
basic_player_stats_regular_season_df

### Using replacement dictionaries created above, create standardized team name column for every dataframe

In [None]:
list_of_all_dataframes = [basic_player_stats_regular_season_df,
                           advanced_player_stats_in_regular_season_df,
                           player_shooting_in_regular_season_df,
                           player_play_by_play_in_regular_season_df,
                           player_basic_playoff_stats_df,
                           player_advanced_stats_in_playoffs_df,
                           player_shooting_stats_in_playoffs_df,
                           player_play_by_play_stats_in_playoffs_df,
                           mvp_vote_df,
                           roy_vote_df,
                           all_nba_df,
                           all_defense_df,
                           mip_df,
                           six_man_df,
                           dpoy_df,
                           finals_mvp,
                          nba_team_basic_stat_in_regular_season_df,
                          nba_team_basic_opponent_stat_in_regular_season_df,
                          nba_advanced_team_stats,
                          nba_team_shooting_stat_in_regular_season_df,
                          nba_team_opponent_shooting_stat_in_regular_season_df,
                          nba_team_basic_stat_in_playoffs_df,
                          nba_team_basic_opponent_stat_in_playoffs_df,
                          nba_team_advanced_stat_in_playoffs_df,
                          nba_team_shooting_in_playoffs_df,
                          nba_team_opponent_shooting_stat_in_playoffs_df,
                          nba_team_playoff_standing,
                          regular_season_standing_df
                         ]

for i, dataframe_to_apply_dicts_to in enumerate(list_of_all_dataframes):
    
    print(i)
    
    break_flag = 0
    secondary_break_flag = 0
    unacceptable_team_name = []
        
    # For each dataframe, create a column that is team name column with each dict applied to it
    dataframe_to_apply_dicts_to['Standardized/Modernized Team Name'] = dataframe_to_apply_dicts_to['Team Name Refined - accounts for players being traded']
    dataframe_to_apply_dicts_to['Standardized/Modernized Team Name'] = dataframe_to_apply_dicts_to['Standardized/Modernized Team Name'].replace(team_name_replacement_dict)
    dataframe_to_apply_dicts_to['Standardized/Modernized Team Name'] = dataframe_to_apply_dicts_to['Standardized/Modernized Team Name'].replace(city_name_to_team_name_dict)
    
    # Ensure all records have an acceptable standardized team name
    
    for team_name in list(dataframe_to_apply_dicts_to['Standardized/Modernized Team Name'].unique()):
        
        if team_name not in acceptable_team_name_list:
            
            break_flag = break_flag + 1
            
            unacceptable_team_name.append(team_name)
            
    if len(dataframe_to_apply_dicts_to.loc[dataframe_to_apply_dicts_to['Standardized/Modernized Team Name'].isnull()])>0:
        secondary_break_flag = secondary_break_flag + 1
            
    if break_flag>0:
        raise Exception(f'non acceptable team names: {unacceptable_team_name}')


    if secondary_break_flag>0:
        raise Exception(f'missing team name in dataframe: {i}')

    # Assign to list
    list_of_all_dataframes[i] = dataframe_to_apply_dicts_to
    
    print(i)
    
# Assign changes from list to actual dataframes
basic_player_stats_regular_season_df,advanced_player_stats_in_regular_season_df,player_shooting_in_regular_season_df,player_play_by_play_in_regular_season_df,player_basic_playoff_stats_df,player_advanced_stats_in_playoffs_df,player_shooting_stats_in_playoffs_df,player_play_by_play_stats_in_playoffs_df,mvp_vote_df,roy_vote_df,all_nba_df,all_defense_df,mip_df,six_man_df,dpoy_df,finals_mvp,nba_team_basic_stat_in_regular_season_df,nba_team_basic_opponent_stat_in_regular_season_df,nba_advanced_team_stats,nba_team_shooting_stat_in_regular_season_df,nba_team_opponent_shooting_stat_in_regular_season_df,nba_team_basic_stat_in_playoffs_df,nba_team_basic_opponent_stat_in_playoffs_df,nba_team_advanced_stat_in_playoffs_df,nba_team_shooting_in_playoffs_df,nba_team_opponent_shooting_stat_in_playoffs_df,nba_team_playoff_standing,regular_season_standing_df = list_of_all_dataframes 

### Create Player Unique Idea-Year concat for all player dataframes. This will serve as the primary key to connect all player dataframes

 <a class="anchor" id="Step28"></a>

In [None]:
list_of_player_dataframes = [basic_player_stats_regular_season_df,
                           advanced_player_stats_in_regular_season_df,
                           player_shooting_in_regular_season_df,
                           player_play_by_play_in_regular_season_df,
                           player_basic_playoff_stats_df,
                           player_advanced_stats_in_playoffs_df,
                           player_shooting_stats_in_playoffs_df,
                           player_play_by_play_stats_in_playoffs_df,
                           mvp_vote_df,
                           roy_vote_df,
                           all_nba_df,
                           all_defense_df,
                           mip_df,
                           six_man_df,
                           dpoy_df,
                           finals_mvp]


for i, player_dataframe in enumerate(list_of_player_dataframes):
    
    print(i)
    
    # Create concat
    player_dataframe['Player_unique_id_and_year_concat'] = player_dataframe['Player Unique ID'] + '-' + player_dataframe['Year'].astype(str)
    
    # Assign to list
    list_of_player_dataframes[i] = player_dataframe
    
# Assign changes from list to actual dataframes
basic_player_stats_regular_season_df,advanced_player_stats_in_regular_season_df,player_shooting_in_regular_season_df,player_play_by_play_in_regular_season_df,player_basic_playoff_stats_df,player_advanced_stats_in_playoffs_df,player_shooting_stats_in_playoffs_df,player_play_by_play_stats_in_playoffs_df,mvp_vote_df,roy_vote_df,all_nba_df,all_defense_df,mip_df,six_man_df,dpoy_df,finals_mvp = list_of_player_dataframes 

In [None]:
basic_player_stats_regular_season_df

### Create Standardized Team Name-Year concat for all dataframes. This will be primary key to connect player data to all team-related data. 

 <a class="anchor" id="Step29"></a>

In [None]:
list_of_all_dataframes = [basic_player_stats_regular_season_df,
                           advanced_player_stats_in_regular_season_df,
                           player_shooting_in_regular_season_df,
                           player_play_by_play_in_regular_season_df,
                           player_basic_playoff_stats_df,
                           player_advanced_stats_in_playoffs_df,
                           player_shooting_stats_in_playoffs_df,
                           player_play_by_play_stats_in_playoffs_df,
                           mvp_vote_df,
                           roy_vote_df,
                           all_nba_df,
                           all_defense_df,
                           mip_df,
                           six_man_df,
                           dpoy_df,
                           finals_mvp,
                          nba_team_basic_stat_in_regular_season_df,
                          nba_team_basic_opponent_stat_in_regular_season_df,
                          nba_advanced_team_stats,
                          nba_team_shooting_stat_in_regular_season_df,
                          nba_team_opponent_shooting_stat_in_regular_season_df,
                          nba_team_basic_stat_in_playoffs_df,
                          nba_team_basic_opponent_stat_in_playoffs_df,
                          nba_team_advanced_stat_in_playoffs_df,
                          nba_team_shooting_in_playoffs_df,
                          nba_team_opponent_shooting_stat_in_playoffs_df,
                          nba_team_playoff_standing,
                          regular_season_standing_df
                         ]

for i, dataframe in enumerate(list_of_all_dataframes):
    
    print(i)
    
    # Create concat
    dataframe['Team_name_and_year_concat'] = dataframe['Standardized/Modernized Team Name'] + '-' + dataframe['Year'].astype(str)
    
    # Assign to list
    list_of_all_dataframes[i] = dataframe
    
# Assign changes from list to actual dataframes
basic_player_stats_regular_season_df,advanced_player_stats_in_regular_season_df,player_shooting_in_regular_season_df,player_play_by_play_in_regular_season_df,player_basic_playoff_stats_df,player_advanced_stats_in_playoffs_df,player_shooting_stats_in_playoffs_df,player_play_by_play_stats_in_playoffs_df,mvp_vote_df,roy_vote_df,all_nba_df,all_defense_df,mip_df,six_man_df,dpoy_df,finals_mvp,nba_team_basic_stat_in_regular_season_df,nba_team_basic_opponent_stat_in_regular_season_df,nba_advanced_team_stats,nba_team_shooting_stat_in_regular_season_df,nba_team_opponent_shooting_stat_in_regular_season_df,nba_team_basic_stat_in_playoffs_df,nba_team_basic_opponent_stat_in_playoffs_df,nba_team_advanced_stat_in_playoffs_df,nba_team_shooting_in_playoffs_df,nba_team_opponent_shooting_stat_in_playoffs_df,nba_team_playoff_standing,regular_season_standing_df = list_of_all_dataframes 

# For every dataframe, replace column names as desired. 

### (Rename columns with unique names so that no columns between dataframes are shared. This will help avoid issues with understanding data after joining tables)

 <a class="anchor" id="Step30"></a>

## REGULAR SEASON PLAYER STATS

### Regular Season Basic Player Stats

In [None]:
# Define unique name for dataframe to insert into f-string
unique_table_name = "Regular Season Basic Player Stats - "

# Define name replacement dict
name_replacement_dict = {"Rk":f"{unique_table_name}Rank",
"Player":f"{unique_table_name}Player",
"Pos":f"{unique_table_name}Position",
"Age":f"{unique_table_name}Age",
"Team":f"{unique_table_name}Team",
"G":f"{unique_table_name}Games Played",
"GS":f"{unique_table_name}Games Started",
"MP":f"{unique_table_name}Minutes Played Per Game",
"FG":f"{unique_table_name}Field Goals Made Per Game",
"FGA":f"{unique_table_name}Field Goals Attempted Per Game",
"FG%":f"{unique_table_name}Average Field Goal % For Season",
"3P":f"{unique_table_name}3 Pointers Made Per Game",
"3PA":f"{unique_table_name}3 Point Attempts Per Game",
"3P%":f"{unique_table_name}Average 3 Point % For Season",
"2P":f"{unique_table_name}2 Pointers Made Per Game",
"2PA":f"{unique_table_name}2 Point Attempts Per Game",
"2P%":f"{unique_table_name}Average 2 Point % For Season",
"eFG%":f"{unique_table_name}Average Effective Field Goal % For Season",
"FT":f"{unique_table_name}Free Throws Made Per Game",
"FTA":f"{unique_table_name}Free Throw Attempts Per Game",
"FT%":f"{unique_table_name}Average Free Throw % For Season",
"ORB":f"{unique_table_name}Offense Rebounds Per Game",
"DRB":f"{unique_table_name}Defense Rebounds Per Game",
"TRB":f"{unique_table_name}Total Rebounds Per Game",
"AST":f"{unique_table_name}Assists Per Game",
"STL":f"{unique_table_name}Steals Per Game",
"BLK":f"{unique_table_name}Blocks Per Game",
"TOV":f"{unique_table_name}Turnovers Per Game",
"PF":f"{unique_table_name}Personal Fouls Per Game",
"PTS":f"{unique_table_name}Points Per Game",
"Player Unique ID":f"{unique_table_name}Player Unique ID",
"Year":f"{unique_table_name}Year",
"Team Name Refined - accounts for players being traded":f"{unique_table_name}Team Name Refined - accounts for players being traded",
"Standardized/Modernized Team Name":f"{unique_table_name}Standardized/Modernized Team Name"}
# Replace names in dataframe
basic_player_stats_regular_season_df.rename(name_replacement_dict,axis=1,inplace=True)

# Check column names to make sure none desired to get rid of
for column in basic_player_stats_regular_season_df.columns:
    
    print(f"""
    {column}""")

basic_player_stats_regular_season_df.head(1)


### Regular Season Advanced Player Stats

In [None]:
# Define unique name for dataframe to insert into f-string
unique_table_name = "Regular Season Advanced Player Stats - "

# Define name replacement dict
name_replacement_dict = {"Rk":f"{unique_table_name}Rank",
"Player":f"{unique_table_name}Player",
"Pos":f"{unique_table_name}Position",
"Age":f"{unique_table_name}Age",
"Team":f"{unique_table_name}Team",
"G":f"{unique_table_name}Games Played",
"GS":f"{unique_table_name}Games Started",
'PER':f'{unique_table_name}Player Efficiency',
'DBPM':f'{unique_table_name}Defensive Box Plus/Minus',
'VORP':f'{unique_table_name}Value Above Replacement Player',
'TRB%':f'{unique_table_name}Total Rebounding %',
'ORB%':f'{unique_table_name}Offensive Rebounding %',
'WS':f'{unique_table_name}Win Shares',
'BLK%':f'{unique_table_name}Block %',
'DWS':f'{unique_table_name}Total Defensive Win Shares For Season',
'TOV%':f'{unique_table_name}Turnover % for Season',
'STL%':f'{unique_table_name}Steal Percentage for Season',
'FTr':f'{unique_table_name}Free Throw Rate for Season',
'DRB%':f'{unique_table_name}Defensive Rebound Percentage',
'3PAr':f'{unique_table_name}3 Point Attempt Rate',
'AST%':f'{unique_table_name}Assist % For Season',
'OBPM':f'{unique_table_name}Offensive Box Plus/Minus For Season',
'BPM':f'{unique_table_name}Box Plus/Minus For Season',
'WS/48':f'{unique_table_name}Win Shares Per 48 Minutes Played For the Season',
'USG%':f'{unique_table_name}Usage % For Season',
'OWS':f'{unique_table_name}Total Offensive Win Shares For Season',
'TS%':f'{unique_table_name}True Shooting % For Season',
'MP':f'{unique_table_name}Total Minutes Played During Season',
"Player Unique ID":f"{unique_table_name}Player Unique ID",
"Year":f"{unique_table_name}Year",
"Team Name Refined - accounts for players being traded":f"{unique_table_name}Team Name Refined - accounts for players being traded",
"Standardized/Modernized Team Name":f"{unique_table_name}Standardized/Modernized Team Name",}

# Replace names in dataframe
advanced_player_stats_in_regular_season_df.rename(name_replacement_dict,axis=1,inplace=True)

# Get rid of dataframes as desired
list_of_columns_to_remove = ['Unnamed: 19','Unnamed: 24']

for item in list_of_columns_to_remove:
    advanced_player_stats_in_regular_season_df = advanced_player_stats_in_regular_season_df.drop(item,axis=1)

# Check column names to make sure none desired to get rid of
for column in advanced_player_stats_in_regular_season_df.columns:
    
    print(f"""
    {column}""")

advanced_player_stats_in_regular_season_df.head(1)

### Regular Season Shooting Player Stats

In [None]:
# Define unique name for dataframe to insert into f-string
unique_table_name = "Regular Season Player Shooting Stats - "

# Define name replacement dict
name_replacement_dict = {"Rk":f"{unique_table_name}Rank",
"Player":f"{unique_table_name}Player",
"Pos":f"{unique_table_name}Position",
"Age":f"{unique_table_name}Age",
"Team":f"{unique_table_name}Team",
"G":f"{unique_table_name}Games Played",
'MP':f'{unique_table_name}Total Minutes Played During Season',
'FG%':f'{unique_table_name}Field Goal %',
'Dist.':f'{unique_table_name}Average distance of shot',                         
'2P':f'{unique_table_name}2 Pointer Attempt Rate For Season as % of total field goals attempted',
'0-3':f'{unique_table_name}% of FGA between 0-3 ft.',
'3-10':f'{unique_table_name}% of FGA between 3-10 ft.',                         
'10-16':f'{unique_table_name}% of FGA between 10-16 ft.',
'16-3P':f'{unique_table_name}% of FGA between 16-3P ft.',
'3P':f'{unique_table_name}3 Pointer Attempt Rate For Season as % of total field goals attempted',
'2P.1':f'{unique_table_name}FG % of 2 Point Attempts',
'0-3.1':f'{unique_table_name}FG % of Attempts between 0-3 ft.',
'3-10.1':f'{unique_table_name}FG % of Attempts between 3-10 ft.',                         
'10-16.1':f'{unique_table_name}FG % of Attempts between 10-16 ft.',
'16-3P.1':f'{unique_table_name}FG % of Attempts between 16-3P ft.',
'3P.1':f'{unique_table_name}FG % of 3 Point Attempts',                       
'2P.2':f'{unique_table_name}% of Made 2 Point Fields Goals That Were Assisted',                         
'3P.2':f'{unique_table_name}% of Made 3 Point Fields Goals That Were Assisted',
'%FGA':f'{unique_table_name}% of Field Goal attempts that are dunk attempts',
'#':f'{unique_table_name}Number of made dunk attempts',
'%3PA':f'{unique_table_name}% of 3 Point Attempts that come from the corner',
'3P%':f'{unique_table_name}% of 3 Point corner attempts that are made',
'Att.':f'{unique_table_name}Heave Attempts',
'#.1':f'{unique_table_name}Number of made heaves',                     
"Player Unique ID":f"{unique_table_name}Player Unique ID",
"Year":f"{unique_table_name}Year",
"Team Name Refined - accounts for players being traded":f"{unique_table_name}Team Name Refined - accounts for players being traded",
"Standardized/Modernized Team Name":f"{unique_table_name}Standardized/Modernized Team Name"}

# Replace names in dataframe
player_shooting_in_regular_season_df.rename(name_replacement_dict,axis=1,inplace=True)

# Get rid of dataframes as desired
list_of_columns_to_remove = ['Unnamed: 9','Unnamed: 16','Unnamed: 23','Unnamed: 26','Unnamed: 29','Unnamed: 32']

for item in list_of_columns_to_remove:
    player_shooting_in_regular_season_df = player_shooting_in_regular_season_df.drop(item,axis=1)

# Check column names to make sure none desired to get rid of
for column in player_shooting_in_regular_season_df.columns:
    
    print(f"""
    {column}""")

player_shooting_in_regular_season_df.head(1)

### Regular Season Play by Play Stats

In [None]:
# Define unique name for dataframe to insert into f-string
unique_table_name = "Regular Season Play by Play Player Stats - "

# Define name replacement dict
name_replacement_dict = {"Rk":f"{unique_table_name}Rank",
"Player":f"{unique_table_name}Player",
"Pos":f"{unique_table_name}Position",
"Age":f"{unique_table_name}Age",
"Team":f"{unique_table_name}Team",
"G":f"{unique_table_name}Games Played",
'MP':f'{unique_table_name}Total Minutes Played During Season',
                         
'PG%':f'{unique_table_name}Percentage of Total Minutes Played at Point Guard',                         
'SG%':f'{unique_table_name}Percentage of Total Minutes Played at Shooting Guard',    
'SF%':f'{unique_table_name}Percentage of Total Minutes Played at Small Forward',    
'PF%':f'{unique_table_name}Percentage of Total Minutes Played at Power Forward',    
'C%':f'{unique_table_name}Percentage of Total Minutes Played at Center',    
'OnCourt':f'{unique_table_name}Plus/Minus Per 100 Possessions (On Court)',    
'On-Off':f'{unique_table_name}Plus/Minus Net Per 100 Possessions (On/off)',    
'BadPass':f'{unique_table_name}Total Turnovers by Bad Pass',    
'LostBall':f'{unique_table_name}Total Lost Ball Turnovers',    
'Shoot':f'{unique_table_name}Shooting Fouls Committed',    
'Off.':f'{unique_table_name}Offensive Fouls Committed', 
'Shoot.1':f'{unique_table_name}Shooting Fouls Drawn',    
'Off..1':f'{unique_table_name}Offensive Fouls Drawn',                     
'PGA':f'{unique_table_name}Points Generated by Assists',    
'And1':f'{unique_table_name}Number of And1 Attempts After Made Field Goal',    
'Blkd':f'{unique_table_name}Field Goal Attempts Blocked',                                                      
"Player Unique ID":f"{unique_table_name}Player Unique ID",
"Year":f"{unique_table_name}Year",
"Team Name Refined - accounts for players being traded":f"{unique_table_name}Team Name Refined - accounts for players being traded",
"Standardized/Modernized Team Name":f"{unique_table_name}Standardized/Modernized Team Name"}

# Replace names in dataframe
player_play_by_play_in_regular_season_df.rename(name_replacement_dict,axis=1,inplace=True)

# # Get rid of dataframes as desired
# list_of_columns_to_remove = ['Unnamed: 9','Unnamed: 16','Unnamed: 23','Unnamed: 26','Unnamed: 29','Unnamed: 32']

# for item in list_of_columns_to_remove:
#     player_play_by_play_in_regular_season_df = player_play_by_play_in_regular_season_df.drop(item,axis=1)

# Check column names to make sure none desired to get rid of
for column in player_play_by_play_in_regular_season_df.columns:
    
    print(f"""
    {column}""")

player_play_by_play_in_regular_season_df.head(1)

## PLAYOFFS PLAYER STATS

### Playoffs Basic Player Stats

In [None]:
# Define unique name for dataframe to insert into f-string
unique_table_name = "Playoffs Basic Player Stats - "

# Define name replacement dict
name_replacement_dict = {"Rk":f"{unique_table_name}Rank",
"Player":f"{unique_table_name}Player",
"Pos":f"{unique_table_name}Position",
"Age":f"{unique_table_name}Age",
"Team":f"{unique_table_name}Team",
"G":f"{unique_table_name}Games Played",
"GS":f"{unique_table_name}Games Started",
"MP":f"{unique_table_name}Minutes Played Per Game",
"FG":f"{unique_table_name}Field Goals Made Per Game",
"FGA":f"{unique_table_name}Field Goals Attempted Per Game",
"FG%":f"{unique_table_name}Average Field Goal % For Season",
"3P":f"{unique_table_name}3 Pointers Made Per Game",
"3PA":f"{unique_table_name}3 Point Attempts Per Game",
"3P%":f"{unique_table_name}Average 3 Point % For Season",
"2P":f"{unique_table_name}2 Pointers Made Per Game",
"2PA":f"{unique_table_name}2 Point Attempts Per Game",
"2P%":f"{unique_table_name}Average 2 Point % For Season",
"eFG%":f"{unique_table_name}Average Effective Field Goal % For Season",
"FT":f"{unique_table_name}Free Throws Made Per Game",
"FTA":f"{unique_table_name}Free Throw Attempts Per Game",
"FT%":f"{unique_table_name}Average Free Throw % For Season",
"ORB":f"{unique_table_name}Offense Rebounds Per Game",
"DRB":f"{unique_table_name}Defense Rebounds Per Game",
"TRB":f"{unique_table_name}Total Rebounds Per Game",
"AST":f"{unique_table_name}Assists Per Game",
"STL":f"{unique_table_name}Steals Per Game",
"BLK":f"{unique_table_name}Blocks Per Game",
"TOV":f"{unique_table_name}Turnovers Per Game",
"PF":f"{unique_table_name}Personal Fouls Per Game",
"PTS":f"{unique_table_name}Points Per Game",
"Player Unique ID":f"{unique_table_name}Player Unique ID",
"Year":f"{unique_table_name}Year",
"Team Name Refined - accounts for players being traded":f"{unique_table_name}Team Name Refined - accounts for players being traded",
"Standardized/Modernized Team Name":f"{unique_table_name}Standardized/Modernized Team Name"}
# Replace names in dataframe
player_basic_playoff_stats_df.rename(name_replacement_dict,axis=1,inplace=True)

# Check column names to make sure none desired to get rid of
for column in player_basic_playoff_stats_df.columns:
    
    print(f"""
    {column}""")

player_basic_playoff_stats_df.head(1)


### Playoffs Advanced Player Stats

In [None]:
# Define unique name for dataframe to insert into f-string
unique_table_name = "Playoffs Advanced Player Stats - "

# Define name replacement dict
name_replacement_dict = {"Rk":f"{unique_table_name}Rank",
"Player":f"{unique_table_name}Player",
"Pos":f"{unique_table_name}Position",
"Age":f"{unique_table_name}Age",
"Team":f"{unique_table_name}Team",
"G":f"{unique_table_name}Games Played",
"GS":f"{unique_table_name}Games Started",
'PER':f'{unique_table_name}Player Efficiency',
'DBPM':f'{unique_table_name}Defensive Box Plus/Minus',
'VORP':f'{unique_table_name}Value Above Replacement Player',
'TRB%':f'{unique_table_name}Total Rebounding %',
'ORB%':f'{unique_table_name}Offensive Rebounding %',
'WS':f'{unique_table_name}Win Shares',
'BLK%':f'{unique_table_name}Block %',
'DWS':f'{unique_table_name}Total Defensive Win Shares For Season',
'TOV%':f'{unique_table_name}Turnover % for Season',
'STL%':f'{unique_table_name}Steal Percentage for Season',
'FTr':f'{unique_table_name}Free Throw Rate for Season',
'DRB%':f'{unique_table_name}Defensive Rebound Percentage',
'3PAr':f'{unique_table_name}3 Point Attempt Rate',
'AST%':f'{unique_table_name}Assist % For Season',
'OBPM':f'{unique_table_name}Offensive Box Plus/Minus For Season',
'BPM':f'{unique_table_name}Box Plus/Minus For Season',
'WS/48':f'{unique_table_name}Win Shares Per 48 Minutes Played For the Season',
'USG%':f'{unique_table_name}Usage % For Season',
'OWS':f'{unique_table_name}Total Offensive Win Shares For Season',
'TS%':f'{unique_table_name}True Shooting % For Season',
'MP':f'{unique_table_name}Total Minutes Played During Season',
"Player Unique ID":f"{unique_table_name}Player Unique ID",
"Year":f"{unique_table_name}Year",
"Team Name Refined - accounts for players being traded":f"{unique_table_name}Team Name Refined - accounts for players being traded",
"Standardized/Modernized Team Name":f"{unique_table_name}Standardized/Modernized Team Name",}

# Replace names in dataframe
player_advanced_stats_in_playoffs_df.rename(name_replacement_dict,axis=1,inplace=True)

# Get rid of dataframes as desired
list_of_columns_to_remove = ['Unnamed: 19','Unnamed: 24']

for item in list_of_columns_to_remove:
    player_advanced_stats_in_playoffs_df = player_advanced_stats_in_playoffs_df.drop(item,axis=1)

# Check column names to make sure none desired to get rid of
for column in player_advanced_stats_in_playoffs_df.columns:
    
    print(f"""
    {column}""")

player_advanced_stats_in_playoffs_df.head(1)

### Playoffs Shooting Player Stats

In [None]:
# Define unique name for dataframe to insert into f-string
unique_table_name = "Playoffs Player Shooting Stats - "

# Define name replacement dict
name_replacement_dict = {"Rk":f"{unique_table_name}Rank",
"Player":f"{unique_table_name}Player",
"Pos":f"{unique_table_name}Position",
"Age":f"{unique_table_name}Age",
"Team":f"{unique_table_name}Team",
"G":f"{unique_table_name}Games Played",
'MP':f'{unique_table_name}Total Minutes Played During Season',
'FG%':f'{unique_table_name}Field Goal %',
'Dist.':f'{unique_table_name}Average distance of shot',                         
'2P':f'{unique_table_name}2 Pointer Attempt Rate For Season as % of total field goals attempted',
'0-3':f'{unique_table_name}% of FGA between 0-3 ft.',
'3-10':f'{unique_table_name}% of FGA between 3-10 ft.',                         
'10-16':f'{unique_table_name}% of FGA between 10-16 ft.',
'16-3P':f'{unique_table_name}% of FGA between 16-3P ft.',
'3P':f'{unique_table_name}3 Pointer Attempt Rate For Season as % of total field goals attempted',
'2P.1':f'{unique_table_name}FG % of 2 Point Attempts',
'0-3.1':f'{unique_table_name}FG % of Attempts between 0-3 ft.',
'3-10.1':f'{unique_table_name}FG % of Attempts between 3-10 ft.',                         
'10-16.1':f'{unique_table_name}FG % of Attempts between 10-16 ft.',
'16-3P.1':f'{unique_table_name}FG % of Attempts between 16-3P ft.',
'3P.1':f'{unique_table_name}FG % of 3 Point Attempts',                       
'2P.2':f'{unique_table_name}% of Made 2 Point Fields Goals That Were Assisted',                         
'3P.2':f'{unique_table_name}% of Made 3 Point Fields Goals That Were Assisted',
'%FGA':f'{unique_table_name}% of Field Goal attempts that are dunk attempts',
'#':f'{unique_table_name}Number of made dunk attempts',
'%3PA':f'{unique_table_name}% of 3 Point Attempts that come from the corner',
'3P%':f'{unique_table_name}% of 3 Point corner attempts that are made',
'Att.':f'{unique_table_name}Heave Attempts',
'#.1':f'{unique_table_name}Number of made heaves',                     
"Player Unique ID":f"{unique_table_name}Player Unique ID",
"Year":f"{unique_table_name}Year",
"Team Name Refined - accounts for players being traded":f"{unique_table_name}Team Name Refined - accounts for players being traded",
"Standardized/Modernized Team Name":f"{unique_table_name}Standardized/Modernized Team Name"}

# Replace names in dataframe
player_shooting_stats_in_playoffs_df.rename(name_replacement_dict,axis=1,inplace=True)

# Get rid of dataframes as desired
list_of_columns_to_remove = ['Unnamed: 9','Unnamed: 16','Unnamed: 23','Unnamed: 26','Unnamed: 29','Unnamed: 32']

for item in list_of_columns_to_remove:
    player_shooting_stats_in_playoffs_df = player_shooting_stats_in_playoffs_df.drop(item,axis=1)

# Check column names to make sure none desired to get rid of
for column in player_shooting_stats_in_playoffs_df.columns:
    
    print(f"""
    {column}""")

player_shooting_stats_in_playoffs_df.head(1)

### Playoffs Play by Play Player Stats

In [None]:
# Define unique name for dataframe to insert into f-string
unique_table_name = "Playoffs Play by Play Player Stats - "

# Define name replacement dict
name_replacement_dict = {"Rk":f"{unique_table_name}Rank",
"Player":f"{unique_table_name}Player",
"Pos":f"{unique_table_name}Position",
"Age":f"{unique_table_name}Age",
"Team":f"{unique_table_name}Team",
"G":f"{unique_table_name}Games Played",
'MP':f'{unique_table_name}Total Minutes Played During Season',                        
'PG%':f'{unique_table_name}Percentage of Total Minutes Played at Point Guard',                         
'SG%':f'{unique_table_name}Percentage of Total Minutes Played at Shooting Guard',    
'SF%':f'{unique_table_name}Percentage of Total Minutes Played at Small Forward',    
'PF%':f'{unique_table_name}Percentage of Total Minutes Played at Power Forward',    
'C%':f'{unique_table_name}Percentage of Total Minutes Played at Center',    
'OnCourt':f'{unique_table_name}Plus/Minus Per 100 Possessions (On Court)',    
'On-Off':f'{unique_table_name}Plus/Minus Net Per 100 Possessions (On/off)',    
'BadPass':f'{unique_table_name}Total Turnovers by Bad Pass',    
'LostBall':f'{unique_table_name}Total Lost Ball Turnovers',    
'Shoot':f'{unique_table_name}Shooting Fouls Committed',    
'Off.':f'{unique_table_name}Offensive Fouls Committed', 
'Shoot.1':f'{unique_table_name}Shooting Fouls Drawn',    
'Off..1':f'{unique_table_name}Offensive Fouls Drawn',                     
'PGA':f'{unique_table_name}Points Generated by Assists',    
'And1':f'{unique_table_name}Number of And1 Attempts After Made Field Goal',    
'Blkd':f'{unique_table_name}Field Goal Attempts Blocked',                                                      
"Player Unique ID":f"{unique_table_name}Player Unique ID",
"Year":f"{unique_table_name}Year",
"Team Name Refined - accounts for players being traded":f"{unique_table_name}Team Name Refined - accounts for players being traded",
"Standardized/Modernized Team Name":f"{unique_table_name}Standardized/Modernized Team Name"}

# Replace names in dataframe
player_play_by_play_stats_in_playoffs_df.rename(name_replacement_dict,axis=1,inplace=True)

# # Get rid of dataframes as desired
# list_of_columns_to_remove = ['Unnamed: 9','Unnamed: 16','Unnamed: 23','Unnamed: 26','Unnamed: 29','Unnamed: 32']

# for item in list_of_columns_to_remove:
#     player_play_by_play_in_regular_season_df = player_play_by_play_in_regular_season_df.drop(item,axis=1)

# Check column names to make sure none desired to get rid of
for column in player_play_by_play_stats_in_playoffs_df.columns:
    
    print(f"""
    {column}""")

player_play_by_play_stats_in_playoffs_df.head(1)

## Player Awards

### MVP Award

In [None]:
# Define unique name for dataframe to insert into f-string
unique_table_name = "MVP Vote Results - "

# Define name replacement dict
name_replacement_dict = {"Rank":f"{unique_table_name}Rank",
"Player":f"{unique_table_name}Player",
"Age":f"{unique_table_name}Age",
"Team":f"{unique_table_name}Team",
"First":f"{unique_table_name}First Place Votes",
"Pts Won":f"{unique_table_name}Total Voting Points Won",
"Pts Max":f"{unique_table_name}Total Voting Points Possible",
"Share":f"{unique_table_name}% of Total Voting Points Possible That Were Won",                                                                                                                  
"Player Unique ID":f"{unique_table_name}Player Unique ID",
"Year":f"{unique_table_name}Year",
"Team Name Refined - accounts for players being traded":f"{unique_table_name}Team Name Refined - accounts for players being traded",
"Standardized/Modernized Team Name":f"{unique_table_name}Standardized/Modernized Team Name"}

# Replace names in dataframe
mvp_vote_df.rename(name_replacement_dict,axis=1,inplace=True)

# # Get rid of dataframes as desired
list_of_columns_to_remove = ['G','PTS','TRB','AST','STL','BLK','FG%','3P%','FT%','WS','WS/48','MP']

for item in list_of_columns_to_remove:
    mvp_vote_df = mvp_vote_df.drop(item,axis=1)

# Check column names to make sure none desired to get rid of
for column in mvp_vote_df.columns:
    
    print(f"""
    {column}""")

mvp_vote_df.head(1)

### ROY Award

In [None]:
# Define unique name for dataframe to insert into f-string
unique_table_name = "ROY Vote Results - "

# Define name replacement dict
name_replacement_dict = {"Rank":f"{unique_table_name}Rank",
"Player":f"{unique_table_name}Player",
"Age":f"{unique_table_name}Age",
"Team":f"{unique_table_name}Team",
"First":f"{unique_table_name}First Place Votes",
"Pts Won":f"{unique_table_name}Total Voting Points Won",
"Pts Max":f"{unique_table_name}Total Voting Points Possible",
"Share":f"{unique_table_name}% of Total Voting Points Possible That Were Won",                                                                                                                  
"Player Unique ID":f"{unique_table_name}Player Unique ID",
"Year":f"{unique_table_name}Year",
"Team Name Refined - accounts for players being traded":f"{unique_table_name}Team Name Refined - accounts for players being traded",
"Standardized/Modernized Team Name":f"{unique_table_name}Standardized/Modernized Team Name"}

# Replace names in dataframe
roy_vote_df.rename(name_replacement_dict,axis=1,inplace=True)

# # Get rid of dataframes as desired
list_of_columns_to_remove = ['G','PTS','TRB','AST','STL','BLK','FG%','3P%','FT%','WS','WS/48','MP']

for item in list_of_columns_to_remove:
    roy_vote_df = roy_vote_df.drop(item,axis=1)

# Check column names to make sure none desired to get rid of
for column in roy_vote_df.columns:
    
    print(f"""
    {column}""")

roy_vote_df.head(1)

### All NBA Award

In [None]:
# Define unique name for dataframe to insert into f-string
unique_table_name = "All NBA Vote Results - "

# Define name replacement dict
name_replacement_dict = {"Rank":f"{unique_table_name}Rank",
"# Tm":f"{unique_table_name}All NBA Award (e.g. 1st Team)",
"1st Tm":f"{unique_table_name}All NBA 1st Team Votes",
"2nd Tm":f"{unique_table_name}All NBA 2nd Team Votes",
"3rd Tm":f"{unique_table_name}All NBA 3rd Team Votes",                                                  
"Pos":f"{unique_table_name}Position",                         
"Player":f"{unique_table_name}Player",
"Age":f"{unique_table_name}Age",
"Team":f"{unique_table_name}Team",
"First":f"{unique_table_name}First Place Votes",
"Pts Won":f"{unique_table_name}Total Voting Points Won",
"Pts Max":f"{unique_table_name}Total Voting Points Possible",
"Share":f"{unique_table_name}% of Total Voting Points Possible That Were Won",                                                                                                                  
"Player Unique ID":f"{unique_table_name}Player Unique ID",
"Year":f"{unique_table_name}Year",
"Team Name Refined - accounts for players being traded":f"{unique_table_name}Team Name Refined - accounts for players being traded",
"Standardized/Modernized Team Name":f"{unique_table_name}Standardized/Modernized Team Name"}

# Replace names in dataframe
all_nba_df.rename(name_replacement_dict,axis=1,inplace=True)

# # # Get rid of dataframes as desired
list_of_columns_to_remove = ['G', 'MP', 'PTS', 'TRB', 'AST', 'STL','BLK', 'FG%', '3P%', 'FT%', 'WS', 'WS/48']

for item in list_of_columns_to_remove:
    all_nba_df = all_nba_df.drop(item,axis=1)

# Check column names to make sure none desired to get rid of
for column in all_nba_df.columns:
    
    print(f"""
    {column}""")

all_nba_df.head(1)

### All Defensive Award

In [None]:
# Define unique name for dataframe to insert into f-string
unique_table_name = "All Defensive Vote Results - "

# Define name replacement dict
name_replacement_dict = {"Rank":f"{unique_table_name}Rank",
"# Tm":f"{unique_table_name}All NBA Award (e.g. 1st Team)",
"1st Tm":f"{unique_table_name}All NBA 1st Team Votes",
"2nd Tm":f"{unique_table_name}All NBA 2nd Team Votes",
"3rd Tm":f"{unique_table_name}All NBA 3rd Team Votes",                                                  
"Pos":f"{unique_table_name}Position",                         
"Player":f"{unique_table_name}Player",
"Age":f"{unique_table_name}Age",
"Team":f"{unique_table_name}Team",
"First":f"{unique_table_name}First Place Votes",
"Pts Won":f"{unique_table_name}Total Voting Points Won",
"Pts Max":f"{unique_table_name}Total Voting Points Possible",
"Share":f"{unique_table_name}% of Total Voting Points Possible That Were Won",                                                                                                                  
"Player Unique ID":f"{unique_table_name}Player Unique ID",
"Year":f"{unique_table_name}Year",
"Team Name Refined - accounts for players being traded":f"{unique_table_name}Team Name Refined - accounts for players being traded",
"Standardized/Modernized Team Name":f"{unique_table_name}Standardized/Modernized Team Name"}

# Replace names in dataframe
all_defense_df.rename(name_replacement_dict,axis=1,inplace=True)

# # # Get rid of dataframes as desired
list_of_columns_to_remove = ['G', 'MP', 'PTS', 'TRB', 'AST', 'STL','BLK', 'FG%', '3P%', 'FT%', 'WS', 'WS/48','DWS','DBPM','DRtg']

for item in list_of_columns_to_remove:
    all_defense_df = all_defense_df.drop(item,axis=1)

# Check column names to make sure none desired to get rid of
for column in all_defense_df.columns:
    
    print(f"""
    {column}""")

all_defense_df.head(1)

### Most Improved Player Award

In [None]:
# Define unique name for dataframe to insert into f-string
unique_table_name = "MIP Vote Results - "

# Define name replacement dict
name_replacement_dict = {"Rank":f"{unique_table_name}Rank",
"# Tm":f"{unique_table_name}All NBA Award (e.g. 1st Team)",
"1st Tm":f"{unique_table_name}All NBA 1st Team Votes",
"2nd Tm":f"{unique_table_name}All NBA 2nd Team Votes",
"3rd Tm":f"{unique_table_name}All NBA 3rd Team Votes",                                                  
"Pos":f"{unique_table_name}Position",                         
"Player":f"{unique_table_name}Player",
"Age":f"{unique_table_name}Age",
"Team":f"{unique_table_name}Team",
"First":f"{unique_table_name}First Place Votes",
"Pts Won":f"{unique_table_name}Total Voting Points Won",
"Pts Max":f"{unique_table_name}Total Voting Points Possible",
"Share":f"{unique_table_name}% of Total Voting Points Possible That Were Won",                                                                                                                  
"Player Unique ID":f"{unique_table_name}Player Unique ID",
"Year":f"{unique_table_name}Year",
"Team Name Refined - accounts for players being traded":f"{unique_table_name}Team Name Refined - accounts for players being traded",
"Standardized/Modernized Team Name":f"{unique_table_name}Standardized/Modernized Team Name"}

# Replace names in dataframe
mip_df.rename(name_replacement_dict,axis=1,inplace=True)

# # Get rid of dataframes as desired
list_of_columns_to_remove = ['G', 'MP', 'PTS', 'TRB', 'AST', 'STL','BLK', 'FG%', '3P%', 'FT%', 'WS', 'WS/48']

for item in list_of_columns_to_remove:
    mip_df = mip_df.drop(item,axis=1)

# Check column names to make sure none desired to get rid of
for column in mip_df.columns:
    
    print(f"""
    {column}""")

mip_df.head(1)

### 6 Man of the Year Award

In [None]:
# Define unique name for dataframe to insert into f-string
unique_table_name = "6 Man Vote Results - "

# Define name replacement dict
name_replacement_dict = {"Rank":f"{unique_table_name}Rank",
"# Tm":f"{unique_table_name}All NBA Award (e.g. 1st Team)",
"1st Tm":f"{unique_table_name}All NBA 1st Team Votes",
"2nd Tm":f"{unique_table_name}All NBA 2nd Team Votes",
"3rd Tm":f"{unique_table_name}All NBA 3rd Team Votes",                                                  
"Pos":f"{unique_table_name}Position",                         
"Player":f"{unique_table_name}Player",
"Age":f"{unique_table_name}Age",
"Team":f"{unique_table_name}Team",
"First":f"{unique_table_name}First Place Votes",
"Pts Won":f"{unique_table_name}Total Voting Points Won",
"Pts Max":f"{unique_table_name}Total Voting Points Possible",
"Share":f"{unique_table_name}% of Total Voting Points Possible That Were Won",                                                                                                                  
"Player Unique ID":f"{unique_table_name}Player Unique ID",
"Year":f"{unique_table_name}Year",
"Team Name Refined - accounts for players being traded":f"{unique_table_name}Team Name Refined - accounts for players being traded",
"Standardized/Modernized Team Name":f"{unique_table_name}Standardized/Modernized Team Name"}

# Replace names in dataframe
six_man_df.rename(name_replacement_dict,axis=1,inplace=True)

# Get rid of dataframes as desired
list_of_columns_to_remove = ['G', 'MP', 'PTS', 'TRB', 'AST', 'STL','BLK', 'FG%', '3P%', 'FT%', 'WS', 'WS/48']

for item in list_of_columns_to_remove:
    six_man_df = six_man_df.drop(item,axis=1)

# Check column names to make sure none desired to get rid of
for column in six_man_df.columns:
    
    print(f"""
    {column}""")

six_man_df.head(1)

### DPOY Award

In [None]:
# Define unique name for dataframe to insert into f-string
unique_table_name = "DPOY Results - "

# Define name replacement dict
name_replacement_dict = {"Rank":f"{unique_table_name}Rank",
"# Tm":f"{unique_table_name}All NBA Award (e.g. 1st Team)",
"1st Tm":f"{unique_table_name}All NBA 1st Team Votes",
"2nd Tm":f"{unique_table_name}All NBA 2nd Team Votes",
"3rd Tm":f"{unique_table_name}All NBA 3rd Team Votes",                                                  
"Pos":f"{unique_table_name}Position",                         
"Player":f"{unique_table_name}Player",
"Age":f"{unique_table_name}Age",
"Team":f"{unique_table_name}Team",
"First":f"{unique_table_name}First Place Votes",
"Pts Won":f"{unique_table_name}Total Voting Points Won",
"Pts Max":f"{unique_table_name}Total Voting Points Possible",
"Share":f"{unique_table_name}% of Total Voting Points Possible That Were Won",                                                                                                                  
"Player Unique ID":f"{unique_table_name}Player Unique ID",
"Year":f"{unique_table_name}Year",
"Team Name Refined - accounts for players being traded":f"{unique_table_name}Team Name Refined - accounts for players being traded",
"Standardized/Modernized Team Name":f"{unique_table_name}Standardized/Modernized Team Name"}

# Replace names in dataframe
dpoy_df.rename(name_replacement_dict,axis=1,inplace=True)

# Get rid of dataframes as desired
list_of_columns_to_remove = ['G', 'MP', 'PTS', 'TRB', 'AST', 'STL','BLK', 'FG%', '3P%', 'FT%', 'WS', 'WS/48','DWS','DBPM','DRtg']

for item in list_of_columns_to_remove:
    dpoy_df = dpoy_df.drop(item,axis=1)

# Check column names to make sure none desired to get rid of
for column in dpoy_df.columns:
    
    print(f"""
    {column}""")

dpoy_df.head(1)

### Finals MVP Award

In [None]:
# Define unique name for dataframe to insert into f-string
unique_table_name = "Finals MVP Results - "

# Define name replacement dict
name_replacement_dict = {"Rank":f"{unique_table_name}Rank",
"# Tm":f"{unique_table_name}All NBA Award (e.g. 1st Team)",
"1st Tm":f"{unique_table_name}All NBA 1st Team Votes",
"2nd Tm":f"{unique_table_name}All NBA 2nd Team Votes",
"3rd Tm":f"{unique_table_name}All NBA 3rd Team Votes",                                                  
"Pos":f"{unique_table_name}Position",                         
"Player":f"{unique_table_name}Player",
"Age":f"{unique_table_name}Age",
"Team":f"{unique_table_name}Team",
"First":f"{unique_table_name}First Place Votes",
"Pts Won":f"{unique_table_name}Total Voting Points Won",
"Pts Max":f"{unique_table_name}Total Voting Points Possible",
"Share":f"{unique_table_name}% of Total Voting Points Possible That Were Won",                                                                                                                  
"Player Unique ID":f"{unique_table_name}Player Unique ID",
"Year":f"{unique_table_name}Year",
"Team Name Refined - accounts for players being traded":f"{unique_table_name}Team Name Refined - accounts for players being traded",
"Standardized/Modernized Team Name":f"{unique_table_name}Standardized/Modernized Team Name"}

# Replace names in dataframe
finals_mvp.rename(name_replacement_dict,axis=1,inplace=True)

# # Get rid of dataframes as desired
list_of_columns_to_remove = ['Lg', 'G', 'MP', 'PTS', 'TRB', 'AST', 'STL','BLK', 'FG%', '3P%', 'FT%']

for item in list_of_columns_to_remove:
    finals_mvp = finals_mvp.drop(item,axis=1)

# Check column names to make sure none desired to get rid of
for column in finals_mvp.columns:
    
    print(f"""
    {column}""")

finals_mvp.head(1)

## TEAM REGULAR SEASON STATS

 <a class="anchor" id="Step31"></a>

### TEAM Regular Season Basic Stats

In [None]:
# Define unique name for dataframe to insert into f-string
unique_table_name = "Regular Season Basic Team Stats - "

# Define name replacement dict
name_replacement_dict = {"Rk":f"{unique_table_name}Rank",
"Player":f"{unique_table_name}Player",
"Pos":f"{unique_table_name}Position",
"Age":f"{unique_table_name}Age",
"Team":f"{unique_table_name}Team",
"G":f"{unique_table_name}Games Played",
"GS":f"{unique_table_name}Games Started",
"MP":f"{unique_table_name}Minutes Played Per Game",
"FG":f"{unique_table_name}Field Goals Made Per Game",
"FGA":f"{unique_table_name}Field Goals Attempted Per Game",
"FG%":f"{unique_table_name}Average Field Goal % For Season",
"3P":f"{unique_table_name}3 Pointers Made Per Game",
"3PA":f"{unique_table_name}3 Point Attempts Per Game",
"3P%":f"{unique_table_name}Average 3 Point % For Season",
"2P":f"{unique_table_name}2 Pointers Made Per Game",
"2PA":f"{unique_table_name}2 Point Attempts Per Game",
"2P%":f"{unique_table_name}Average 2 Point % For Season",
"eFG%":f"{unique_table_name}Average Effective Field Goal % For Season",
"FT":f"{unique_table_name}Free Throws Made Per Game",
"FTA":f"{unique_table_name}Free Throw Attempts Per Game",
"FT%":f"{unique_table_name}Average Free Throw % For Season",
"ORB":f"{unique_table_name}Offense Rebounds Per Game",
"DRB":f"{unique_table_name}Defense Rebounds Per Game",
"TRB":f"{unique_table_name}Total Rebounds Per Game",
"AST":f"{unique_table_name}Assists Per Game",
"STL":f"{unique_table_name}Steals Per Game",
"BLK":f"{unique_table_name}Blocks Per Game",
"TOV":f"{unique_table_name}Turnovers Per Game",
"PF":f"{unique_table_name}Personal Fouls Per Game",
"PTS":f"{unique_table_name}Points Per Game",
"Player Unique ID":f"{unique_table_name}Player Unique ID",
"Year":f"{unique_table_name}Year",
"Team Name Refined - accounts for players being traded":f"{unique_table_name}Team Name Refined - accounts for players being traded",
"Standardized/Modernized Team Name":f"{unique_table_name}Standardized/Modernized Team Name"}
# Replace names in dataframe
nba_team_basic_stat_in_regular_season_df.rename(name_replacement_dict,axis=1,inplace=True)

# Check column names to make sure none desired to get rid of
for column in nba_team_basic_stat_in_regular_season_df.columns:
    
    print(f"""
    {column}""")

nba_team_basic_stat_in_regular_season_df.head(1)


### TEAM Regular Season Basic Opponent Stats

In [None]:
# Define unique name for dataframe to insert into f-string
unique_table_name = "Regular Season Basic Team Opponent Stats - "

# Define name replacement dict
name_replacement_dict = {"Rk":f"{unique_table_name}Rank",
"Player":f"{unique_table_name}Player",
"Pos":f"{unique_table_name}Position",
"Age":f"{unique_table_name}Age",
"Team":f"{unique_table_name}Team",
"G":f"{unique_table_name}Games Played",
"GS":f"{unique_table_name}Games Started",
"MP":f"{unique_table_name}Minutes Played Per Game",
"FG":f"{unique_table_name}Field Goals Made Per Game",
"FGA":f"{unique_table_name}Field Goals Attempted Per Game",
"FG%":f"{unique_table_name}Average Field Goal % For Season",
"3P":f"{unique_table_name}3 Pointers Made Per Game",
"3PA":f"{unique_table_name}3 Point Attempts Per Game",
"3P%":f"{unique_table_name}Average 3 Point % For Season",
"2P":f"{unique_table_name}2 Pointers Made Per Game",
"2PA":f"{unique_table_name}2 Point Attempts Per Game",
"2P%":f"{unique_table_name}Average 2 Point % For Season",
"eFG%":f"{unique_table_name}Average Effective Field Goal % For Season",
"FT":f"{unique_table_name}Free Throws Made Per Game",
"FTA":f"{unique_table_name}Free Throw Attempts Per Game",
"FT%":f"{unique_table_name}Average Free Throw % For Season",
"ORB":f"{unique_table_name}Offense Rebounds Per Game",
"DRB":f"{unique_table_name}Defense Rebounds Per Game",
"TRB":f"{unique_table_name}Total Rebounds Per Game",
"AST":f"{unique_table_name}Assists Per Game",
"STL":f"{unique_table_name}Steals Per Game",
"BLK":f"{unique_table_name}Blocks Per Game",
"TOV":f"{unique_table_name}Turnovers Per Game",
"PF":f"{unique_table_name}Personal Fouls Per Game",
"PTS":f"{unique_table_name}Points Per Game",
"Player Unique ID":f"{unique_table_name}Player Unique ID",
"Year":f"{unique_table_name}Year",
"Team Name Refined - accounts for players being traded":f"{unique_table_name}Team Name Refined - accounts for players being traded",
"Standardized/Modernized Team Name":f"{unique_table_name}Standardized/Modernized Team Name"}
# Replace names in dataframe
nba_team_basic_opponent_stat_in_regular_season_df.rename(name_replacement_dict,axis=1,inplace=True)

# Check column names to make sure none desired to get rid of
for column in nba_team_basic_opponent_stat_in_regular_season_df.columns:
    
    print(f"""
    {column}""")

nba_team_basic_opponent_stat_in_regular_season_df.head(1)

### TEAM Regular Season Advanced Stats

In [None]:
# Define unique name for dataframe to insert into f-string
unique_table_name = "Regular Season Team Advanced Stats - "

# Define name replacement dict
name_replacement_dict = {"Rk":f"{unique_table_name}Rank",
"Age":f"{unique_table_name}Average Player Age",
"Team":f"{unique_table_name}Team",

"W":f"{unique_table_name}Wins",
"L":f"{unique_table_name}Losses",
"PW":f"{unique_table_name}Pythagorean Wins (expected wins based on points scored and allowed)",
"PL":f"{unique_table_name}Pythagorean Losses (expected wins based on points scored and allowed)",
"MOV":f"{unique_table_name}Margin of Victory",
"SOS":f"{unique_table_name}Strength of schedule rating (positive is above average)",
"SRS":f"{unique_table_name}Simple rating system: derived using average point differential and strength of scheduled",
"ORtg":f"{unique_table_name}Offensive Rating",
"DRtg":f"{unique_table_name}Defensive Rating",
"NRtg":f"{unique_table_name}Net Rating",
"Pace":f"{unique_table_name}Pace (Possessions per 48 Minutes)",
"FTr":f"{unique_table_name}Free Throw Attempts Per FG Attempt",
"3PAr":f"{unique_table_name}3 Point Attempts Per FG Attempt",
"TS%":f"{unique_table_name}True Shooting %",
"eFG%":f"{unique_table_name}Average Effective Field Goal % For Season",
"TOV%":f"{unique_table_name}Turnover %",
"ORB%":f"{unique_table_name}Offensive Rebounding %",  
                         
"FT/FGA":f"{unique_table_name}Free Throws Made Per FG Attempt",
"eFG%.1":f"{unique_table_name}Opponent Effective Field Goal %",
"TOV%.1":f"{unique_table_name}Opponent Turnover %",
"DRB%":f"{unique_table_name}Defensive Rebounding %",
"FT/FGA.1":f"{unique_table_name}Opponent Field Goal Throws Per FG Attempt",
"Arena":f"{unique_table_name}Arena",
"Attend.":f"{unique_table_name}Total Arena Attendance",
"Attend./G":f"{unique_table_name}Attendance Per Game",

"Player Unique ID":f"{unique_table_name}Player Unique ID",
"Year":f"{unique_table_name}Year",
"Team Name Refined - accounts for players being traded":f"{unique_table_name}Team Name Refined - accounts for players being traded",
"Standardized/Modernized Team Name":f"{unique_table_name}Standardized/Modernized Team Name"}

# Replace names in dataframe
nba_advanced_team_stats.rename(name_replacement_dict,axis=1,inplace=True)

# # Get rid of dataframes as desired
list_of_columns_to_remove = ['Unnamed: 17','Unnamed: 22','Unnamed: 27']

for item in list_of_columns_to_remove:
    nba_advanced_team_stats = nba_advanced_team_stats.drop(item,axis=1)


# Check column names to make sure none desired to get rid of
for column in nba_advanced_team_stats.columns:
    
    print(f"""
    {column}""")

nba_advanced_team_stats.head(1)

### TEAM Regular Season Shooting Stats

In [None]:
# Define unique name for dataframe to insert into f-string
unique_table_name = "Regular Season Team Shooting Stats - "

# Define name replacement dict
name_replacement_dict = {"Rk":f"{unique_table_name}Rank",
"Player":f"{unique_table_name}Player",
"Pos":f"{unique_table_name}Position",
"Age":f"{unique_table_name}Age",
"Team":f"{unique_table_name}Team",
"G":f"{unique_table_name}Games Played",
'MP':f'{unique_table_name}Total Minutes Played During Season',
'FG%':f'{unique_table_name}Field Goal %',
'Dist.':f'{unique_table_name}Average distance of shot',                         
'2P':f'{unique_table_name}2 Pointer Attempt Rate For Season as % of total field goals attempted',
'0-3':f'{unique_table_name}% of FGA between 0-3 ft.',
'3-10':f'{unique_table_name}% of FGA between 3-10 ft.',                         
'10-16':f'{unique_table_name}% of FGA between 10-16 ft.',
'16-3P':f'{unique_table_name}% of FGA between 16-3P ft.',
'3P':f'{unique_table_name}3 Pointer Attempt Rate For Season as % of total field goals attempted',
'2P.1':f'{unique_table_name}FG % of 2 Point Attempts',
'0-3.1':f'{unique_table_name}FG % of Attempts between 0-3 ft.',
'3-10.1':f'{unique_table_name}FG % of Attempts between 3-10 ft.',                         
'10-16.1':f'{unique_table_name}FG % of Attempts between 10-16 ft.',
'16-3P.1':f'{unique_table_name}FG % of Attempts between 16-3P ft.',
'3P.1':f'{unique_table_name}FG % of 3 Point Attempts',                       
'2P.2':f'{unique_table_name}% of Made 2 Point Fields Goals That Were Assisted',                         
'3P.2':f'{unique_table_name}% of Made 3 Point Fields Goals That Were Assisted',
'%FGA':f'{unique_table_name}% of Field Goal attempts that are dunk attempts',
'Md.':f'{unique_table_name}Number of made dunk attempts',
 
'%FGA.1':f'{unique_table_name}% of Field Goal attempts that are layups',   
'Md..1':f'{unique_table_name}# of made layups',                         
                         
'%3PA':f'{unique_table_name}% of 3 Point Attempts that come from the corner',
'3P%':f'{unique_table_name}% of 3 Point corner attempts that are made',
'Att.':f'{unique_table_name}Heave Attempts',
'Md..2':f'{unique_table_name}Number of made heaves',                     
"Player Unique ID":f"{unique_table_name}Player Unique ID",
"Year":f"{unique_table_name}Year",
"Team Name Refined - accounts for players being traded":f"{unique_table_name}Team Name Refined - accounts for players being traded",
"Standardized/Modernized Team Name":f"{unique_table_name}Standardized/Modernized Team Name"}

# Replace names in dataframe
nba_team_shooting_stat_in_regular_season_df.rename(name_replacement_dict,axis=1,inplace=True)

# Get rid of dataframes as desired
list_of_columns_to_remove = ['Unnamed: 6','Unnamed: 13','Unnamed: 20','Unnamed: 23','Unnamed: 26','Unnamed: 29','Unnamed: 32']

for item in list_of_columns_to_remove:
    nba_team_shooting_stat_in_regular_season_df = nba_team_shooting_stat_in_regular_season_df.drop(item,axis=1)

# Check column names to make sure none desired to get rid of
for column in nba_team_shooting_stat_in_regular_season_df.columns:
    
    print(f"""
    {column}""")

nba_team_shooting_stat_in_regular_season_df.head(1)

### TEAM Opponent Regular Season Shooting Stats

In [None]:
# Define unique name for dataframe to insert into f-string
unique_table_name = "Regular Season Team Opponent Shooting Stats - "

# Define name replacement dict
name_replacement_dict = {"Rk":f"{unique_table_name}Rank",
"Player":f"{unique_table_name}Player",
"Pos":f"{unique_table_name}Position",
"Age":f"{unique_table_name}Age",
"Team":f"{unique_table_name}Team",
"G":f"{unique_table_name}Games Played",
'MP':f'{unique_table_name}Total Minutes Played During Season',
'FG%':f'{unique_table_name}Field Goal %',
'Dist.':f'{unique_table_name}Average distance of shot',                         
'2P':f'{unique_table_name}2 Pointer Attempt Rate For Season as % of total field goals attempted',
'0-3':f'{unique_table_name}% of FGA between 0-3 ft.',
'3-10':f'{unique_table_name}% of FGA between 3-10 ft.',                         
'10-16':f'{unique_table_name}% of FGA between 10-16 ft.',
'16-3P':f'{unique_table_name}% of FGA between 16-3P ft.',
'3P':f'{unique_table_name}3 Pointer Attempt Rate For Season as % of total field goals attempted',
'2P.1':f'{unique_table_name}FG % of 2 Point Attempts',
'0-3.1':f'{unique_table_name}FG % of Attempts between 0-3 ft.',
'3-10.1':f'{unique_table_name}FG % of Attempts between 3-10 ft.',                         
'10-16.1':f'{unique_table_name}FG % of Attempts between 10-16 ft.',
'16-3P.1':f'{unique_table_name}FG % of Attempts between 16-3P ft.',
'3P.1':f'{unique_table_name}FG % of 3 Point Attempts',                       
'2P.2':f'{unique_table_name}% of Made 2 Point Fields Goals That Were Assisted',                         
'3P.2':f'{unique_table_name}% of Made 3 Point Fields Goals That Were Assisted',
'%FGA':f'{unique_table_name}% of Field Goal attempts that are dunk attempts',
'Md.':f'{unique_table_name}Number of made dunk attempts',
 
'%FGA.1':f'{unique_table_name}% of Field Goal attempts that are layups',   
'Md..1':f'{unique_table_name}# of made layups',                         
                         
'%3PA':f'{unique_table_name}% of 3 Point Attempts that come from the corner',
'3P%':f'{unique_table_name}% of 3 Point corner attempts that are made',
'Att.':f'{unique_table_name}Heave Attempts',
'Md..2':f'{unique_table_name}Number of made heaves',                     
"Player Unique ID":f"{unique_table_name}Player Unique ID",
"Year":f"{unique_table_name}Year",
"Team Name Refined - accounts for players being traded":f"{unique_table_name}Team Name Refined - accounts for players being traded",
"Standardized/Modernized Team Name":f"{unique_table_name}Standardized/Modernized Team Name"}

# Replace names in dataframe
nba_team_opponent_shooting_stat_in_regular_season_df.rename(name_replacement_dict,axis=1,inplace=True)

# Get rid of dataframes as desired
list_of_columns_to_remove = ['Unnamed: 6','Unnamed: 13','Unnamed: 20','Unnamed: 23','Unnamed: 26','Unnamed: 29']

for item in list_of_columns_to_remove:
    nba_team_opponent_shooting_stat_in_regular_season_df = nba_team_opponent_shooting_stat_in_regular_season_df.drop(item,axis=1)

# Check column names to make sure none desired to get rid of
for column in nba_team_opponent_shooting_stat_in_regular_season_df.columns:
    
    print(f"""
    {column}""")

nba_team_opponent_shooting_stat_in_regular_season_df.head(1)

### TEAM Regular Season Standings

In [None]:
# Define unique name for dataframe to insert into f-string
unique_table_name = "Regular Season Team Standings - "

# Define name replacement dict
name_replacement_dict = {"Rk":f"{unique_table_name}Rank",
"Team":f"{unique_table_name}Team",
"Overall":f"{unique_table_name}Overall Team Record",
"Home":f"{unique_table_name}Team Record at Home",
"Road":f"{unique_table_name}Team Record on Road",  
"Pre":f"{unique_table_name}Team Record Pre All Star Break",
"Post":f"{unique_table_name}Team Record Post All Star Break",
"≤3":f"{unique_table_name}Team Record in Games Decided by 3 points or under",
"≥10":f"{unique_table_name}Team Record in Games Decided by 10 points or over",                                                
"Year":f"{unique_table_name}Year",
"Team Name Refined - accounts for players being traded":f"{unique_table_name}Team Name Refined - accounts for players being traded",
"Standardized/Modernized Team Name":f"{unique_table_name}Standardized/Modernized Team Name"}

# Replace names in dataframe
regular_season_standing_df.rename(name_replacement_dict,axis=1,inplace=True)

# # Get rid of dataframes as desired
# list_of_columns_to_remove = ['Unnamed: 6','Unnamed: 13','Unnamed: 20','Unnamed: 23','Unnamed: 26','Unnamed: 29']

# for item in list_of_columns_to_remove:
#     nba_team_opponent_shooting_stat_in_regular_season_df = nba_team_opponent_shooting_stat_in_regular_season_df.drop(item,axis=1)

# Check column names to make sure none desired to get rid of
for column in regular_season_standing_df.columns:
    
    print(f"""
    {column}""")

regular_season_standing_df.head(1)

## TEAM PLAYOFFS STATS

### TEAM Playoffs Basic Stats

In [None]:
# Define unique name for dataframe to insert into f-string
unique_table_name = "Playoffs Basic Team Stats - "

# Define name replacement dict
name_replacement_dict = {"Rk":f"{unique_table_name}Rank",
"Player":f"{unique_table_name}Player",
"Pos":f"{unique_table_name}Position",
"Age":f"{unique_table_name}Age",
"Team":f"{unique_table_name}Team",
"G":f"{unique_table_name}Games Played",
"GS":f"{unique_table_name}Games Started",
"MP":f"{unique_table_name}Minutes Played Per Game",
"FG":f"{unique_table_name}Field Goals Made Per Game",
"FGA":f"{unique_table_name}Field Goals Attempted Per Game",
"FG%":f"{unique_table_name}Average Field Goal % For Season",
"3P":f"{unique_table_name}3 Pointers Made Per Game",
"3PA":f"{unique_table_name}3 Point Attempts Per Game",
"3P%":f"{unique_table_name}Average 3 Point % For Season",
"2P":f"{unique_table_name}2 Pointers Made Per Game",
"2PA":f"{unique_table_name}2 Point Attempts Per Game",
"2P%":f"{unique_table_name}Average 2 Point % For Season",
"eFG%":f"{unique_table_name}Average Effective Field Goal % For Season",
"FT":f"{unique_table_name}Free Throws Made Per Game",
"FTA":f"{unique_table_name}Free Throw Attempts Per Game",
"FT%":f"{unique_table_name}Average Free Throw % For Season",
"ORB":f"{unique_table_name}Offense Rebounds Per Game",
"DRB":f"{unique_table_name}Defense Rebounds Per Game",
"TRB":f"{unique_table_name}Total Rebounds Per Game",
"AST":f"{unique_table_name}Assists Per Game",
"STL":f"{unique_table_name}Steals Per Game",
"BLK":f"{unique_table_name}Blocks Per Game",
"TOV":f"{unique_table_name}Turnovers Per Game",
"PF":f"{unique_table_name}Personal Fouls Per Game",
"PTS":f"{unique_table_name}Points Per Game",
"Player Unique ID":f"{unique_table_name}Player Unique ID",
"Year":f"{unique_table_name}Year",
"Team Name Refined - accounts for players being traded":f"{unique_table_name}Team Name Refined - accounts for players being traded",
"Standardized/Modernized Team Name":f"{unique_table_name}Standardized/Modernized Team Name"}
# Replace names in dataframe
nba_team_basic_stat_in_playoffs_df.rename(name_replacement_dict,axis=1,inplace=True)

# Check column names to make sure none desired to get rid of
for column in nba_team_basic_stat_in_playoffs_df.columns:
    
    print(f"""
    {column}""")

nba_team_basic_stat_in_playoffs_df.head(1)


### TEAM Playoffs Basic Opponent Stats

In [None]:
# Define unique name for dataframe to insert into f-string
unique_table_name = "Playoffs Basic Team Opponent Stats - "

# Define name replacement dict
name_replacement_dict = {"Rk":f"{unique_table_name}Rank",
"Player":f"{unique_table_name}Player",
"Pos":f"{unique_table_name}Position",
"Age":f"{unique_table_name}Age",
"Team":f"{unique_table_name}Team",
"G":f"{unique_table_name}Games Played",
"GS":f"{unique_table_name}Games Started",
"MP":f"{unique_table_name}Minutes Played Per Game",
"FG":f"{unique_table_name}Field Goals Made Per Game",
"FGA":f"{unique_table_name}Field Goals Attempted Per Game",
"FG%":f"{unique_table_name}Average Field Goal % For Season",
"3P":f"{unique_table_name}3 Pointers Made Per Game",
"3PA":f"{unique_table_name}3 Point Attempts Per Game",
"3P%":f"{unique_table_name}Average 3 Point % For Season",
"2P":f"{unique_table_name}2 Pointers Made Per Game",
"2PA":f"{unique_table_name}2 Point Attempts Per Game",
"2P%":f"{unique_table_name}Average 2 Point % For Season",
"eFG%":f"{unique_table_name}Average Effective Field Goal % For Season",
"FT":f"{unique_table_name}Free Throws Made Per Game",
"FTA":f"{unique_table_name}Free Throw Attempts Per Game",
"FT%":f"{unique_table_name}Average Free Throw % For Season",
"ORB":f"{unique_table_name}Offense Rebounds Per Game",
"DRB":f"{unique_table_name}Defense Rebounds Per Game",
"TRB":f"{unique_table_name}Total Rebounds Per Game",
"AST":f"{unique_table_name}Assists Per Game",
"STL":f"{unique_table_name}Steals Per Game",
"BLK":f"{unique_table_name}Blocks Per Game",
"TOV":f"{unique_table_name}Turnovers Per Game",
"PF":f"{unique_table_name}Personal Fouls Per Game",
"PTS":f"{unique_table_name}Points Per Game",
"Player Unique ID":f"{unique_table_name}Player Unique ID",
"Year":f"{unique_table_name}Year",
"Team Name Refined - accounts for players being traded":f"{unique_table_name}Team Name Refined - accounts for players being traded",
"Standardized/Modernized Team Name":f"{unique_table_name}Standardized/Modernized Team Name"}
# Replace names in dataframe
nba_team_basic_opponent_stat_in_playoffs_df.rename(name_replacement_dict,axis=1,inplace=True)

# Check column names to make sure none desired to get rid of
for column in nba_team_basic_opponent_stat_in_playoffs_df.columns:
    
    print(f"""
    {column}""")

nba_team_basic_opponent_stat_in_playoffs_df.head(1)

### TEAM Playoffs Advanced Stats

In [None]:
# Define unique name for dataframe to insert into f-string
unique_table_name = "Playoffs Team Advanced Stats - "

# Define name replacement dict
name_replacement_dict = {"Rk":f"{unique_table_name}Rank",
"Age":f"{unique_table_name}Average Player Age",
"Team":f"{unique_table_name}Team",

"W":f"{unique_table_name}Wins",
"L":f"{unique_table_name}Losses",
"W/L%":f"{unique_table_name}Win Loss Percentage",                         
"PW":f"{unique_table_name}Pythagorean Wins (expected wins based on points scored and allowed)",
"PL":f"{unique_table_name}Pythagorean Losses (expected wins based on points scored and allowed)",
"MOV":f"{unique_table_name}Margin of Victory",
"SOS":f"{unique_table_name}Strength of schedule rating (positive is above average)",
"SRS":f"{unique_table_name}Simple rating system: derived using average point differential and strength of scheduled",
"ORtg":f"{unique_table_name}Offensive Rating",
"DRtg":f"{unique_table_name}Defensive Rating",
"NRtg":f"{unique_table_name}Net Rating",
"Pace":f"{unique_table_name}Pace (Possessions per 48 Minutes)",
"FTr":f"{unique_table_name}Free Throw Attempts Per FG Attempt",
"3PAr":f"{unique_table_name}3 Point Attempts Per FG Attempt",
"TS%":f"{unique_table_name}True Shooting %",
"eFG%":f"{unique_table_name}Average Effective Field Goal % For Season",
"TOV%":f"{unique_table_name}Turnover %",
"ORB%":f"{unique_table_name}Offensive Rebounding %",  
                         
"FT/FGA":f"{unique_table_name}Free Throws Made Per FG Attempt",
"eFG%.1":f"{unique_table_name}Opponent Effective Field Goal %",
"TOV%.1":f"{unique_table_name}Opponent Turnover %",
"DRB%":f"{unique_table_name}Defensive Rebounding %",
"FT/FGA.1":f"{unique_table_name}Opponent Field Goal Throws Per FG Attempt",
"Arena":f"{unique_table_name}Arena",
"Attend.":f"{unique_table_name}Total Arena Attendance",
"Attend./G":f"{unique_table_name}Attendance Per Game",

"Player Unique ID":f"{unique_table_name}Player Unique ID",
"Year":f"{unique_table_name}Year",
"Team Name Refined - accounts for players being traded":f"{unique_table_name}Team Name Refined - accounts for players being traded",
"Standardized/Modernized Team Name":f"{unique_table_name}Standardized/Modernized Team Name"}

# Replace names in dataframe
nba_team_advanced_stat_in_playoffs_df.rename(name_replacement_dict,axis=1,inplace=True)

# # Get rid of dataframes as desired
list_of_columns_to_remove = ['Unnamed: 15','Unnamed: 20']

for item in list_of_columns_to_remove:
    nba_team_advanced_stat_in_playoffs_df = nba_team_advanced_stat_in_playoffs_df.drop(item,axis=1)


# Check column names to make sure none desired to get rid of
for column in nba_team_advanced_stat_in_playoffs_df.columns:
    
    print(f"""
    {column}""")

nba_team_advanced_stat_in_playoffs_df.head(1)

### TEAM Playoffs Shooting Stats

In [None]:
# Define unique name for dataframe to insert into f-string
unique_table_name = "Playoffs Team Shooting Stats - "

# Define name replacement dict
name_replacement_dict = {"Rk":f"{unique_table_name}Rank",
"Player":f"{unique_table_name}Player",
"Pos":f"{unique_table_name}Position",
"Age":f"{unique_table_name}Age",
"Team":f"{unique_table_name}Team",
"G":f"{unique_table_name}Games Played",
'MP':f'{unique_table_name}Total Minutes Played During Season',
'FG%':f'{unique_table_name}Field Goal %',
'Dist.':f'{unique_table_name}Average distance of shot',                         
'2P':f'{unique_table_name}2 Pointer Attempt Rate For Season as % of total field goals attempted',
'0-3':f'{unique_table_name}% of FGA between 0-3 ft.',
'3-10':f'{unique_table_name}% of FGA between 3-10 ft.',                         
'10-16':f'{unique_table_name}% of FGA between 10-16 ft.',
'16-3P':f'{unique_table_name}% of FGA between 16-3P ft.',
'3P':f'{unique_table_name}3 Pointer Attempt Rate For Season as % of total field goals attempted',
'2P.1':f'{unique_table_name}FG % of 2 Point Attempts',
'0-3.1':f'{unique_table_name}FG % of Attempts between 0-3 ft.',
'3-10.1':f'{unique_table_name}FG % of Attempts between 3-10 ft.',                         
'10-16.1':f'{unique_table_name}FG % of Attempts between 10-16 ft.',
'16-3P.1':f'{unique_table_name}FG % of Attempts between 16-3P ft.',
'3P.1':f'{unique_table_name}FG % of 3 Point Attempts',                       
'2P.2':f'{unique_table_name}% of Made 2 Point Fields Goals That Were Assisted',                         
'3P.2':f'{unique_table_name}% of Made 3 Point Fields Goals That Were Assisted',
'%FGA':f'{unique_table_name}% of Field Goal attempts that are dunk attempts',
'Md.':f'{unique_table_name}Number of made dunk attempts',
 
'%FGA.1':f'{unique_table_name}% of Field Goal attempts that are layups',   
'Md..1':f'{unique_table_name}# of made layups',                         
                         
'%3PA':f'{unique_table_name}% of 3 Point Attempts that come from the corner',
'3P%':f'{unique_table_name}% of 3 Point corner attempts that are made',
'Att.':f'{unique_table_name}Heave Attempts',
'#':f'{unique_table_name}Number of made heaves',                     
"Player Unique ID":f"{unique_table_name}Player Unique ID",
"Year":f"{unique_table_name}Year",
"Team Name Refined - accounts for players being traded":f"{unique_table_name}Team Name Refined - accounts for players being traded",
"Standardized/Modernized Team Name":f"{unique_table_name}Standardized/Modernized Team Name"}

# Replace names in dataframe
nba_team_shooting_in_playoffs_df.rename(name_replacement_dict,axis=1,inplace=True)

# # Get rid of dataframes as desired
list_of_columns_to_remove = ['Unnamed: 6','Unnamed: 13','Unnamed: 20','Unnamed: 23','Unnamed: 26','Unnamed: 29']

for item in list_of_columns_to_remove:
    nba_team_shooting_in_playoffs_df = nba_team_shooting_in_playoffs_df.drop(item,axis=1)

# Check column names to make sure none desired to get rid of
for column in nba_team_shooting_in_playoffs_df.columns:
    
    print(f"""
    {column}""")

nba_team_shooting_in_playoffs_df.head(1)

### TEAM Playoff Opponent Shooting Stats

In [None]:
# Define unique name for dataframe to insert into f-string
unique_table_name = "Playoffs Team Opponent Shooting Stats - "

# Define name replacement dict
name_replacement_dict = {"Rk":f"{unique_table_name}Rank",
"Player":f"{unique_table_name}Player",
"Pos":f"{unique_table_name}Position",
"Age":f"{unique_table_name}Age",
"Team":f"{unique_table_name}Team",
"G":f"{unique_table_name}Games Played",
'MP':f'{unique_table_name}Total Minutes Played During Season',
'FG%':f'{unique_table_name}Field Goal %',
'Dist.':f'{unique_table_name}Average distance of shot',                         
'2P':f'{unique_table_name}2 Pointer Attempt Rate For Season as % of total field goals attempted',
'0-3':f'{unique_table_name}% of FGA between 0-3 ft.',
'3-10':f'{unique_table_name}% of FGA between 3-10 ft.',                         
'10-16':f'{unique_table_name}% of FGA between 10-16 ft.',
'16-3P':f'{unique_table_name}% of FGA between 16-3P ft.',
'3P':f'{unique_table_name}3 Pointer Attempt Rate For Season as % of total field goals attempted',
'2P.1':f'{unique_table_name}FG % of 2 Point Attempts',
'0-3.1':f'{unique_table_name}FG % of Attempts between 0-3 ft.',
'3-10.1':f'{unique_table_name}FG % of Attempts between 3-10 ft.',                         
'10-16.1':f'{unique_table_name}FG % of Attempts between 10-16 ft.',
'16-3P.1':f'{unique_table_name}FG % of Attempts between 16-3P ft.',
'3P.1':f'{unique_table_name}FG % of 3 Point Attempts',                       
'2P.2':f'{unique_table_name}% of Made 2 Point Fields Goals That Were Assisted',                         
'3P.2':f'{unique_table_name}% of Made 3 Point Fields Goals That Were Assisted',
'%FGA':f'{unique_table_name}% of Field Goal attempts that are dunk attempts',
'Md.':f'{unique_table_name}Number of made dunk attempts',
 
'%FGA.1':f'{unique_table_name}% of Field Goal attempts that are layups',   
'Md..1':f'{unique_table_name}# of made layups',                         
                         
'%3PA':f'{unique_table_name}% of 3 Point Attempts that come from the corner',
'3P%':f'{unique_table_name}% of 3 Point corner attempts that are made',
'Att.':f'{unique_table_name}Heave Attempts',
'Md..2':f'{unique_table_name}Number of made heaves',                     
"Player Unique ID":f"{unique_table_name}Player Unique ID",
"Year":f"{unique_table_name}Year",
"Team Name Refined - accounts for players being traded":f"{unique_table_name}Team Name Refined - accounts for players being traded",
"Standardized/Modernized Team Name":f"{unique_table_name}Standardized/Modernized Team Name"}

# Replace names in dataframe
nba_team_opponent_shooting_stat_in_playoffs_df.rename(name_replacement_dict,axis=1,inplace=True)

# Get rid of dataframes as desired
list_of_columns_to_remove = ['Unnamed: 6','Unnamed: 13','Unnamed: 20','Unnamed: 23','Unnamed: 26']

for item in list_of_columns_to_remove:
    nba_team_opponent_shooting_stat_in_playoffs_df = nba_team_opponent_shooting_stat_in_playoffs_df.drop(item,axis=1)

# Check column names to make sure none desired to get rid of
for column in nba_team_opponent_shooting_stat_in_playoffs_df.columns:
    
    print(f"""
    {column}""")

nba_team_opponent_shooting_stat_in_playoffs_df.head(1)

### Team Playoff Standings

In [None]:
# Define unique name for dataframe to insert into f-string
unique_table_name = "Team Playoff Standings - "

# Define name replacement dict
name_replacement_dict = {"Rk":f'{unique_table_name}Rank',
                         "Team":f'{unique_table_name}Team',
                         "Overall":f'{unique_table_name}Overall Record',
                         "Home":f'{unique_table_name}Record at home',
                         "Road":f'{unique_table_name}Record on the road',
                         "E":f'{unique_table_name}Record against Eastern Conference',
                         'Year':f'{unique_table_name}Year',
                         "W":f'{unique_table_name}Record against Western Conference',
                         "Team Name Refined - accounts for players being traded":f'{unique_table_name}Team Name Refined - accounts for players being traded',
                         "Standardized/Modernized Team Name":f'{unique_table_name}Standardized/Modernized Team Name'}
                         
                         
# Replace names in dataframe
nba_team_playoff_standing.rename(name_replacement_dict,axis=1,inplace=True)

# # Get rid of dataframes as desired
# list_of_columns_to_remove = ['Unnamed: 6','Unnamed: 13','Unnamed: 20','Unnamed: 23','Unnamed: 26']

# for item in list_of_columns_to_remove:
#     nba_team_playoff_standing = nba_team_playoff_standing.drop(item,axis=1)

# Check column names to make sure none desired to get rid of
for column in nba_team_playoff_standing.columns:
    
    print(f"""
    {column}""")

nba_team_playoff_standing.head(1)                         

In [None]:
# Create playoff result column
nba_team_playoff_standing['Playoff Result'] = nba_team_playoff_standing['Team Playoff Standings - Overall Record']

def populate_playoff_result(x):
    
    x_value = int(x.split('-')[0])
    
    y_value = int(x.split('-')[1])
    
    value_to_populate = 'FLAG'
    
    if x_value == 16:
        
        value_to_populate='Won NBA Finals'
        
    if x_value < 16 and x_value > 11:
        
        value_to_populate='Lost NBA Finals'
    
    if x_value < 12 and x_value > 7:
        
        value_to_populate='Lost in Conference Finals'
        
    if x_value < 8 and x_value > 3:
        
        value_to_populate='Lost in 2nd round'
    
    if x_value < 4:
        
        value_to_populate='Lost in 1st round'
        
    if x_value == 0:
        
        if y_value == 0:
            
            value_to_populate='Lost in play-in'
    
    return value_to_populate
    
nba_team_playoff_standing['Playoff Result'] = nba_team_playoff_standing['Playoff Result'].apply(populate_playoff_result)

nba_team_playoff_standing.head(5)

# Stage 6.) Feature Engineering before training different Machine Learning Models

 <a class="anchor" id="Step99"></a>

In [None]:
def rank_column_values(dataframe, name_of_year_value_column, list_of_columns_to_rank, rank_method='average', ascending=False):
    
    # Within year, rank columns
    for column in list_of_columns_to_rank: 

        ranked_column_name = column + ' Ranked'
        
        dataframe[ranked_column_name] = np.nan
    
        for year in list(dataframe[name_of_year_value_column].unique()):

            # Filter down to year_subset
            subset_df = dataframe.copy()
            subset_df = subset_df.loc[subset_df[name_of_year_value_column]==year]

            # Create and populate ranked column for subset
            subset_df.loc[:,ranked_column_name] = subset_df[column].rank(method=rank_method, ascending=ascending)
            
            # Using subset indices and values, populate indices of main dataframe with rank values for year created    
            dataframe.loc[subset_df.index.tolist(),ranked_column_name] = subset_df[ranked_column_name]

    return dataframe

## Prep player regular season basic stats for machine learning

###### Define treatment of columns

In [None]:

list_of_basic_player_stats_regular_season_df_categorical_to_encode = ['Regular Season Basic Player Stats - Position',
                                                                     ]


list_of_basic_player_stats_regular_season_df_columns_to_exclude = [
                                                                   'Regular Season Basic Player Stats - Rank',
                                                                   'Regular Season Basic Player Stats - Player Unique ID',
                                                                   'Regular Season Basic Player Stats - Team Name Refined - accounts for players being traded',
                                                                    'Regular Season Basic Player Stats - 6MAN Game Started Maximum',
                                                                    'Regular Season Basic Player Stats - 6MAN Game Minimum'
                                                                  ]

list_of_basic_player_stats_regular_season_df_columns_to_rank = ['Regular Season Basic Player Stats - Total Rebounds Per Game',
                                                                'Regular Season Basic Player Stats - Assists Per Game',
                                                                'Regular Season Basic Player Stats - Steals Per Game',
                                                                'Regular Season Basic Player Stats - Blocks Per Game',
                                                                'Regular Season Basic Player Stats - Points Per Game',
                                                                'Regular Season Basic Player Stats - Average Effective Field Goal % For Season'
                                                               ]

list_of_basic_player_stats_regular_season_df_columns_to_derive_indicator_for = ['Regular Season Basic Player Stats - Games Played',
                                                                               'Regular Season Basic Player Stats - Minutes Played Per Game']

###### Create 6MAN games started and games played indicator ( > 60 games played, < 35 games started)

In [None]:
# Create temporary indicator columns
basic_player_stats_regular_season_df['Regular Season Basic Player Stats - 6MAN Game Minimum'] = (basic_player_stats_regular_season_df['Regular Season Basic Player Stats - Games Played'] > 60).astype(int)
basic_player_stats_regular_season_df['Regular Season Basic Player Stats - 6MAN Game Started Maximum'] = (basic_player_stats_regular_season_df['Regular Season Basic Player Stats - Games Started'] < 35).astype(int)

# Using temporary indicator columns, create a 6MAN indicator
basic_player_stats_regular_season_df['Regular Season Basic Player Stats - 6MAN Game Played and Started Qualification '] = ((basic_player_stats_regular_season_df['Regular Season Basic Player Stats - 6MAN Game Minimum'] == 1) & (basic_player_stats_regular_season_df['Regular Season Basic Player Stats - 6MAN Game Started Maximum'] == 1)).astype(int)

###### Perform transformations of data using lists above

In [None]:
# Store old dataframe in case needed
basic_player_stats_regular_season_df_before_changes = basic_player_stats_regular_season_df.copy()

# Create ranked columns for columns desired to rank
basic_player_stats_regular_season_df = rank_column_values(basic_player_stats_regular_season_df,'Regular Season Basic Player Stats - Year',list_of_basic_player_stats_regular_season_df_columns_to_rank)

# Remove columns to exclude:
for column in list_of_basic_player_stats_regular_season_df_columns_to_exclude:
    
    basic_player_stats_regular_season_df.drop(column,axis=1,inplace=True)
    
# Encode nominal categorical variables using one-hot encoding###### Create indicator columns for games played and minutes per game
basic_player_stats_regular_season_df = pd.get_dummies(basic_player_stats_regular_season_df, columns=list_of_basic_player_stats_regular_season_df_categorical_to_encode)

###### Create indicator columns for games played and minutes per game

In [None]:
basic_player_stats_regular_season_df['Regular Season Basic Player Stats - Game Minimum'] = (basic_player_stats_regular_season_df['Regular Season Basic Player Stats - Games Played'] > 50).astype(int)
basic_player_stats_regular_season_df['Regular Season Basic Player Stats - Minutes Per Game Minimum'] = (basic_player_stats_regular_season_df['Regular Season Basic Player Stats - Minutes Played Per Game'] >= 20).astype(int)


###### Create Delta columns for Most Improved Player Analysis

In [None]:
# columns_to_delta = ['Regular Season Basic Player Stats - Minutes Played Per Game',
#                     'Regular Season Basic Player Stats - Average Effective Field Goal % For Season',
#                    'Regular Season Basic Player Stats - Points Per Game',
#                    'Regular Season Basic Player Stats - Total Rebounds Per Game',
#                    'Regular Season Basic Player Stats - Assists Per Game',
#                    'Regular Season Basic Player Stats - Steals Per Game',
#                    'Regular Season Basic Player Stats - Blocks Per Game']

# for column_to_delta in columns_to_delta:
    
#     delta_column_name = f'{column_to_delta} Change from Previous Year' = np.nan
    
    
# for year in list(basic_player_stats_regular_season_df['Year'].unique()):
    
#     if year == 2003:
        
#         continue
        
#     year_before = year - 1
    
#     for column_to_delta in columns_to_delta:
    
#         delta_column_name = f'{column_to_delta} Change from Previous Year' = np.nan

# # Minutes played per game

In [None]:
basic_player_stats_regular_season_df.head(10)

## Prep player regular season advanced stats for machine learning

In [None]:
list_of_categorical_to_encode = []
                                                            



list_of_columns_to_exclude = ['Regular Season Advanced Player Stats - Rank',
                               'Regular Season Advanced Player Stats - Player',
                               'Regular Season Advanced Player Stats - Position',
                               'Regular Season Advanced Player Stats - Team',
                               'Regular Season Advanced Player Stats - Age',
                               'Regular Season Advanced Player Stats - Games Played',
                              'Regular Season Advanced Player Stats - Team Name Refined - accounts for players being traded',
                              'Regular Season Advanced Player Stats - Year',
                              'Regular Season Advanced Player Stats - Player Unique ID',
                              'Regular Season Advanced Player Stats - Standardized/Modernized Team Name',
                             'Team_name_and_year_concat']


list_of_columns_to_rank = ['Regular Season Advanced Player Stats - Player Efficiency',
                           'Regular Season Advanced Player Stats - True Shooting % For Season',
                            'Regular Season Advanced Player Stats - Offensive Rebounding %',
                            'Regular Season Advanced Player Stats - Defensive Rebound Percentage',
                            'Regular Season Advanced Player Stats - Total Rebounding %',
                            'Regular Season Advanced Player Stats - Assist % For Season',
                            'Regular Season Advanced Player Stats - Steal Percentage for Season',
                            'Regular Season Advanced Player Stats - Block %',
                            'Regular Season Advanced Player Stats - Turnover % for Season',
                            'Regular Season Advanced Player Stats - Usage % For Season',
                            'Regular Season Advanced Player Stats - Total Offensive Win Shares For Season',
                            'Regular Season Advanced Player Stats - Total Defensive Win Shares For Season',
                            'Regular Season Advanced Player Stats - Win Shares',
                            'Regular Season Advanced Player Stats - Win Shares Per 48 Minutes Played For the Season',
                            'Regular Season Advanced Player Stats - Offensive Box Plus/Minus For Season',
                            'Regular Season Advanced Player Stats - Defensive Box Plus/Minus',
                            'Regular Season Advanced Player Stats - Box Plus/Minus For Season',
                            'Regular Season Advanced Player Stats - Value Above Replacement Player']


list_of_columns_to_derive_indicator_for = []

In [None]:
# Store old dataframe in case needed
advanced_player_stats_in_regular_season_df_before_changes = advanced_player_stats_in_regular_season_df.copy()

# Create ranked columns for columns desired to rank
advanced_player_stats_in_regular_season_df = rank_column_values(advanced_player_stats_in_regular_season_df,'Regular Season Advanced Player Stats - Year',list_of_columns_to_rank)

# Remove columns to exclude:
for column in list_of_columns_to_exclude:
    
    advanced_player_stats_in_regular_season_df.drop(column,axis=1,inplace=True)
    
# # Encode nominal categorical variables using one-hot encoding
# basic_player_stats_regular_season_df = pd.get_dummies(basic_player_stats_regular_season_df, columns=list_of_basic_player_stats_regular_season_df_categorical_to_encode)

In [None]:
advanced_player_stats_in_regular_season_df.head(10)

In [None]:
advanced_player_stats_in_regular_season_df.loc[advanced_player_stats_in_regular_season_df['Player_unique_id_and_year_concat']=='jokicni01-2023']

## Prep player regular season shooting stats for machine learning

In [None]:
list_of_categorical_to_encode = []
                                                            



list_of_columns_to_exclude = ['Regular Season Player Shooting Stats - Rank',
                               'Regular Season Player Shooting Stats - Player',
                               'Regular Season Player Shooting Stats - Position',
                               'Regular Season Player Shooting Stats - Team',
                               'Regular Season Player Shooting Stats - Age',
                               'Regular Season Player Shooting Stats - Games Played',
                              'Regular Season Player Shooting Stats - Total Minutes Played During Season',
                              'Regular Season Player Shooting Stats - Year',
                              'Regular Season Player Shooting Stats - Player Unique ID',
                              'Regular Season Player Shooting Stats - Standardized/Modernized Team Name',
                             'Team_name_and_year_concat',
                             'Regular Season Player Shooting Stats - Team Name Refined - accounts for players being traded',
                             'Regular Season Player Shooting Stats - Field Goal %']


list_of_columns_to_rank = []


list_of_columns_to_derive_indicator_for = []

In [None]:
player_shooting_in_regular_season_df.head(10)

In [None]:
# Store old dataframe in case needed
player_shooting_in_regular_season_df_before_changes = player_shooting_in_regular_season_df.copy()

# # Create ranked columns for columns desired to rank
# advanced_player_stats_in_regular_season_df = rank_column_values(advanced_player_stats_in_regular_season_df,'Regular Season Advanced Player Stats - Year',list_of_columns_to_rank)

# Remove columns to exclude:
for column in list_of_columns_to_exclude:
    
    player_shooting_in_regular_season_df.drop(column,axis=1,inplace=True)
    
# # Encode nominal categorical variables using one-hot encoding
# basic_player_stats_regular_season_df = pd.get_dummies(basic_player_stats_regular_season_df, columns=list_of_basic_player_stats_regular_season_df_categorical_to_encode)

In [None]:
player_shooting_in_regular_season_df.head(10)

In [None]:
player_shooting_in_regular_season_df.loc[player_shooting_in_regular_season_df['Player_unique_id_and_year_concat']=='jokicni01-2023']

## Prep player regular season play by play stats for machine learning

In [None]:
list_of_categorical_to_encode = []
                                                            



list_of_columns_to_exclude = ['Regular Season Play by Play Player Stats - Rank',
                               'Regular Season Play by Play Player Stats - Player',
                               'Regular Season Play by Play Player Stats - Position',
                               'Regular Season Play by Play Player Stats - Team',
                               'Regular Season Play by Play Player Stats - Age',
                               'Regular Season Play by Play Player Stats - Games Played',
                              'Regular Season Play by Play Player Stats - Total Minutes Played During Season',
                              'Regular Season Play by Play Player Stats - Year',
                              'Regular Season Play by Play Player Stats - Player Unique ID',
                              'Regular Season Play by Play Player Stats - Standardized/Modernized Team Name',
                             'Team_name_and_year_concat',
                             'Regular Season Play by Play Player Stats - Team Name Refined - accounts for players being traded']


list_of_columns_to_rank = ['Regular Season Play by Play Player Stats - Plus/Minus Per 100 Possessions (On Court)',
                          'Regular Season Play by Play Player Stats - Plus/Minus Net Per 100 Possessions (On/off)',
                          'Regular Season Play by Play Player Stats - Points Generated by Assists',
                          'Regular Season Play by Play Player Stats - Shooting Fouls Drawn']


list_of_columns_to_derive_indicator_for = []

In [None]:
list_of_columns_to_modify = ['Regular Season Play by Play Player Stats - Percentage of Total Minutes Played at Point Guard',
                            'Regular Season Play by Play Player Stats - Percentage of Total Minutes Played at Shooting Guard',
                            'Regular Season Play by Play Player Stats - Percentage of Total Minutes Played at Power Forward',
                            'Regular Season Play by Play Player Stats - Percentage of Total Minutes Played at Small Forward',
                            'Regular Season Play by Play Player Stats - Percentage of Total Minutes Played at Center']

for column in list_of_columns_to_modify:
    
    player_play_by_play_in_regular_season_df[column] = pd.to_numeric(player_play_by_play_in_regular_season_df[column].str.rstrip('%'), errors='coerce') / 100

In [None]:
player_play_by_play_in_regular_season_df['Regular Season Play by Play Player Stats - Percentage of Total Minutes Played at Point Guard'][12018]

In [None]:
player_play_by_play_in_regular_season_df.head(10)

In [None]:
# Store old dataframe in case needed
player_play_by_play_in_regular_season_df_before_changes = player_play_by_play_in_regular_season_df.copy()

# Create ranked columns for columns desired to rank
player_play_by_play_in_regular_season_df = rank_column_values(player_play_by_play_in_regular_season_df,'Regular Season Play by Play Player Stats - Year',list_of_columns_to_rank)

# Remove columns to exclude:
for column in list_of_columns_to_exclude:
    
    player_play_by_play_in_regular_season_df.drop(column,axis=1,inplace=True)
    
# # Encode nominal categorical variables using one-hot encoding
# basic_player_stats_regular_season_df = pd.get_dummies(basic_player_stats_regular_season_df, columns=list_of_basic_player_stats_regular_season_df_categorical_to_encode)

In [None]:
player_play_by_play_in_regular_season_df.head(10)

In [None]:
player_play_by_play_in_regular_season_df.loc[player_play_by_play_in_regular_season_df['Player_unique_id_and_year_concat']=='jokicni01-2023']

## Prep MVP voting history for machine learning

In [None]:
list_of_categorical_to_encode = []
                                                            



list_of_columns_to_exclude = ['MVP Vote Results - Rank',
                               'MVP Vote Results - Player',
                              'MVP Vote Results - Player Unique ID',
                              'MVP Vote Results - Year',
                              'MVP Vote Results - Standardized/Modernized Team Name',
                             'Team_name_and_year_concat',
                              'MVP Vote Results - Age',
                              'MVP Vote Results - Team',
                             'MVP Vote Results - Team Name Refined - accounts for players being traded']


list_of_columns_to_rank = []


list_of_columns_to_derive_indicator_for = []

In [None]:
dataframe_column_prefix = 'MVP Vote Results - '

# Create column to contain indicator
mvp_vote_df[f'{dataframe_column_prefix}MVP Result Indicator'] = np.nan

# Loop through year and populate indicator
for year in list(mvp_vote_df[f'{dataframe_column_prefix}Year'].unique()):
    
    # Filter dataframe by year
    subset_df = mvp_vote_df.loc[mvp_vote_df[f'{dataframe_column_prefix}Year']==year]
    
    # Sort year subset by Total Voting Points Won
    index_to_mark = subset_df[f'{dataframe_column_prefix}Total Voting Points Won'].idxmax()
    
    # Set value = 2 for winner
    mvp_vote_df.loc[index_to_mark,f'{dataframe_column_prefix}MVP Result Indicator'] = 2

    # Remove winner from consideration
    subset_df = subset_df.drop(index_to_mark)
    
    # Sort year subset by Total Voting Points Won
    index_to_mark = subset_df[f'{dataframe_column_prefix}Total Voting Points Won'].idxmax()
          
    # Set value = 3 for 2nd place
    mvp_vote_df.loc[index_to_mark,f'{dataframe_column_prefix}MVP Result Indicator'] = 3   

# Default in value of 1 to indicate they received a vote but did not win 
mvp_vote_df.loc[~mvp_vote_df[f'{dataframe_column_prefix}MVP Result Indicator'].isin([3,2]),f'{dataframe_column_prefix}MVP Result Indicator'] = 1     
            

In [None]:
mvp_vote_df.head(10)

In [None]:
# Store old dataframe in case needed
mvp_vote_df_before_changes = player_play_by_play_in_regular_season_df.copy()

# # Create ranked columns for columns desired to rank
# mvp_vote_df = rank_column_values(player_play_by_play_in_regular_season_df,'Regular Season Play by Play Player Stats - Year',list_of_columns_to_rank)

# Remove columns to exclude:
for column in list_of_columns_to_exclude:
    
    mvp_vote_df.drop(column,axis=1,inplace=True)
    
# # Encode nominal categorical variables using one-hot encoding
# basic_player_stats_regular_season_df = pd.get_dummies(basic_player_stats_regular_season_df, columns=list_of_basic_player_stats_regular_season_df_categorical_to_encode)

In [None]:
mvp_vote_df.head(10)

In [None]:
mvp_vote_df.loc[mvp_vote_df['Player_unique_id_and_year_concat']=='jokicni01-2023']

## Prep DPOY voting history for machine learning

In [None]:
list_of_categorical_to_encode = []
                                                            



list_of_columns_to_exclude = ['DPOY Results - Rank',
                               'DPOY Results - Player',
                              'DPOY Results - Player Unique ID',
                              'DPOY Results - Year',
                              'DPOY Results - Standardized/Modernized Team Name',
                             'Team_name_and_year_concat',
                              'DPOY Results - Age',
                              'DPOY Results - Team',
                             'DPOY Results - Team Name Refined - accounts for players being traded']


list_of_columns_to_rank = []


list_of_columns_to_derive_indicator_for = []

In [None]:
dataframe_column_prefix = 'DPOY Results - '

# Create column to contain indicator
dpoy_df[f'{dataframe_column_prefix}DPOY Result Indicator'] = np.nan

# Loop through year and populate indicator
for year in list(dpoy_df[f'{dataframe_column_prefix}Year'].unique()):
    
    # Filter dataframe by year
    subset_df = dpoy_df.loc[dpoy_df[f'{dataframe_column_prefix}Year']==year]
    
    # Sort year subset by Total Voting Points Won
    index_to_mark = subset_df[f'{dataframe_column_prefix}Total Voting Points Won'].idxmax()
    
    # Set value = 2 for winner
    dpoy_df.loc[index_to_mark,f'{dataframe_column_prefix}DPOY Result Indicator'] = 2

    # Remove winner from consideration
    subset_df = subset_df.drop(index_to_mark)
    
    # Sort year subset by Total Voting Points Won
    index_to_mark = subset_df[f'{dataframe_column_prefix}Total Voting Points Won'].idxmax()
          
    # Set value = 3 for 2nd place
    dpoy_df.loc[index_to_mark,f'{dataframe_column_prefix}DPOY Result Indicator'] = 3   

# Default in value of 1 to indicate they received a vote but did not win 
dpoy_df.loc[~dpoy_df[f'{dataframe_column_prefix}DPOY Result Indicator'].isin([3,2]),f'{dataframe_column_prefix}DPOY Result Indicator'] = 1     
            

In [None]:
dpoy_df.head(10)

In [None]:
# Store old dataframe in case needed
dpoy_df_before_changes = dpoy_df.copy()

# # Create ranked columns for columns desired to rank
# mvp_vote_df = rank_column_values(player_play_by_play_in_regular_season_df,'Regular Season Play by Play Player Stats - Year',list_of_columns_to_rank)

# Remove columns to exclude:
for column in list_of_columns_to_exclude:
    
    dpoy_df.drop(column,axis=1,inplace=True)
    
# # Encode nominal categorical variables using one-hot encoding
# basic_player_stats_regular_season_df = pd.get_dummies(basic_player_stats_regular_season_df, columns=list_of_basic_player_stats_regular_season_df_categorical_to_encode)

In [None]:
dpoy_df.head(10)

In [None]:
dpoy_df.loc[dpoy_df['Player_unique_id_and_year_concat']=='antetgi01-2023']

## Prep ROY voting history for machine learning

In [None]:
list_of_categorical_to_encode = []
                                                            



list_of_columns_to_exclude = ['ROY Vote Results - Rank',
                               'ROY Vote Results - Player',
                              'ROY Vote Results - Player Unique ID',
                              'ROY Vote Results - Year',
                              'ROY Vote Results - Standardized/Modernized Team Name',
                             'Team_name_and_year_concat',
                              'ROY Vote Results - Age',
                              'ROY Vote Results - Team',
                             'ROY Vote Results - Team Name Refined - accounts for players being traded']


list_of_columns_to_rank = []


list_of_columns_to_derive_indicator_for = []

In [None]:
dataframe_column_prefix = 'ROY Vote Results - '

# Create column to contain indicator
roy_vote_df[f'{dataframe_column_prefix}ROY Result Indicator'] = np.nan

# Loop through year and populate indicator
for year in list(roy_vote_df[f'{dataframe_column_prefix}Year'].unique()):
    
    # Filter dataframe by year
    subset_df = roy_vote_df.loc[roy_vote_df[f'{dataframe_column_prefix}Year']==year]
    
    # Sort year subset by Total Voting Points Won
    index_to_mark = subset_df[f'{dataframe_column_prefix}Total Voting Points Won'].idxmax()
    
    # Set value = 2 for winner
    roy_vote_df.loc[index_to_mark,f'{dataframe_column_prefix}ROY Result Indicator'] = 2

    # Remove winner from consideration
    subset_df = subset_df.drop(index_to_mark)
    
    # Sort year subset by Total Voting Points Won
    index_to_mark = subset_df[f'{dataframe_column_prefix}Total Voting Points Won'].idxmax()
          
    # Set value = 3 for 2nd place
    roy_vote_df.loc[index_to_mark,f'{dataframe_column_prefix}ROY Result Indicator'] = 3   

# Default in value of 1 to indicate they received a vote but did not win 
roy_vote_df.loc[~roy_vote_df[f'{dataframe_column_prefix}ROY Result Indicator'].isin([3,2]),f'{dataframe_column_prefix}ROY Result Indicator'] = 1     
        

In [None]:
roy_vote_df.head(10)

In [None]:
# Store old dataframe in case needed
roy_vote_df_before_changes = roy_vote_df.copy()

# # Create ranked columns for columns desired to rank
# mvp_vote_df = rank_column_values(player_play_by_play_in_regular_season_df,'Regular Season Play by Play Player Stats - Year',list_of_columns_to_rank)

# Remove columns to exclude:
for column in list_of_columns_to_exclude:
    
    roy_vote_df.drop(column,axis=1,inplace=True)
    
# # Encode nominal categorical variables using one-hot encoding
# basic_player_stats_regular_season_df = pd.get_dummies(basic_player_stats_regular_season_df, columns=list_of_basic_player_stats_regular_season_df_categorical_to_encode)

In [None]:
roy_vote_df.head(10)

## Prep MIP voting history for machine learning

In [None]:
list_of_categorical_to_encode = []
                                                            



list_of_columns_to_exclude = ['MIP Vote Results - Rank',
                               'MIP Vote Results - Player',
                              'MIP Vote Results - Player Unique ID',
                              'MIP Vote Results - Year',
                              'MIP Vote Results - Standardized/Modernized Team Name',
                             'Team_name_and_year_concat',
                              'MIP Vote Results - Age',
                              'MIP Vote Results - Team',
                             'MIP Vote Results - Team Name Refined - accounts for players being traded']


list_of_columns_to_rank = []


list_of_columns_to_derive_indicator_for = []

In [None]:
dataframe_column_prefix = 'MIP Vote Results - '

# Create column to contain indicator
mip_df[f'{dataframe_column_prefix}MIP Result Indicator'] = np.nan

# Loop through year and populate indicator
for year in list(mip_df[f'{dataframe_column_prefix}Year'].unique()):
    
    # Filter dataframe by year
    subset_df = mip_df.loc[mip_df[f'{dataframe_column_prefix}Year']==year]
    
    # Sort year subset by Total Voting Points Won
    index_to_mark = subset_df[f'{dataframe_column_prefix}Total Voting Points Won'].idxmax()
              
    # Set value = 2 for winner
    mip_df.loc[index_to_mark,f'{dataframe_column_prefix}MIP Result Indicator'] = 2
        
    # Remove winner from consideration
    subset_df = subset_df.drop(index_to_mark)
    
    # Sort year subset by Total Voting Points Won
    index_to_mark = subset_df[f'{dataframe_column_prefix}Total Voting Points Won'].idxmax()
          
    # Set value = 3 for 2nd place
    mip_df.loc[index_to_mark,f'{dataframe_column_prefix}MIP Result Indicator'] = 3   

# Default in value of 1 to indicate they received a vote but did not win 
mip_df.loc[~mip_df[f'{dataframe_column_prefix}MIP Result Indicator'].isin([3,2]),f'{dataframe_column_prefix}MIP Result Indicator'] = 1     
    

In [None]:
mip_df.head(20)

In [None]:
# Store old dataframe in case needed
mip_df_before_changes = mip_df.copy()

# # Create ranked columns for columns desired to rank
# mvp_vote_df = rank_column_values(player_play_by_play_in_regular_season_df,'Regular Season Play by Play Player Stats - Year',list_of_columns_to_rank)

# Remove columns to exclude:
for column in list_of_columns_to_exclude:
    
    mip_df.drop(column,axis=1,inplace=True)
    
# # Encode nominal categorical variables using one-hot encoding
# basic_player_stats_regular_season_df = pd.get_dummies(basic_player_stats_regular_season_df, columns=list_of_basic_player_stats_regular_season_df_categorical_to_encode)

In [None]:
mip_df.head(10)

## Prep 6 Man voting history for machine learning

In [None]:
list_of_categorical_to_encode = []
                                                            



list_of_columns_to_exclude = ['6 Man Vote Results - Rank',
                               '6 Man Vote Results - Player',
                              '6 Man Vote Results - Player Unique ID',
                              '6 Man Vote Results - Year',
                              '6 Man Vote Results - Standardized/Modernized Team Name',
                             'Team_name_and_year_concat',
                              '6 Man Vote Results - Age',
                              '6 Man Vote Results - Team',
                             '6 Man Vote Results - Team Name Refined - accounts for players being traded']


list_of_columns_to_rank = []


list_of_columns_to_derive_indicator_for = []

In [None]:
dataframe_column_prefix = '6 Man Vote Results - '

# Create column to contain indicator
six_man_df[f'{dataframe_column_prefix}6MAN Result Indicator'] = np.nan

# Loop through year and populate indicator
for year in list(six_man_df[f'{dataframe_column_prefix}Year'].unique()):
    
    # Filter dataframe by year
    subset_df = six_man_df.loc[six_man_df[f'{dataframe_column_prefix}Year']==year]
    
    # Sort year subset by Total Voting Points Won
    index_to_mark = subset_df[f'{dataframe_column_prefix}Total Voting Points Won'].idxmax()
    
    # Set value = 2 for winner
    six_man_df.loc[index_to_mark,f'{dataframe_column_prefix}6MAN Result Indicator'] = 2

    # Remove winner from consideration
    subset_df = subset_df.drop(index_to_mark)
    
    # Sort year subset by Total Voting Points Won
    index_to_mark = subset_df[f'{dataframe_column_prefix}Total Voting Points Won'].idxmax()
          
    # Set value = 3 for 2nd place
    six_man_df.loc[index_to_mark,f'{dataframe_column_prefix}6MAN Result Indicator'] = 3   

# Default in value of 1 to indicate they received a vote but did not win 
six_man_df.loc[~six_man_df[f'{dataframe_column_prefix}6MAN Result Indicator'].isin([3,2]),f'{dataframe_column_prefix}6MAN Result Indicator'] = 1         

In [None]:
six_man_df.head(10)

In [None]:
# Store old dataframe in case needed
six_man_df_before_changes = six_man_df.copy()

# # Create ranked columns for columns desired to rank
# mvp_vote_df = rank_column_values(player_play_by_play_in_regular_season_df,'Regular Season Play by Play Player Stats - Year',list_of_columns_to_rank)

# Remove columns to exclude:
for column in list_of_columns_to_exclude:
    
    six_man_df.drop(column,axis=1,inplace=True)
    
# # Encode nominal categorical variables using one-hot encoding
# basic_player_stats_regular_season_df = pd.get_dummies(basic_player_stats_regular_season_df, columns=list_of_basic_player_stats_regular_season_df_categorical_to_encode)

In [None]:
six_man_df.head(10)

## Prep All NBA voting history for machine learning

In [None]:
list_of_categorical_to_encode = []
                                                            



list_of_columns_to_exclude = ['All NBA Vote Results - Player',
                              'All NBA Vote Results - Player Unique ID',
                              'All NBA Vote Results - Position',
                              'All NBA Vote Results - Team',
                              'All NBA Vote Results - Age',
                              'All NBA Vote Results - Standardized/Modernized Team Name',
                             'Team_name_and_year_concat',
                             'All NBA Vote Results - Team Name Refined - accounts for players being traded',
                             'All NBA Vote Results - Year']


list_of_columns_to_rank = []


list_of_columns_to_derive_indicator_for = []

In [None]:
all_nba_replacement_dict = {'3T':'3rd',
                           '2T':'2nd',
                           '1T':'1st'}

all_nba_df['All NBA Vote Results - All NBA Award (e.g. 1st Team)'] = all_nba_df['All NBA Vote Results - All NBA Award (e.g. 1st Team)'].replace(all_nba_replacement_dict)

all_nba_df['All NBA Vote Results - RESULT INDICATOR'] = all_nba_df['All NBA Vote Results - All NBA Award (e.g. 1st Team)']

all_nba_replacement_dict = {'3rd':'2',
                           '2nd':'3',
                           '1st':'4',
                           'ORV':'1'}

all_nba_df['All NBA Vote Results - RESULT INDICATOR'] = all_nba_df['All NBA Vote Results - RESULT INDICATOR'].replace(all_nba_replacement_dict)

all_nba_df

In [None]:
# Store old dataframe in case needed
all_nba_df_before_changes = all_nba_df.copy()

# # Create ranked columns for columns desired to rank
# mvp_vote_df = rank_column_values(player_play_by_play_in_regular_season_df,'Regular Season Play by Play Player Stats - Year',list_of_columns_to_rank)

# Remove columns to exclude:
for column in list_of_columns_to_exclude:
    
    all_nba_df.drop(column,axis=1,inplace=True)
    
# # Encode nominal categorical variables using one-hot encoding
# basic_player_stats_regular_season_df = pd.get_dummies(basic_player_stats_regular_season_df, columns=list_of_basic_player_stats_regular_season_df_categorical_to_encode)

In [None]:
all_nba_df.head(10)

## Prep All Defense voting history for machine learning

In [None]:
list_of_categorical_to_encode = []
                                                            



list_of_columns_to_exclude = ['All Defensive Vote Results - Player',
                              'All Defensive Vote Results - Player Unique ID',
                              'All Defensive Vote Results - Position',
                              'All Defensive Vote Results - Team',
                              'All Defensive Vote Results - Age',
                              'All Defensive Vote Results - Standardized/Modernized Team Name',
                             'Team_name_and_year_concat',
                             'All Defensive Vote Results - Team Name Refined - accounts for players being traded',
                             'All Defensive Vote Results - Year']


list_of_columns_to_rank = []


list_of_columns_to_derive_indicator_for = []

In [None]:
all_defense_df['All Defensive Vote Results - All NBA Award (e.g. 1st Team)'].unique()

In [None]:
all_defense_df

In [None]:
all_defense_df['All Defensive Vote Results - RESULT INDICATOR'] = all_defense_df['All Defensive Vote Results - All NBA Award (e.g. 1st Team)']

all_defense_replacement_dict = {
                           '2nd':'2',
                           '1st':'3',
                           'ORV':'1'}

all_defense_df['All Defensive Vote Results - RESULT INDICATOR'] = all_defense_df['All Defensive Vote Results - RESULT INDICATOR'].replace(all_defense_replacement_dict)

all_defense_df

In [None]:
# Store old dataframe in case needed
all_defense_df_before_changes = all_defense_df.copy()

# # Create ranked columns for columns desired to rank
# mvp_vote_df = rank_column_values(player_play_by_play_in_regular_season_df,'Regular Season Play by Play Player Stats - Year',list_of_columns_to_rank)

# Remove columns to exclude:
for column in list_of_columns_to_exclude:
    
    all_defense_df.drop(column,axis=1,inplace=True)
    
# # Encode nominal categorical variables using one-hot encoding
# basic_player_stats_regular_season_df = pd.get_dummies(basic_player_stats_regular_season_df, columns=list_of_basic_player_stats_regular_season_df_categorical_to_encode)

In [None]:
all_defense_df.head(10)

## Prep Finals MVP voting history for machine learning

In [None]:
list_of_categorical_to_encode = []
                                                            



list_of_columns_to_exclude = ['Finals MVP Results - Year',
                              'Finals MVP Results - Player',
                              'Finals MVP Results - Team',
                              'Finals MVP Results - Age',
                              'Finals MVP Results - Player Unique ID',
                              'Finals MVP Results - Standardized/Modernized Team Name',
                             'Team_name_and_year_concat',
                             'Finals MVP Results - Team Name Refined - accounts for players being traded']


list_of_columns_to_rank = []


list_of_columns_to_derive_indicator_for = []

In [None]:
finals_mvp['Finals MVP Results - MVP Indicator'] = 1

In [None]:
# Store old dataframe in case needed
finals_mvp_before_changes = finals_mvp.copy()

# # Create ranked columns for columns desired to rank
# mvp_vote_df = rank_column_values(player_play_by_play_in_regular_season_df,'Regular Season Play by Play Player Stats - Year',list_of_columns_to_rank)

# Remove columns to exclude:
for column in list_of_columns_to_exclude:
    
    finals_mvp.drop(column,axis=1,inplace=True)
    
# # Encode nominal categorical variables using one-hot encoding
# basic_player_stats_regular_season_df = pd.get_dummies(basic_player_stats_regular_season_df, columns=list_of_basic_player_stats_regular_season_df_categorical_to_encode)

In [None]:
finals_mvp.head(10)

## Prep team basic stat in regular season for machine learning

In [None]:
list_of_categorical_to_encode = []
                                                            



list_of_columns_to_exclude = ['Regular Season Basic Team Stats - Rank',
                               'Regular Season Basic Team Stats - Minutes Played Per Game',
                              'Regular Season Basic Team Stats - Team Name Refined - accounts for players being traded',
                              'Regular Season Basic Team Stats - Standardized/Modernized Team Name'
                             ]


list_of_columns_to_rank = [
                           'Regular Season Basic Team Stats - Points Per Game',
                           'Regular Season Basic Team Stats - Total Rebounds Per Game',
                           'Regular Season Basic Team Stats - Offense Rebounds Per Game',
                           'Regular Season Basic Team Stats - Defense Rebounds Per Game',
                           'Regular Season Basic Team Stats - Total Rebounds Per Game',
                           'Regular Season Basic Team Stats - Assists Per Game',
                           'Regular Season Basic Team Stats - Steals Per Game',
                           'Regular Season Basic Team Stats - Blocks Per Game',
                           'Regular Season Basic Team Stats - Turnovers Per Game',
                           'Regular Season Basic Team Stats - Personal Fouls Per Game'
                          ]


list_of_columns_to_derive_indicator_for = []

In [None]:
# Store with league averages
nba_team_basic_stat_in_regular_season_df_league_average = nba_team_basic_stat_in_regular_season_df.copy()

# Filter out league averages, as it will mess with rankings
nba_team_basic_stat_in_regular_season_df = nba_team_basic_stat_in_regular_season_df.loc[nba_team_basic_stat_in_regular_season_df['Regular Season Basic Team Stats - Team']!='League Average']

In [None]:
# Store old dataframe in case needed
nba_team_basic_stat_in_regular_season_df_before_changes = nba_team_basic_stat_in_regular_season_df.copy()

# Create ranked columns for columns desired to rank
nba_team_basic_stat_in_regular_season_df = rank_column_values(nba_team_basic_stat_in_regular_season_df,'Regular Season Basic Team Stats - Year',list_of_columns_to_rank)

# Remove columns to exclude:
for column in list_of_columns_to_exclude:
    
    nba_team_basic_stat_in_regular_season_df.drop(column,axis=1,inplace=True)
    
# # Encode nominal categorical variables using one-hot encoding
# basic_player_stats_regular_season_df = pd.get_dummies(basic_player_stats_regular_season_df, columns=list_of_basic_player_stats_regular_season_df_categorical_to_encode)

In [None]:
nba_team_basic_stat_in_regular_season_df.head(3)

## Prep team basic opponent stat in regular season for machine learning

In [None]:
list_of_categorical_to_encode = []
                                                            



list_of_columns_to_exclude = ['Regular Season Basic Team Opponent Stats - Rank',
                               'Regular Season Basic Team Opponent Stats - Team',
                               'Regular Season Basic Team Opponent Stats - Minutes Played Per Game',
                             'Regular Season Basic Team Opponent Stats - Year',
                              'Regular Season Basic Team Opponent Stats - Team Name Refined - accounts for players being traded',
                              'Regular Season Basic Team Opponent Stats - Standardized/Modernized Team Name'
                             ]


list_of_columns_to_rank = [
                           'Regular Season Basic Team Opponent Stats - Points Per Game',
                           'Regular Season Basic Team Opponent Stats - Total Rebounds Per Game',
                           'Regular Season Basic Team Opponent Stats - Offense Rebounds Per Game',
                           'Regular Season Basic Team Opponent Stats - Defense Rebounds Per Game',
                           'Regular Season Basic Team Opponent Stats - Total Rebounds Per Game',
                           'Regular Season Basic Team Opponent Stats - Assists Per Game',
                           'Regular Season Basic Team Opponent Stats - Steals Per Game',
                           'Regular Season Basic Team Opponent Stats - Blocks Per Game',
                           'Regular Season Basic Team Opponent Stats - Turnovers Per Game',
                           'Regular Season Basic Team Opponent Stats - Personal Fouls Per Game'
                          ]


list_of_columns_to_derive_indicator_for = []

In [None]:
nba_team_basic_opponent_stat_in_regular_season_df

In [None]:
# Store with league averages
nba_team_basic_opponent_stat_in_regular_season_df_league_average = nba_team_basic_opponent_stat_in_regular_season_df.copy()

# Filter out league averages, as it will mess with rankings
nba_team_basic_opponent_stat_in_regular_season_df = nba_team_basic_opponent_stat_in_regular_season_df.loc[nba_team_basic_opponent_stat_in_regular_season_df['Regular Season Basic Team Opponent Stats - Team']!='League Average']

In [None]:
# Store old dataframe in case needed
nba_team_basic_opponent_stat_in_regular_season_df_before_changes = nba_team_basic_opponent_stat_in_regular_season_df.copy()

# Create ranked columns for columns desired to rank
nba_team_basic_opponent_stat_in_regular_season_df = rank_column_values(nba_team_basic_opponent_stat_in_regular_season_df,'Regular Season Basic Team Opponent Stats - Year',list_of_columns_to_rank)

# Remove columns to exclude:
for column in list_of_columns_to_exclude:
    
    nba_team_basic_opponent_stat_in_regular_season_df.drop(column,axis=1,inplace=True)
    
# # Encode nominal categorical variables using one-hot encoding
# basic_player_stats_regular_season_df = pd.get_dummies(basic_player_stats_regular_season_df, columns=list_of_basic_player_stats_regular_season_df_categorical_to_encode)

In [None]:
nba_team_basic_opponent_stat_in_regular_season_df.head(3)

## Prep team advanced stats in regular season for machine learning

In [None]:
list_of_categorical_to_encode = []
                                                            



list_of_columns_to_exclude = ['Regular Season Team Advanced Stats - Rank',
                               'Regular Season Team Advanced Stats - Team',
                             'Regular Season Team Advanced Stats - Year',
                              'Regular Season Team Advanced Stats - Team Name Refined - accounts for players being traded',
                              'Regular Season Team Advanced Stats - Standardized/Modernized Team Name',
                              'Regular Season Team Advanced Stats - Wins',
                              'Regular Season Team Advanced Stats - Losses']


list_of_columns_to_rank = [
                           'Regular Season Team Advanced Stats - Pythagorean Wins (expected wins based on points scored and allowed)',
                           'Regular Season Team Advanced Stats - Pythagorean Losses (expected wins based on points scored and allowed)',
                           'Regular Season Team Advanced Stats - Margin of Victory',
                           'Regular Season Team Advanced Stats - Simple rating system: derived using average point differential and strength of scheduled',
                           'Regular Season Team Advanced Stats - Offensive Rating',
                           'Regular Season Team Advanced Stats - Defensive Rating',
                           'Regular Season Team Advanced Stats - Net Rating',
                           'Regular Season Team Advanced Stats - Pace (Possessions per 48 Minutes)',
                            'Regular Season Team Advanced Stats - Average Effective Field Goal % For Season',
                            'Regular Season Team Advanced Stats - Defensive Rebounding %',
                            'Regular Season Team Advanced Stats - Opponent Effective Field Goal %',
                            'Regular Season Team Advanced Stats - Opponent Turnover %',
                            'Regular Season Team Advanced Stats - Offensive Rebounding %',
                            'Regular Season Team Advanced Stats - Turnover %',
                            'Regular Season Team Advanced Stats - Average Player Age'
                          ]


list_of_columns_to_derive_indicator_for = []

In [None]:
nba_advanced_team_stats

In [None]:
# Store with league averages
nba_advanced_team_stats_league_average = nba_advanced_team_stats.copy()

# Filter out league averages, as it will mess with rankings
nba_advanced_team_stats = nba_advanced_team_stats.loc[nba_advanced_team_stats['Regular Season Team Advanced Stats - Team']!='League Average']

In [None]:
# Store old dataframe in case needed
nba_advanced_team_stats_before_changes = nba_advanced_team_stats.copy()

# Create ranked columns for columns desired to rank
nba_advanced_team_stats = rank_column_values(nba_advanced_team_stats,'Regular Season Team Advanced Stats - Year',list_of_columns_to_rank)

# Remove columns to exclude:
for column in list_of_columns_to_exclude:
    
    nba_advanced_team_stats.drop(column,axis=1,inplace=True)
    
# # Encode nominal categorical variables using one-hot encoding
# basic_player_stats_regular_season_df = pd.get_dummies(basic_player_stats_regular_season_df, columns=list_of_basic_player_stats_regular_season_df_categorical_to_encode)

In [None]:
nba_advanced_team_stats.head(3)

## Prep team shooting stats in regular season for machine learning

In [None]:
list_of_categorical_to_encode = []
                                                            



list_of_columns_to_exclude = ['Regular Season Team Shooting Stats - Rank',
                               'Regular Season Team Shooting Stats - Team',
                              'Regular Season Team Shooting Stats - Games Played',
                             'Regular Season Team Shooting Stats - Year',
                              'Regular Season Team Shooting Stats - Total Minutes Played During Season',
                              'Regular Season Team Shooting Stats - Team Name Refined - accounts for players being traded',
                              'Regular Season Team Shooting Stats - Standardized/Modernized Team Name']


list_of_columns_to_rank = [
                          ]


list_of_columns_to_derive_indicator_for = []

In [None]:
nba_team_shooting_stat_in_regular_season_df

In [None]:
# Store with league averages
nba_team_shooting_stat_in_regular_season_df_league_average = nba_team_shooting_stat_in_regular_season_df.copy()

# Filter out league averages, as it will mess with rankings
nba_team_shooting_stat_in_regular_season_df = nba_team_shooting_stat_in_regular_season_df.loc[nba_team_shooting_stat_in_regular_season_df['Regular Season Team Shooting Stats - Team']!='League Average']

In [None]:
# Store old dataframe in case needed
nba_team_shooting_stat_in_regular_season_df_before_changes = nba_team_shooting_stat_in_regular_season_df.copy()

# Create ranked columns for columns desired to rank
nba_team_shooting_stat_in_regular_season_df = rank_column_values(nba_team_shooting_stat_in_regular_season_df,'Regular Season Team Shooting Stats - Year',list_of_columns_to_rank)

# Remove columns to exclude:
for column in list_of_columns_to_exclude:
    
    nba_team_shooting_stat_in_regular_season_df.drop(column,axis=1,inplace=True)
    
# # Encode nominal categorical variables using one-hot encoding
# basic_player_stats_regular_season_df = pd.get_dummies(basic_player_stats_regular_season_df, columns=list_of_basic_player_stats_regular_season_df_categorical_to_encode)

In [None]:
nba_team_shooting_stat_in_regular_season_df.head(3)

## Prep team opponent shooting stats in regular season for machine learning

In [None]:
list_of_categorical_to_encode = []
                                                            



list_of_columns_to_exclude = ['Regular Season Team Opponent Shooting Stats - Rank',
                               'Regular Season Team Opponent Shooting Stats - Team',
                              'Regular Season Team Opponent Shooting Stats - Games Played',
                             'Regular Season Team Opponent Shooting Stats - Year',
                              'Regular Season Team Opponent Shooting Stats - Total Minutes Played During Season',
                              'Regular Season Team Opponent Shooting Stats - Team Name Refined - accounts for players being traded',
                              'Regular Season Team Opponent Shooting Stats - Standardized/Modernized Team Name']


list_of_columns_to_rank = [
                          ]


list_of_columns_to_derive_indicator_for = []

In [None]:
nba_team_opponent_shooting_stat_in_regular_season_df

In [None]:
# Store with league averages
nba_team_opponent_shooting_stat_in_regular_season_df_league_average = nba_team_opponent_shooting_stat_in_regular_season_df.copy()

# Filter out league averages, as it will mess with rankings
nba_team_opponent_shooting_stat_in_regular_season_df = nba_team_opponent_shooting_stat_in_regular_season_df.loc[nba_team_opponent_shooting_stat_in_regular_season_df['Regular Season Team Opponent Shooting Stats - Team']!='League Average']

In [None]:
# Store old dataframe in case needed
nba_team_opponent_shooting_stat_in_regular_season_df_before_changes = nba_team_opponent_shooting_stat_in_regular_season_df.copy()

# Create ranked columns for columns desired to rank
nba_team_opponent_shooting_stat_in_regular_season_df = rank_column_values(nba_team_opponent_shooting_stat_in_regular_season_df,'Regular Season Team Opponent Shooting Stats - Year',list_of_columns_to_rank)

# Remove columns to exclude:
for column in list_of_columns_to_exclude:
    
    nba_team_opponent_shooting_stat_in_regular_season_df.drop(column,axis=1,inplace=True)
    
# # Encode nominal categorical variables using one-hot encoding
# basic_player_stats_regular_season_df = pd.get_dummies(basic_player_stats_regular_season_df, columns=list_of_basic_player_stats_regular_season_df_categorical_to_encode)

In [None]:
nba_team_opponent_shooting_stat_in_regular_season_df.head(3)

## Prep team regular season standings for machine learning

In [None]:
def win_percent_function(column_value):
    
    wins = int(column_value.split('-')[0])
    
    losses = int(column_value.split('-')[1])
    
    win_loss_percent = wins/(wins+losses)
    
    return win_loss_percent

In [None]:
list_of_columns_to_apply_function_to = ['Regular Season Team Standings - Overall Team Record',
                                        'Regular Season Team Standings - Team Record at Home',
                                        'Regular Season Team Standings - Team Record on Road',
                                        'Regular Season Team Standings - Team Record Pre All Star Break',
                                        'Regular Season Team Standings - Team Record Post All Star Break',
                                        'Regular Season Team Standings - Team Record in Games Decided by 3 points or under',
                                        'Regular Season Team Standings - Team Record in Games Decided by 10 points or over'
                                       ]

for column in list_of_columns_to_apply_function_to:
    
    regular_season_standing_df[column] = regular_season_standing_df[column].apply(win_percent_function)
    

In [None]:
regular_season_standing_df


In [None]:
list_of_categorical_to_encode = []
                                                            



list_of_columns_to_exclude = ['Regular Season Team Standings - Rank',
                               'Regular Season Team Standings - Team',
                             'Regular Season Team Standings - Year',
                              'Regular Season Team Standings - Team Name Refined - accounts for players being traded',
                              'Regular Season Team Standings - Standardized/Modernized Team Name']


list_of_columns_to_rank = ['Regular Season Team Standings - Overall Team Record',
                           'Regular Season Team Standings - Team Record at Home',
                           'Regular Season Team Standings - Team Record on Road',
                           'Regular Season Team Standings - Team Record Pre All Star Break',
                           'Regular Season Team Standings - Team Record Post All Star Break',
                           'Regular Season Team Standings - Team Record in Games Decided by 3 points or under',
                           'Regular Season Team Standings - Team Record in Games Decided by 10 points or over'
                          ]


list_of_columns_to_derive_indicator_for = []

In [None]:
# Store old dataframe in case needed
regular_season_standing_df_before_changes = regular_season_standing_df.copy()

# Create ranked columns for columns desired to rank
regular_season_standing_df = rank_column_values(regular_season_standing_df,'Regular Season Team Standings - Year',list_of_columns_to_rank)

# Remove columns to exclude:
for column in list_of_columns_to_exclude:
    
    regular_season_standing_df.drop(column,axis=1,inplace=True)
    
# # Encode nominal categorical variables using one-hot encoding
# basic_player_stats_regular_season_df = pd.get_dummies(basic_player_stats_regular_season_df, columns=list_of_basic_player_stats_regular_season_df_categorical_to_encode)

In [None]:
regular_season_standing_df.head(3)

# Stage 7.) Merge Dataframes

#  REGULAR SEASON PLAYER STATS MERGE. Create merged dataframe

 <a class="anchor" id="Step32"></a>

### Merge basic with advanced stats

In [None]:
number_of_columns_before = len(basic_player_stats_regular_season_df.columns)

all_regular_season_player_stats_merged_df = basic_player_stats_regular_season_df.merge(advanced_player_stats_in_regular_season_df,how='left',on='Player_unique_id_and_year_concat')

number_of_columns_after = len(all_regular_season_player_stats_merged_df.columns)

print(f"""Went from {number_of_columns_before} columns to {number_of_columns_after} columns.
      
Added {number_of_columns_after-number_of_columns_before}""")

### Add player shooting stats

In [None]:
number_of_columns_before = len(all_regular_season_player_stats_merged_df.columns)

all_regular_season_player_stats_merged_df = all_regular_season_player_stats_merged_df.merge(player_shooting_in_regular_season_df,how='left',on='Player_unique_id_and_year_concat')

number_of_columns_after = len(all_regular_season_player_stats_merged_df.columns)

print(f"""Went from {number_of_columns_before} columns to {number_of_columns_after} columns.
      
Added {number_of_columns_after-number_of_columns_before}""")

### Add player play-by-play stats

In [None]:
number_of_columns_before = len(all_regular_season_player_stats_merged_df.columns)

all_regular_season_player_stats_merged_df = all_regular_season_player_stats_merged_df.merge(player_play_by_play_in_regular_season_df,how='left',on='Player_unique_id_and_year_concat')

number_of_columns_after = len(all_regular_season_player_stats_merged_df.columns)

print(f"""Went from {number_of_columns_before} columns to {number_of_columns_after} columns.
      
Added {number_of_columns_after-number_of_columns_before}""")

# REGULAR SEASON TEAM STATS MERGE

 <a class="anchor" id="Step33"></a>

### Merge regular season basic stats for team with regular season basic stats for opponents

In [None]:
number_of_columns_before = len(nba_team_basic_stat_in_regular_season_df.columns)

all_regular_season_team_stats_merged_df = nba_team_basic_stat_in_regular_season_df.merge(nba_team_basic_opponent_stat_in_regular_season_df,how='left',on='Team_name_and_year_concat')

number_of_columns_after = len(all_regular_season_team_stats_merged_df.columns)

print(f"""Went from {number_of_columns_before} columns to {number_of_columns_after} columns.
      
Added {number_of_columns_after-number_of_columns_before}""")

### Add regular season advanced team stats

In [None]:
number_of_columns_before = len(all_regular_season_team_stats_merged_df.columns)

all_regular_season_team_stats_merged_df = all_regular_season_team_stats_merged_df.merge(nba_advanced_team_stats,how='left',on='Team_name_and_year_concat')

number_of_columns_after = len(all_regular_season_team_stats_merged_df.columns)

print(f"""Went from {number_of_columns_before} columns to {number_of_columns_after} columns.
      
Added {number_of_columns_after-number_of_columns_before}""")

### Add regular season team shooting stats

In [None]:
number_of_columns_before = len(all_regular_season_team_stats_merged_df.columns)

all_regular_season_team_stats_merged_df = all_regular_season_team_stats_merged_df.merge(nba_team_shooting_stat_in_regular_season_df,how='left',on='Team_name_and_year_concat')

number_of_columns_after = len(all_regular_season_team_stats_merged_df.columns)

print(f"""Went from {number_of_columns_before} columns to {number_of_columns_after} columns.
      
Added {number_of_columns_after-number_of_columns_before}""")

### Add regular season team opponent shooting stats

In [None]:
number_of_columns_before = len(all_regular_season_team_stats_merged_df.columns)

all_regular_season_team_stats_merged_df = all_regular_season_team_stats_merged_df.merge(nba_team_opponent_shooting_stat_in_regular_season_df,how='left',on='Team_name_and_year_concat')

number_of_columns_after = len(all_regular_season_team_stats_merged_df.columns)

print(f"""Went from {number_of_columns_before} columns to {number_of_columns_after} columns.
      
Added {number_of_columns_after-number_of_columns_before}""")

### Add regular season team standings

In [None]:
number_of_columns_before = len(all_regular_season_team_stats_merged_df.columns)

all_regular_season_team_stats_merged_df = all_regular_season_team_stats_merged_df.merge(regular_season_standing_df,how='left',on='Team_name_and_year_concat')

number_of_columns_after = len(all_regular_season_team_stats_merged_df.columns)

print(f"""Went from {number_of_columns_before} columns to {number_of_columns_after} columns.
      
Added {number_of_columns_after-number_of_columns_before}""")

# Remove 2024 data that is used for predicting, not training 

In [None]:
all_regular_season_player_stats_merged_df_2024 = all_regular_season_player_stats_merged_df.loc[all_regular_season_player_stats_merged_df['Regular Season Basic Player Stats - Year']==2024].copy()
all_regular_season_player_stats_merged_df = all_regular_season_player_stats_merged_df.loc[all_regular_season_player_stats_merged_df['Regular Season Basic Player Stats - Year']!=2024]

all_regular_season_player_stats_merged_df_2024_final = all_regular_season_player_stats_merged_df_2024.copy()

In [None]:
all_regular_season_team_stats_merged_df_2024 = all_regular_season_team_stats_merged_df.loc[all_regular_season_team_stats_merged_df['Regular Season Basic Team Stats - Year']==2024].copy()
all_regular_season_team_stats_merged_df = all_regular_season_team_stats_merged_df.loc[all_regular_season_team_stats_merged_df['Regular Season Basic Team Stats - Year']!=2024]


In [None]:
all_regular_season_team_stats_merged_df_2024

# Predicting MVP - Machine Learning Model Using GBM
 <a class="anchor" id="Step34"></a>

## Create dataset to analyze
    
###    1.) Merge regular season player stats with player mvp vote data 

###    2.) Merge the above data set with team data

In [None]:
mvp_prediction_df = all_regular_season_player_stats_merged_df.merge(mvp_vote_df,how='left',on='Player_unique_id_and_year_concat')

In [None]:
mvp_prediction_df = mvp_prediction_df.merge(all_regular_season_team_stats_merged_df,how='left',on='Team_name_and_year_concat')

In [None]:
mvp_prediction_df.head(10)

## Define list of features (exclude data not to be used in prediction) 

In [None]:
list_of_columns_to_exclude = ['Regular Season Basic Player Stats - Player',
                              'Regular Season Basic Team Stats - Team',
                             'Regular Season Basic Player Stats - Team',
                             'Regular Season Basic Player Stats - Standardized/Modernized Team Name',
                             'Player_unique_id_and_year_concat',
                              'Regular Season Basic Player Stats - Year',
                             'Team_name_and_year_concat',
                             'Regular Season Basic Team Stats - Games Played',
                             'Regular Season Basic Team Opponent Stats - Games Played',
                             'Regular Season Team Advanced Stats - Arena',
                             'Regular Season Team Advanced Stats - Total Arena Attendance',
                             'Regular Season Team Advanced Stats - Attendance Per Game',
                            'MVP Vote Results - Total Voting Points Possible',
                              'MVP Vote Results - Total Voting Points Won',
                               'MVP Vote Results - First Place Votes',
                              'MVP Vote Results - MVP Result Indicator',
                              'Regular Season Basic Team Stats - Year'
                             ]

list_of_features = list(mvp_prediction_df.columns)

for column in list_of_columns_to_exclude:
    
    list_of_features.remove(column)
    
for feature in list_of_features:
    
    print(f"""
    {feature}""")

## Check for NaN values and review

In [None]:
for feature in list_of_features:
    
    if len(mvp_prediction_df.loc[mvp_prediction_df[feature].isnull()]) > 0:
        
        mvp_prediction_df.loc[mvp_prediction_df[feature].isnull(),feature] = 0
        
        print(f"""
        {feature} contains {len(mvp_prediction_df.loc[mvp_prediction_df[feature].isnull()])} NaN values """)

## Split into test and training data

In [None]:
list_of_features_filtered = ['Regular Season Advanced Player Stats - Total Minutes Played During Season',
                           'Regular Season Advanced Player Stats - Player Efficiency',
                           'Regular Season Advanced Player Stats - True Shooting % For Season',
                           'Regular Season Advanced Player Stats - 3 Point Attempt Rate',
                           'Regular Season Advanced Player Stats - Free Throw Rate for Season',
                           'Regular Season Advanced Player Stats - Offensive Rebounding %',
                           'Regular Season Advanced Player Stats - Defensive Rebound Percentage',
                           'Regular Season Advanced Player Stats - Total Rebounding %',
                           'Regular Season Advanced Player Stats - Assist % For Season',
                           'Regular Season Advanced Player Stats - Steal Percentage for Season',
                           'Regular Season Advanced Player Stats - Block %',
                           'Regular Season Advanced Player Stats - Turnover % for Season',
                           'Regular Season Advanced Player Stats - Usage % For Season',
                           'Regular Season Advanced Player Stats - Total Offensive Win Shares For Season',
                           'Regular Season Advanced Player Stats - Total Defensive Win Shares For Season',
                           'Regular Season Advanced Player Stats - Win Shares',
                           'Regular Season Advanced Player Stats - Win Shares Per 48 Minutes Played For the Season',
                           'Regular Season Advanced Player Stats - Offensive Box Plus/Minus For Season',
                           'Regular Season Advanced Player Stats - Defensive Box Plus/Minus',
                           'Regular Season Advanced Player Stats - Box Plus/Minus For Season',
                           'Regular Season Advanced Player Stats - Value Above Replacement Player',
                           'Regular Season Advanced Player Stats - Player Efficiency Ranked',
                           'Regular Season Advanced Player Stats - True Shooting % For Season Ranked',
                           'Regular Season Advanced Player Stats - Offensive Rebounding % Ranked',
                           'Regular Season Advanced Player Stats - Defensive Rebound Percentage Ranked',
                           'Regular Season Advanced Player Stats - Total Rebounding % Ranked',
                           'Regular Season Advanced Player Stats - Assist % For Season Ranked',
                           'Regular Season Advanced Player Stats - Steal Percentage for Season Ranked',
                           'Regular Season Advanced Player Stats - Block % Ranked',
                           'Regular Season Advanced Player Stats - Turnover % for Season Ranked',
                           'Regular Season Advanced Player Stats - Usage % For Season Ranked',
                           'Regular Season Advanced Player Stats - Total Offensive Win Shares For Season Ranked',
                           'Regular Season Advanced Player Stats - Total Defensive Win Shares For Season Ranked',
                           'Regular Season Advanced Player Stats - Win Shares Ranked',
                           'Regular Season Advanced Player Stats - Win Shares Per 48 Minutes Played For the Season Ranked',
                           'Regular Season Advanced Player Stats - Offensive Box Plus/Minus For Season Ranked',
                           'Regular Season Advanced Player Stats - Defensive Box Plus/Minus Ranked',
                           'Regular Season Advanced Player Stats - Box Plus/Minus For Season Ranked',
                           'Regular Season Advanced Player Stats - Value Above Replacement Player Ranked',
                            'Regular Season Team Standings - Overall Team Record Ranked',
                            'Regular Season Team Standings - Overall Team Record',
                            'MVP Vote Results - % of Total Voting Points Possible That Were Won']

In [None]:
mvp_prediction_df_features = mvp_prediction_df.loc[:,list_of_features_filtered]

# Separate features and target variable
X = mvp_prediction_df_features.drop('MVP Vote Results - % of Total Voting Points Possible That Were Won', axis=1)
y = mvp_prediction_df_features['MVP Vote Results - % of Total Voting Points Possible That Were Won']  

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Initialize and Train the GBM Model

In [None]:
# Initialize the GBM regressor
gbm_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Train the model
gbm_regressor.fit(X_train, y_train)

## Make Predictions and Evaluate the Model

In [None]:
# Make predictions
y_pred = gbm_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

## Evaluate feature importance

In [None]:
feature_importance = pd.DataFrame(gbm_regressor.feature_importances_, index=X_train.columns, columns=['importance']).sort_values('importance', ascending=False)

feature_importance.head(10)

### Populate prediction dataframe with predictions

In [None]:
# Predict the percentage of 1st place votes for all players
predicted_votes = gbm_regressor.predict(mvp_prediction_df_features.drop('MVP Vote Results - % of Total Voting Points Possible That Were Won', axis=1))

# Add the predicted votes to the DataFrame
mvp_prediction_df.loc[:,'Predicted Votes'] = predicted_votes

In [None]:
mvp_prediction_df.loc[:,['Regular Season Team Standings - Overall Team Record','Regular Season Basic Player Stats - Player','MVP Vote Results - % of Total Voting Points Possible That Were Won','Predicted Votes']].sort_values('Predicted Votes',ascending=False).head(200)

In [None]:
mvp_prediction_df['Actual Vote - Predicted'] = mvp_prediction_df['MVP Vote Results - % of Total Voting Points Possible That Were Won'] - mvp_prediction_df['Predicted Votes']
mvp_prediction_df.loc[:,['Regular Season Team Standings - Overall Team Record','Regular Season Basic Player Stats - Player','MVP Vote Results - % of Total Voting Points Possible That Were Won','Predicted Votes','Actual Vote - Predicted','Team_name_and_year_concat']].sort_values('Actual Vote - Predicted',ascending=True).head(200)

In [None]:
mvp_prediction_df['Predicted MVP Candidate'] = np.nan
mvp_prediction_df['Predicted MVP Winner'] = np.nan

for year in list(mvp_prediction_df['Regular Season Basic Player Stats - Year'].unique()):
    
    year_subset = mvp_prediction_df.loc[mvp_prediction_df['Regular Season Basic Player Stats - Year']==year]
    
    # Sort by Predicted Votes and Grab top 10 predicted 
    year_subset = year_subset.sort_values('Predicted Votes',ascending=False).head(15)
    
    # Populate grab indices and populate in mvp_prediction_df
    mvp_prediction_df.loc[year_subset.index.tolist(),'Predicted MVP Candidate'] = 1
    
    # Grab just predicted MVP
    year_subset = year_subset.head(1)
    
    # Populate predicted mvp at index
    mvp_prediction_df.loc[year_subset.index.tolist(),'Predicted MVP Winner'] = 1

In [None]:
all_regular_season_player_stats_merged_df['Predicted MVP Candidate'] = np.nan
all_regular_season_player_stats_merged_df['Predicted MVP Winner'] = np.nan
all_regular_season_player_stats_merged_df['Predicted % of MVP Votes'] = np.nan

all_regular_season_player_stats_merged_df.loc[mvp_prediction_df.loc[mvp_prediction_df['Predicted MVP Candidate'].notnull()].index.tolist(),'Predicted MVP Candidate'] = mvp_prediction_df['Predicted MVP Candidate']
all_regular_season_player_stats_merged_df.loc[mvp_prediction_df.loc[mvp_prediction_df['Predicted MVP Winner'].notnull()].index.tolist(),'Predicted MVP Winner'] = mvp_prediction_df['Predicted MVP Winner']
all_regular_season_player_stats_merged_df.loc[mvp_prediction_df.loc[mvp_prediction_df['Predicted Votes'].notnull()].index.tolist(),'Predicted % of MVP Votes'] = mvp_prediction_df['Predicted Votes']

In [None]:
all_regular_season_player_stats_merged_df.loc[all_regular_season_player_stats_merged_df['Predicted MVP Winner'].notnull()].head(10)

# Predict 2024 MVP

## Prep the 2024 Data

In [None]:
mvp_2024_prediction_df = all_regular_season_player_stats_merged_df_2024.merge(all_regular_season_team_stats_merged_df_2024,how='left',on='Team_name_and_year_concat').set_index(all_regular_season_player_stats_merged_df_2024.index)

In [None]:
columns_to_multiply = ['Regular Season Basic Player Stats - Games Played',
                      'Regular Season Basic Player Stats - Games Started',
                      'Regular Season Advanced Player Stats - Total Offensive Win Shares For Season',
                      'Regular Season Advanced Player Stats - Total Defensive Win Shares For Season',
                      'Regular Season Advanced Player Stats - Win Shares',
                      'Regular Season Advanced Player Stats - Value Above Replacement Player']

multiplier = 82/68

for column in columns_to_multiply:
    
    mvp_2024_prediction_df[column] = mvp_2024_prediction_df[column]*multiplier

### Define list of features (exclude data not to be used in prediction) 

In [None]:
list_of_columns_to_exclude = ['Regular Season Basic Player Stats - Player',
                              'Regular Season Basic Team Stats - Team',
                             'Regular Season Basic Player Stats - Team',
                             'Regular Season Basic Player Stats - Standardized/Modernized Team Name',
                             'Player_unique_id_and_year_concat',
                              'Regular Season Basic Player Stats - Year',
                             'Team_name_and_year_concat',
                             'Regular Season Basic Team Stats - Games Played',
                             'Regular Season Basic Team Opponent Stats - Games Played',
                             'Regular Season Team Advanced Stats - Arena',
                             'Regular Season Team Advanced Stats - Total Arena Attendance',
                             'Regular Season Team Advanced Stats - Attendance Per Game',
                              'Regular Season Basic Team Stats - Year'
                             ]

list_of_features = list(mvp_2024_prediction_df.columns)

for column in list_of_columns_to_exclude:
    
    list_of_features.remove(column)
    
for feature in list_of_features:
    
    print(f"""
    {feature}""")

### Check for NaN values and review

In [None]:
for feature in list_of_features:
    
    if len(mvp_2024_prediction_df.loc[mvp_2024_prediction_df[feature].isnull()]) > 0:
        
        mvp_2024_prediction_df.loc[mvp_2024_prediction_df[feature].isnull(),feature] = 0
        
        print(f"""
        {feature} contains {len(mvp_2024_prediction_df.loc[mvp_2024_prediction_df[feature].isnull()])} NaN values """)

### Split into test and training data

In [None]:
list_of_features_filtered = ['Regular Season Advanced Player Stats - Total Minutes Played During Season',
                           'Regular Season Advanced Player Stats - Player Efficiency',
                           'Regular Season Advanced Player Stats - True Shooting % For Season',
                           'Regular Season Advanced Player Stats - 3 Point Attempt Rate',
                           'Regular Season Advanced Player Stats - Free Throw Rate for Season',
                           'Regular Season Advanced Player Stats - Offensive Rebounding %',
                           'Regular Season Advanced Player Stats - Defensive Rebound Percentage',
                           'Regular Season Advanced Player Stats - Total Rebounding %',
                           'Regular Season Advanced Player Stats - Assist % For Season',
                           'Regular Season Advanced Player Stats - Steal Percentage for Season',
                           'Regular Season Advanced Player Stats - Block %',
                           'Regular Season Advanced Player Stats - Turnover % for Season',
                           'Regular Season Advanced Player Stats - Usage % For Season',
                           'Regular Season Advanced Player Stats - Total Offensive Win Shares For Season',
                           'Regular Season Advanced Player Stats - Total Defensive Win Shares For Season',
                           'Regular Season Advanced Player Stats - Win Shares',
                           'Regular Season Advanced Player Stats - Win Shares Per 48 Minutes Played For the Season',
                           'Regular Season Advanced Player Stats - Offensive Box Plus/Minus For Season',
                           'Regular Season Advanced Player Stats - Defensive Box Plus/Minus',
                           'Regular Season Advanced Player Stats - Box Plus/Minus For Season',
                           'Regular Season Advanced Player Stats - Value Above Replacement Player',
                           'Regular Season Advanced Player Stats - Player Efficiency Ranked',
                           'Regular Season Advanced Player Stats - True Shooting % For Season Ranked',
                           'Regular Season Advanced Player Stats - Offensive Rebounding % Ranked',
                           'Regular Season Advanced Player Stats - Defensive Rebound Percentage Ranked',
                           'Regular Season Advanced Player Stats - Total Rebounding % Ranked',
                           'Regular Season Advanced Player Stats - Assist % For Season Ranked',
                           'Regular Season Advanced Player Stats - Steal Percentage for Season Ranked',
                           'Regular Season Advanced Player Stats - Block % Ranked',
                           'Regular Season Advanced Player Stats - Turnover % for Season Ranked',
                           'Regular Season Advanced Player Stats - Usage % For Season Ranked',
                           'Regular Season Advanced Player Stats - Total Offensive Win Shares For Season Ranked',
                           'Regular Season Advanced Player Stats - Total Defensive Win Shares For Season Ranked',
                           'Regular Season Advanced Player Stats - Win Shares Ranked',
                           'Regular Season Advanced Player Stats - Win Shares Per 48 Minutes Played For the Season Ranked',
                           'Regular Season Advanced Player Stats - Offensive Box Plus/Minus For Season Ranked',
                           'Regular Season Advanced Player Stats - Defensive Box Plus/Minus Ranked',
                           'Regular Season Advanced Player Stats - Box Plus/Minus For Season Ranked',
                           'Regular Season Advanced Player Stats - Value Above Replacement Player Ranked',
                            'Regular Season Team Standings - Overall Team Record Ranked',
                            'Regular Season Team Standings - Overall Team Record']

In [None]:
mvp_2024_prediction_df_features = mvp_2024_prediction_df.loc[:,list_of_features_filtered]

In [None]:
# Predict the percentage of 1st place votes for all players
predicted_votes = gbm_regressor.predict(mvp_2024_prediction_df_features)

# Add the predicted votes to the DataFrame
mvp_2024_prediction_df.loc[:,'Predicted Votes'] = predicted_votes

In [None]:
mvp_2024_prediction_df.sort_values('Predicted Votes',ascending=False).head(15)

In [None]:
all_regular_season_player_stats_merged_df_2024_final.loc[mvp_2024_prediction_df.index.tolist(),'MVP Predicted Vote %'] = mvp_2024_prediction_df.loc[mvp_2024_prediction_df.index.tolist(),'Predicted Votes']

In [None]:
all_regular_season_player_stats_merged_df_2024_final.sort_values('MVP Predicted Vote %',ascending=False)

# Predicting DPOY - Machine Learning Model Using GBM
 <a class="anchor" id="Step35"></a>

## Create dataset to analyze
    
###    1.) Merge regular season player stats with player mvp vote data 

###    2.) Merge the above data set with team data

In [None]:
dpoy_prediction_df = all_regular_season_player_stats_merged_df.merge(dpoy_df,how='left',on='Player_unique_id_and_year_concat')

In [None]:
dpoy_prediction_df = dpoy_prediction_df.merge(all_regular_season_team_stats_merged_df,how='left',on='Team_name_and_year_concat')

In [None]:
dpoy_prediction_df.head(10)

## Define list of features (exclude data not to be used in prediction) 

In [None]:
list_of_columns_to_exclude = ['Regular Season Basic Player Stats - Player',
                              'Regular Season Basic Team Stats - Team',
                             'Regular Season Basic Player Stats - Team',
                             'Regular Season Basic Player Stats - Standardized/Modernized Team Name',
                             'Player_unique_id_and_year_concat',
                             'Team_name_and_year_concat',
                              'Regular Season Basic Player Stats - Year',
                             'Regular Season Basic Team Stats - Games Played',
                             'Regular Season Basic Team Opponent Stats - Games Played',
                             'Regular Season Team Advanced Stats - Arena',
                             'Regular Season Team Advanced Stats - Total Arena Attendance',
                             'Regular Season Team Advanced Stats - Attendance Per Game',
                            'DPOY Results - Total Voting Points Possible',
                              'DPOY Results - Total Voting Points Won',
                               'DPOY Results - First Place Votes',
                              'DPOY Results - DPOY Result Indicator',
                              'Regular Season Basic Team Stats - Year'
                             ]

list_of_features = list(dpoy_prediction_df.columns)

for column in list_of_columns_to_exclude:
    
    list_of_features.remove(column)
    
for feature in list_of_features:
    
    print(f"""
    {feature}""")

## Check for NaN values and review

In [None]:
for feature in list_of_features:
    
    if len(dpoy_prediction_df.loc[dpoy_prediction_df[feature].isnull()]) > 0:
        
        dpoy_prediction_df.loc[dpoy_prediction_df[feature].isnull(),feature] = 0
        
        print(f"""
        {feature} contains {len(dpoy_prediction_df.loc[dpoy_prediction_df[feature].isnull()])} NaN values """)

## Split into test and training data

In [None]:
list_of_features_filtered = [ 'Regular Season Advanced Player Stats - Total Minutes Played During Season',
 'Regular Season Advanced Player Stats - Player Efficiency',
 'Regular Season Advanced Player Stats - True Shooting % For Season',
 'Regular Season Advanced Player Stats - 3 Point Attempt Rate',
 'Regular Season Advanced Player Stats - Free Throw Rate for Season',
 'Regular Season Advanced Player Stats - Offensive Rebounding %',
 'Regular Season Advanced Player Stats - Defensive Rebound Percentage',
 'Regular Season Advanced Player Stats - Total Rebounding %',
 'Regular Season Advanced Player Stats - Assist % For Season',
 'Regular Season Advanced Player Stats - Steal Percentage for Season',
 'Regular Season Advanced Player Stats - Block %',
 'Regular Season Advanced Player Stats - Turnover % for Season',
 'Regular Season Advanced Player Stats - Usage % For Season',
 'Regular Season Advanced Player Stats - Total Offensive Win Shares For Season',
 'Regular Season Advanced Player Stats - Total Defensive Win Shares For Season',
 'Regular Season Advanced Player Stats - Win Shares',
 'Regular Season Advanced Player Stats - Win Shares Per 48 Minutes Played For the Season',
 'Regular Season Advanced Player Stats - Offensive Box Plus/Minus For Season',
 'Regular Season Advanced Player Stats - Defensive Box Plus/Minus',
 'Regular Season Advanced Player Stats - Box Plus/Minus For Season',
 'Regular Season Advanced Player Stats - Value Above Replacement Player',
 'Regular Season Advanced Player Stats - Player Efficiency Ranked',
 'Regular Season Advanced Player Stats - True Shooting % For Season Ranked',
 'Regular Season Advanced Player Stats - Offensive Rebounding % Ranked',
 'Regular Season Advanced Player Stats - Defensive Rebound Percentage Ranked',
 'Regular Season Advanced Player Stats - Total Rebounding % Ranked',
 'Regular Season Advanced Player Stats - Assist % For Season Ranked',
 'Regular Season Advanced Player Stats - Steal Percentage for Season Ranked',
 'Regular Season Advanced Player Stats - Block % Ranked',
 'Regular Season Advanced Player Stats - Turnover % for Season Ranked',
 'Regular Season Advanced Player Stats - Usage % For Season Ranked',
 'Regular Season Advanced Player Stats - Total Offensive Win Shares For Season Ranked',
 'Regular Season Advanced Player Stats - Total Defensive Win Shares For Season Ranked',
 'Regular Season Advanced Player Stats - Win Shares Ranked',
 'Regular Season Advanced Player Stats - Win Shares Per 48 Minutes Played For the Season Ranked',
 'Regular Season Advanced Player Stats - Offensive Box Plus/Minus For Season Ranked',
 'Regular Season Advanced Player Stats - Defensive Box Plus/Minus Ranked',
 'Regular Season Advanced Player Stats - Box Plus/Minus For Season Ranked',
 'Regular Season Advanced Player Stats - Value Above Replacement Player Ranked',
                             'DPOY Results - % of Total Voting Points Possible That Were Won',
                             'Regular Season Team Standings - Overall Team Record'
                            ]

In [None]:
list_of_features

In [None]:
dpoy_prediction_df_features = dpoy_prediction_df.loc[:,list_of_features_filtered]

# Separate features and target variable
X = dpoy_prediction_df_features.drop('DPOY Results - % of Total Voting Points Possible That Were Won', axis=1)
y = dpoy_prediction_df_features['DPOY Results - % of Total Voting Points Possible That Were Won']  

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Initialize and Train the GBM Model

In [None]:
# Initialize the GBM regressor
gbm_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Train the model
gbm_regressor.fit(X_train, y_train)

## Make Predictions and Evaluate the Model

In [None]:
# Make predictions
y_pred = gbm_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

In [None]:
# # All features had 
# Mean Squared Error: 0.001925368630181291
# R^2 Score: 0.35803670206569393

## Evaluate feature importance

In [None]:
feature_importance = pd.DataFrame(gbm_regressor.feature_importances_, index=X_train.columns, columns=['importance']).sort_values('importance', ascending=False)

feature_importance.head(10)

In [None]:
dpoy_prediction_df.head(10)

### Populate prediction dataframe with predictions

In [None]:
# Predict the percentage of 1st place votes for all players
predicted_votes = gbm_regressor.predict(dpoy_prediction_df_features.drop('DPOY Results - % of Total Voting Points Possible That Were Won', axis=1))

# Add the predicted votes to the DataFrame
dpoy_prediction_df.loc[:,'Predicted Votes'] = predicted_votes

In [None]:
dpoy_prediction_df

In [None]:
dpoy_prediction_df.loc[:,['Regular Season Team Standings - Overall Team Record','Regular Season Basic Player Stats - Player','DPOY Results - % of Total Voting Points Possible That Were Won','Predicted Votes']].sort_values('Predicted Votes',ascending=False).head(200)

In [None]:
dpoy_prediction_df['Actual Vote - Predicted'] = dpoy_prediction_df['DPOY Results - % of Total Voting Points Possible That Were Won'] - dpoy_prediction_df['Predicted Votes']
dpoy_prediction_df.loc[:,['Regular Season Team Standings - Overall Team Record','Regular Season Basic Player Stats - Player','DPOY Results - % of Total Voting Points Possible That Were Won','Predicted Votes','Actual Vote - Predicted','Team_name_and_year_concat']].sort_values('Actual Vote - Predicted',ascending=False).head(200)

In [None]:
dpoy_prediction_df['Predicted DPOY Candidate'] = np.nan
dpoy_prediction_df['Predicted DPOY Winner'] = np.nan

for year in list(dpoy_prediction_df['Regular Season Basic Player Stats - Year'].unique()):
    
    year_subset = dpoy_prediction_df.loc[dpoy_prediction_df['Regular Season Basic Player Stats - Year']==year]
    
    # Sort by Predicted Votes and Grab top 10 predicted 
    year_subset = year_subset.sort_values('Predicted Votes',ascending=False).head(15)
    
    # Populate grab indices and populate in dpoy_prediction_df
    dpoy_prediction_df.loc[year_subset.index.tolist(),'Predicted DPOY Candidate'] = 1
    
    # Grab just predicted dpoy
    year_subset = year_subset.head(1)
    
    # Populate predicted mvp at index
    dpoy_prediction_df.loc[year_subset.index.tolist(),'Predicted DPOY Winner'] = 1

In [None]:
all_regular_season_player_stats_merged_df['Predicted DPOY Candidate'] = np.nan
all_regular_season_player_stats_merged_df['Predicted DPOY Winner'] = np.nan
all_regular_season_player_stats_merged_df['Predicted % of DPOY Votes'] = np.nan

all_regular_season_player_stats_merged_df.loc[dpoy_prediction_df.loc[dpoy_prediction_df['Predicted DPOY Candidate'].notnull()].index.tolist(),'Predicted DPOY Candidate'] = dpoy_prediction_df['Predicted DPOY Candidate']
all_regular_season_player_stats_merged_df.loc[dpoy_prediction_df.loc[dpoy_prediction_df['Predicted DPOY Winner'].notnull()].index.tolist(),'Predicted DPOY Winner'] = dpoy_prediction_df['Predicted DPOY Winner']
all_regular_season_player_stats_merged_df.loc[dpoy_prediction_df.loc[dpoy_prediction_df['Predicted Votes'].notnull()].index.tolist(),'Predicted % of DPOY Votes'] = dpoy_prediction_df['Predicted Votes']

In [None]:
all_regular_season_player_stats_merged_df.loc[all_regular_season_player_stats_merged_df['Predicted DPOY Winner'].notnull()].head(10)

# Predict 2024 DPOY

In [None]:
dpoy_2024_prediction_df = all_regular_season_player_stats_merged_df_2024.merge(all_regular_season_team_stats_merged_df_2024,how='left',on='Team_name_and_year_concat').set_index(all_regular_season_player_stats_merged_df_2024.index)

## Prep the 2024 DPOY data

In [None]:
columns_to_multiply = ['Regular Season Basic Player Stats - Games Played',
                      'Regular Season Basic Player Stats - Games Started',
                      'Regular Season Advanced Player Stats - Total Offensive Win Shares For Season',
                      'Regular Season Advanced Player Stats - Total Defensive Win Shares For Season',
                      'Regular Season Advanced Player Stats - Win Shares',
                      'Regular Season Advanced Player Stats - Value Above Replacement Player']

multiplier = 82/68

for column in columns_to_multiply:
    
    dpoy_2024_prediction_df[column] = dpoy_2024_prediction_df[column]*multiplier

### Define list of features (exclude data not to be used in prediction) 

In [None]:
list_of_columns_to_exclude = ['Regular Season Basic Player Stats - Player',
                              'Regular Season Basic Team Stats - Team',
                             'Regular Season Basic Player Stats - Team',
                             'Regular Season Basic Player Stats - Standardized/Modernized Team Name',
                             'Player_unique_id_and_year_concat',
                             'Team_name_and_year_concat',
                              'Regular Season Basic Player Stats - Year',
                             'Regular Season Basic Team Stats - Games Played',
                             'Regular Season Basic Team Opponent Stats - Games Played',
                             'Regular Season Team Advanced Stats - Arena',
                             'Regular Season Team Advanced Stats - Total Arena Attendance',
                             'Regular Season Team Advanced Stats - Attendance Per Game',
                              'Regular Season Basic Team Stats - Year'
                             ]

list_of_features = list(dpoy_2024_prediction_df.columns)

for column in list_of_columns_to_exclude:
    
    list_of_features.remove(column)
    
for feature in list_of_features:
    
    print(f"""
    {feature}""")

## Check for NaN values and review

In [None]:
for feature in list_of_features:
    
    if len(dpoy_2024_prediction_df.loc[dpoy_2024_prediction_df[feature].isnull()]) > 0:
        
        dpoy_2024_prediction_df.loc[dpoy_2024_prediction_df[feature].isnull(),feature] = 0
        
        print(f"""
        {feature} contains {len(dpoy_2024_prediction_df.loc[dpoy_2024_prediction_df[feature].isnull()])} NaN values """)

### Filter down

In [None]:
list_of_features_filtered = [ 'Regular Season Advanced Player Stats - Total Minutes Played During Season',
 'Regular Season Advanced Player Stats - Player Efficiency',
 'Regular Season Advanced Player Stats - True Shooting % For Season',
 'Regular Season Advanced Player Stats - 3 Point Attempt Rate',
 'Regular Season Advanced Player Stats - Free Throw Rate for Season',
 'Regular Season Advanced Player Stats - Offensive Rebounding %',
 'Regular Season Advanced Player Stats - Defensive Rebound Percentage',
 'Regular Season Advanced Player Stats - Total Rebounding %',
 'Regular Season Advanced Player Stats - Assist % For Season',
 'Regular Season Advanced Player Stats - Steal Percentage for Season',
 'Regular Season Advanced Player Stats - Block %',
 'Regular Season Advanced Player Stats - Turnover % for Season',
 'Regular Season Advanced Player Stats - Usage % For Season',
 'Regular Season Advanced Player Stats - Total Offensive Win Shares For Season',
 'Regular Season Advanced Player Stats - Total Defensive Win Shares For Season',
 'Regular Season Advanced Player Stats - Win Shares',
 'Regular Season Advanced Player Stats - Win Shares Per 48 Minutes Played For the Season',
 'Regular Season Advanced Player Stats - Offensive Box Plus/Minus For Season',
 'Regular Season Advanced Player Stats - Defensive Box Plus/Minus',
 'Regular Season Advanced Player Stats - Box Plus/Minus For Season',
 'Regular Season Advanced Player Stats - Value Above Replacement Player',
 'Regular Season Advanced Player Stats - Player Efficiency Ranked',
 'Regular Season Advanced Player Stats - True Shooting % For Season Ranked',
 'Regular Season Advanced Player Stats - Offensive Rebounding % Ranked',
 'Regular Season Advanced Player Stats - Defensive Rebound Percentage Ranked',
 'Regular Season Advanced Player Stats - Total Rebounding % Ranked',
 'Regular Season Advanced Player Stats - Assist % For Season Ranked',
 'Regular Season Advanced Player Stats - Steal Percentage for Season Ranked',
 'Regular Season Advanced Player Stats - Block % Ranked',
 'Regular Season Advanced Player Stats - Turnover % for Season Ranked',
 'Regular Season Advanced Player Stats - Usage % For Season Ranked',
 'Regular Season Advanced Player Stats - Total Offensive Win Shares For Season Ranked',
 'Regular Season Advanced Player Stats - Total Defensive Win Shares For Season Ranked',
 'Regular Season Advanced Player Stats - Win Shares Ranked',
 'Regular Season Advanced Player Stats - Win Shares Per 48 Minutes Played For the Season Ranked',
 'Regular Season Advanced Player Stats - Offensive Box Plus/Minus For Season Ranked',
 'Regular Season Advanced Player Stats - Defensive Box Plus/Minus Ranked',
 'Regular Season Advanced Player Stats - Box Plus/Minus For Season Ranked',
 'Regular Season Advanced Player Stats - Value Above Replacement Player Ranked',
                             'Regular Season Team Standings - Overall Team Record'
                            ]

In [None]:
dpoy_2024_prediction_df_features = dpoy_2024_prediction_df.loc[:,list_of_features_filtered]

In [None]:
# Predict the percentage of 1st place votes for all players
predicted_votes = gbm_regressor.predict(dpoy_2024_prediction_df_features)

# Add the predicted votes to the DataFrame
dpoy_2024_prediction_df.loc[:,'Predicted Votes'] = predicted_votes

In [None]:
dpoy_2024_prediction_df.sort_values('Predicted Votes',ascending=False).head(15)

In [None]:
all_regular_season_player_stats_merged_df_2024_final.loc[dpoy_2024_prediction_df.index.tolist(),'DPOY Predicted Vote %'] = dpoy_2024_prediction_df.loc[dpoy_2024_prediction_df.index.tolist(),'Predicted Votes']

In [None]:
all_regular_season_player_stats_merged_df_2024_final.sort_values('DPOY Predicted Vote %',ascending=False)

# Predicting six man - Machine Learning Model Using GBM
 <a class="anchor" id="Step36"></a>

## Create dataset to analyze
    
###    1.) Merge regular season player stats with player mvp vote data 

###    2.) Merge the above data set with team data

In [None]:
six_man_prediction_df = all_regular_season_player_stats_merged_df.merge(six_man_df,how='left',on='Player_unique_id_and_year_concat')

In [None]:
six_man_prediction_df = six_man_prediction_df.merge(all_regular_season_team_stats_merged_df,how='left',on='Team_name_and_year_concat')

In [None]:
six_man_prediction_df.head(10)

### Filter to just 6-men qualified candidates (minimum of 60 games played, less than 35 games started)

In [None]:
six_man_prediction_df = six_man_prediction_df.loc[six_man_prediction_df['Regular Season Basic Player Stats - 6MAN Game Played and Started Qualification ']==1]

six_man_prediction_df.head(5)

## Define list of features (exclude data not to be used in prediction) 

In [None]:
list_of_columns_to_exclude = ['Regular Season Basic Player Stats - Player',
                             'Regular Season Basic Player Stats - Team',
                              'Regular Season Basic Team Stats - Team',
                             'Regular Season Basic Player Stats - Standardized/Modernized Team Name',
                             'Player_unique_id_and_year_concat',
                             'Team_name_and_year_concat',
                             'Regular Season Basic Team Stats - Games Played',
                             'Regular Season Basic Team Opponent Stats - Games Played',
                             'Regular Season Team Advanced Stats - Arena',
                              'Regular Season Basic Player Stats - Year',
                             'Regular Season Team Advanced Stats - Total Arena Attendance',
                             'Regular Season Team Advanced Stats - Attendance Per Game',
                            '6 Man Vote Results - Total Voting Points Possible',
                              '6 Man Vote Results - Total Voting Points Won',
                               '6 Man Vote Results - First Place Votes',
                              '6 Man Vote Results - 6MAN Result Indicator',
                              'Regular Season Basic Team Stats - Year',
                              "Predicted MVP Candidate",
                              "Predicted MVP Winner",
                              "Predicted % of MVP Votes",
                              "Predicted DPOY Candidate",
                              "Predicted DPOY Winner",
                              "Predicted % of DPOY Votes"
                             ]

list_of_features = list(six_man_prediction_df.columns)

for column in list_of_columns_to_exclude:
    
    list_of_features.remove(column)
    
for feature in list_of_features:
    
    print(f"""
    {feature}""")

## Check for NaN values and review

In [None]:
for feature in list_of_features:
    
    if len(six_man_prediction_df.loc[six_man_prediction_df[feature].isnull()]) > 0:
        
        six_man_prediction_df.loc[six_man_prediction_df[feature].isnull(),feature] = 0
        
        print(f"""
        {feature} contains {len(six_man_prediction_df.loc[six_man_prediction_df[feature].isnull()])} NaN values """)

## Split into test and training data

In [None]:
list_of_features_filtered = ["Regular Season Basic Player Stats - Points Per Game",
"Regular Season Basic Player Stats - Points Per Game Ranked",
"Regular Season Play by Play Player Stats - Plus/Minus Per 100 Possessions (On Court)",
"Regular Season Advanced Player Stats - Value Above Replacement Player Ranked",
                             '6 Man Vote Results - % of Total Voting Points Possible That Were Won'
                            ]

In [None]:
list_of_features

In [None]:
six_man_prediction_df_features = six_man_prediction_df.loc[:,list_of_features]

# Separate features and target variable
X = six_man_prediction_df_features.drop('6 Man Vote Results - % of Total Voting Points Possible That Were Won', axis=1)
y = six_man_prediction_df_features['6 Man Vote Results - % of Total Voting Points Possible That Were Won']  

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Initialize and Train the GBM Model

In [None]:
# Initialize the GBM regressor
gbm_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Train the model
gbm_regressor.fit(X_train, y_train)

## Make Predictions and Evaluate the Model

In [None]:
# Make predictions
y_pred = gbm_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

In [None]:
# # All features had 
# Mean Squared Error: 0.001925368630181291
# R^2 Score: 0.35803670206569393

## Evaluate feature importance

In [None]:
feature_importance = pd.DataFrame(gbm_regressor.feature_importances_, index=X_train.columns, columns=['importance']).sort_values('importance', ascending=False)

feature_importance.head(10)

In [None]:
six_man_prediction_df.head(10)

### Populate prediction dataframe with predictions

In [None]:
# Predict the percentage of 1st place votes for all players
predicted_votes = gbm_regressor.predict(six_man_prediction_df_features.drop('6 Man Vote Results - % of Total Voting Points Possible That Were Won', axis=1))

# Add the predicted votes to the DataFrame
six_man_prediction_df.loc[:,'Predicted Votes'] = predicted_votes

In [None]:
six_man_prediction_df.head(5)

In [None]:
six_man_prediction_df.loc[:,['Regular Season Team Standings - Overall Team Record','Regular Season Basic Player Stats - Player','6 Man Vote Results - % of Total Voting Points Possible That Were Won','Predicted Votes']].sort_values('Predicted Votes',ascending=False).head(200)

In [None]:
six_man_prediction_df['Actual Vote - Predicted'] = six_man_prediction_df['6 Man Vote Results - % of Total Voting Points Possible That Were Won'] - six_man_prediction_df['Predicted Votes']
six_man_prediction_df.loc[:,['Regular Season Team Standings - Overall Team Record','Regular Season Basic Player Stats - Player','6 Man Vote Results - % of Total Voting Points Possible That Were Won','Predicted Votes','Actual Vote - Predicted','Team_name_and_year_concat']].sort_values('Actual Vote - Predicted',ascending=True).head(200)

In [None]:
six_man_prediction_df['Predicted 6MAN Candidate'] = np.nan
six_man_prediction_df['Predicted 6MAN Winner'] = np.nan

for year in list(six_man_prediction_df['Regular Season Basic Player Stats - Year'].unique()):
    
    year_subset = six_man_prediction_df.loc[six_man_prediction_df['Regular Season Basic Player Stats - Year']==year]
    
    # Sort by Predicted Votes and Grab top 10 predicted 
    year_subset = year_subset.sort_values('Predicted Votes',ascending=False).head(15)
    
    # Populate grab indices and populate in mvp_prediction_df
    six_man_prediction_df.loc[year_subset.index.tolist(),'Predicted 6MAN Candidate'] = 1
    
    # Grab just predicted MVP
    year_subset = year_subset.head(1)
    
    # Populate predicted mvp at index
    six_man_prediction_df.loc[year_subset.index.tolist(),'Predicted 6MAN Winner'] = 1

In [None]:
all_regular_season_player_stats_merged_df['Predicted 6MAN Candidate'] = np.nan
all_regular_season_player_stats_merged_df['Predicted 6MAN Winner'] = np.nan
all_regular_season_player_stats_merged_df['Predicted % of 6MAN Votes'] = np.nan

all_regular_season_player_stats_merged_df.loc[six_man_prediction_df.loc[six_man_prediction_df['Predicted 6MAN Candidate'].notnull()].index.tolist(),'Predicted 6MAN Candidate'] = six_man_prediction_df['Predicted 6MAN Candidate']
all_regular_season_player_stats_merged_df.loc[six_man_prediction_df.loc[six_man_prediction_df['Predicted 6MAN Winner'].notnull()].index.tolist(),'Predicted 6MAN Winner'] = six_man_prediction_df['Predicted 6MAN Winner']
all_regular_season_player_stats_merged_df.loc[six_man_prediction_df.loc[six_man_prediction_df['Predicted Votes'].notnull()].index.tolist(),'Predicted % of 6MAN Votes'] = six_man_prediction_df['Predicted Votes']

In [None]:
all_regular_season_player_stats_merged_df.loc[all_regular_season_player_stats_merged_df['Predicted 6MAN Winner'].notnull()].head(10)

# Predict 2024 6MAN

In [None]:
_six_man_2024_prediction_df = all_regular_season_player_stats_merged_df_2024.merge(all_regular_season_team_stats_merged_df_2024,how='left',on='Team_name_and_year_concat').set_index(all_regular_season_player_stats_merged_df_2024.index)

## Prep the 2024 data

In [None]:
columns_to_multiply = ['Regular Season Basic Player Stats - Games Played',
                      'Regular Season Basic Player Stats - Games Started',
                      'Regular Season Advanced Player Stats - Total Offensive Win Shares For Season',
                      'Regular Season Advanced Player Stats - Total Defensive Win Shares For Season',
                      'Regular Season Advanced Player Stats - Win Shares',
                      'Regular Season Advanced Player Stats - Value Above Replacement Player']

multiplier = 82/68

for column in columns_to_multiply:
    
    _six_man_2024_prediction_df[column] = _six_man_2024_prediction_df[column]*multiplier

## Define list of features (exclude data not to be used in prediction) 

In [None]:
list_of_columns_to_exclude = ['Regular Season Basic Player Stats - Player',
                             'Regular Season Basic Player Stats - Team',
                              'Regular Season Basic Team Stats - Team',
                             'Regular Season Basic Player Stats - Standardized/Modernized Team Name',
                             'Player_unique_id_and_year_concat',
                             'Team_name_and_year_concat',
                             'Regular Season Basic Team Stats - Games Played',
                             'Regular Season Basic Team Opponent Stats - Games Played',
                             'Regular Season Team Advanced Stats - Arena',
                              'Regular Season Basic Player Stats - Year',
                             'Regular Season Team Advanced Stats - Total Arena Attendance',
                             'Regular Season Team Advanced Stats - Attendance Per Game',
                              'Regular Season Basic Team Stats - Year'
                             ]

list_of_features = list(_six_man_2024_prediction_df.columns)

for column in list_of_columns_to_exclude:
    
    list_of_features.remove(column)
    
for feature in list_of_features:
    
    print(f"""
    {feature}""")

## Check for NaN values and review

In [None]:
for feature in list_of_features:
    
    if len(_six_man_2024_prediction_df.loc[_six_man_2024_prediction_df[feature].isnull()]) > 0:
        
        _six_man_2024_prediction_df.loc[_six_man_2024_prediction_df[feature].isnull(),feature] = 0
        
        print(f"""
        {feature} contains {len(_six_man_2024_prediction_df.loc[_six_man_2024_prediction_df[feature].isnull()])} NaN values """)

In [None]:
_six_man_2024_prediction_df_features = _six_man_2024_prediction_df.loc[:,list_of_features]

In [None]:
# Predict the percentage of 1st place votes for all players
predicted_votes = gbm_regressor.predict(_six_man_2024_prediction_df_features)

# Add the predicted votes to the DataFrame
_six_man_2024_prediction_df.loc[:,'Predicted Votes'] = predicted_votes

In [None]:
_six_man_2024_prediction_df.sort_values('Predicted Votes',ascending=False).head(15)

In [None]:
all_regular_season_player_stats_merged_df_2024_final.loc[_six_man_2024_prediction_df.index.tolist(),'6MAN Predicted Vote %'] = _six_man_2024_prediction_df.loc[_six_man_2024_prediction_df.index.tolist(),'Predicted Votes']

In [None]:
all_regular_season_player_stats_merged_df_2024_final_filtered = all_regular_season_player_stats_merged_df_2024_final.loc[(all_regular_season_player_stats_merged_df_2024_final['Regular Season Basic Player Stats - Games Played'] > 50)&(all_regular_season_player_stats_merged_df_2024_final['Regular Season Basic Player Stats - Games Started'] < 20)]

all_regular_season_player_stats_merged_df_2024_final_filtered.sort_values('6MAN Predicted Vote %',ascending=False)

# Predicting ROY - Machine Learning Model Using GBM
 <a class="anchor" id="Step37"></a>

## Create dataset to analyze
    
###    1.) Merge regular season player stats with player mvp vote data 

###    2.) Merge the above data set with team data

In [None]:
roy_prediction_df = all_regular_season_player_stats_merged_df.merge(roy_vote_df,how='left',on='Player_unique_id_and_year_concat')

In [None]:
roy_prediction_df = roy_prediction_df.merge(all_regular_season_team_stats_merged_df,how='left',on='Team_name_and_year_concat')

In [None]:
roy_prediction_df.head(2)

### Filter to just rookie. To do that, we have to load in rookie tables

### Compile yearly data into one dataframe

#### Create dataframes of yearly data

In [None]:
create_list_of_dataframes_for_yearly_stats(f'{os.getcwd()}\\NBA Rookies',1)

###### Validate that all of the yearly data files contain different data

In [None]:
check_all_unique_dataframes(master_df_list,'Year')

###### Validate that the column structure for every data frame is the same.

In [None]:
validate_column_structure_is_same(master_df_list)

###### Concat/combine yearly dataframes into one AND validate that the length of the combined dataframe is the same as the sum of the yearly dataframes

In [None]:
concat_dataframes_and_validate_and_reset_index_and_sort('rookie_df',master_df_list,cumulative_row_counter)

###### Validate we have all of the data we need from 2004-2023

In [None]:
validate_completeness_of_data(rookie_df,2004,2024)

###### Clean player names

In [None]:
clean_column_values(rookie_df,'Player')

In [None]:
rookie_df['Player_unique_id_and_year_concat'] = rookie_df['-9999'] + '-' + rookie_df['Year'].astype(str)
rookie_df['Rookie year indicator'] = 1
rookie_df = rookie_df[['Player_unique_id_and_year_concat','Rookie year indicator']]

In [None]:
roy_prediction_df = roy_prediction_df.merge(rookie_df,how='left',on='Player_unique_id_and_year_concat')

In [None]:
roy_prediction_df = roy_prediction_df.loc[roy_prediction_df['Rookie year indicator']==1]
len(roy_prediction_df)

In [None]:
roy_prediction_df.head(3)

## Define list of features (exclude data not to be used in prediction) 

In [None]:
list_of_columns_to_exclude = ['Regular Season Basic Player Stats - Player',
                             'Regular Season Basic Player Stats - Team',
                              'Regular Season Basic Team Stats - Team',
                              
                             'Regular Season Basic Player Stats - Standardized/Modernized Team Name',
                             'Player_unique_id_and_year_concat',
                             'Team_name_and_year_concat',
                              'Regular Season Basic Player Stats - Year',
                             'Regular Season Basic Team Stats - Games Played',
                             'Regular Season Basic Team Opponent Stats - Games Played',
                             'Regular Season Team Advanced Stats - Arena',
                             'Regular Season Team Advanced Stats - Total Arena Attendance',
                             'Regular Season Team Advanced Stats - Attendance Per Game',
                            'ROY Vote Results - Total Voting Points Possible',
                              'ROY Vote Results - Total Voting Points Won',
                               'ROY Vote Results - First Place Votes',
                              'ROY Vote Results - ROY Result Indicator',
                              'Regular Season Basic Team Stats - Year',
                                "Predicted MVP Candidate",
                              "Predicted MVP Winner",
                              "Predicted % of MVP Votes",
                              "Predicted DPOY Candidate",
                              "Predicted DPOY Winner",
                              "Predicted % of DPOY Votes",
                              "Predicted 6MAN Candidate",
                              "Predicted 6MAN Winner",
                              "Predicted % of 6MAN Votes"                             
                             ]

list_of_features = list(roy_prediction_df.columns)

for column in list_of_columns_to_exclude:
    
    list_of_features.remove(column)
    
for feature in list_of_features:
    
    print(f"""
    {feature}""")

## Check for NaN values and review

In [None]:
for feature in list_of_features:
    
    if len(roy_prediction_df.loc[roy_prediction_df[feature].isnull()]) > 0:
        
        roy_prediction_df.loc[roy_prediction_df[feature].isnull(),feature] = 0
        
        print(f"""
        {feature} contains {len(roy_prediction_df.loc[roy_prediction_df[feature].isnull()])} NaN values """)

## Split into test and training data

In [None]:
list_of_features_filtered = ["Regular Season Basic Player Stats - Points Per Game",
"Regular Season Basic Player Stats - Points Per Game Ranked",
"Regular Season Play by Play Player Stats - Plus/Minus Per 100 Possessions (On Court)",
"Regular Season Advanced Player Stats - Value Above Replacement Player Ranked",
                             '6 Man Vote Results - % of Total Voting Points Possible That Were Won'
                            ]

In [None]:
list_of_features

In [None]:
roy_prediction_df_features = roy_prediction_df.loc[:,list_of_features]

# Separate features and target variable
X = roy_prediction_df_features.drop('ROY Vote Results - % of Total Voting Points Possible That Were Won', axis=1)
y = roy_prediction_df_features['ROY Vote Results - % of Total Voting Points Possible That Were Won']  

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Initialize and Train the GBM Model

In [None]:
# Initialize the GBM regressor
gbm_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Train the model
gbm_regressor.fit(X_train, y_train)

## Make Predictions and Evaluate the Model

In [None]:
# Make predictions
y_pred = gbm_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

In [None]:
# # All features had 
# Mean Squared Error: 0.001925368630181291
# R^2 Score: 0.35803670206569393

## Evaluate feature importance

In [None]:
feature_importance = pd.DataFrame(gbm_regressor.feature_importances_, index=X_train.columns, columns=['importance']).sort_values('importance', ascending=False)

feature_importance.head(10)

In [None]:
roy_prediction_df.head(10)

### Populate prediction dataframe with predictions

In [None]:
# Predict the percentage of 1st place votes for all players
predicted_votes = gbm_regressor.predict(roy_prediction_df_features.drop('ROY Vote Results - % of Total Voting Points Possible That Were Won', axis=1))

# Add the predicted votes to the DataFrame
roy_prediction_df.loc[:,'Predicted Votes'] = predicted_votes

In [None]:
roy_prediction_df.head(5)

In [None]:
roy_prediction_df.loc[:,['Regular Season Team Standings - Overall Team Record','Regular Season Basic Player Stats - Player','ROY Vote Results - % of Total Voting Points Possible That Were Won','Predicted Votes']].sort_values('Predicted Votes',ascending=False).head(200)

In [None]:
roy_prediction_df['Actual Vote - Predicted'] = roy_prediction_df['ROY Vote Results - % of Total Voting Points Possible That Were Won'] - roy_prediction_df['Predicted Votes']
roy_prediction_df.loc[:,['Regular Season Team Standings - Overall Team Record','Regular Season Basic Player Stats - Player','ROY Vote Results - % of Total Voting Points Possible That Were Won','Predicted Votes','Actual Vote - Predicted','Team_name_and_year_concat']].sort_values('Actual Vote - Predicted',ascending=True).head(200)

In [None]:
roy_prediction_df['Predicted ROY Candidate'] = np.nan
roy_prediction_df['Predicted ROY Winner'] = np.nan

for year in list(roy_prediction_df['Regular Season Basic Player Stats - Year'].unique()):
    
    year_subset = roy_prediction_df.loc[roy_prediction_df['Regular Season Basic Player Stats - Year']==year]
    
    # Sort by Predicted Votes and Grab top 10 predicted 
    year_subset = year_subset.sort_values('Predicted Votes',ascending=False).head(15)
    
    # Populate grab indices and populate in mvp_prediction_df
    roy_prediction_df.loc[year_subset.index.tolist(),'Predicted ROY Candidate'] = 1
    
    # Grab just predicted MVP
    year_subset = year_subset.head(1)
    
    # Populate predicted mvp at index
    roy_prediction_df.loc[year_subset.index.tolist(),'Predicted ROY Winner'] = 1

In [None]:
all_regular_season_player_stats_merged_df['Predicted ROY Candidate'] = np.nan
all_regular_season_player_stats_merged_df['Predicted ROY Winner'] = np.nan
all_regular_season_player_stats_merged_df['Predicted % of ROY Votes'] = np.nan

all_regular_season_player_stats_merged_df.loc[roy_prediction_df.loc[roy_prediction_df['Predicted ROY Candidate'].notnull()].index.tolist(),'Predicted ROY Candidate'] = roy_prediction_df['Predicted ROY Candidate']
all_regular_season_player_stats_merged_df.loc[roy_prediction_df.loc[roy_prediction_df['Predicted ROY Winner'].notnull()].index.tolist(),'Predicted ROY Winner'] = roy_prediction_df['Predicted ROY Winner']
all_regular_season_player_stats_merged_df.loc[roy_prediction_df.loc[roy_prediction_df['Predicted Votes'].notnull()].index.tolist(),'Predicted % of ROY Votes'] = roy_prediction_df['Predicted Votes']

In [None]:
all_regular_season_player_stats_merged_df.loc[all_regular_season_player_stats_merged_df['Predicted ROY Winner'].notnull()].head(10)

# Predict 2024 ROY

In [None]:
roy_2024_prediction_df = all_regular_season_player_stats_merged_df_2024.merge(all_regular_season_team_stats_merged_df_2024,how='left',on='Team_name_and_year_concat').set_index(all_regular_season_player_stats_merged_df_2024.index)

# Prep the data

In [None]:
columns_to_multiply = ['Regular Season Basic Player Stats - Games Played',
                      'Regular Season Basic Player Stats - Games Started',
                      'Regular Season Advanced Player Stats - Total Offensive Win Shares For Season',
                      'Regular Season Advanced Player Stats - Total Defensive Win Shares For Season',
                      'Regular Season Advanced Player Stats - Win Shares',
                      'Regular Season Advanced Player Stats - Value Above Replacement Player']

multiplier = 82/68

for column in columns_to_multiply:
    
    roy_2024_prediction_df[column] = roy_2024_prediction_df[column]*multiplier

### Populate 2024 with rookie markers

In [None]:
rookie_2024_df = pd.read_csv('2024 Special Rookie File.txt',header=1)
rookie_2024_df['Year'] = 2024
rookie_2024_df['Player_unique_id_and_year_concat'] = rookie_2024_df['-9999'] + '-' + rookie_2024_df['Year'].astype(str)
rookie_2024_df['Rookie year indicator'] = 1
rookie_2024_df = rookie_2024_df[['Player_unique_id_and_year_concat','Rookie year indicator']]
roy_2024_prediction_df = roy_2024_prediction_df.merge(rookie_2024_df,how='left',on='Player_unique_id_and_year_concat').set_index(all_regular_season_player_stats_merged_df_2024.index)
roy_2024_prediction_df

## Define list of features (exclude data not to be used in prediction) 

In [None]:
list_of_columns_to_exclude = ['Regular Season Basic Player Stats - Player',
                             'Regular Season Basic Player Stats - Team',
                              'Regular Season Basic Team Stats - Team',
                             'Regular Season Basic Player Stats - Standardized/Modernized Team Name',
                             'Player_unique_id_and_year_concat',
                             'Team_name_and_year_concat',
                              'Regular Season Basic Player Stats - Year',
                             'Regular Season Basic Team Stats - Games Played',
                             'Regular Season Basic Team Opponent Stats - Games Played',
                             'Regular Season Team Advanced Stats - Arena',
                             'Regular Season Team Advanced Stats - Total Arena Attendance',
                             'Regular Season Team Advanced Stats - Attendance Per Game',
                              'Regular Season Basic Team Stats - Year',                             
                             ]

list_of_features = list(roy_2024_prediction_df.columns)

for column in list_of_columns_to_exclude:
    
    list_of_features.remove(column)
    
for feature in list_of_features:
    
    print(f"""
    {feature}""")

## Check for NaN values and review

In [None]:
for feature in list_of_features:
    
    if len(roy_2024_prediction_df.loc[roy_2024_prediction_df[feature].isnull()]) > 0:
        
        roy_2024_prediction_df.loc[roy_2024_prediction_df[feature].isnull(),feature] = 0
        
        print(f"""
        {feature} contains {len(roy_2024_prediction_df.loc[roy_2024_prediction_df[feature].isnull()])} NaN values """)

In [None]:
roy_2024_prediction_df_features = roy_2024_prediction_df.loc[:,list_of_features]

In [None]:
# Predict the percentage of 1st place votes for all players
predicted_votes = gbm_regressor.predict(roy_2024_prediction_df_features)

# Add the predicted votes to the DataFrame
roy_2024_prediction_df.loc[:,'Predicted Votes'] = predicted_votes

In [None]:
roy_2024_prediction_df.sort_values('Predicted Votes',ascending=False).head(15)

In [None]:
all_regular_season_player_stats_merged_df_2024_final.loc[roy_2024_prediction_df.index.tolist(),'ROY Predicted Vote %'] = roy_2024_prediction_df.loc[roy_2024_prediction_df.index.tolist(),'Predicted Votes']
all_regular_season_player_stats_merged_df_2024_final.loc[roy_2024_prediction_df.index.tolist(),'Rookie year indicator'] = roy_2024_prediction_df.loc[roy_2024_prediction_df.index.tolist(),'Rookie year indicator']

In [None]:
all_regular_season_player_stats_merged_df_2024_final_filtered = all_regular_season_player_stats_merged_df_2024_final.loc[all_regular_season_player_stats_merged_df_2024_final['Rookie year indicator']==1]

all_regular_season_player_stats_merged_df_2024_final_filtered.sort_values('ROY Predicted Vote %',ascending=False).head(5)

# Predicting MIP - Machine Learning Model Using GBM

 <a class="anchor" id="Step38"></a>

## Create dataset to analyze
    
###    1.) Merge regular season player stats with player mvp vote data 

###    2.) Merge the above data set with team data

In [None]:
mip_prediction_df = all_regular_season_player_stats_merged_df.merge(mip_df,how='left',on='Player_unique_id_and_year_concat')

In [None]:
mip_prediction_df = mip_prediction_df.merge(all_regular_season_team_stats_merged_df,how='left',on='Team_name_and_year_concat')

In [None]:
mip_prediction_df.head(2)

### Filter out rookie years (because they wouldn't have year prior data)

In [None]:
mip_prediction_df = mip_prediction_df.merge(rookie_df,how='left',on='Player_unique_id_and_year_concat')
mip_prediction_df = mip_prediction_df.loc[mip_prediction_df['Rookie year indicator'].isnull()]

### Create delta columns for basic and advanced player stats

In [None]:
list_of_indices_where_no_previous_year_data = []

list_of_columns_to_delta = [
 'Regular Season Basic Player Stats - Games Played',
 'Regular Season Basic Player Stats - Games Started',
 'Regular Season Basic Player Stats - Minutes Played Per Game',
 'Regular Season Basic Player Stats - Field Goals Made Per Game',
 'Regular Season Basic Player Stats - Field Goals Attempted Per Game',
 'Regular Season Basic Player Stats - Average Field Goal % For Season',
 'Regular Season Basic Player Stats - 3 Pointers Made Per Game',
 'Regular Season Basic Player Stats - 3 Point Attempts Per Game',
 'Regular Season Basic Player Stats - Average 3 Point % For Season',
 'Regular Season Basic Player Stats - 2 Pointers Made Per Game',
 'Regular Season Basic Player Stats - 2 Point Attempts Per Game',
 'Regular Season Basic Player Stats - Average 2 Point % For Season',
 'Regular Season Basic Player Stats - Average Effective Field Goal % For Season',
 'Regular Season Basic Player Stats - Free Throws Made Per Game',
 'Regular Season Basic Player Stats - Free Throw Attempts Per Game',
 'Regular Season Basic Player Stats - Average Free Throw % For Season',
 'Regular Season Basic Player Stats - Offense Rebounds Per Game',
 'Regular Season Basic Player Stats - Defense Rebounds Per Game',
 'Regular Season Basic Player Stats - Total Rebounds Per Game',
 'Regular Season Basic Player Stats - Assists Per Game',
 'Regular Season Basic Player Stats - Steals Per Game',
 'Regular Season Basic Player Stats - Blocks Per Game',
 'Regular Season Basic Player Stats - Turnovers Per Game',
 'Regular Season Basic Player Stats - Personal Fouls Per Game',
 'Regular Season Basic Player Stats - Points Per Game',
 'Regular Season Basic Player Stats - Total Rebounds Per Game Ranked',
 'Regular Season Basic Player Stats - Assists Per Game Ranked',
 'Regular Season Basic Player Stats - Steals Per Game Ranked',
 'Regular Season Basic Player Stats - Blocks Per Game Ranked',
 'Regular Season Basic Player Stats - Points Per Game Ranked',
 'Regular Season Basic Player Stats - Average Effective Field Goal % For Season Ranked',
 'Regular Season Advanced Player Stats - Total Minutes Played During Season',
 'Regular Season Advanced Player Stats - Player Efficiency',
 'Regular Season Advanced Player Stats - True Shooting % For Season',
 'Regular Season Advanced Player Stats - 3 Point Attempt Rate',
 'Regular Season Advanced Player Stats - Free Throw Rate for Season',
 'Regular Season Advanced Player Stats - Offensive Rebounding %',
 'Regular Season Advanced Player Stats - Defensive Rebound Percentage',
 'Regular Season Advanced Player Stats - Total Rebounding %',
 'Regular Season Advanced Player Stats - Assist % For Season',
 'Regular Season Advanced Player Stats - Steal Percentage for Season',
 'Regular Season Advanced Player Stats - Block %',
 'Regular Season Advanced Player Stats - Turnover % for Season',
 'Regular Season Advanced Player Stats - Usage % For Season',
 'Regular Season Advanced Player Stats - Total Offensive Win Shares For Season',
 'Regular Season Advanced Player Stats - Total Defensive Win Shares For Season',
 'Regular Season Advanced Player Stats - Win Shares',
 'Regular Season Advanced Player Stats - Win Shares Per 48 Minutes Played For the Season',
 'Regular Season Advanced Player Stats - Offensive Box Plus/Minus For Season',
 'Regular Season Advanced Player Stats - Defensive Box Plus/Minus',
 'Regular Season Advanced Player Stats - Box Plus/Minus For Season',
 'Regular Season Advanced Player Stats - Value Above Replacement Player',
 'Regular Season Advanced Player Stats - Player Efficiency Ranked',
 'Regular Season Advanced Player Stats - True Shooting % For Season Ranked',
 'Regular Season Advanced Player Stats - Offensive Rebounding % Ranked',
 'Regular Season Advanced Player Stats - Defensive Rebound Percentage Ranked',
 'Regular Season Advanced Player Stats - Total Rebounding % Ranked',
 'Regular Season Advanced Player Stats - Assist % For Season Ranked',
 'Regular Season Advanced Player Stats - Steal Percentage for Season Ranked',
 'Regular Season Advanced Player Stats - Block % Ranked',
 'Regular Season Advanced Player Stats - Turnover % for Season Ranked',
 'Regular Season Advanced Player Stats - Usage % For Season Ranked',
 'Regular Season Advanced Player Stats - Total Offensive Win Shares For Season Ranked',
 'Regular Season Advanced Player Stats - Total Defensive Win Shares For Season Ranked',
 'Regular Season Advanced Player Stats - Win Shares Ranked',
 'Regular Season Advanced Player Stats - Win Shares Per 48 Minutes Played For the Season Ranked',
 'Regular Season Advanced Player Stats - Offensive Box Plus/Minus For Season Ranked',
 'Regular Season Advanced Player Stats - Defensive Box Plus/Minus Ranked',
 'Regular Season Advanced Player Stats - Box Plus/Minus For Season Ranked',
 'Regular Season Advanced Player Stats - Value Above Replacement Player Ranked']

player_id_to_delta_value_dict = []

for column in list_of_columns_to_delta:
    mip_prediction_df[f'{column} Delta'] = np.nan

# For each year that is not 2004...
for year in list(mip_prediction_df['Regular Season Basic Player Stats - Year'].unique()):
    
    if year == 2004:
        
        continue
        
    previous_year = year - 1
    
    current_subset_df = mip_prediction_df.loc[mip_prediction_df['Regular Season Basic Player Stats - Year']==year].copy()
    
    current_subset_df['Temp_player_id'] = current_subset_df['Player_unique_id_and_year_concat'].str.split('-').str[0]
    
    previous_subset_df = mip_prediction_df.loc[mip_prediction_df['Regular Season Basic Player Stats - Year']==previous_year].copy()
    
    previous_subset_df['Temp_player_id'] = previous_subset_df['Player_unique_id_and_year_concat'].str.split('-').str[0]    
        
    # For each player that played in the current year
    for unique_player in list(current_subset_df['Temp_player_id'].unique()):

        # Grab index where the player exist in current year
        index_for_current = current_subset_df.loc[current_subset_df['Temp_player_id']==unique_player].index[0]
        
        # Check and make sure we have previous year data. If not, print player and year value to check it out
        if len(previous_subset_df.loc[previous_subset_df['Temp_player_id']==unique_player])==0:
            
            print(f'{unique_player} in {year} does not seem to have previous year data.')
            
            list_of_indices_where_no_previous_year_data.append(index_for_current) 
            
        else:
        
            # Grab index where the player exist in PREVIOUS year
            index_for_previous = previous_subset_df.loc[previous_subset_df['Temp_player_id']==unique_player].index[0]

            for column in list_of_columns_to_delta:

                # Grab value for current year and column
                current_year_column_value = current_subset_df.loc[index_for_current,column]

                # Grab value for previous year and column
                previous_year_column_value = previous_subset_df.loc[index_for_previous,column]

                # Calculate delta value
                delta_value = current_year_column_value-previous_year_column_value

                # Populate delta version of column with delta
                mip_prediction_df.loc[index_for_current,f'{column} Delta'] = delta_value

### Drop columns that have nulls deltas (because no prior year data found)

In [None]:
mip_prediction_df = mip_prediction_df.loc[mip_prediction_df['Regular Season Basic Player Stats - Points Per Game Delta'].notnull()]

In [None]:
mip_prediction_df.head(1)

## Define list of features (exclude data not to be used in prediction) 

In [None]:
list_of_columns_to_exclude = ['Regular Season Basic Player Stats - Player',
                             'Regular Season Basic Player Stats - Team',
                              'Regular Season Basic Team Stats - Team',
                             'Regular Season Basic Player Stats - Standardized/Modernized Team Name',
                             'Player_unique_id_and_year_concat',
                             'Team_name_and_year_concat',
                             'Regular Season Basic Team Stats - Games Played',
                             'Regular Season Basic Team Opponent Stats - Games Played',
                             'Regular Season Team Advanced Stats - Arena',
                             'Regular Season Team Advanced Stats - Total Arena Attendance',
                             'Regular Season Team Advanced Stats - Attendance Per Game',
                            'MIP Vote Results - Total Voting Points Possible',
                              'MIP Vote Results - Total Voting Points Won',
                               'MIP Vote Results - First Place Votes',
                              'MIP Vote Results - MIP Result Indicator',
                              'Regular Season Basic Team Stats - Year'
                             ]

list_of_features = list(mip_prediction_df.columns)

for column in list_of_columns_to_exclude:
    
    list_of_features.remove(column)
    
for feature in list_of_features:
    
    print(f"""
    {feature}""")

## Check for NaN values and review

In [None]:
for feature in list_of_features:
    
    if len(mip_prediction_df.loc[mip_prediction_df[feature].isnull()]) > 0:
        
        mip_prediction_df.loc[mip_prediction_df[feature].isnull(),feature] = 0
        
        print(f"""
        {feature} contains {len(mip_prediction_df.loc[mip_prediction_df[feature].isnull()])} NaN values """)

## Split into test and training data

In [None]:
list_of_features_filtered = ['Regular Season Advanced Player Stats - Value Above Replacement Player Delta',
 'Regular Season Basic Player Stats - Points Per Game Delta',
 'Regular Season Advanced Player Stats - Value Above Replacement Player Ranked',
 'Regular Season Advanced Player Stats - Usage % For Season Ranked Delta',
 'Regular Season Advanced Player Stats - Usage % For Season Delta',
 'Regular Season Advanced Player Stats - Player Efficiency Delta',
 'Regular Season Basic Player Stats - Field Goals Attempted Per Game Delta',
 'Regular Season Basic Player Stats - Points Per Game Ranked',
 'Regular Season Advanced Player Stats - Steal Percentage for Season Ranked',
 'Regular Season Basic Player Stats - Games Started',
 'Regular Season Advanced Player Stats - Win Shares',
 'Regular Season Basic Player Stats - 2 Pointers Made Per Game',
 'Regular Season Basic Player Stats - Average 2 Point % For Season Delta',
 'Regular Season Team Advanced Stats - Net Rating Ranked',
 'Regular Season Advanced Player Stats - Player Efficiency Ranked Delta',
 'Regular Season Advanced Player Stats - Offensive Rebounding % Ranked Delta',
 'Regular Season Basic Player Stats - 2 Pointers Made Per Game Delta',
 'Regular Season Advanced Player Stats - Total Offensive Win Shares For Season Delta',
  'MIP Vote Results - % of Total Voting Points Possible That Were Won']

In [None]:
list_of_features

In [None]:
mip_prediction_df_features = mip_prediction_df.loc[:,list_of_features_filtered]

# Separate features and target variable
X = mip_prediction_df_features.drop('MIP Vote Results - % of Total Voting Points Possible That Were Won', axis=1)
y = mip_prediction_df_features['MIP Vote Results - % of Total Voting Points Possible That Were Won']  

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Initialize and Train the GBM Model

In [None]:
# Initialize the GBM regressor
gbm_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Train the model
gbm_regressor.fit(X_train, y_train)

## Make Predictions and Evaluate the Model

In [None]:
# Make predictions
y_pred = gbm_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

In [None]:
# # All features had 
# Mean Squared Error: 0.001925368630181291
# R^2 Score: 0.35803670206569393

## Evaluate feature importance

In [None]:
feature_importance = pd.DataFrame(gbm_regressor.feature_importances_, index=X_train.columns, columns=['importance']).sort_values('importance', ascending=False)

feature_importance.head(20)

In [None]:
mip_prediction_df.head(3)

### Populate prediction dataframe with predictions

In [None]:
# Predict the percentage of 1st place votes for all players
predicted_votes = gbm_regressor.predict(mip_prediction_df_features.drop('MIP Vote Results - % of Total Voting Points Possible That Were Won', axis=1))

# Add the predicted votes to the DataFrame
mip_prediction_df.loc[:,'Predicted Votes'] = predicted_votes

In [None]:
mip_prediction_df.head(5)

In [None]:
mip_prediction_df.loc[:,['Regular Season Team Standings - Overall Team Record','Regular Season Basic Player Stats - Player','MIP Vote Results - % of Total Voting Points Possible That Were Won','Predicted Votes']].sort_values('Predicted Votes',ascending=False).head(200)

In [None]:
mip_prediction_df['Actual Vote - Predicted'] = mip_prediction_df['MIP Vote Results - % of Total Voting Points Possible That Were Won'] - mip_prediction_df['Predicted Votes']
mip_prediction_df.loc[:,['Regular Season Team Standings - Overall Team Record','Regular Season Basic Player Stats - Player','MIP Vote Results - % of Total Voting Points Possible That Were Won','Predicted Votes','Actual Vote - Predicted','Team_name_and_year_concat']].sort_values('Actual Vote - Predicted',ascending=False).head(200)

In [None]:
mip_prediction_df['Predicted MIP Candidate'] = np.nan
mip_prediction_df['Predicted MIP Winner'] = np.nan

for year in list(mip_prediction_df['Regular Season Basic Player Stats - Year'].unique()):
    
    year_subset = mip_prediction_df.loc[mip_prediction_df['Regular Season Basic Player Stats - Year']==year]
    
    # Sort by Predicted Votes and Grab top 10 predicted 
    year_subset = year_subset.sort_values('Predicted Votes',ascending=False).head(15)
    
    # Populate grab indices and populate in mvp_prediction_df
    mip_prediction_df.loc[year_subset.index.tolist(),'Predicted MIP Candidate'] = 1
    
    # Grab just predicted MVP
    year_subset = year_subset.head(1)
    
    # Populate predicted mvp at index
    mip_prediction_df.loc[year_subset.index.tolist(),'Predicted MIP Winner'] = 1

In [None]:
all_regular_season_player_stats_merged_df['Predicted MIP Candidate'] = np.nan
all_regular_season_player_stats_merged_df['Predicted MIP Winner'] = np.nan
all_regular_season_player_stats_merged_df['Predicted % of MIP Votes'] = np.nan

all_regular_season_player_stats_merged_df.loc[mip_prediction_df.loc[mip_prediction_df['Predicted MIP Candidate'].notnull()].index.tolist(),'Predicted MIP Candidate'] = mip_prediction_df['Predicted MIP Candidate']
all_regular_season_player_stats_merged_df.loc[mip_prediction_df.loc[mip_prediction_df['Predicted MIP Winner'].notnull()].index.tolist(),'Predicted MIP Winner'] = mip_prediction_df['Predicted MIP Winner']
all_regular_season_player_stats_merged_df.loc[mip_prediction_df.loc[mip_prediction_df['Predicted Votes'].notnull()].index.tolist(),'Predicted % of MIP Votes'] = mip_prediction_df['Predicted Votes']

In [None]:
all_regular_season_player_stats_merged_df.loc[all_regular_season_player_stats_merged_df['Predicted MIP Winner'].notnull()].head(10)

In [None]:
# # Create delta column in merged_df
# for columnn in list_of_columns_to_delta:
    
#     all_regular_season_player_stats_merged_df[f'{column} Delta'] = np.nan
    
# # Passthrough delta columns values
# for column in list_of_columns_to_delta:
    
#     indices = mip_prediction_df.index.tolist()
#     all_regular_season_player_stats_merged_df.loc[indices,f'{column} Delta'] = mip_prediction_df[f'{column} Delta']

In [None]:
list_of_indices_where_no_previous_year_data = []

list_of_columns_to_delta = [
 'Regular Season Basic Player Stats - Games Played',
 'Regular Season Basic Player Stats - Games Started',
 'Regular Season Basic Player Stats - Minutes Played Per Game',
 'Regular Season Basic Player Stats - Field Goals Made Per Game',
 'Regular Season Basic Player Stats - Field Goals Attempted Per Game',
 'Regular Season Basic Player Stats - Average Field Goal % For Season',
 'Regular Season Basic Player Stats - 3 Pointers Made Per Game',
 'Regular Season Basic Player Stats - 3 Point Attempts Per Game',
 'Regular Season Basic Player Stats - Average 3 Point % For Season',
 'Regular Season Basic Player Stats - 2 Pointers Made Per Game',
 'Regular Season Basic Player Stats - 2 Point Attempts Per Game',
 'Regular Season Basic Player Stats - Average 2 Point % For Season',
 'Regular Season Basic Player Stats - Average Effective Field Goal % For Season',
 'Regular Season Basic Player Stats - Free Throws Made Per Game',
 'Regular Season Basic Player Stats - Free Throw Attempts Per Game',
 'Regular Season Basic Player Stats - Average Free Throw % For Season',
 'Regular Season Basic Player Stats - Offense Rebounds Per Game',
 'Regular Season Basic Player Stats - Defense Rebounds Per Game',
 'Regular Season Basic Player Stats - Total Rebounds Per Game',
 'Regular Season Basic Player Stats - Assists Per Game',
 'Regular Season Basic Player Stats - Steals Per Game',
 'Regular Season Basic Player Stats - Blocks Per Game',
 'Regular Season Basic Player Stats - Turnovers Per Game',
 'Regular Season Basic Player Stats - Personal Fouls Per Game',
 'Regular Season Basic Player Stats - Points Per Game',
 'Regular Season Basic Player Stats - Total Rebounds Per Game Ranked',
 'Regular Season Basic Player Stats - Assists Per Game Ranked',
 'Regular Season Basic Player Stats - Steals Per Game Ranked',
 'Regular Season Basic Player Stats - Blocks Per Game Ranked',
 'Regular Season Basic Player Stats - Points Per Game Ranked',
 'Regular Season Basic Player Stats - Average Effective Field Goal % For Season Ranked',
 'Regular Season Advanced Player Stats - Total Minutes Played During Season',
 'Regular Season Advanced Player Stats - Player Efficiency',
 'Regular Season Advanced Player Stats - True Shooting % For Season',
 'Regular Season Advanced Player Stats - 3 Point Attempt Rate',
 'Regular Season Advanced Player Stats - Free Throw Rate for Season',
 'Regular Season Advanced Player Stats - Offensive Rebounding %',
 'Regular Season Advanced Player Stats - Defensive Rebound Percentage',
 'Regular Season Advanced Player Stats - Total Rebounding %',
 'Regular Season Advanced Player Stats - Assist % For Season',
 'Regular Season Advanced Player Stats - Steal Percentage for Season',
 'Regular Season Advanced Player Stats - Block %',
 'Regular Season Advanced Player Stats - Turnover % for Season',
 'Regular Season Advanced Player Stats - Usage % For Season',
 'Regular Season Advanced Player Stats - Total Offensive Win Shares For Season',
 'Regular Season Advanced Player Stats - Total Defensive Win Shares For Season',
 'Regular Season Advanced Player Stats - Win Shares',
 'Regular Season Advanced Player Stats - Win Shares Per 48 Minutes Played For the Season',
 'Regular Season Advanced Player Stats - Offensive Box Plus/Minus For Season',
 'Regular Season Advanced Player Stats - Defensive Box Plus/Minus',
 'Regular Season Advanced Player Stats - Box Plus/Minus For Season',
 'Regular Season Advanced Player Stats - Value Above Replacement Player',
 'Regular Season Advanced Player Stats - Player Efficiency Ranked',
 'Regular Season Advanced Player Stats - True Shooting % For Season Ranked',
 'Regular Season Advanced Player Stats - Offensive Rebounding % Ranked',
 'Regular Season Advanced Player Stats - Defensive Rebound Percentage Ranked',
 'Regular Season Advanced Player Stats - Total Rebounding % Ranked',
 'Regular Season Advanced Player Stats - Assist % For Season Ranked',
 'Regular Season Advanced Player Stats - Steal Percentage for Season Ranked',
 'Regular Season Advanced Player Stats - Block % Ranked',
 'Regular Season Advanced Player Stats - Turnover % for Season Ranked',
 'Regular Season Advanced Player Stats - Usage % For Season Ranked',
 'Regular Season Advanced Player Stats - Total Offensive Win Shares For Season Ranked',
 'Regular Season Advanced Player Stats - Total Defensive Win Shares For Season Ranked',
 'Regular Season Advanced Player Stats - Win Shares Ranked',
 'Regular Season Advanced Player Stats - Win Shares Per 48 Minutes Played For the Season Ranked',
 'Regular Season Advanced Player Stats - Offensive Box Plus/Minus For Season Ranked',
 'Regular Season Advanced Player Stats - Defensive Box Plus/Minus Ranked',
 'Regular Season Advanced Player Stats - Box Plus/Minus For Season Ranked',
 'Regular Season Advanced Player Stats - Value Above Replacement Player Ranked']

player_id_to_delta_value_dict = []

for column in list_of_columns_to_delta:
    all_regular_season_player_stats_merged_df[f'{column} Delta'] = np.nan

# For each year that is not 2004...
for year in list(all_regular_season_player_stats_merged_df['Regular Season Basic Player Stats - Year'].unique()):
    
    if year == 2004:
        
        continue
        
    previous_year = year - 1
    
    current_subset_df = all_regular_season_player_stats_merged_df.loc[all_regular_season_player_stats_merged_df['Regular Season Basic Player Stats - Year']==year].copy()
    
    current_subset_df['Temp_player_id'] = current_subset_df['Player_unique_id_and_year_concat'].str.split('-').str[0]
    
    previous_subset_df = all_regular_season_player_stats_merged_df.loc[all_regular_season_player_stats_merged_df['Regular Season Basic Player Stats - Year']==previous_year].copy()
    
    previous_subset_df['Temp_player_id'] = previous_subset_df['Player_unique_id_and_year_concat'].str.split('-').str[0]    
        
    # For each player that played in the current year
    for unique_player in list(current_subset_df['Temp_player_id'].unique()):

        # Grab index where the player exist in current year
        index_for_current = current_subset_df.loc[current_subset_df['Temp_player_id']==unique_player].index[0]
        
        # Check and make sure we have previous year data. If not, print player and year value to check it out
        if len(previous_subset_df.loc[previous_subset_df['Temp_player_id']==unique_player])==0:
            
            print(f'{unique_player} in {year} does not seem to have previous year data.')
            
            list_of_indices_where_no_previous_year_data.append(index_for_current) 
            
        else:
        
            # Grab index where the player exist in PREVIOUS year
            index_for_previous = previous_subset_df.loc[previous_subset_df['Temp_player_id']==unique_player].index[0]

            for column in list_of_columns_to_delta:

                # Grab value for current year and column
                current_year_column_value = current_subset_df.loc[index_for_current,column]

                # Grab value for previous year and column
                previous_year_column_value = previous_subset_df.loc[index_for_previous,column]

                # Calculate delta value
                delta_value = current_year_column_value-previous_year_column_value

                # Populate delta version of column with delta
                all_regular_season_player_stats_merged_df.loc[index_for_current,f'{column} Delta'] = delta_value

# Predict 2024 MIP

### Add in 2023 data to calculate deltas. First, create dataframe with just 2023 data

In [None]:
all_regular_season_player_stats_merged_df_2023 = all_regular_season_player_stats_merged_df.loc[all_regular_season_player_stats_merged_df['Regular Season Basic Player Stats - Year']==2023].copy()

### Concat 2023 and 2024 Data 

In [None]:
all_regular_season_player_stats_merged_df_2024 = pd.concat([all_regular_season_player_stats_merged_df_2023,all_regular_season_player_stats_merged_df_2024])

In [None]:
mip_2024_prediction_df = all_regular_season_player_stats_merged_df_2024.merge(all_regular_season_team_stats_merged_df_2024,how='left',on='Team_name_and_year_concat').set_index(all_regular_season_player_stats_merged_df_2024.index)

In [None]:
mip_2024_prediction_df

## Calculate delta columns for 2024

In [None]:
list_of_indices_where_no_previous_year_data = []

list_of_columns_to_delta = [
 'Regular Season Basic Player Stats - Games Played',
 'Regular Season Basic Player Stats - Games Started',
 'Regular Season Basic Player Stats - Minutes Played Per Game',
 'Regular Season Basic Player Stats - Field Goals Made Per Game',
 'Regular Season Basic Player Stats - Field Goals Attempted Per Game',
 'Regular Season Basic Player Stats - Average Field Goal % For Season',
 'Regular Season Basic Player Stats - 3 Pointers Made Per Game',
 'Regular Season Basic Player Stats - 3 Point Attempts Per Game',
 'Regular Season Basic Player Stats - Average 3 Point % For Season',
 'Regular Season Basic Player Stats - 2 Pointers Made Per Game',
 'Regular Season Basic Player Stats - 2 Point Attempts Per Game',
 'Regular Season Basic Player Stats - Average 2 Point % For Season',
 'Regular Season Basic Player Stats - Average Effective Field Goal % For Season',
 'Regular Season Basic Player Stats - Free Throws Made Per Game',
 'Regular Season Basic Player Stats - Free Throw Attempts Per Game',
 'Regular Season Basic Player Stats - Average Free Throw % For Season',
 'Regular Season Basic Player Stats - Offense Rebounds Per Game',
 'Regular Season Basic Player Stats - Defense Rebounds Per Game',
 'Regular Season Basic Player Stats - Total Rebounds Per Game',
 'Regular Season Basic Player Stats - Assists Per Game',
 'Regular Season Basic Player Stats - Steals Per Game',
 'Regular Season Basic Player Stats - Blocks Per Game',
 'Regular Season Basic Player Stats - Turnovers Per Game',
 'Regular Season Basic Player Stats - Personal Fouls Per Game',
 'Regular Season Basic Player Stats - Points Per Game',
 'Regular Season Basic Player Stats - Total Rebounds Per Game Ranked',
 'Regular Season Basic Player Stats - Assists Per Game Ranked',
 'Regular Season Basic Player Stats - Steals Per Game Ranked',
 'Regular Season Basic Player Stats - Blocks Per Game Ranked',
 'Regular Season Basic Player Stats - Points Per Game Ranked',
 'Regular Season Basic Player Stats - Average Effective Field Goal % For Season Ranked',
 'Regular Season Advanced Player Stats - Total Minutes Played During Season',
 'Regular Season Advanced Player Stats - Player Efficiency',
 'Regular Season Advanced Player Stats - True Shooting % For Season',
 'Regular Season Advanced Player Stats - 3 Point Attempt Rate',
 'Regular Season Advanced Player Stats - Free Throw Rate for Season',
 'Regular Season Advanced Player Stats - Offensive Rebounding %',
 'Regular Season Advanced Player Stats - Defensive Rebound Percentage',
 'Regular Season Advanced Player Stats - Total Rebounding %',
 'Regular Season Advanced Player Stats - Assist % For Season',
 'Regular Season Advanced Player Stats - Steal Percentage for Season',
 'Regular Season Advanced Player Stats - Block %',
 'Regular Season Advanced Player Stats - Turnover % for Season',
 'Regular Season Advanced Player Stats - Usage % For Season',
 'Regular Season Advanced Player Stats - Total Offensive Win Shares For Season',
 'Regular Season Advanced Player Stats - Total Defensive Win Shares For Season',
 'Regular Season Advanced Player Stats - Win Shares',
 'Regular Season Advanced Player Stats - Win Shares Per 48 Minutes Played For the Season',
 'Regular Season Advanced Player Stats - Offensive Box Plus/Minus For Season',
 'Regular Season Advanced Player Stats - Defensive Box Plus/Minus',
 'Regular Season Advanced Player Stats - Box Plus/Minus For Season',
 'Regular Season Advanced Player Stats - Value Above Replacement Player',
 'Regular Season Advanced Player Stats - Player Efficiency Ranked',
 'Regular Season Advanced Player Stats - True Shooting % For Season Ranked',
 'Regular Season Advanced Player Stats - Offensive Rebounding % Ranked',
 'Regular Season Advanced Player Stats - Defensive Rebound Percentage Ranked',
 'Regular Season Advanced Player Stats - Total Rebounding % Ranked',
 'Regular Season Advanced Player Stats - Assist % For Season Ranked',
 'Regular Season Advanced Player Stats - Steal Percentage for Season Ranked',
 'Regular Season Advanced Player Stats - Block % Ranked',
 'Regular Season Advanced Player Stats - Turnover % for Season Ranked',
 'Regular Season Advanced Player Stats - Usage % For Season Ranked',
 'Regular Season Advanced Player Stats - Total Offensive Win Shares For Season Ranked',
 'Regular Season Advanced Player Stats - Total Defensive Win Shares For Season Ranked',
 'Regular Season Advanced Player Stats - Win Shares Ranked',
 'Regular Season Advanced Player Stats - Win Shares Per 48 Minutes Played For the Season Ranked',
 'Regular Season Advanced Player Stats - Offensive Box Plus/Minus For Season Ranked',
 'Regular Season Advanced Player Stats - Defensive Box Plus/Minus Ranked',
 'Regular Season Advanced Player Stats - Box Plus/Minus For Season Ranked',
 'Regular Season Advanced Player Stats - Value Above Replacement Player Ranked']

player_id_to_delta_value_dict = []

for column in list_of_columns_to_delta:
    mip_2024_prediction_df[f'{column} Delta'] = np.nan

# For each year that is not 2004...
for year in list(mip_2024_prediction_df['Regular Season Basic Player Stats - Year'].unique()):
    
    if year == 2023:
        
        continue
        
    previous_year = year - 1
    
    current_subset_df = mip_2024_prediction_df.loc[mip_2024_prediction_df['Regular Season Basic Player Stats - Year']==year].copy()
    
    current_subset_df['Temp_player_id'] = current_subset_df['Player_unique_id_and_year_concat'].str.split('-').str[0]
    
    previous_subset_df = mip_2024_prediction_df.loc[mip_2024_prediction_df['Regular Season Basic Player Stats - Year']==previous_year].copy()
    
    previous_subset_df['Temp_player_id'] = previous_subset_df['Player_unique_id_and_year_concat'].str.split('-').str[0]    
        
    # For each player that played in the current year
    for unique_player in list(current_subset_df['Temp_player_id'].unique()):

        # Grab index where the player exist in current year
        index_for_current = current_subset_df.loc[current_subset_df['Temp_player_id']==unique_player].index[0]
        
        # Check and make sure we have previous year data. If not, print player and year value to check it out
        if len(previous_subset_df.loc[previous_subset_df['Temp_player_id']==unique_player])==0:
            
            print(f'{unique_player} in {year} does not seem to have previous year data.')
            
            list_of_indices_where_no_previous_year_data.append(index_for_current) 
            
        else:
        
            # Grab index where the player exist in PREVIOUS year
            index_for_previous = previous_subset_df.loc[previous_subset_df['Temp_player_id']==unique_player].index[0]

            for column in list_of_columns_to_delta:

                # Grab value for current year and column
                current_year_column_value = current_subset_df.loc[index_for_current,column]

                # Grab value for previous year and column
                previous_year_column_value = previous_subset_df.loc[index_for_previous,column]

                # Calculate delta value
                delta_value = current_year_column_value-previous_year_column_value

                # Populate delta version of column with delta
                mip_2024_prediction_df.loc[index_for_current,f'{column} Delta'] = delta_value

### Drop columns that have nulls deltas (because no prior year data found)

In [None]:
mip_2024_prediction_df = mip_2024_prediction_df.loc[mip_2024_prediction_df['Regular Season Basic Player Stats - Points Per Game Delta'].notnull()]

In [None]:
mip_2024_prediction_df.head(10)

In [None]:
list_of_columns_to_exclude = ['Regular Season Basic Player Stats - Player',
                             'Regular Season Basic Player Stats - Team',
                              'Regular Season Basic Team Stats - Team',
                             'Regular Season Basic Player Stats - Standardized/Modernized Team Name',
                             'Player_unique_id_and_year_concat',
                             'Team_name_and_year_concat',
                             'Regular Season Basic Team Stats - Games Played',
                             'Regular Season Basic Team Opponent Stats - Games Played',
                             'Regular Season Team Advanced Stats - Arena',
                             'Regular Season Team Advanced Stats - Total Arena Attendance',
                             'Regular Season Team Advanced Stats - Attendance Per Game',
                              'Regular Season Basic Team Stats - Year',
                              'Predicted MIP Candidate',
                              'Predicted MIP Winner'
                             ]

list_of_features = list(mip_2024_prediction_df.columns)

for column in list_of_columns_to_exclude:
    
    list_of_features.remove(column)
    
for feature in list_of_features:
    
    print(f"""
    {feature}""")

In [None]:
list_of_features

## Check for NaN values and review

In [None]:
for feature in list_of_features:
    
    if len(mip_2024_prediction_df.loc[mip_2024_prediction_df[feature].isnull()]) > 0:
        
        mip_2024_prediction_df.loc[mip_2024_prediction_df[feature].isnull(),feature] = 0
        
        print(f"""
        {feature} contains {len(mip_2024_prediction_df.loc[mip_2024_prediction_df[feature].isnull()])} NaN values """)

In [None]:
list_of_features_filtered = ['Regular Season Advanced Player Stats - Value Above Replacement Player Delta',
 'Regular Season Basic Player Stats - Points Per Game Delta',
 'Regular Season Advanced Player Stats - Value Above Replacement Player Ranked',
 'Regular Season Advanced Player Stats - Usage % For Season Ranked Delta',
 'Regular Season Advanced Player Stats - Usage % For Season Delta',
 'Regular Season Advanced Player Stats - Player Efficiency Delta',
 'Regular Season Basic Player Stats - Field Goals Attempted Per Game Delta',
 'Regular Season Basic Player Stats - Points Per Game Ranked',
 'Regular Season Advanced Player Stats - Steal Percentage for Season Ranked',
 'Regular Season Basic Player Stats - Games Started',
 'Regular Season Advanced Player Stats - Win Shares',
 'Regular Season Basic Player Stats - 2 Pointers Made Per Game',
 'Regular Season Basic Player Stats - Average 2 Point % For Season Delta',
 'Regular Season Team Advanced Stats - Net Rating Ranked',
 'Regular Season Advanced Player Stats - Player Efficiency Ranked Delta',
 'Regular Season Advanced Player Stats - Offensive Rebounding % Ranked Delta',
 'Regular Season Basic Player Stats - 2 Pointers Made Per Game Delta',
 'Regular Season Advanced Player Stats - Total Offensive Win Shares For Season Delta']

In [None]:
mip_2024_prediction_df_features = mip_2024_prediction_df.loc[:,list_of_features_filtered]

In [None]:
# Predict the percentage of 1st place votes for all players
predicted_votes = gbm_regressor.predict(mip_2024_prediction_df_features)

# Add the predicted votes to the DataFrame
mip_2024_prediction_df.loc[:,'Predicted Votes'] = predicted_votes

In [None]:
mip_2024_prediction_df.sort_values('Predicted Votes',ascending=False).head(15)

In [None]:
all_regular_season_player_stats_merged_df_2024_final.loc[mip_2024_prediction_df.index.tolist(),'MIP Predicted Vote %'] = mip_2024_prediction_df.loc[mip_2024_prediction_df.index.tolist(),'Predicted Votes']

In [None]:
for column in list_of_columns_to_delta:
    
    name_of_column = f'{column} Delta'
    
    all_regular_season_player_stats_merged_df_2024_final[name_of_column] = np.nan
    
for column in list_of_columns_to_delta:
    
    name_of_column = f'{column} Delta'    
    
    # Populate delta columns
    all_regular_season_player_stats_merged_df_2024_final.loc[mip_2024_prediction_df.index.tolist(),name_of_column] = mip_2024_prediction_df.loc[mip_2024_prediction_df.index.tolist(),name_of_column]

In [None]:
all_regular_season_player_stats_merged_df_2024_final_filtered = all_regular_season_player_stats_merged_df_2024_final.loc[all_regular_season_player_stats_merged_df_2024_final['Regular Season Basic Player Stats - Games Played']>55]
all_regular_season_player_stats_merged_df_2024_final_filtered.sort_values('MIP Predicted Vote %',ascending=False).head(10)

In [None]:
nba_award_predictions_df = all_regular_season_player_stats_merged_df_2024_final.merge(all_regular_season_team_stats_merged_df_2024,how='left',on='Team_name_and_year_concat')

nba_award_predictions_df.to_excel('2024 NBA Award Predictions.xlsx')

# Predicting 2024 NBA Playoffs 
 <a class="anchor" id="Step39"></a>

In [None]:
team_playoff_prediction_df = all_regular_season_team_stats_merged_df.merge(nba_team_playoff_standing,how='left',on='Team_name_and_year_concat')

def win_function(column_value):
    
    if pd.isna(column_value):
        
        wins = 0
        
    else:     
    
        wins = int(column_value.split('-')[0])
    
    return wins

team_playoff_prediction_df['Team Playoff Wins'] = team_playoff_prediction_df['Team Playoff Standings - Overall Record'].apply(win_function)

In [None]:
team_playoff_prediction_df.head(1)

## Define list of features (exclude data not to be used in prediction) 

In [None]:
list_of_columns_to_exclude = ['Regular Season Basic Team Stats - Team',
                              'Team_name_and_year_concat',
                              'Regular Season Basic Team Stats - Year',
                              'Regular Season Team Advanced Stats - Arena',
                              'Regular Season Team Advanced Stats - Total Arena Attendance',
                              'Regular Season Team Advanced Stats - Attendance Per Game',
                              'Team Playoff Standings - Rank',
                              'Team Playoff Standings - Team',
                              'Team Playoff Standings - Overall Record',
                              'Team Playoff Standings - Record at home',
                              'Team Playoff Standings - Record on the road',
                              'Team Playoff Standings - Record against Eastern Conference',
                              'Team Playoff Standings - Record against Western Conference',
                              'Team Playoff Standings - Year',
                              'Team Playoff Standings - Team Name Refined - accounts for players being traded',
                              'Team Playoff Standings - Standardized/Modernized Team Name',
                              'Playoff Result'
                             ]

list_of_features = list(team_playoff_prediction_df.columns)

for column in list_of_columns_to_exclude:
    
    list_of_features.remove(column)
    
for feature in list_of_features:
    
    print(f"""
    {feature}""")

## Check for NaN values and review

In [None]:
for feature in list_of_features:
    
    if len(team_playoff_prediction_df.loc[team_playoff_prediction_df[feature].isnull()]) > 0:
        
        team_playoff_prediction_df.loc[team_playoff_prediction_df[feature].isnull(),feature] = 0
        
        print(f"""
        {feature} contains {len(team_playoff_prediction_df.loc[team_playoff_prediction_df[feature].isnull()])} NaN values """)

## Split into test and training data

In [None]:
list_of_features_filtered = [
"Regular Season Team Standings - Overall Team Record Ranked",
                             'Team Playoff Wins']

In [None]:
team_playoff_prediction_df_features = team_playoff_prediction_df.loc[:,list_of_features_filtered]

# Separate features and target variable
X = team_playoff_prediction_df_features.drop('Team Playoff Wins', axis=1)
y = team_playoff_prediction_df_features['Team Playoff Wins']  

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Initialize and Train the GBM Model

In [None]:
# Initialize the GBM regressor
gbm_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Train the model
gbm_regressor.fit(X_train, y_train)

## Make Predictions and Evaluate the Model

In [None]:
# Make predictions
y_pred = gbm_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

In [None]:
# # All features had 
# Mean Squared Error: 0.001925368630181291
# R^2 Score: 0.35803670206569393

## Evaluate feature importance

In [None]:
feature_importance = pd.DataFrame(gbm_regressor.feature_importances_, index=X_train.columns, columns=['importance']).sort_values('importance', ascending=False)

list(feature_importance.head(10).index)

### Populate prediction dataframe with predictions

In [None]:
# Predict the percentage of 1st place votes for all players
predicted_votes = gbm_regressor.predict(team_playoff_prediction_df_features.drop('Team Playoff Wins', axis=1))

# Add the predicted votes to the DataFrame
team_playoff_prediction_df.loc[:,'Predicted Votes'] = predicted_votes

In [None]:
team_playoff_prediction_df.sort_values('Predicted Votes',ascending=False)

# Predicting 2024 Playoffs

In [None]:
playoff_prediction_2024 = all_regular_season_team_stats_merged_df_2024

# Prep the data

## Define list of features (exclude data not to be used in prediction) 

In [None]:
list_of_columns_to_exclude = ['Regular Season Basic Team Stats - Team',
                              'Team_name_and_year_concat',
                              'Regular Season Basic Team Stats - Year',
                              'Regular Season Team Advanced Stats - Arena',
                              'Regular Season Team Advanced Stats - Total Arena Attendance',
                              'Regular Season Team Advanced Stats - Attendance Per Game'
                             ]

list_of_features = list(playoff_prediction_2024.columns)

for column in list_of_columns_to_exclude:
    
    list_of_features.remove(column)
    
for feature in list_of_features:
    
    print(f"""
    {feature}""")

## Check for NaN values and review

In [None]:
for feature in list_of_features:
    
    if len(playoff_prediction_2024.loc[playoff_prediction_2024[feature].isnull()]) > 0:
        
        playoff_prediction_2024.loc[playoff_prediction_2024[feature].isnull(),feature] = 0
        
        print(f"""
        {feature} contains {len(playoff_prediction_2024.loc[playoff_prediction_2024[feature].isnull()])} NaN values """)

In [None]:
list_of_features_filtered = ["Regular Season Team Standings - Overall Team Record Ranked"]

In [None]:
playoff_prediction_2024_features = playoff_prediction_2024.loc[:,list_of_features_filtered]

In [None]:
# Predict the percentage of 1st place votes for all players
predicted_votes = gbm_regressor.predict(playoff_prediction_2024_features)

# Add the predicted votes to the DataFrame
playoff_prediction_2024.loc[:,'Predicted Votes'] = predicted_votes

In [None]:
playoff_prediction_2024.sort_values('Predicted Votes',ascending=False).head(30)

#  PLAYER AWARDS MERGE. Merge regular season players stats with player awards

 <a class="anchor" id="Step40"></a>

### Add MVP

In [None]:
number_of_columns_before = len(all_regular_season_player_stats_merged_df.columns)

all_regular_season_player_stats_merged_df_plus_awards_df = all_regular_season_player_stats_merged_df.merge(mvp_vote_df,how='left',on='Player_unique_id_and_year_concat')

number_of_columns_after = len(all_regular_season_player_stats_merged_df_plus_awards_df.columns)

print(f"""Went from {number_of_columns_before} columns to {number_of_columns_after} columns.
      
Added {number_of_columns_after-number_of_columns_before}""")

### Add ROY

In [None]:
number_of_columns_before = len(all_regular_season_player_stats_merged_df_plus_awards_df.columns)

all_regular_season_player_stats_merged_df_plus_awards_df = all_regular_season_player_stats_merged_df_plus_awards_df.merge(roy_vote_df,how='left',on='Player_unique_id_and_year_concat')

number_of_columns_after = len(all_regular_season_player_stats_merged_df_plus_awards_df.columns)

print(f"""Went from {number_of_columns_before} columns to {number_of_columns_after} columns.
      
Added {number_of_columns_after-number_of_columns_before}""")

### Add MIP

In [None]:
number_of_columns_before = len(all_regular_season_player_stats_merged_df_plus_awards_df.columns)

all_regular_season_player_stats_merged_df_plus_awards_df = all_regular_season_player_stats_merged_df_plus_awards_df.merge(mip_df,how='left',on='Player_unique_id_and_year_concat')

number_of_columns_after = len(all_regular_season_player_stats_merged_df_plus_awards_df.columns)

print(f"""Went from {number_of_columns_before} columns to {number_of_columns_after} columns.
      
Added {number_of_columns_after-number_of_columns_before}""")

### Add 6MAN

In [None]:
number_of_columns_before = len(all_regular_season_player_stats_merged_df_plus_awards_df.columns)

all_regular_season_player_stats_merged_df_plus_awards_df = all_regular_season_player_stats_merged_df_plus_awards_df.merge(six_man_df,how='left',on='Player_unique_id_and_year_concat')

number_of_columns_after = len(all_regular_season_player_stats_merged_df_plus_awards_df.columns)

print(f"""Went from {number_of_columns_before} columns to {number_of_columns_after} columns.
      
Added {number_of_columns_after-number_of_columns_before}""")

### Add DPOY

In [None]:
number_of_columns_before = len(all_regular_season_player_stats_merged_df_plus_awards_df.columns)

all_regular_season_player_stats_merged_df_plus_awards_df = all_regular_season_player_stats_merged_df_plus_awards_df.merge(dpoy_df,how='left',on='Player_unique_id_and_year_concat')

number_of_columns_after = len(all_regular_season_player_stats_merged_df_plus_awards_df.columns)

print(f"""Went from {number_of_columns_before} columns to {number_of_columns_after} columns.
      
Added {number_of_columns_after-number_of_columns_before}""")

### Add Finals MVP

In [None]:
number_of_columns_before = len(all_regular_season_player_stats_merged_df_plus_awards_df.columns)

all_regular_season_player_stats_merged_df_plus_awards_df = all_regular_season_player_stats_merged_df_plus_awards_df.merge(finals_mvp,how='left',on='Player_unique_id_and_year_concat')

number_of_columns_after = len(all_regular_season_player_stats_merged_df_plus_awards_df.columns)

print(f"""Went from {number_of_columns_before} columns to {number_of_columns_after} columns.
      
Added {number_of_columns_after-number_of_columns_before}""")

### Add All NBA

In [None]:
number_of_columns_before = len(all_regular_season_player_stats_merged_df_plus_awards_df.columns)

all_regular_season_player_stats_merged_df_plus_awards_df = all_regular_season_player_stats_merged_df_plus_awards_df.merge(all_nba_df,how='left',on='Player_unique_id_and_year_concat')

number_of_columns_after = len(all_regular_season_player_stats_merged_df_plus_awards_df.columns)

print(f"""Went from {number_of_columns_before} columns to {number_of_columns_after} columns.
      ### Add All NBA
Added {number_of_columns_after-number_of_columns_before}""")

### Add All Defense

In [None]:
number_of_columns_before = len(all_regular_season_player_stats_merged_df_plus_awards_df.columns)

all_regular_season_player_stats_merged_df_plus_awards_df = all_regular_season_player_stats_merged_df_plus_awards_df.merge(all_defense_df,how='left',on='Player_unique_id_and_year_concat')

number_of_columns_after = len(all_regular_season_player_stats_merged_df_plus_awards_df.columns)

print(f"""Went from {number_of_columns_before} columns to {number_of_columns_after} columns.
      
Added {number_of_columns_after-number_of_columns_before}""")

#  PLAYOFFS PLAYER STATS MERGE. Create merged dataframe
 <a class="anchor" id="Step41"></a>

### Merge basic with advanced stats

In [None]:
number_of_columns_before = len(player_basic_playoff_stats_df.columns)

all_playoffs_player_stats_merged_df = player_basic_playoff_stats_df.merge(player_advanced_stats_in_playoffs_df,how='left',on='Player_unique_id_and_year_concat')

number_of_columns_after = len(all_playoffs_player_stats_merged_df.columns)

print(f"""Went from {number_of_columns_before} columns to {number_of_columns_after} columns.
      
Added {number_of_columns_after-number_of_columns_before}""")

### Add player shooting stats

In [None]:
number_of_columns_before = len(all_playoffs_player_stats_merged_df.columns)

all_playoffs_player_stats_merged_df = all_playoffs_player_stats_merged_df.merge(player_shooting_stats_in_playoffs_df.drop('Team_name_and_year_concat',axis=1),how='left',on='Player_unique_id_and_year_concat')

number_of_columns_after = len(all_playoffs_player_stats_merged_df.columns)

print(f"""Went from {number_of_columns_before} columns to {number_of_columns_after} columns.
      
Added {number_of_columns_after-number_of_columns_before}""")

### Add player play-by-play stats

In [None]:
number_of_columns_before = len(all_playoffs_player_stats_merged_df.columns)

all_playoffs_player_stats_merged_df = all_playoffs_player_stats_merged_df.merge(player_play_by_play_stats_in_playoffs_df.drop('Team_name_and_year_concat',axis=1),how='left',on='Player_unique_id_and_year_concat')

number_of_columns_after = len(all_playoffs_player_stats_merged_df.columns)

print(f"""Went from {number_of_columns_before} columns to {number_of_columns_after} columns.
      
Added {number_of_columns_after-number_of_columns_before}""")

# PLAYOFFS TEAM STATS MERGE

 <a class="anchor" id="Step42"></a>

### Merge playoffs basic stats for team with playoffs basic stats for opponents

In [None]:
number_of_columns_before = len(nba_team_basic_stat_in_playoffs_df.columns)

all_playoff_team_stats_merged_df = nba_team_basic_stat_in_playoffs_df.merge(nba_team_basic_opponent_stat_in_playoffs_df,how='left',on='Team_name_and_year_concat')

number_of_columns_after = len(all_playoff_team_stats_merged_df.columns)

print(f"""Went from {number_of_columns_before} columns to {number_of_columns_after} columns.
      
Added {number_of_columns_after-number_of_columns_before}""")

### Add playoffs advanced team stats

In [None]:
number_of_columns_before = len(all_playoff_team_stats_merged_df.columns)

all_playoff_team_stats_merged_df = all_playoff_team_stats_merged_df.merge(nba_team_advanced_stat_in_playoffs_df,how='left',on='Team_name_and_year_concat')

number_of_columns_after = len(all_playoff_team_stats_merged_df.columns)

print(f"""Went from {number_of_columns_before} columns to {number_of_columns_after} columns.
      
Added {number_of_columns_after-number_of_columns_before}""")

### Add playoffs team shooting stats

In [None]:
number_of_columns_before = len(all_playoff_team_stats_merged_df.columns)

all_playoff_team_stats_merged_df = all_playoff_team_stats_merged_df.merge(nba_team_shooting_in_playoffs_df,how='left',on='Team_name_and_year_concat')

number_of_columns_after = len(all_playoff_team_stats_merged_df.columns)

print(f"""Went from {number_of_columns_before} columns to {number_of_columns_after} columns.
      
Added {number_of_columns_after-number_of_columns_before}""")

### Add playoffs team opponent shooting stats

In [None]:
number_of_columns_before = len(all_playoff_team_stats_merged_df.columns)

all_playoff_team_stats_merged_df = all_playoff_team_stats_merged_df.merge(nba_team_opponent_shooting_stat_in_playoffs_df,how='left',on='Team_name_and_year_concat')

number_of_columns_after = len(all_playoff_team_stats_merged_df.columns)

print(f"""Went from {number_of_columns_before} columns to {number_of_columns_after} columns.
      
Added {number_of_columns_after-number_of_columns_before}""")

### Add team playoff standing and results

In [None]:
number_of_columns_before = len(all_playoff_team_stats_merged_df.columns)

all_playoff_team_stats_merged_df = all_playoff_team_stats_merged_df.merge(nba_team_playoff_standing,how='left',on='Team_name_and_year_concat')

number_of_columns_after = len(all_playoff_team_stats_merged_df.columns)

print(f"""Went from {number_of_columns_before} columns to {number_of_columns_after} columns.
      
Added {number_of_columns_after-number_of_columns_before}""")

# ALL TEAM STATS
 <a class="anchor" id="Step43"></a>

### Merge all regular season team stats with all playoff team stats

In [None]:
number_of_columns_before = len(all_regular_season_team_stats_merged_df.columns)

all_team_stats_and_playoff_standings_info_df = all_regular_season_team_stats_merged_df.merge(all_playoff_team_stats_merged_df,how='left',on='Team_name_and_year_concat')

number_of_columns_after = len(all_team_stats_and_playoff_standings_info_df.columns)

print(f"""Went from {number_of_columns_before} columns to {number_of_columns_after} columns.
      
Added {number_of_columns_after-number_of_columns_before}""")

# ALL PLAYER STATS
 <a class="anchor" id="Step44"></a>

### Merge regular season player + awards with player playoff stats. 

In [None]:
number_of_columns_before = len(all_regular_season_player_stats_merged_df_plus_awards_df.columns)

all_player_related_stats_df = all_regular_season_player_stats_merged_df_plus_awards_df.merge(all_playoffs_player_stats_merged_df,how='left',on='Player_unique_id_and_year_concat')

number_of_columns_after = len(all_player_related_stats_df.columns)

print(f"""Went from {number_of_columns_before} columns to {number_of_columns_after} columns.
      
Added {number_of_columns_after-number_of_columns_before}""")

# COMBINE ALL PLAYER AND TEAM STATS
 <a class="anchor" id="Step45"></a>

### Merge all team-related stats with all player related stats

In [None]:
number_of_columns_before = len(all_player_related_stats_df.columns)

ultimate_combined_df = all_player_related_stats_df.merge(all_team_stats_and_playoff_standings_info_df,how='left',on='Team_name_and_year_concat')

number_of_columns_after = len(ultimate_combined_df.columns)

print(f"""Went from {number_of_columns_before} columns to {number_of_columns_after} columns.
      
Added {number_of_columns_after-number_of_columns_before}""")

In [None]:
all_team_stats_and_playoff_standings_info_df.loc[all_team_stats_and_playoff_standings_info_df['Regular Season Basic Team Stats - Team']=='Atlanta Hawks']

In [None]:
nba_team_basic_stat_in_regular_season_df_league_average

# Create and export special merged regular season team stats containing league averages
 <a class="anchor" id="Step46"></a>

In [None]:
team_regular_season_df_with_league_averages = nba_team_basic_stat_in_regular_season_df_league_average.merge(nba_team_basic_opponent_stat_in_regular_season_df,how='left',on='Team_name_and_year_concat')
team_regular_season_df_with_league_averages = team_regular_season_df_with_league_averages.merge(nba_advanced_team_stats_league_average,how='left',on='Team_name_and_year_concat')
team_regular_season_df_with_league_averages = team_regular_season_df_with_league_averages.merge(nba_team_shooting_stat_in_regular_season_df_league_average,how='left',on='Team_name_and_year_concat')
team_regular_season_df_with_league_averages = team_regular_season_df_with_league_averages.merge(nba_team_opponent_shooting_stat_in_regular_season_df_league_average,how='left',on='Team_name_and_year_concat')
team_regular_season_df_with_league_averages = team_regular_season_df_with_league_averages.merge(all_playoff_team_stats_merged_df,how='left',on='Team_name_and_year_concat')

In [None]:
list_of_columns_to_mark_best = ['Regular Season Basic Team Stats - Assists Per Game',
                               'Regular Season Basic Team Stats - Blocks Per Game',
                               'Regular Season Basic Team Stats - Steals Per Game',
                               'Regular Season Basic Team Stats - Turnovers Per Game',
                               'Regular Season Basic Team Stats - Points Per Game',
                               'Regular Season Basic Team Stats - Total Rebounds Per Game',
                               'Regular Season Team Advanced Stats - Offensive Rating',
                               'Regular Season Team Advanced Stats - Defensive Rating',
                               'Regular Season Team Advanced Stats - Net Rating',
                               'Regular Season Team Advanced Stats - Pace (Possessions per 48 Minutes)',
                               'Regular Season Team Advanced Stats - True Shooting %',
                               'Regular Season Team Advanced Stats - Average Effective Field Goal % For Season',
                               'Regular Season Team Shooting Stats - Average distance of shot',
                               'Regular Season Basic Team Stats - 3 Point Attempts Per Game',
                               'Regular Season Team Shooting Stats - 3 Pointer Attempt Rate For Season as % of total field goals attempted']

for column in list_of_columns_to_mark_best:
    
    # New column name
    new_column_name = f'{column} Best Flag'
    
    # Create new column with default value of FALSE
    team_regular_season_df_with_league_averages[new_column_name] = 'FALSE'
    
    # For each year, find the index for the max value for the column of interest
    for year in list(team_regular_season_df_with_league_averages['Regular Season Basic Team Stats - Year'].unique()):
        
        subset_df = team_regular_season_df_with_league_averages.loc[team_regular_season_df_with_league_averages['Regular Season Basic Team Stats - Year']==year]
    
        index_of_max = subset_df[column].idxmax()
        
        # Mark index of max
        team_regular_season_df_with_league_averages.loc[index_of_max,new_column_name] = 'TRUE'

In [None]:
team_regular_season_df_with_league_averages.to_excel('All Team Stats from 2003-2023 - March 15.xlsx')

# Export Ultimate Combined dataframe
 <a class="anchor" id="Step47"></a>

In [None]:
# list_of_columns_to_mark_best = ['Regular Season Basic Player Stats - 3 Point Attempts Per Game',
#                                'Regular Season Basic Player Stats - Average 3 Point % For Season',
#                                'Regular Season Basic Player Stats - Total Rebounds Per Game',
#                                'Regular Season Basic Player Stats - Steals Per Game',
#                                'Regular Season Basic Player Stats - Blocks Per Game',
#                                'Regular Season Basic Player Stats - Points Per Game',
#                                'Regular Season Basic Player Stats - Assists Per Game',
#                                'Regular Season Basic Player Stats - Free Throw Attempts Per Game']

# for column in list_of_columns_to_mark_best:
    
#     # New column name
#     new_column_name = f'{column} Best Flag'
    
#     # Create new column with default value of FALSE
#     ultimate_combined_df[new_column_name] = 'FALSE'
    
#     # For each year, find the index for the max value for the column of interest
#     for year in list(ultimate_combined_df['Regular Season Basic Player Stats - Year'].unique()):
        
#         subset_df = ultimate_combined_df.loc[ultimate_combined_df['Regular Season Basic Player Stats - Year']==year]
    
#         index_of_max = subset_df[column].idxmax()
        
#         # Mark index of max
#         ultimate_combined_df.loc[index_of_max,new_column_name] = 'TRUE'

In [None]:
ultimate_combined_df.to_excel('All Player and Team Stats from 2003-2023.xlsx')

In [None]:
ultimate_combined_df.loc[ultimate_combined_df['MIP Vote Results - MIP Result Indicator']==3]