In [93]:
import pandas as pd
import pickle
import os

from sklearn.preprocessing import MinMaxScaler

import time
import random

from nba_api.stats.static import teams, players
from nba_api.stats.endpoints import playercareerstats, playerawards

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

KeyboardInterrupt: 

In [2]:
# Function to save the dictionary
def save_dataframes(dataframes_dict, filepath):
    with open(filepath, 'wb') as f:
        pickle.dump(dataframes_dict, f)
    # print(f"DataFrames saved to {filepath}")

# Function to load the dictionary
def load_dataframes(filepath):
    if os.path.exists(filepath):
        with open(filepath, 'rb') as f:
            data = pickle.load(f)
        print(f"DataFrames loaded from {filepath}")
        return data
    else:
        print(f"No file found at {filepath}, starting with empty dictionary.")
        return {}

## Fetch a list of all NBA players so their names/IDs can be used to fetch their stats

In [3]:
nba_players = players.get_players()
print("Number of players fetched: {}".format(len(nba_players)))

Number of players fetched: 5034


## Using the list of all players, reduce the list to those who played in the modern era (i.e. since 1980)

In [4]:
cols = playercareerstats.PlayerCareerStats(per_mode36="Totals", player_id=nba_players[2]['id']).get_data_frames()[0].columns

season_to_dataframe = {season: pd.DataFrame(columns=cols) for season in range(1980,2024)}

stats_type = 'PerGame'
df_filepath = f'/Users/cb/src/nba_mvp_ml/data/raw/seasons_{stats_type}'
# save_dataframes(season_to_dataframe, df_filepath)

In [5]:
failed_players = []
succeeded_players = []

## Iterate over the list of players, adding seasons after 1979 to a dict of DataFrames
### Save dataframes to file in case of error

In [67]:
stats_type = 'PerGame'
df_filepath = f'/Users/cb/src/nba_mvp_ml/data/raw/seasons_{stats_type}'

season_to_dataframe = load_dataframes(df_filepath)


# for player in nba_players:
for player in [{'id': 76054,
  'full_name': 'Nate Archibald',
  'first_name': 'Nate',
  'last_name': 'Archibald',
  'is_active': False}]:


    try:
        # Fetch player stats
        if player['id']:
            stats = playercareerstats.PlayerCareerStats(per_mode36=stats_type, player_id=player['id'])
            df = stats.get_data_frames()[0] # 0 indexes main career stats 
        
            # Add the player's name to the df for better human understanding 
            df['PLAYER_FULLNAME'] = player['full_name'].upper()
            columns = df.columns.to_list()
            new_order = columns[:1] + ['PLAYER_FULLNAME'] + columns[1:-1]
            df = df[new_order]
        
            # Conver season column to integers for arithmetic
            df['SEASON_ID'] = [int(year[:4]) for year in df['SEASON_ID']]
            
            # Remove any season before 1980
            # df[df['SEASON_ID'] > 1979]
        
            # Add each player's remaining season to the corresponding season dataframe
            for season in season_to_dataframe.keys():
                season_to_dataframe[season] = pd.concat([season_to_dataframe[season], df[df['SEASON_ID'] == season]], ignore_index=True)
                season_to_dataframe[season] = season_to_dataframe[season][new_order]
                save_dataframes(season_to_dataframe, df_filepath)
    
            succeeded_players.append(player)
            time.sleep(random.randint(0, 3))
            

    except Exception as e:
        print(f'Failed on player {player['id']}: {player['full_name']} with exception {e}.')
        failed_players.append(player)
        


DataFrames loaded from /Users/cb/src/nba_mvp_ml/data/raw/seasons_PerGame


In [7]:
cols = playercareerstats.PlayerCareerStats(per_mode36="Totals", player_id=nba_players[2]['id']).get_data_frames()[0].columns

season_to_dataframe = {season: pd.DataFrame(columns=cols) for season in range(1980,2024)}

stats_type = 'Totals'
df_filepath = f'/Users/cb/src/nba_mvp_ml/data/raw/seasons_{stats_type}'
save_dataframes(season_to_dataframe, df_filepath)

for player in nba_players:

    try:
        # Fetch player stats
        if player['id']:
            stats = playercareerstats.PlayerCareerStats(per_mode36=stats_type, player_id=player['id'])
            df = stats.get_data_frames()[0] # 0 indexes main career stats 
        
            # Add the player's name to the df for better human understanding 
            df['PLAYER_FULLNAME'] = player['full_name'].upper()
            columns = df.columns.to_list()
            new_order = columns[:1] + ['PLAYER_FULLNAME'] + columns[1:-1]
            df = df[new_order]
        
            # Conver season column to integers for arithmetic
            df['SEASON_ID'] = [int(year[:4]) for year in df['SEASON_ID']]
            
            # Remove any season before 1980
            # df[df['SEASON_ID'] > 1979]
        
            # Add each player's remaining season to the corresponding season dataframe
            for season in season_to_dataframe.keys():
                season_to_dataframe[season] = pd.concat([season_to_dataframe[season], df[df['SEASON_ID'] == season]], ignore_index=True)
                season_to_dataframe[season] = season_to_dataframe[season][new_order]
                save_dataframes(season_to_dataframe, df_filepath)
        
            time.sleep(random.randint(0, 3))

        else: 
            print(f'There was no player found for id {player['id']}: ({player['full_name']})')

    except Exception as e:
        print(f'Failed on player {player['id']}: {player['full_name']} with exception {e}.')
        failed_players.append(player)
        


Failed on player 78073: Ben Scharnus with exception HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30).
Failed on player 78074: Marv Schatzman with exception HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30).


## Test loading the dataframes from file

In [72]:
totals_df_filepath = f'/Users/cb/src/nba_mvp_ml/data/raw/seasons_Totals'
pergame_df_filepath = f'/Users/cb/src/nba_mvp_ml/data/raw/seasons_PerGame'

In [75]:
totals_df = load_dataframes(totals_df_filepath)
totals_df[1980].iloc[:15]

DataFrames loaded from /Users/cb/src/nba_mvp_ml/data/raw/seasons_Totals


Unnamed: 0,PLAYER_ID,PLAYER_FULLNAME,SEASON_ID,LEAGUE_ID,TEAM_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,GS,MIN,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
0,76003,KAREEM ABDUL-JABBAR,1980,0,1610612747,LAL,34.0,80,80.0,2976.0,...,0.766,197.0,624.0,821,272,59.0,228.0,249.0,244,2095
1,76005,TOM ABERNETHY,1980,0,1610612744,GOS,27.0,10,0.0,39.0,...,0.667,1.0,7.0,8,1,1.0,0.0,2.0,5,4
2,76005,TOM ABERNETHY,1980,0,1610612754,IND,27.0,29,0.0,259.0,...,0.579,19.0,21.0,40,18,6.0,3.0,6.0,29,59
3,76005,TOM ABERNETHY,1980,0,0,TOT,27.0,39,0.0,298.0,...,0.591,20.0,28.0,48,19,7.0,3.0,8.0,34,63
4,76011,ALVAN ADAMS,1980,0,1610612756,PHX,26.0,75,69.0,2054.0,...,0.768,157.0,389.0,546,344,106.0,69.0,226.0,226,1115
5,76030,DARRELL ALLUMS,1980,0,1610612742,DAL,22.0,22,3.0,276.0,...,0.591,19.0,46.0,65,25,5.0,8.0,23.0,51,59
6,76054,NATE ARCHIBALD,1980,0,1610612738,BOS,32.0,80,72.0,2820.0,...,0.816,36.0,140.0,176,618,75.0,18.0,265.0,201,1106
7,76078,DENNIS AWTREY,1980,0,1610612760,SEA,33.0,47,0.0,607.0,...,0.7,33.0,75.0,108,54,12.0,8.0,33.0,85,102
8,76085,JAMES BAILEY,1980,0,1610612760,SEA,24.0,82,69.0,2539.0,...,0.709,192.0,415.0,607,98,74.0,143.0,219.0,332,1145
9,76091,GREG BALLARD,1980,0,1610612764,WAS,26.0,82,71.0,2610.0,...,0.847,167.0,413.0,580,195,118.0,39.0,117.0,194,1271


In [78]:
pergame_df = load_dataframes(pergame_df_filepath)

per_game_columns = [
    'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 
    'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 
    'STL', 'BLK', 'TOV', 'PF', 'PTS'
]

# Rename columns for each DataFrame in the dictionary
for key, df in pergame_df.items():
    df.rename(columns={col: col + '_PG' for col in per_game_columns if col in df.columns}, inplace=True)

pergame_df[1980].iloc[-15:]

DataFrames loaded from /Users/cb/src/nba_mvp_ml/data/raw/seasons_PerGame


Unnamed: 0,PLAYER_ID,PLAYER_FULLNAME,SEASON_ID,LEAGUE_ID,TEAM_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,GS,MIN_PG,...,FT_PCT_PG,OREB_PG,DREB_PG,REB_PG,AST_PG,STL_PG,BLK_PG,TOV_PG,PF_PG,PTS_PG
348,78527,MICHAEL WILEY,1980,0,1610612759,SAN,23.0,33,,8.2,...,0.75,0.7,1.3,1.9,0.3,0.2,0.2,0.8,1.2,5.7
349,78531,BOB WILKERSON,1980,0,1610612741,CHI,26.0,80,,28.0,...,0.84,1.1,2.5,3.5,3.4,1.3,0.3,2.2,2.1,10.0
350,78532,JAMAAL WILKES,1980,0,1610612747,LAL,28.0,81,,37.4,...,0.758,1.8,3.6,5.4,2.9,1.5,0.4,2.6,2.8,22.6
351,78533,JAMES WILKES,1980,0,1610612741,CHI,23.0,48,,11.3,...,0.69,0.8,1.3,2.0,0.6,0.5,0.3,0.7,1.8,4.1
352,78537,JEFF WILKINS,1980,0,1610612762,UTH,26.0,56,,18.9,...,0.675,1.1,3.8,4.9,0.7,0.6,0.8,1.1,3.0,4.7
353,78548,FREEMAN WILLIAMS,1980,0,1610612746,SDC,25.0,82,,24.1,...,0.852,0.9,0.7,1.6,2.0,1.1,0.1,2.0,1.9,19.3
354,78571,RAY WILLIAMS,1980,0,1610612752,NYK,26.0,79,,34.7,...,0.817,1.5,2.5,4.1,5.5,2.3,0.5,3.0,3.4,19.7
355,78570,SLY WILLIAMS,1980,0,1610612752,NYK,23.0,67,,29.5,...,0.69,2.4,3.8,6.2,2.7,1.7,0.3,2.1,3.0,13.2
356,78575,JOHN WILLIAMSON,1980,0,1610612764,WAS,29.0,9,,12.4,...,0.833,0.0,0.8,0.8,1.9,0.4,0.1,1.3,1.4,4.7
357,78577,BILL WILLOUGHBY,1980,0,1610612745,HOU,24.0,55,,20.8,...,0.766,1.3,2.8,4.1,1.2,0.3,0.6,1.3,1.9,6.3


In [81]:
merged_dict = {}

for key in totals_df.keys():
    if key in pergame_df:
        # Get corresponding DataFrames
        df1 = totals_df[key]
        df2 = pergame_df[key]
        
        # Perform an inner join on PLAYER_ID to retain only matching rows
        merged_df = pd.merge(df1, df2, on=['PLAYER_ID', 'PLAYER_FULLNAME', 'SEASON_ID', 'LEAGUE_ID', 'TEAM_ID',
       'TEAM_ABBREVIATION', 'PLAYER_AGE', 'GP', 'GS'], how='inner').drop_duplicates(subset='PLAYER_ID', keep='first')
        
        # Add the merged DataFrame to the result dictionary
        merged_dict[key] = merged_df

In [91]:
combined_df = pd.concat(
    [df.assign(Season=season) for season, df in merged_dict.items()],
    ignore_index=True
).drop(columns=['Season'])

In [83]:
df_final_filepath = f'/Users/cb/src/nba_mvp_ml/data/raw/seasons_all'
save_dataframes(merged_dict, df_final_filepath)

In [89]:
output_dir = "/Users/cb/src/nba_mvp_ml/data/processed/combined.csv"
combined_df.to_csv(output_dir, index=False)

In [95]:
from sklearn.preprocessing import MinMaxScaler

columns_to_normalize = ['PLAYER_AGE', 'GP', 'GS', 'MIN', 'FGM', 'FGA',
       'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB',
       'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'MIN_PG',
       'FGM_PG', 'FGA_PG', 'FG_PCT_PG', 'FG3M_PG', 'FG3A_PG', 'FG3_PCT_PG',
       'FTM_PG', 'FTA_PG', 'FT_PCT_PG', 'OREB_PG', 'DREB_PG', 'REB_PG',
       'AST_PG', 'STL_PG', 'BLK_PG', 'TOV_PG', 'PF_PG', 'PTS_PG']

# Initialize the scaler
scaler = MinMaxScaler()

# Apply normalization only to selected columns
combined_df[columns_to_normalize] = scaler.fit_transform(combined_df[columns_to_normalize])

In [96]:
output_dir = "/Users/cb/src/nba_mvp_ml/data/processed/combined_normalized.csv"
combined_df.to_csv(output_dir, index=False)