In [None]:
from nba_stats.read_write.basic_stats import ReadDatabase

import pandas as pd
import numpy as np
import datetime as dt
import pickle

pd.options.display.max_columns=50

In [None]:
reader = ReadDatabase()

In [None]:
years = (2021, 2021)

# import boxscore and adv boxscore average stats per regular season
reader.basic_summary(summary_name='regular', aggregator='AVG', return_teams=True, suppress_query=False, adv_stats=True, playoffs='regular', convert_ids=False, years=years)
boxscores = reader.get_summary('regular').copy()

# import all nba teams
allnbateams = reader.read_table(get_str="SELECT player_id, season FROM allnbateams WHERE allnbatype_id=1 AND league='NBA'")
allnbateams.loc[:,'allnba'] = 1

# import standings at the end of each season
reader.season_games(years, convert_ids=False)
standings = reader.wpct_all()

In [None]:
input_data = boxscores.merge(standings, how='left', left_on=['season','team_id'], right_on=['season','team'])
input_data = input_data.drop(columns=['team'], axis=1)

input_data = input_data.merge(allnbateams, how='left', on=['season','player_id'])
input_data['allnba'] = input_data['allnba'].fillna(0)

In [None]:
# basic cleansing
cleansed_data = input_data[input_data['game_count'] != 0]
cleansed_data = cleansed_data[~cleansed_data['player_id'].isna()]

In [None]:
# fill allowable na fields with the average for the given season. Na is allowed when it is due to zero on denominator.
reader.basic_summary(summary_name='season', aggregator='AVG', adv_stats=True, playoffs='regular', convert_ids=False, years=years, player_fields=False)
na_cols = ['fg_pct', 'fg2_pct','fg3_pct', 'ft_pct', 'assist_tov', 'ts_pct', 'tov_pct']
season_avgs = reader.get_summary('season')[['season']+na_cols].set_index('season').transpose().to_dict()

na_filled = []
for season, df in cleansed_data.groupby('season', sort=False):
    temp_df = df.copy()
    temp_df.loc[:,'game_pct'] = temp_df['game_count'] / temp_df['game_count'].max()
    temp_df[na_cols] = temp_df[na_cols].fillna(season_avgs[season])
    na_filled.append(temp_df)
    
avgfilled_data = pd.concat(na_filled)
# replace None with nan
avgfilled_data = avgfilled_data.fillna(np.nan).reset_index(drop=True)

# check pct stats where they are missing and it is not due to 0 attempts
for pct, attempt in zip(['fg_pct', 'fg2_pct','fg3_pct', 'ft_pct', 'assist_tov'], ['fga', 'fg2a', 'fg3a', 'fta', 'tov']):
    temp_df = cleansed_data[(cleansed_data[pct].isna()) & (cleansed_data[attempt] != 0)]
    print('{}: {} where {} is not 0'.format(pct, len(temp_df), attempt))

MISSING
<br>Plus minus only began in 2001


In [None]:
cleansed_data.isna().sum()
# avgfilled_data.isna().sum()

In [None]:
id_cols = ['player_id', 'last_name', 'first_name', 'pos1', 'G', 'F', 'C', 'team', 'season']
# remove attempts and total rebounds as they are redundant, game count is redundant due to pct of games played
drop_cols = ['game_count', 'player_id', 'team_id', 'season', 'fga', 'fta', 'fg2a', 'fg3a', 'trb']
y_col = ['allnba']
data_cols = [header for header in avgfilled_data.columns if header not in drop_cols+y_col] + y_col

In [None]:
avgfilled_data[y_col] = avgfilled_data[y_col].astype(int)
training_data = np.array(avgfilled_data[data_cols]).astype(float)
features = data_cols[:-1]
id_data = avgfilled_data.copy()
teams = reader.read_table(get_str='SELECT team_id, abbreviation FROM teams')
players = reader.read_table(get_str='SELECT player_id, last_name, first_name, pos1, pos2 FROM players')
id_data = id_data.merge(teams, how='left', on='team_id').rename(columns={'abbreviation':'team'})#.drop(columns=['team_id'], axis=1)
id_data = id_data.merge(players, how='left', on='player_id')
id_data.loc[:,'positions'] = id_data['pos1'] + ',' + id_data['pos2'].fillna('')
for pos in ['G', 'F', 'C']:
    id_data.loc[:,pos] = id_data['positions'].str.contains(pos)
id_data = id_data[id_cols+features]

In [None]:
print(training_data.shape, id_data.shape, sep='\n')

In [None]:
# save data used to run model for current year
with open('data/allnba_2021.pickle', 'wb') as f:
    pickle.dump([training_data, features, id_data], f)

In [None]:
# save training data
with open('data/allnbatraining.pickle', 'wb') as f:
    pickle.dump([training_data, features, id_data], f)