In [1]:
import pandas as pd
import numpy as np
import nba_api.stats.endpoints as ep
import datetime
import re
from os import path
import time
import pickle

In [2]:
def lshelper(game_arr, gd):
    for x in range(len(game_arr)):
        gid = game_arr[x, 4]
        match = game_arr[x, 6]
        pm = game_arr[x, 27]
        if game_arr[x, 7] == 'W':
            winner = game_arr[x, 2]
        else:
            winner = game_arr[x, 6][-3:]
        gd[gid] = (match, pm, winner)
    return gd


class StatBucket():

    def __init__(self):
        self.clog = None
        self.log = None
        self.oslog = None
        self.outcomes = None
        self.data = None
        self.season = None
        self.gidset = None
        self.FD = {'advanced':ep.boxscoreadvancedv2.BoxScoreAdvancedV2,
     'fourfactors':ep.boxscorefourfactorsv2.BoxScoreFourFactorsV2,
     'misc':ep.boxscoremiscv2.BoxScoreMiscV2,
     'scoring':ep.boxscorescoringv2.BoxScoreScoringV2,
     'summary':ep.boxscoresummaryv2.BoxScoreSummaryV2,
     'traditional':ep.boxscoretraditionalv2.BoxScoreTraditionalV2
    }

    def update_log(self, season):
        self.season=season
        result = ep.leaguegamefinder.LeagueGameFinder(season_nullable=season)
        all_games = result.get_data_frames()[0]
        rs = all_games[all_games.SEASON_ID == '2' + season[:4]]
        rs = rs[rs.GAME_ID.str[:3] == '002'] #may need to update
        os = all_games[all_games.SEASON_ID == '4' + season[:4]]
        os = os[os.GAME_ID.str[:3] == '004']
        self.log = rs
        self.oslog = os
#         self.clog = rs.append(os)
        self.clog = pd.concat([rs,os])
    
    
    def get_log_stats(self):
        game_arr = self.log.to_numpy()
        os_arr = self.oslog.to_numpy()
        game_dat = dict()
        game_dat = lshelper(game_arr, game_dat)
        res = lshelper(os_arr, game_dat)        
        self.outcomes = res
        self.gidset = set(res.keys())

    def write_out(self, f, tstats, pstats):
        # IF V3
        try:
            # if f == 'traditional':
            #     tstats.sort_values('teamId', inplace=True, kind='mergesort')
            #     tstats.sort_values('gameId', inplace=True, kind='mergesort')
            #     tstats.to_csv(f'DATA/raw/teams/{f}/{f}{self.season}.csv', index=False)
            #     pstats.sort_values('teamId', inplace=True, kind='mergesort')
            #     pstats.sort_values('gameId', inplace=True, kind='mergesort')
            #     pstats.to_csv(f'DATA/raw/players/{f}/{f}{self.season}.csv', index=False)
            # else:
            tstats.sort_values('TEAM_ID', inplace=True, kind='mergesort')
            tstats.sort_values('GAME_ID', inplace=True, kind='mergesort')
            tstats.to_csv(f'DATA/raw/teams/{f}/{f}{self.season}.csv', index=False)
            pstats.sort_values('TEAM_ID', inplace=True, kind='mergesort')
            pstats.sort_values('GAME_ID', inplace=True, kind='mergesort')
            pstats.to_csv(f'DATA/raw/players/{f}/{f}{SEASONS[i]}.csv', index=False)
            
            return
        except Exception as e:
            print(f'error with write out for {f}{self.season}\n{e}\n')
            
            return


    def Season_csv_update(self,endpoint_name):
        print(f"collecting {endpoint_name} data for season: {self.season}")
        teams = []
        players= []
        ex_set = set()
        # if endpoint_name == 'traditional':
        #     col_gid = 'gameId'
        # else:
        #     col_gid = 'GAME_ID'
        col_gid = 'GAME_ID'
        ## remove already gotten games
        if path.exists(f'DATA/raw/players/{endpoint_name}/{endpoint_name}{self.season}.csv'):
            try:
                with open(f'DATA/raw/players/{endpoint_name}/{endpoint_name}{self.season}.csv', 'r') as f:
                    pstats = pd.read_csv(f, dtype={col_gid: str})

                #have to loop through to add leading 0's if not already present
                ex_set = set()
                for i in pstats[col_gid]:
                    try:
                        if i[:2] != '00':
                            newi = '00'+i
                            ex_set.add(newi)
                        else:
                            ex_set.add(i)
                    except:
                        print('gid_problem')
            except Exception as e:
                print(f"Unable to exclude partial player data\n\t{e}\n")
        else:
            pstats = pd.DataFrame()

        if path.exists(f'DATA/raw/teams/{endpoint_name}/{endpoint_name}{self.season}.csv'):
            try:
                with open(f'DATA/raw/teams/{endpoint_name}/{endpoint_name}{self.season}.csv', 'r') as f2:
                    tstats = pd.read_csv(f2, dtype={col_gid: str})
                #have to loop through to add leading 0's if not already present
                ex_set2 = set()
                for i in tstats[col_gid]:
                    try:
                        if i[:2] != '00':
                            newi = '00'+i
                            ex_set2.add(newi)
                        else:
                            ex_set2.add(i)
                    except:
                        print('gid_problem')
                ex_set = ex_set.intersection(ex_set2)
            except Exception as e:
                print(f"Unable to exclude partial player data\n\t{e}\n")
        else:
            tstats = pd.DataFrame()

        print(f"total number of games = {len(self.gidset)}\nremoving {len(ex_set)} games from list")
        print(f"still need {len(self.gidset - ex_set)}")

        gids = self.gidset-ex_set
        if not gids:
            print(f'all data present for {endpoint_name}_{self.season}\n')
            return 0
        
        count=0
        statfunc = self.FD[endpoint_name]
        for gid in gids:
            try:
                count+=1
                game = statfunc(game_id=gid).get_data_frames()
                players.append(game[0])
                teams.append(game[1])
            except Exception as e:
                if not players:
                    print(f"ERROR, NO NEW DATA, FILES UNCHANGED,STILL NEED {len(ex_set)+count - len(self.gidset)} GAMES\n\t{e}")
                    return len(self.gidset) - len(ex_set)+count
                else:
                    print(f"MADE IT THROUGH {count} GAMES OUT OF {len(gids)} GAMES BECAUSE OF \n\t{e}\nRESULTING IN {len(ex_set)+count} GAMES OUT OF {len(self.gidset)} TOTAL")
                    new_pstats = pd.concat(players)
                    new_tstats = pd.concat(teams)
                    nsp = pd.concat([new_pstats, pstats], ignore_index=True)
                    nst = pd.concat([new_tstats, tstats], ignore_index=True)
                    
                    self.write_out(endpoint_name, nst, nsp)
                    break

        if not players or not teams:
            print('NO CHANGES?')
            return 0
            
        new_pstats = pd.concat(players)
        new_tstats = pd.concat(teams)
        nsp = pd.concat([new_pstats, pstats], ignore_index=True)
        nst = pd.concat([new_tstats, tstats], ignore_index=True)

        print(f"writing {len(new_tstats) / 2} games\nDONE\n")

        self.write_out(endpoint_name, nst, nsp)
        return len(ex_set)+count - len(self.gidset)
        

    def Season_csv(self, endpoint_name):
        teams = []
        players= []
        count=0
        statfunc = self.FD[endpoint_name]
        for gid in self.gidset:
            try:
                count+=1
                game = statfunc(game_id=gid).get_data_frames()
                players.append(game[0])
                teams.append(game[1])
            except Exception as e:
                print(f"ONLY MADE IT THROUGH {count} GAMES BECAUSE OF \n\t{e}")
                with open(f'DATA/raw/players/{endpoint_name}/PARTIAL_{endpoint_name}{self.season}.bin', 'wb') as f:
                    pickle.dump(players, f)
                with open(f'DATA/raw/teams/{endpoint_name}/PARTIAL_{endpoint_name}{self.season}.bin', 'wb') as f2:
                    pickle.dump(teams, f2)
                break
        pstats = pd.concat(players)
        tstats = pd.concat(teams)
        return tstats, pstats




def append_game(statfunc, current_stats, gid, playerdata=False):
    if playerdata:
        game = statfunc(game_id=gid).get_data_frames()[0]
        current_stats = pd.concat([current_stats,game])
        return current_stats
    else:
        game = statfunc(game_id=gid).get_data_frames()[1]
        current_stats = pd.concat([current_stats,game])
        return current_stats
def split_gid(gidset, nos):
    rem = len(gidset)%nos
    relist = [int(len(gidset)/nos)] * nos
    relist[-1]+=rem
    return relist

In [3]:
# ## TEMP FIX GID MATCHING
# sb = StatBucket()
# # THIS MAY NEED TO BE SEASONS[i][:4] for some earlier seasons??
# sb.update_log('2023-24')
# sb.get_log_stats()
# print(sb.gidset)


# with open(f'DATA/raw/players/advanced/advanced2023-24.csv', 'r') as f:
#     pstats = pd.read_csv(f, dtype={'GAME_ID': str})
# print(pstats)
# ex_set = set()
# for i in pstats['GAME_ID']:
#     if i[:2] != '00':
#         newi = '00'+i
#         ex_set.add(newi)
#     else:
#         ex_set.add(i)
# print(ex_set - sb.gidset)
    


In [4]:
# PLAYER DATA OR TEAM DATA
PLAYER_DATA = False


SEASONS = ['2016-17',
           '2022-23',
           '2023-24',
           '2021-22',
           '2017-18',
           '2018-19',
           '2019-20',
           '2020-21',]

# ['2010-11',
#            '2011-12',
#            '2012-13',
#            '2013-14',
#            '2014-15',
#            '2015-16',
#            '2016-17',
#            '2017-18',
#            '2018-19',
#            '2019-20',
#            '2020-21',
#           '2021-22',
#           '2022-23']

FN = ['traditional']

# FN = ['scoring',
#     'advanced',
#     'fourfactors',
#      'misc']


#      'scoring'
#      'summary',
#      'traditional'



In [5]:
### FOR DIFFERENT SEASON_CSV_FUNCTION:

done_check = dict()
for ses in SEASONS:
    for f in FN:
        done_check[ses+f] = 1

def check_done(res_d):
    for k in res_d:
        if res_d[k] != 0:
            return False
    return True


DONE = False
loop_count=0
while not DONE:
    print(f"loop: {loop_count}")
    for i in range(len(SEASONS)):
        try:
            sb = StatBucket()
            # THIS MAY NEED TO BE SEASONS[i][:4] for some earlier seasons??
            sb.update_log(SEASONS[i])
            sb.get_log_stats()
            for j in FN:
                if not done_check[SEASONS[i]+j]:
                    continue
                # if path.exists(f'DATA/raw/teams/{j}/{j}{SEASONS[i]}.csv'):
                #     print(f"SKIPPING teams/{j}/{j}{SEASONS[i]}, already exists")
                #     continue

                if not len(sb.gidset):
                    print(f'error with {SEASONS[i]}, no game ids')
                    print(sb.outcomes)
                    break
                
                if not path.exists(f'DATA/raw/log/log_{SEASONS[i]}.csv'):
                    sb.clog.sort_values('TEAM_ID', inplace=True, kind='mergesort')
                    sb.clog.sort_values('GAME_ID', inplace=True, kind='mergesort')
                    sb.clog.to_csv(f'DATA/raw/log/log_{SEASONS[i]}.csv', index=False)
                    
                done_check[SEASONS[i]+j] = sb.Season_csv_update(j)
        except Exception as e:
            print(f'ERROR with {SEASONS[i]} \n\t{e}')
    loop_count+=1
    DONE = check_done(done_check)



print(f"FINISHED WITH {SEASONS} and {FN}")



loop: 0
collecting traditional data for season: 2016-17
total number of games = 1309
removing 926 games from list
still need 383
MADE IT THROUGH 19 GAMES OUT OF 383 GAMES BECAUSE OF 
	Expecting value: line 1 column 1 (char 0)
RESULTING IN 945 GAMES OUT OF 1309 TOTAL
writing 18.0 games
DONE

collecting traditional data for season: 2022-23
total number of games = 1314
removing 1314 games from list
still need 0
all data present for traditional_2022-23

collecting traditional data for season: 2023-24
total number of games = 1312
removing 1312 games from list
still need 0
all data present for traditional_2023-24

collecting traditional data for season: 2021-22
total number of games = 1317
removing 1317 games from list
still need 0
all data present for traditional_2021-22

collecting traditional data for season: 2017-18
total number of games = 1312
removing 1312 games from list
still need 0
all data present for traditional_2017-18

collecting traditional data for season: 2018-19
total number

KeyboardInterrupt: 

In [None]:
ep.leaguegamefinder.LeagueGameFinder(season_nullable='2020-21').get_data_frames()[0]

In [None]:
### FOR UPDATING DATA:

for i in range(len(SEASONS)):
    print(SEASONS[i])
    sb = StatBucket()
    sb.update_log(SEASONS[i])
    sb.get_log_stats()
    gidset = set(sb.outcomes.keys())
    if not path.exists(f'DATA/raw/log/log_{SEASONS[i]}.csv'):
        sb.clog.sort_values('TEAM_ID', inplace=True, kind='mergesort')
        sb.clog.sort_values('GAME_ID', inplace=True, kind='mergesort')
        sb.clog.to_csv(f'DATA/raw/log/log_{SEASONS[i]}.csv', index=False)

    for j in FN:
        print(j)

        if not len(gidset):
            print(f'error with {SEASONS[i]}, no game ids')
            print(sb.outcomes)
            break
        nst, nsp = sb.Season_csv_update(j)
        # nst.sort_values('TEAM_ID', inplace=True, kind='mergesort')
        # nst.sort_values('GAME_ID', inplace=True, kind='mergesort')
        # nst.to_csv(f'DATA/raw/teams/{j}/{j}{SEASONS[i]}.csv')
        # nsp.sort_values('TEAM_ID', inplace=True, kind='mergesort')
        # nsp.sort_values('GAME_ID', inplace=True, kind='mergesort')
        # nsp.to_csv(f'DATA/raw/players/{j}/{j}{SEASONS[i]}.csv')

        # IF V3
        # nst.sort_values('teamId', inplace=True, kind='mergesort')
        # nst.sort_values('gameId', inplace=True, kind='mergesort')
        # nst.to_csv(f'DATA/raw/teams/{j}/{j}{SEASONS[i]}.csv')
        # nsp.sort_values('teamId', inplace=True, kind='mergesort')
        # nsp.sort_values('gameId', inplace=True, kind='mergesort')
        # nsp.to_csv(f'DATA/raw/players/{j}/{j}{SEASONS[i]}.csv')


In [None]:
nst.sort_values('teamId', inplace=True, kind='mergesort')
nst.sort_values('gameId', inplace=True, kind='mergesort')
nst.to_csv(f'DATA/raw/teams/{j}/{j}{SEASONS[i]}.csv')
nsp.sort_values('teamId', inplace=True, kind='mergesort')
nsp.sort_values('gameId', inplace=True, kind='mergesort')
nsp.to_csv(f'DATA/raw/players/{j}/{j}{SEASONS[i]}.csv')

In [None]:
nsp['gameId']

In [None]:
nst['gameId']

In [None]:
game = ep.boxscoretraditionalv3.BoxScoreTraditionalV3(game_id='0022300709').get_data_frames()
players = game[0]
teams = game[1]

In [None]:
nst

In [None]:
nsp

In [None]:
nst.sort_values('teamId', inplace=True, kind='mergesort')
nst.sort_values('gameId', inplace=True, kind='mergesort')
nst.to_csv(f'DATA/raw/teams/{j}/{j}{SEASONS[i]}.csv')
nsp.sort_values('teamId', inplace=True, kind='mergesort')
nsp.sort_values('gameId', inplace=True, kind='mergesort')
nsp.to_csv(f'DATA/raw/players/{j}/{j}{SEASONS[i]}.csv')

In [None]:
with open(f'DATA/raw/players/traditional/PARTIAL_traditional2023-24.bin', 'rb') as f:
    player_df = pickle.load(f)


In [None]:
with open(f'teams100.bin', 'rb') as f2:
    team_df = pickle.load(f2)


# with open(f'DATA/raw/teams/traditional/PARTIAL_traditional2023-24.bin', 'rb') as f2:
#     team_df = pickle.load(f2)




In [None]:
new_player = pd.concat(player_df)
new_team = pd.concat(team_df)

In [None]:
new_team

In [None]:
new['gameId']

In [None]:
for i in player_df:
    print(len(i))

In [None]:


SEASON='2023-24'

sb = StatBucket()
# THIS MAY NEED TO BE SEASONS[i][:4] for some earlier seasons??
sb.update_log(SEASON)
sb.get_log_stats()

# if not path.exists(f'DATA/raw/log/log_{SEASON}.csv'):
sb.clog.sort_values('TEAM_ID', inplace=True, kind='mergesort')
sb.clog.sort_values('GAME_ID', inplace=True, kind='mergesort')
sb.clog.to_csv(f'DATA/raw/log/log_{SEASON}.csv', index=False)

In [None]:
result = ep.leaguegamefinder.LeagueGameFinder(season_nullable=SEASON).get_data_frames()

In [None]:
result[0]

In [None]:
try:
    new = x + 10
except:
    print('ag')