In [None]:
# Created 07/02/2021
import pandas as pd
import numpy as np
import nba_api.stats.endpoints as ep
from datetime import datetime
import re
from os import path

In [None]:
TEAMS = ['ATL', 'BKN', 'BOS', 'CHA', 'CHI', 'CLE', 'DAL', 'DEN', 'DET', 'GSW', 'HOU', 'IND', 'LAC', 'LAL', 'MEM',
             'MIA', 'MIL', 'MIN', 'NOP', 'NYK', 'OKC', 'ORL', 'PHI', 'PHX', 'POR', 'SAC', 'SAS', 'TOR', 'UTA', 'WAS']

In [None]:
def lshelper(game_arr, gd):
    for x in range(len(game_arr)):
        gid = game_arr[x, 4]
        match = game_arr[x, 6]
        pm = game_arr[x, 27]
        if game_arr[x, 7] == 'W':
            winner = game_arr[x, 2]
        else:
            winner = game_arr[x, 6][-3:]
        gd[gid] = (match, pm, winner)
    return gd


class StatBucket():

    def __init__(self):
        self.clog = None
        self.log = None
        self.oslog = None
        self.outcomes = None
        self.data = None

    def update_log(self, season):
        result = ep.leaguegamefinder.LeagueGameFinder(season_nullable=season)
        all_games = result.get_data_frames()[0]
        rs = all_games[all_games.SEASON_ID == '2' + season[:4]]
        rs = rs[rs.GAME_ID.str[:3] == '002'] #may need to update
        os = all_games[all_games.SEASON_ID == '4' + season[:4]]
        os = os[os.GAME_ID.str[:3] == '004']
        self.log = rs
        self.oslog = os
        self.clog = rs.append(os)
    
    
    def get_log_stats(self):
        game_arr = self.log.to_numpy()
        os_arr = self.oslog.to_numpy()
        game_dat = dict()
        game_dat = lshelper(game_arr, game_dat)
        res = lshelper(os_arr, game_dat)        
        self.outcomes = res
        
        
def Season_csv(statfunc, gidset):
    first = gidset.pop()
    tstats = statfunc(game_id=first).get_data_frames()[1]
    for gid in gidset:
        game = statfunc(game_id=gid).get_data_frames()[1]
        tstats = tstats.append(game)
    return tstats

In [None]:
def date_filter(df, sd=datetime(1969, 1, 1), ed=datetime(2050, 1, 1)):
    """
    returns dataframe of games in between two dates
    
    will not work if using games before 1969 or after 2050
    
    possible error may come from using index number > or < to splice
    
    df - combined data
    sd - start date, datetime obj
    ed - end date, datetime obj
    """
    si = None
    ei = None
    for i, x in zip(df.index, df['GAME_DATE']):
        if datetime.strptime(x, "%Y-%m-%d") > sd:
            si = i
            break
    for j, y in zip(df.index, df['GAME_DATE']):
        if datetime.strptime(y, "%Y-%m-%d") > ed:
            ei = j
            break
    print(ei, si)
    if si == None:
        if ei == None:
            return df
        return df[df.index <= ei]
    elif ei == None:
        return df[df.index >= si]
    else:
        df = df[df.index >= si]
        return df[df.index <= ei]
    
def team_filter(df, team):
    return df[df['TEAM_ABBREVIATION'] == team]
    

def firstDigit(n) :
    # Remove last digit from number
    # till only one digit is left
    while n >= 10:
        n = n / 10;
    return int(n)

def os_filter(df, os, year):
    if os:
        return df[df.SEASON_ID == int('4' + year)]
    else:
        return df[df.SEASON_ID == int('2' + year)]
            
def ha_filter(df, ha):
    if ha == 'h':
        tarr = []
        for i in df['MATCHUP'].values:
            tarr.append('@' not in i)
        return df[tarr]
    else:
        tarr = []
        for i in df['MATCHUP'].values:
            tarr.append('@' in i)
        return df[tarr]
    


In [None]:
dl = ['PTS', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A',
 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL',
       'BLK', 'TOV', 'PF', 'E_OFF_RATING',
       'OFF_RATING', 'E_DEF_RATING', 'DEF_RATING', 'E_NET_RATING',
       'NET_RATING', 'AST_PCT', 'AST_TOV', 'AST_RATIO', 'OREB_PCT', 'DREB_PCT',
       'REB_PCT', 'E_TM_TOV_PCT', 'TM_TOV_PCT', 'EFG_PCT', 'TS_PCT', 'USG_PCT',
       'E_USG_PCT', 'E_PACE', 'PACE', 'PACE_PER40', 'POSS', 'PIE', 'FTA_RATE',
       'OPP_EFG_PCT', 'OPP_FTA_RATE', 'OPP_TOV_PCT', 'OPP_OREB_PCT',
       'PTS_OFF_TOV', 'PTS_2ND_CHANCE', 'PTS_FB', 'PTS_PAINT',
       'OPP_PTS_OFF_TOV', 'OPP_PTS_2ND_CHANCE', 'OPP_PTS_FB', 'OPP_PTS_PAINT',
       'BLKA', 'PFD', 'PCT_FGA_2PT', 'PCT_FGA_3PT', 'PCT_PTS_2PT',
       'PCT_PTS_2PT_MR', 'PCT_PTS_3PT', 'PCT_PTS_FB', 'PCT_PTS_FT',
       'PCT_PTS_OFF_TOV', 'PCT_PTS_PAINT', 'PCT_AST_2PM', 'PCT_UAST_2PM',
       'PCT_AST_3PM', 'PCT_UAST_3PM', 'PCT_AST_FGM', 'PCT_UAST_FGM']

normdata = ['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_DATE', 'MATCHUP', 'WL']

In [None]:
def lastx(df, tdf, cols, x):
    """
    does lastx moving average for
    team dataframe adding to: tdf
    dataframe df
    list of columns dl
    int x
    """
    for d in cols:
        tdf[d] = df[d].rolling(window=x).mean()
    return


def create_avgsV2(season, cols, ha = '', lx = 'avgs', os = ''):
    """
    Season: 2012-13 or 2020-21
    ha (home / away): 'home' or 'away'
    lastx: 'lastx' where x is an int
    os (offseason): 'os', 'rs', or '' default is '' (includes both)
    cols: data wanted avgs of
    """
    df1 = pd.read_csv(f'DATA/combinedv1/combined{season}.csv', index_col='GAME_ID')
    linedf = pd.read_csv(f'DATA/linesv1/LinesV1{season}.csv', index_col='GAME_ID')
    
    if ha == 'home':
        df1 = ha_filter(df1, 'h')
    elif ha == 'away':
        df1 = ha_filter(df1, 'a')
        
    # THIS IS WRONG    
    if os != '':
        df1 = os_filter(df1, os, season[:4])
    
    if lx[:4] == 'last':
        last = int(lx[4:])
    else:
        last = 0
    
    first = True
    main_df = None
    for team in TEAMS:
        t1 = team_filter(df1, team)
        t1_lines = team_filter(linedf, team)
#         if ha=='home':
#             t1_lines 
        if len(t1_lines) != len(t1):
            print(f'LENGTHS OF DATA DO NOT MATCH FOR {team}\n')
            break
        dfx = pd.DataFrame()
        for x in normdata:
            dfx[x] = t1[x]
        
        
        if (last):
            lastx(t1, dfx, dl, last)
            
        else:
            for j in cols:
                nl = []
                for i in range(len(t1.index)):
                    temp = t1[:i+1]
                    nl.append((temp[j].mean()))
                dfx[j] = nl
        
        dfx.index = t1.index
        dfx['PLUS_MINUS'] = t1['PLUS_MINUS']
        
        #next game id
        ngl = dfx.index.to_list()[1:]
        #next game plus minus
        npm = t1['PLUS_MINUS'].to_list()[1:]
        # next game matchup
        ngm = t1['MATCHUP'].to_list()[1:]
        #next game spread
        ngs = t1_lines['LINE'].to_list()[1:]
        # next game over/under
        ngou = t1_lines['O/U'].to_list()[1:]
        
        npm.append(100)
        ngl.append(0)
        ngm.append('last_game')
        ngs.append(0)
        ngou.append(0)
        
        dfx['NEXT_GAME_ID'] = ngl
        dfx['NEXT_MATCHUP'] = ngm
        dfx['NEXT_PLUS_MINUS'] = npm
        dfx['NEXT_SPREAD'] = ngs
        dfx['NEXT_O/U'] = ngou


        if first:
            main_df = dfx
            first=False
        else:
            main_df = main_df.append(dfx)
        
    main_df.index.name='GAME_ID'
    main_df.to_csv(f'DATA/avgsV2/{os}{ha}{lx}{season}.csv')
    return main_df

def create_avgsV1(season, cols, ha = '', lx = 'avgs', os = ''):
    """
    Season: 2012-13 or 2020-21
    ha (home / away): 'home' or 'away'
    lastx: 'lastx' where x is an int
    os (offseason): 'os', 'rs', or '' default is '' (includes both)
    cols: data wanted avgs of
    """
    df1 = pd.read_csv(f'DATA/combinedv1/combined{season}.csv', index_col='GAME_ID')  
    if ha == 'home':
        df1 = ha_filter(df1, 'h')
    elif ha == 'away':
        df1 = ha_filter(df1, 'a')
        
    # THIS IS WRONG    
    if os != '':
        df1 = os_filter(df1, os, season[:4])
    
    if lx[:4] == 'last':
        last = int(lx[4:])
    else:
        last = 0
    
    first = True
    main_df = None
    for team in TEAMS:
        t1 = team_filter(df1, team)
        dfx = pd.DataFrame()
        for x in normdata:
            dfx[x] = t1[x]
        
        if (last):
            lastx(t1, dfx, dl, last)
            
        else:
            for j in cols:
                nl = []
                for i in range(len(t1.index)):
                    temp = t1[:i+1]
                    nl.append((temp[j].mean()))
                dfx[j] = nl
        
        dfx.index = t1.index
        dfx['PLUS_MINUS'] = t1['PLUS_MINUS']
        
        #next game id
        ngl = dfx.index.to_list()[1:]
        #next game plus minus
        npm = t1['PLUS_MINUS'].to_list()[1:]
        # next game matchup
        ngm = t1['MATCHUP'].to_list()[1:]
        
        npm.append(100)
        ngl.append(0)
        ngm.append('last_game')

        
        dfx['NEXT_GAME_ID'] = ngl
        dfx['NEXT_MATCHUP'] = ngm
        dfx['NEXT_PLUS_MINUS'] = npm

        if first:
            main_df = dfx
            first=False
        else:
            main_df = main_df.append(dfx)

    main_df.index.name='GAME_ID'
    main_df.to_csv(f'DATA/avgsV1/{os}{ha}{lx}{season}.csv')
    return main_df
        

In [None]:
# SEASON = '2013-14'
### ALL COMBINATIONS: HA + HOME OR AWAY + AVGS
### LASTX + AVGS
### LASTX + LAST10
### V2 DOES NOT WORK WITH HOME / AWAY YET

SEASONS = ['2012-13']
#           '2013-14',
#           '2014-15',
#           '2015-16',
#           '2016-17',
#           '2017-18',
#           '2018-19',
#           '2019-20']
# #           '2020-21']


OS = ''
HA = 'away'
LASTX = 'avgs'

In [None]:
for season in SEASONS:
    if HA == '':
        create_avgsV2(season, dl, ha=HA, lx=LASTX, os=OS)
    else:
        create_avgsV1(season, dl, ha=HA, lx=LASTX, os=OS)
#     new.to_csv(f'DATA/avgs/{OS}{HA}{LASTX}{season}.csv')

In [None]:
create_avgs('2012-13', dl, ha=HA, lx=LASTX, os=OS)

In [None]:
# To GET next game data from the log instead of the files

def get_ng_data(season, teams):
    data_d = {}
    sb = StatBucket()
    sb.update_log(season)
    cdata = sb.clog
    cdata.sort_values('GAME_ID', inplace=True)
    for team in teams:
        t1 = team_filter(cdata, team)
        
        # next gameid, nextplusminus, nextmatchup
        ngl = t1['GAME_ID'].astype('int64').to_list()[1:]
        npm = t1['PLUS_MINUS'].astype('int64').to_list()[1:]
        ngm = t1['MATCHUP'].to_list()[1:]
        
        ngl.append(0)
        npm.append(100)
        ngm.append('None')
        
        
        data_d[team] = [t1['GAME_ID'].astype('int64').to_list(), ngl, npm, ngm]
    return data_d