In [None]:
# Created 07/17/2021
import pandas as pd
import numpy as np
import nba_api.stats.endpoints as ep
from datetime import datetime
import re
from os import path
import matplotlib.pyplot as plt

In [None]:
TEAMS = ['ATL', 'BKN', 'BOS', 'CHA', 'CHI', 'CLE', 'DAL', 'DEN', 'DET', 'GSW', 'HOU', 'IND', 'LAC', 'LAL', 'MEM',
             'MIA', 'MIL', 'MIN', 'NOP', 'NYK', 'OKC', 'ORL', 'PHI', 'PHX', 'POR', 'SAC', 'SAS', 'TOR', 'UTA', 'WAS']

In [None]:
def team_filter(df, team):
    return df[df['TEAM_ABBREVIATION'] == team]

In [None]:
class ModelStats():

    def __init__(self):
        self.data_d = {}
        self.seasons = set()
        self.files = set()
        self.neglected = {}
        self.sgids = {}
        
        self.lines = {}
        self.results = {}
        
    def load_season(self, season, file):
        if (file[:4] == 'home') or (file[:4] == 'away'):
            df = pd.read_csv(f'DATA/avgsV1/{file}{season}.csv')
            df.sort_values('GAME_ID', inplace=True, kind='mergesort')
            self.data_d[file+season] = df
            self.neglected[season] = set()
            self.seasons.add(season)
            self.files.add(file)
        else:
            df = pd.read_csv(f'DATA/avgsV2/{file}{season}.csv')
            df.sort_values('GAME_ID', inplace=True, kind='mergesort')
            self.data_d[file+season] = df
            self.neglected[season] = set()
            self.seasons.add(season)
            self.files.add(file)
    
    def apply_rs_filters(self, x=10):
        """
        
        big diff between getting rid of GIDs and indeces bc we need same NEXT_GID
        
        """
        for s in self.seasons:
            self.skip_x(s, x)
            self.os_filter(s, 'avgs', False, s[:4])
            self.lg_filter(s, 'avgs')
            for f in self.files:
                self.neg_filter(s, f)
                if ((f[:4]!='home') and (f[:4]!='away')):
                    self.double_filter(s, f)
        self.apply_sgid()


    def apply_sgid(self):
        for s in self.seasons:
            self.ms_sgid(s, self.files)
            for f in self.files:
                tfil = []
                for i in self.data_d[f+s]['NEXT_GAME_ID']:
                    tfil.append(i in self.sgids[s])
                self.data_d[f+s] = self.data_d[f+s][tfil]

    def ms_sgid(self, season, files):
        """
        model stats season gid filter
        get all game id's that are in the same season / same datasets in the season
        """
        nglist = []
        for i in files:
            nglist.append(self.data_d[i+season]['NEXT_GAME_ID'].tolist())
        result = set(nglist[0])
        for s in nglist[1:]:
            result.intersection_update(s)
        if 0 in result:
            result.remove(0)
        self.sgids[season] = result


                
    def double_filter(self, s, f):
        """
        adds GID to neglected when there is not two rows for the NEXT_GAME_ID
        """
        df = self.data_d[f+s]
        x=df['NEXT_GAME_ID'].value_counts()==2
        newlis=[]
        for ngid in df['NEXT_GAME_ID']:
            newlis.append(x[ngid])
        self.data_d[f+s] = df[newlis]
        
        
        
    # NEGLECTED STUFF, USES NEGLECTED TO REMOVE UNWANTED GID's
                
        
    def skip_x(self, s, x):
        """
        uses stat bucket to get list of GAME_ID's that does not contain the first 10 games a team has played
        """
        df1 = self.data_d['avgs'+s].sort_values('GAME_ID')
        df1.sort_values('TEAM_ABBREVIATION', inplace=True, kind='mergesort')
        first = df1['TEAM_ABBREVIATION'].tolist()[0]
        count=0
        for ta, gid in zip(df1['TEAM_ABBREVIATION'], df1['GAME_ID']):
            if ta != first:
                first = ta
                count = 0
                self.neglected[s].add(int(gid))
                continue
            if count <= x:
                count+=1
                self.neglected[s].add(int(gid))
            else:
                count+=1

                
    def date_filter(self, s, f, sd=datetime(1969, 1, 1), ed=datetime(2050, 1, 1)):
        """
        NOT PERFECT, SEPERATES BASED ON FIRST INSTANCE THAT GAME DATE PAST SD OR ED
        SOME GAMES SLIP THROUGH IF GID IS NOT PERFECTLY SORTED BY DATE
        """
        
        # FIND STARTING GID AND ENDIND GID (GAME_ID)
        df1 = self.data_d[f+s].sort_values('GAME_DATE')
        si = None
        ei = None
        for i, x in zip(df1['GAME_ID'], df['GAME_DATE']):
            if datetime.strptime(x, "%Y-%m-%d") > sd:
                si = i
                break
            for j, y in zip(df1['GAME_ID'], df['GAME_DATE']):
                if datetime.strptime(y, "%Y-%m-%d") > ed:
                    ei = j
                    break
                    
        # ADD GIDS TO NEGLECTED DEPENDING ON IF START AND END INDEX FOUND
        if si==None:
            if ei==None:
                return
            else:
                for ind in df1['GAME_ID']:
                    if ind > ei:
                        self.neglected[s].add(ind)
                return
        if ei==None:
            for ind in df['GAME_ID']:
                if ind < si:
                    self.neglected[s].add(ind)
        else:
            for ind in df['GAME_ID']:
                if ind < si:
                    self.neglected[s].add(ind)
                if ind > ei:
                    self.neglected[s].add(ind)
                    
                    
    def os_filter(self, s, f, os, year):
        df = self.data_d[f+s]
        if os:
            for ind, sid in zip(df['GAME_ID'], df.SEASON_ID):
                if sid == int('2'+'2015'):
                    self.neglected[s].add(ind)
        else:
            for ind, sid in zip(df['GAME_ID'], df.SEASON_ID):
                if sid == int('4'+'2015'):
                    self.neglected[s].add(ind)

                    
    def ha_filter(self, s, f, ha):
        df = self.data_d[f+s]
        if ha == 'h':
            tarr = []
            for i in df['MATCHUP'].values:
                tarr.append('@' not in str(i))
            self.data_d[f+s] = self.data_d[f+s][tarr]
            return
        else:
            tarr = []
            for i in df['MATCHUP'].values:
                tarr.append('@' in str(i))
            self.data_d[f+s] = self.data_d[f+s][tarr]
        
        
    def lg_filter(self, s, f):
        df = self.data_d[f+s]
        for ind, ngid in zip(df['GAME_ID'], df['NEXT_GAME_ID']):
            if ngid == 0:
                self.neglected[s].add(ind)
            if str(ngid)[0]=='4':
                self.neglected[s].add(ind)
                
                
    # APPLICATIONS OF FILTERS ON NEGLECTED OR A CHOSEN GIDLIST
    
    def neg_filter(self, s, f):
        df = self.data_d[f+s]
        flist = []
        for i in df['GAME_ID'].tolist():
            flist.append(i not in self.neglected[s])
        self.data_d[f+s] = df[flist]

In [None]:
normcols = ['GAME_ID', 'NEXT_GAME_ID', 'TEAM_ABBREVIATION', 'MATCHUP', 'PTS']
rescols = ['NEXT_MATCHUP', 'NEXT_PLUS_MINUS', 'NEXT_SPREAD', 'NEXT_O/U']
class Model():
    
    def __init__(self):
        
        self.cf_numbers = {}
        self.end_d = {}
        
        self.results = {}
        
        self.normcols = ['GAME_ID', 'NEXT_GAME_ID', 'PTS', 'MATCHUP']
        
        self.outcheck = {}
        self.outcomes = {}
        self.outcols = ['NEXT_PLUS_MINUS', 'NEXT_SPREAD', 'NEXT_O/U']
        
        
    def run_model(self, ms, seasons, files, calcfunc, calccols, ecalcfunc, fweights={'avgs':0.2, 'last10':0.5, 'haavgs':0.3}, colweights=[0.4,0.25,0.2,0.15]):
        # FIRST, RESET PREV RUN MODEL DATA
        for s in seasons:
            self.outcheck[s] = False
            for f in files:
                if (f[:4]=='home'):
                    self.ha_files_np(ms, s, f, calcfunc, calccols, colweights, fweights)
                elif (f[:4]=='away'):
                    continue
                else:
                    self.norm_files_np(ms, s, f, calcfunc, calccols, colweights, fweights)
        self.apply_end(seasons, list(fweights.keys()), ecalcfunc)
        self.format_results(seasons)
                
    def norm_files_np(self, ms, s, f, calcfunc, calccols, colweights, fweights):
        fs = f[4:]+s
        out_d = {}
        df = ms.data_d[f+s].sort_values('NEXT_MATCHUP')
        df.sort_values('NEXT_GAME_ID', inplace=True, kind='mergesort')
        ndata = df[calccols].to_numpy()
        hngid = df['NEXT_GAME_ID'].tolist()
        
        if not self.outcheck[s]:
            outcome_d = {}
            pm = df['NEXT_PLUS_MINUS'].tolist()
            spread = df['NEXT_SPREAD'].tolist()
            ou = df['NEXT_O/U'].tolist()
            matchups = df['NEXT_MATCHUP'].tolist()
            for j in range(int(len(hngid))-1):
                if matchups[j][4] == '@':
                    adjust = 1
                else:
                    adjust = 0
                # J+1 SO THAT IT IS HOME PLUS MINUS, HOME SPREAD
                outcome_d[hngid[j+adjust]] = [hngid[j+adjust], matchups[j+adjust], pm[j+adjust], spread[j+adjust], ou[j+adjust]]
                j+=1
            self.outcheck[s] == True
            self.outcomes[s] = outcome_d
#         else:
#             assert  
        assert(len(ndata)%2 == 0)
        for i in range(int(len(ndata)-1)):
            if matchups[j][4] == '@':
                out_d[hngid[i]] = fweights[f] * calcfunc(ndata[i+1], ndata[i], colweights)
            else:
                out_d[hngid[i]] = fweights[f] * calcfunc(ndata[i], ndata[i+1], colweights)
            i+=1
        self.cf_numbers[f+s] = out_d
            
    def ha_files_np(self, ms, s, f, calcfunc, calccols, colweights, fweights):
        fs = f[4:]+s
        out_d = {}
        dfh = ms.data_d['home'+fs].sort_values('NEXT_GAME_ID')
        dfa = ms.data_d['away'+fs].sort_values('NEXT_GAME_ID')
        hnp = dfh[calccols].to_numpy()
        anp = dfa[calccols].to_numpy()
        assert(len(hnp)==len(anp))
        assert(dfh['NEXT_GAME_ID'].tolist()==dfa['NEXT_GAME_ID'].tolist())
        hngid = dfh['NEXT_GAME_ID'].tolist()
        for i in range(len(hnp)):
            out_d[hngid[i]] = fweights['ha'+f[4:]] * calcfunc(hnp[i], anp[i], colweights)
        self.cf_numbers['ha'+f[4:]+s] = out_d
            
    def apply_end(self, seasons, efiles, ecfunc):
        for s in seasons:
            out_d = {}
            for i in list(self.cf_numbers['avgs'+s]):
                out_d[i] = ecfunc(self.get_edata(s, i))
            self.end_d[s] = out_d
            
        
    def get_edata(self, s, i):
        return [self.cf_numbers['avgs'+s][i], self.cf_numbers['last10'+s][i], self.cf_numbers['haavgs'+s][i]]
    
    def format_results(self, seasons):
        for s in seasons:
            df1 = pd.DataFrame(self.outcomes[s].values(), columns = ['GAME_ID', 'MATCHUP', 'PLUS_MINUS', 'SPREAD', 'O/U'])
            df1['PREDICTION'] = self.end_d[s].values()
            self.results[s] = df1
        
        
        
        
#     def ha_files(self, ms, s, f, calcfunc, calccols, colweights, fweights):
#         fs = f[4:]+s
#         out_d = {}
#         dfh = ms.data_d['home'+fs][self.normcols+calccols]
#         dfa = ms.data_d['away'+fs][self.normcols+calccols]
#         for ngid in ms.sgids[s]: 
#             hdata = dfh[dfh['NEXT_GAME_ID']==ngid]
#             adata = dfa[dfa['NEXT_GAME_ID']==ngid]
# #             out_d[int(hdata['NEXT_GAME_ID'])] = [hdata['PTS'], adata['PTS']]
#             out_d[hdata['NEXT_GAME_ID']] = [fweighst['ha'+f[4:]] * (calcfunc(hdata[calccols].to_numpy()[0], adata[calccols].to_numpy()[0], colweights))]
#         self.cf_numbers['ha'+fs] = out_d
                
                
        

In [None]:
SEASONS = ['2012-13',
          '2013-14',
          '2014-15',
          '2015-16',
          '2016-17',
          '2017-18',
          '2018-19',
          '2019-20']
FILES = ['avgs',
        'last10',
        'homeavgs',
        'awayavgs']

fourf = ["EFG_PCT",
        "FTA_RATE",
        "TM_TOV_PCT",
        "OREB_PCT",
        "OPP_EFG_PCT",
        "OPP_FTA_RATE",
        "OPP_TOV_PCT",
        "OPP_OREB_PCT"]

E_FILES = ['avgs',
          'last10',
          'haavgs']

# from rothmodel / for rothmodel

# def diff_calc(home, away, weights):
#     efg_h = home[0] - away[0]
#     tov_h = home[1] - away[1]
#     rb_h  = home[2] - away[2]
#     ft_h  = home[3] - away[3]
#     efg_a = home[4] - away[4]
#     tov_a = home[5] - away[5]
#     rb_a  = home[6] - away[6]
#     ft_a  = home[7] - away[7]
#     efg_d =  (efg_h - efg_a) * weights[0]
#     tov_d =  (tov_h - tov_a) * weights[1]
#     rb_d  =  (rb_h  - rb_a)  * weights[2]
#     ft_d  =  (ft_h  - ft_a)  * weights[3]
#     total = (efg_d + tov_d + rb_d + ft_d) * 2
#     return total

def diff_calc(home, away, weights):
    efg_h = home[0] - home[4]
    tov_h = home[1] - home[5]
    rb_h  = home[2] - home[6]
    ft_h  = home[3] - home[7]
    efg_a = away[0] - away[4]
    tov_a = away[1] - away[5]
    rb_a  = away[2] - away[6]
    ft_a  = away[3] - away[7]
    efg_d =  (efg_h - efg_a) * weights[0]
    tov_d =  (tov_h - tov_a) * weights[1]
    rb_d  =  (rb_h  - rb_a)  * weights[2]
    ft_d  =  (ft_h  - ft_a)  * weights[3]
    total = (efg_d + tov_d + rb_d + ft_d) * 2
    return total

def end_calculation(game_data):
    return game_data[0] + game_data[1] + game_data[2] + 2
        
        
    
    
#     def end_calculation(seasons, pred_d):
#     end_pred = {}
#     for i in seasons:
#         new_pred = []
#         for j in range(len(pred_d['avgs'+i])):
#             prediction = pred_d['avgs'+i][j] + pred_d['last10'+i][j] + pred_d['haavgs'+i][j] + 2
#             new_pred.append(prediction)
#         end_pred[i] = new_pred
#     return end_pred

In [None]:
ms = ModelStats()
for s in SEASONS:
    for f in FILES:
        ms.load_season(s, f)

ms.apply_rs_filters(10)
ms.data_d['avgs2017-18'].sort_values('NEXT_GAME_ID')

In [None]:
m = Model()
m.run_model(ms, ms.seasons, ms.files, diff_calc, fourf, end_calculation)
m.results['2012-13']

#### Plus minus + Spread  
    pm positive: home team win
    pm neg: away team win
    spread positive: home team have points
    spread neg: home team need to win by that many
    
    pm + spread = positive if home team bet hit
                = negative if away team bet hit
                
#### BET vs our prediction:  
    if our prediciton is less than the spread:
        
        
    
        

In [None]:
df = m.results['2012-13']
x = df['PLUS_MINUS'] + df['SPREAD']
nelist = []
for i in x:
    if i >=0:
        nelist.append('h')
    else:
        nelist.append('a')
y = df['SPREAD'] + df['PREDICTION']
nelist2 = []
for j in y:
    if j>=0:
        nelist2.append('h')
    else:
        nelist2.append('a')
df['WINNING_BET'] = nelist
df['OUR_BET'] = nelist2
df['HIT'] = df['WINNING_BET'] == df['OUR_BET']
# df['PM-PRED'] = df['PLUS_MINUS'] - df['PREDICTION']
# df['SPREAD+PRED'] = df['SPREAD'] + df['PREDICTION']

print(f"ACCURACY = {df['HIT'].value_counts()[1] / (df['HIT'].value_counts()[0] + df['HIT'].value_counts()[1])}")
df

In [None]:
print(f"average pred for Hite: {df[df['HIT']==True]['PREDICTION'].mean()}")
print(f"average pred for Misses: {df[df['HIT']==False]['PREDICTION'].mean()}")

In [None]:
plt.hist(df[df['HIT']==True]['PREDICTION'], bins=10)

In [None]:
plt.hist(df[df['HIT']==False]['PREDICTION'], bins=10)

In [None]:
list(m.cf_numbers.keys())

In [None]:
# m.cf_numbers['haavgs2017-18']
m.cf_numbers['avgs2017-18'][0]

In [None]:
list(m.end_d['2017-18']) ==list(m.outcomes['2017-18'])

In [None]:
list(m.outcomes['2017-18'])

In [None]:
df1 = pd.DataFrame(m.outcomes['2017-18'].values(), columns = ['GAME_ID', 'MATCHUP', 'PLUS_MINUS', 'SPREAD', 'O/U'])
df1['PREDICTION'] = m.end_d['2017-18'].values()
df1

In [None]:
df = ms.data_d['avgs2012-13'].sort_values('NEXT_GAME_ID')
df[df['NEXT_GAME_ID'] ==21200213]

In [None]:
cols = ['pm', 'spread', 'o/u']
x = {1: [1, 2, 3],
    2: [4, 5, 6],
    3: [7, 8, 9]}


cols2 = ['PREDICTION']
y = {1: [11],
    2: [22],
    3: [33]}

x.values()

In [None]:
df1 = pd.DataFrame(x.values(), index = x.keys(), columns = cols)
df2 = pd.DataFrame(y.values(), index = y.keys(), columns = cols2)
pd.concat([df1, df2], axis=1)

In [None]:
x['sdiff'] = x['pm'] - x['spread']
x.crosstab()

In [None]:
36 % 2

In [None]:
s = 'homeavgs'
s[4:]

In [None]:
xx += [4]