In [210]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import adfuller

import warnings # necessary b/c pandas & statsmodels datetime issue
warnings.simplefilter(action="ignore")

FEATURES TO IMPLEMENT

- HOME RECORD
- AWAY RECORD
- ELO

FEATURES ALREADY IMPLEMENTED IN DATA CLEANING NOTEBOOK
* FG MISSED
* FG(3) MISSED
* FG(2) COLUMNS
* OVERTIME COLUMN
* TURNOVERS FORCED
* SHOT ATTEMPTS BLOCKED

In [211]:
df = pd.read_csv('./cleaned_data/team_data/cleaned_games_full.csv')
df.drop('Unnamed: 0', axis = 1, inplace=True)

In [212]:
df.shape

(7380, 94)

In [213]:
gen_cols = ['GAMEID','DATE','SEASON','OTs']
away_cols = []
home_cols = []
for col in df.columns:
    if '_AWAY' in col:
        away_cols.append(col)
    if '_HOME' in col:
        home_cols.append(col)


In [214]:
home_cols

['TEAM_HOME',
 'FG_HOME',
 'FGA_HOME',
 'FG_PCT_HOME',
 'FG(3)_HOME',
 'FGA(3)_HOME',
 'FG_PCT(3)_HOME',
 'FT_HOME',
 'FTA_HOME',
 'FT_PCT_HOME',
 'ORB_HOME',
 'DRB_HOME',
 'TRB_HOME',
 'AST_HOME',
 'STL_HOME',
 'BLK_HOME',
 'TOV_HOME',
 'PF_HOME',
 'PTS_HOME',
 'TS_PCT_HOME',
 'EFG_PCT_HOME',
 '3PA_R_HOME',
 'FT_R_HOME',
 'ORB_PCT_HOME',
 'DRB_PCT_HOME',
 'TRB_PCT_HOME',
 'AST_PCT_HOME',
 'STL_PCT_HOME',
 'BLK_PCT_HOME',
 'TOV_PCT_HOME',
 'O_RTG_HOME',
 'D_RTG_HOME',
 'WINS_RECORD_HOME',
 'LOSSES_RECORD_HOME',
 'WIN_PCT_HOME',
 'GAME_NO_HOME',
 'WIN_HOME',
 'LOSS_HOME',
 'FG(3)_MISSED_HOME',
 'FG_MISSED_HOME',
 'BLOCKED_ATTEMPTS_HOME',
 'TOV_FORCED_HOME',
 'FG(2)_HOME',
 'FGA(2)_HOME',
 'FG(2)_MISSED_HOME']

In [215]:
df['DATE'] = pd.to_datetime(df['DATE'])

Initial stages of my function to get average stats over N game stretch prior to date. This function will be used to model the data itself rather than the dataset created after cleaning the data.

In [216]:
df_14 = df.loc[df['SEASON'] == 2014].copy()

In [217]:
ind_df_14 = df_14.loc[(df_14['TEAM_HOME'] == 'IND') | (df_14['TEAM_AWAY'] == 'IND')].loc[df_14['DATE'] < '2014-04-06'].copy()

In [218]:
ind_df_14.reset_index(inplace=True)
ind_df_14.drop('index',axis=1,inplace=True)

In [219]:
final_ten = ind_df_14.tail(10).copy()

In [220]:
home = final_ten[final_ten['TEAM_HOME'] == 'IND']
away = final_ten[final_ten['TEAM_AWAY'] == 'IND']

In [221]:
home = home[home_cols].copy()
away = away[away_cols].copy()

In [222]:
home.columns = [x.replace('_HOME', '') for x in home.columns]
away.columns = [x.replace('_AWAY','') for x in away.columns]

In [223]:
home.columns

Index(['TEAM', 'FG', 'FGA', 'FG_PCT', 'FG(3)', 'FGA(3)', 'FG_PCT(3)', 'FT',
       'FTA', 'FT_PCT', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF',
       'PTS', 'TS_PCT', 'EFG_PCT', '3PA_R', 'FT_R', 'ORB_PCT', 'DRB_PCT',
       'TRB_PCT', 'AST_PCT', 'STL_PCT', 'BLK_PCT', 'TOV_PCT', 'O_RTG', 'D_RTG',
       'WINS_RECORD', 'LOSSES_RECORD', 'WIN_PCT', 'GAME_NO', 'WIN', 'LOSS',
       'FG(3)_MISSED', 'FG_MISSED', 'BLOCKED_ATTEMPTS', 'TOV_FORCED', 'FG(2)',
       'FGA(2)', 'FG(2)_MISSED'],
      dtype='object')

In [224]:
away.columns

Index(['TEAM', 'FG', 'FGA', 'FG_PCT', 'FG(3)', 'FGA(3)', 'FG_PCT(3)', 'FT',
       'FTA', 'FT_PCT', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF',
       'PTS', 'TS_PCT', 'EFG_PCT', '3PA_R', 'FT_R', 'ORB_PCT', 'DRB_PCT',
       'TRB_PCT', 'AST_PCT', 'STL_PCT', 'BLK_PCT', 'TOV_PCT', 'O_RTG', 'D_RTG',
       'WINS_RECORD', 'LOSSES_RECORD', 'WIN_PCT', 'GAME_NO', 'WIN', 'LOSS',
       'FG(3)_MISSED', 'FG_MISSED', 'BLOCKED_ATTEMPTS', 'TOV_FORCED', 'FG(2)',
       'FGA(2)', 'FG(2)_MISSED'],
      dtype='object')

In [225]:
away.columns == home.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True])

In [226]:
total = pd.concat([home,away])
total.columns = [x+'_LAST_1O_GAMES' for x in total.columns]
total

Unnamed: 0,TEAM_LAST_1O_GAMES,FG_LAST_1O_GAMES,FGA_LAST_1O_GAMES,FG_PCT_LAST_1O_GAMES,FG(3)_LAST_1O_GAMES,FGA(3)_LAST_1O_GAMES,FG_PCT(3)_LAST_1O_GAMES,FT_LAST_1O_GAMES,FTA_LAST_1O_GAMES,FT_PCT_LAST_1O_GAMES,...,GAME_NO_LAST_1O_GAMES,WIN_LAST_1O_GAMES,LOSS_LAST_1O_GAMES,FG(3)_MISSED_LAST_1O_GAMES,FG_MISSED_LAST_1O_GAMES,BLOCKED_ATTEMPTS_LAST_1O_GAMES,TOV_FORCED_LAST_1O_GAMES,FG(2)_LAST_1O_GAMES,FGA(2)_LAST_1O_GAMES,FG(2)_MISSED_LAST_1O_GAMES
68,IND,34,79,0.43,2,11,0.182,21,25,0.84,...,69.0,1,0,9,45,10,13,32,68,36
71,IND,30,81,0.37,7,14,0.5,17,21,0.81,...,72.0,1,0,7,51,9,18,23,67,44
74,IND,26,69,0.377,6,14,0.429,19,24,0.792,...,75.0,1,0,8,43,5,8,20,55,35
75,IND,39,79,0.494,5,18,0.278,18,24,0.75,...,76.0,0,1,13,40,4,5,34,61,27
67,IND,31,73,0.425,5,23,0.217,19,27,0.704,...,68.0,0,1,18,42,2,10,26,50,24
69,IND,27,74,0.365,6,15,0.4,11,14,0.786,...,70.0,0,1,9,47,8,12,21,59,38
70,IND,27,73,0.37,4,10,0.4,19,21,0.905,...,71.0,0,1,6,46,7,10,23,63,40
72,IND,28,79,0.354,2,13,0.154,20,25,0.8,...,73.0,0,1,11,51,4,8,26,66,40
73,IND,29,78,0.372,8,22,0.364,10,16,0.625,...,74.0,0,1,14,49,1,15,21,56,35
76,IND,33,78,0.423,6,11,0.545,22,28,0.786,...,77.0,1,0,5,45,5,11,27,67,40


In [227]:
total.drop('TEAM_LAST_1O_GAMES',axis=1, inplace=True)
total.mean()

FG_LAST_1O_GAMES                   30.400000
FGA_LAST_1O_GAMES                  76.300000
FG_PCT_LAST_1O_GAMES                0.398000
FG(3)_LAST_1O_GAMES                 5.100000
FGA(3)_LAST_1O_GAMES               15.100000
FG_PCT(3)_LAST_1O_GAMES             0.346900
FT_LAST_1O_GAMES                   17.600000
FTA_LAST_1O_GAMES                  22.500000
FT_PCT_LAST_1O_GAMES                0.779800
ORB_LAST_1O_GAMES                   9.200000
DRB_LAST_1O_GAMES                  32.200000
TRB_LAST_1O_GAMES                  41.400000
AST_LAST_1O_GAMES                  17.300000
STL_LAST_1O_GAMES                   5.400000
BLK_LAST_1O_GAMES                   4.800000
TOV_LAST_1O_GAMES                  13.100000
PF_LAST_1O_GAMES                   20.600000
PTS_LAST_1O_GAMES                  83.500000
TS_PCT_LAST_1O_GAMES                0.483700
EFG_PCT_LAST_1O_GAMES               0.431500
3PA_R_LAST_1O_GAMES                 0.198600
FT_R_LAST_1O_GAMES                  0.295400
ORB_PCT_LA

In [228]:
def get_avg_stats_last_n_games(year,team,date,n):
    away_cols = [] # SEPERATING HOME AND AWAY COLUMNS
    home_cols = []
    for col in df.columns: # FOR LOOP TO SEPERATE COLUMNS
        if '_AWAY' in col:
            away_cols.append(col)
        if '_HOME' in col:
            home_cols.append(col)
    df_year = df.loc[df['SEASON'] == year].copy() # MAKING SURE PREVIOUS SEASONS DONT LEAK INTO THE DATA
    df_year_date = df_year.loc[(df_year['TEAM_HOME'] == team) | (df_year['TEAM_AWAY'] == team)].loc[df_year['DATE'] < date].copy() # GETTING GAMES BEFORE THE DATE AND MAKING SURE THEY INCLUDE BOTH HOME AND AWAY GAMES
    df_year_date.reset_index(inplace=True) # REPLACING OLD INDEX WITH NEW INDEX FROM 0 - N-1
    df_year_date.drop('index',axis=1,inplace=True) # DROPPING THE ORIGINAL INDEX
    final_ten = df_year_date.tail(n).copy() # FINAL N AMOUNT OF GAMES
    home = final_ten[final_ten['TEAM_HOME'] == team] # GETTING HOME GAMES FROM LAST N GAMES
    away = final_ten[final_ten['TEAM_AWAY'] == team] # GETTING AWAY GAMES FROM LAST N GAMES
    home = home[home_cols].copy()
    away = away[away_cols].copy()
    home.columns = [x.replace('_HOME', '') for x in home.columns] # REMOVING THE HOME AND AWAY COLUMN INDICATORS FOR A SUCCESSFUL CONCATENATION
    away.columns = [x.replace('_AWAY','') for x in away.columns]
    total = pd.concat([home,away])
    total.drop(columns=['TEAM', 'LOSS', 'WINS_RECORD', 'LOSSES_RECORD', 'GAME_NO', 'WIN_PCT'], inplace=True) # DROPPING UNNECCESSARY COLUMNS
    total.rename(columns={'WIN':'WIN_PCT'}, inplace=True) # RENAMING WIN TO WIN PCT DUE TO IT BECOMING A WIN PERCENTAGE COLUMNS AFTER APPLYING .MEAN() TO THE DATAFRAME
    total.columns = [x+f'_LAST_{n}_GAMES' for x in total.columns] #
    total = total.mean()
    total[f'WINS_LAST_{n}_GAMES'] = total[f'WIN_PCT_LAST_{n}_GAMES'] * n # GETTING TOTAL WINS IN PAST N GAMES
    total[f'LOSSES_LST_{n}_GAMES'] = n - total[f'WINS_LAST_{n}_GAMES'] # GETTING TOTAL LOSSES IN PAST N GAMES
    return total

In [229]:
def get_avg_home_stats_last_n_games(year,team,date,n):
    home_cols = []
    for col in df.columns:
        if '_HOME' in col:
            home_cols.append(col)
    df_year = df.loc[df['SEASON'] == year].copy()
    df_year_date = df_year.loc[(df_year['TEAM_HOME'] == team)].loc[df_year['DATE'] < date].copy()
    df_year_date.reset_index(inplace=True)
    df_year_date.drop('index',axis=1,inplace=True)
    final_ten = df_year_date.tail(n).copy()
    home = final_ten[final_ten['TEAM_HOME'] == team]
    home = home[home_cols].copy()
    home.columns = [x.replace('_HOME', '') for x in home.columns]
    home.drop(columns=['TEAM', 'LOSS', 'WINS_RECORD', 'LOSSES_RECORD', 'GAME_NO', 'WIN_PCT'], inplace=True)
    home.rename(columns={'WIN':'WIN_PCT'}, inplace=True)
    home.columns = [x+f'_LAST_{n}_GAMES_HOME' for x in home.columns]
    home = home.mean()
    home[f'WINS_LAST_{n}_GAMES_HOME'] = home[f'WIN_PCT_LAST_{n}_GAMES_HOME'] * n
    home[f'LOSSES_LST_{n}_GAMES_HOME'] = n - home[f'WINS_LAST_{n}_GAMES_HOME']
    return home

In [230]:
def get_avg_away_stats_last_n_games(year,team,date,n):
    away_cols = []
    for col in df.columns:
        if '_AWAY' in col:
            away_cols.append(col)
    df_year = df.loc[df['SEASON'] == year].copy()
    df_year_date = df_year.loc[(df_year['TEAM_AWAY'] == team)].loc[df_year['DATE'] < date].copy()
    df_year_date.reset_index(inplace=True)
    df_year_date.drop('index',axis=1,inplace=True)
    final_ten = df_year_date.tail(n).copy()
    away = final_ten[final_ten['TEAM_AWAY'] == team]
    away = away[away_cols].copy()
    away.columns = [x.replace('_AWAY', '') for x in away.columns]
    away.drop(columns=['TEAM', 'LOSS', 'WINS_RECORD', 'LOSSES_RECORD', 'GAME_NO', 'WIN_PCT'], inplace=True)
    away.rename(columns={'WIN':'WIN_PCT'}, inplace=True)
    away.columns = [x+f'_LAST_{n}_GAMES_AWAY' for x in away.columns]
    away = away.mean()
    away[f'WINS_LAST_{n}_GAMES_AWAY'] = away[f'WIN_PCT_LAST_{n}_GAMES_AWAY'] * n
    away[f'LOSSES_LST_{n}_GAMES_AWAY'] = n - away[f'WINS_LAST_{n}_GAMES_AWAY']
    return away

In [231]:
get_avg_away_stats_last_n_games(2019,'OKC','17-12-19',10)

FG_LAST_10_GAMES_AWAY                   42.1000
FGA_LAST_10_GAMES_AWAY                  93.4000
FG_PCT_LAST_10_GAMES_AWAY                0.4513
FG(3)_LAST_10_GAMES_AWAY                13.4000
FGA(3)_LAST_10_GAMES_AWAY               36.5000
FG_PCT(3)_LAST_10_GAMES_AWAY             0.3599
FT_LAST_10_GAMES_AWAY                   16.7000
FTA_LAST_10_GAMES_AWAY                  23.3000
FT_PCT_LAST_10_GAMES_AWAY                0.7213
ORB_LAST_10_GAMES_AWAY                  11.7000
DRB_LAST_10_GAMES_AWAY                  35.4000
TRB_LAST_10_GAMES_AWAY                  47.1000
AST_LAST_10_GAMES_AWAY                  22.7000
STL_LAST_10_GAMES_AWAY                   7.0000
BLK_LAST_10_GAMES_AWAY                   4.9000
TOV_LAST_10_GAMES_AWAY                  13.1000
PF_LAST_10_GAMES_AWAY                   25.5000
PTS_LAST_10_GAMES_AWAY                 114.3000
TS_PCT_LAST_10_GAMES_AWAY                0.5518
EFG_PCT_LAST_10_GAMES_AWAY               0.5226
3PA_R_LAST_10_GAMES_AWAY                

In [232]:
get_avg_home_stats_last_n_games(2019,'OKC','17-12-19',10)

FG_LAST_10_GAMES_HOME                   41.0000
FGA_LAST_10_GAMES_HOME                  95.9000
FG_PCT_LAST_10_GAMES_HOME                0.4280
FG(3)_LAST_10_GAMES_HOME                12.0000
FGA(3)_LAST_10_GAMES_HOME               36.9000
FG_PCT(3)_LAST_10_GAMES_HOME             0.3294
FT_LAST_10_GAMES_HOME                   14.6000
FTA_LAST_10_GAMES_HOME                  21.7000
FT_PCT_LAST_10_GAMES_HOME                0.6778
ORB_LAST_10_GAMES_HOME                  15.3000
DRB_LAST_10_GAMES_HOME                  34.0000
TRB_LAST_10_GAMES_HOME                  49.3000
AST_LAST_10_GAMES_HOME                  23.2000
STL_LAST_10_GAMES_HOME                   7.1000
BLK_LAST_10_GAMES_HOME                   5.2000
TOV_LAST_10_GAMES_HOME                  13.1000
PF_LAST_10_GAMES_HOME                   18.6000
PTS_LAST_10_GAMES_HOME                 108.6000
TS_PCT_LAST_10_GAMES_HOME                0.5155
EFG_PCT_LAST_10_GAMES_HOME               0.4904
3PA_R_LAST_10_GAMES_HOME                

In [233]:
get_avg_stats_last_n_games(2019,'OKC','17-12-19',10)

FG_LAST_10_GAMES                   43.3000
FGA_LAST_10_GAMES                  95.7000
FG_PCT_LAST_10_GAMES                0.4530
FG(3)_LAST_10_GAMES                14.2000
FGA(3)_LAST_10_GAMES               38.7000
FG_PCT(3)_LAST_10_GAMES             0.3648
FT_LAST_10_GAMES                   13.9000
FTA_LAST_10_GAMES                  19.2000
FT_PCT_LAST_10_GAMES                0.7197
ORB_LAST_10_GAMES                  13.8000
DRB_LAST_10_GAMES                  33.4000
TRB_LAST_10_GAMES                  47.2000
AST_LAST_10_GAMES                  26.4000
STL_LAST_10_GAMES                   7.8000
BLK_LAST_10_GAMES                   4.8000
TOV_LAST_10_GAMES                  11.5000
PF_LAST_10_GAMES                   20.5000
PTS_LAST_10_GAMES                 114.7000
TS_PCT_LAST_10_GAMES                0.5509
EFG_PCT_LAST_10_GAMES               0.5269
3PA_R_LAST_10_GAMES                 0.4033
FT_R_LAST_10_GAMES                  0.2009
ORB_PCT_LAST_10_GAMES              28.1900
DRB_PCT_LAS

In [234]:
df['DATE'][:-1]

0      2013-10-29
1      2013-10-29
2      2013-10-29
3      2013-10-30
4      2013-10-30
          ...    
7374   2019-04-10
7375   2019-04-10
7376   2019-04-10
7377   2019-04-10
7378   2019-04-10
Name: DATE, Length: 7379, dtype: datetime64[ns]

In [235]:
df.columns

Index(['TEAM_HOME', 'TEAM_AWAY', 'FG_HOME', 'FG_AWAY', 'FGA_HOME', 'FGA_AWAY',
       'FG_PCT_HOME', 'FG_PCT_AWAY', 'FG(3)_HOME', 'FG(3)_AWAY', 'FGA(3)_HOME',
       'FGA(3)_AWAY', 'FG_PCT(3)_HOME', 'FG_PCT(3)_AWAY', 'FT_HOME', 'FT_AWAY',
       'FTA_HOME', 'FTA_AWAY', 'FT_PCT_HOME', 'FT_PCT_AWAY', 'ORB_HOME',
       'ORB_AWAY', 'DRB_HOME', 'DRB_AWAY', 'TRB_HOME', 'TRB_AWAY', 'AST_HOME',
       'AST_AWAY', 'STL_HOME', 'STL_AWAY', 'BLK_HOME', 'BLK_AWAY', 'TOV_HOME',
       'TOV_AWAY', 'PF_HOME', 'PF_AWAY', 'PTS_HOME', 'PTS_AWAY', 'TS_PCT_HOME',
       'TS_PCT_AWAY', 'EFG_PCT_HOME', 'EFG_PCT_AWAY', '3PA_R_HOME',
       '3PA_R_AWAY', 'FT_R_HOME', 'FT_R_AWAY', 'ORB_PCT_HOME', 'ORB_PCT_AWAY',
       'DRB_PCT_HOME', 'DRB_PCT_AWAY', 'TRB_PCT_HOME', 'TRB_PCT_AWAY',
       'AST_PCT_HOME', 'AST_PCT_AWAY', 'STL_PCT_HOME', 'STL_PCT_AWAY',
       'BLK_PCT_HOME', 'BLK_PCT_AWAY', 'TOV_PCT_HOME', 'TOV_PCT_AWAY',
       'O_RTG_HOME', 'O_RTG_AWAY', 'D_RTG_HOME', 'D_RTG_AWAY',
       'WINS_RECORD_HOME', 

In [236]:
ind_home = df_14.loc[df_14['SEASON'] == 2014].loc[df_14['TEAM_HOME'] == 'IND']

In [237]:
lst_ind = [i for key,i in ind_home['WIN_HOME'].iteritems()]

In [238]:
list(ind_home['WIN_HOME']) == lst_ind

True

In [239]:
lst_ind

[1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0]

In [240]:
index_ind = [key for key,i in ind_home['WIN_HOME'].iteritems()]

In [241]:
home_wins = {}
for i in range(0,41):
    wins = sum(lst_ind[0:i])
    index = index_ind[i]
    home_wins[index] = wins

In [242]:
pd.Series(home_wins)

0        0
32       1
60       2
73       3
98       4
124      5
187      5
202      5
234      6
309      7
327      8
354      9
386     10
402     10
435     10
459     10
487     11
508     12
529     13
561     14
577     14
591     14
683     14
694     15
707     16
736     17
761     18
774     18
789     18
844     18
860     18
882     18
896     19
947     19
994     20
1019    21
1062    22
1099    23
1111    24
1148    24
1196    25
dtype: int64

In [243]:
ind_home['WINS_AT_HOME_HOME'] = [value for key, value in home_wins.items()]

In [244]:
ind_home[['WINS_AT_HOME_HOME']]

Unnamed: 0,WINS_AT_HOME_HOME
0,0
32,1
60,2
73,3
98,4
124,5
187,5
202,5
234,6
309,7


In [245]:
teams = [idx for idx,val in df['TEAM_HOME'].value_counts().iteritems()]

In [246]:
len(teams)

30

In [247]:
seasons = [idx for idx,val in df['SEASON'].value_counts().iteritems()]

In [248]:
seasons

[2014, 2015, 2016, 2017, 2018, 2019]

In [249]:
home_wins = {}
home_losses = {}
away_wins = {}
away_losses = {}
for season in seasons:
    season_df = df.loc[df['SEASON'] == season]
    for team in teams:
        home = season_df.loc[df['TEAM_HOME'] == team]
        away = season_df.loc[df['TEAM_AWAY'] == team]
        lst_away_win = list(away['WIN_AWAY'])
        lst_away_loss = list(away['LOSS_AWAY'])
        lst_home_win = list(home['WIN_HOME'])
        lst_home_loss = list(home['LOSS_HOME'])
        index_h_win = [key for key,i in home['WIN_HOME'].iteritems()]
        index_h_loss = [key for key, i in home['LOSS_HOME'].iteritems()]
        index_a_win = [key for key, i in away['WIN_AWAY'].iteritems()]
        index_a_loss = [key for key, i in away['LOSS_AWAY'].iteritems()]
        for i in range(0,41):
            wins = sum(lst_home_win[0:i])
            index = index_h_win[i]
            home_wins[index] = wins
            losses = sum(lst_home_loss[0:i])
            index = index_h_loss[i]
            home_losses[index] = losses
            wins = sum(lst_away_win[0:i])
            index = index_a_win[i]
            away_wins[index] = wins
            losses = sum(lst_away_loss[0:i])
            index = index_a_loss[i]
            away_losses[index] = losses

In [250]:
pd.Series(home_wins).sort_index()

0        0
1        0
2        0
3        0
4        0
        ..
7375    22
7376    24
7377    20
7378    25
7379    25
Length: 7380, dtype: int64

In [251]:
df['H_TEAM_WINS_AT_HOME'] = pd.Series(home_wins).sort_index()
df['H_TEAM_LOSSES_AT_HOME'] = pd.Series(home_losses).sort_index()
df['A_TEAM_WINS_AT_AWAY'] = pd.Series(away_wins).sort_index()
df['A_TEAM_LOSSES_AT_AWAY'] = pd.Series(away_losses).sort_index()
df

Unnamed: 0,TEAM_HOME,TEAM_AWAY,FG_HOME,FG_AWAY,FGA_HOME,FGA_AWAY,FG_PCT_HOME,FG_PCT_AWAY,FG(3)_HOME,FG(3)_AWAY,...,FG(2)_MISSED_HOME,FG(2)_MISSED_AWAY,GAMEID,DATE,SEASON,OTs,H_TEAM_WINS_AT_HOME,H_TEAM_LOSSES_AT_HOME,A_TEAM_WINS_AT_AWAY,A_TEAM_LOSSES_AT_AWAY
0,IND,ORL,34,36,71,93,0.479,0.387,7,9,...,27,47,201310290IND,2013-10-29,2014,0,0,0,0,0
1,MIA,CHI,37,35,72,83,0.514,0.422,11,7,...,26,29,201310290MIA,2013-10-29,2014,0,0,0,0,0
2,LAL,LAC,42,41,93,83,0.452,0.494,14,8,...,36,29,201310290LAL,2013-10-29,2014,0,0,0,0,0
3,CLE,BRK,35,33,84,82,0.417,0.402,5,9,...,39,34,201310300CLE,2013-10-30,2014,0,0,0,0,0
4,TOR,BOS,38,32,86,66,0.442,0.485,5,3,...,36,24,201310300TOR,2013-10-30,2014,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7375,PHI,CHI,52,45,93,95,0.559,0.474,12,9,...,27,35,201904100PHI,2019-04-10,2019,0,22,18,15,25
7376,SAS,DAL,41,37,88,91,0.466,0.407,8,11,...,31,28,201904100SAS,2019-04-10,2019,0,24,16,9,31
7377,DEN,MIN,39,39,87,91,0.448,0.429,10,13,...,25,33,201904100DEN,2019-04-10,2019,0,20,20,11,29
7378,LAC,UTA,54,47,106,106,0.509,0.443,12,14,...,36,38,201904100LAC,2019-04-10,2019,1,25,15,18,22


In [252]:
df['HOME_GAMES_TEAM_HOME'] = df['H_TEAM_WINS_AT_HOME'] + df['H_TEAM_LOSSES_AT_HOME']
df['AWAY_GAMES_TEAM_AWAY'] = df['A_TEAM_WINS_AT_AWAY'] + df['A_TEAM_LOSSES_AT_AWAY']
df['HOME_GAME_WIN_RATE_HOME'] = df['H_TEAM_WINS_AT_HOME'] / df['HOME_GAMES_TEAM_HOME']
df['AWAY_GAME_WIN_RATE_AWAY'] = df['A_TEAM_WINS_AT_AWAY'] / df['AWAY_GAMES_TEAM_AWAY']

In [253]:
list(df.isna().sum())

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 180,
 180]

In [254]:
df

Unnamed: 0,TEAM_HOME,TEAM_AWAY,FG_HOME,FG_AWAY,FGA_HOME,FGA_AWAY,FG_PCT_HOME,FG_PCT_AWAY,FG(3)_HOME,FG(3)_AWAY,...,SEASON,OTs,H_TEAM_WINS_AT_HOME,H_TEAM_LOSSES_AT_HOME,A_TEAM_WINS_AT_AWAY,A_TEAM_LOSSES_AT_AWAY,HOME_GAMES_TEAM_HOME,AWAY_GAMES_TEAM_AWAY,HOME_GAME_WIN_RATE_HOME,AWAY_GAME_WIN_RATE_AWAY
0,IND,ORL,34,36,71,93,0.479,0.387,7,9,...,2014,0,0,0,0,0,0,0,,
1,MIA,CHI,37,35,72,83,0.514,0.422,11,7,...,2014,0,0,0,0,0,0,0,,
2,LAL,LAC,42,41,93,83,0.452,0.494,14,8,...,2014,0,0,0,0,0,0,0,,
3,CLE,BRK,35,33,84,82,0.417,0.402,5,9,...,2014,0,0,0,0,0,0,0,,
4,TOR,BOS,38,32,86,66,0.442,0.485,5,3,...,2014,0,0,0,0,0,0,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7375,PHI,CHI,52,45,93,95,0.559,0.474,12,9,...,2019,0,22,18,15,25,40,40,0.550,0.375
7376,SAS,DAL,41,37,88,91,0.466,0.407,8,11,...,2019,0,24,16,9,31,40,40,0.600,0.225
7377,DEN,MIN,39,39,87,91,0.448,0.429,10,13,...,2019,0,20,20,11,29,40,40,0.500,0.275
7378,LAC,UTA,54,47,106,106,0.509,0.443,12,14,...,2019,1,25,15,18,22,40,40,0.625,0.450


In [255]:
h_index = df[df['HOME_GAME_WIN_RATE_HOME'].isnull()].index

In [256]:
a_index = df[df['AWAY_GAME_WIN_RATE_AWAY'].isnull()].index

In [257]:
a_nulls = df.loc[a_index]
h_nulls = df.loc[h_index]

In [258]:
(h_nulls['HOME_GAMES_TEAM_HOME'] != 0).sum(), (a_nulls['AWAY_GAMES_TEAM_AWAY'] != 0).sum()

(0, 0)

In [259]:
df.fillna(0, inplace=True)

In [260]:
list(df.isna().sum())

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [261]:
df

Unnamed: 0,TEAM_HOME,TEAM_AWAY,FG_HOME,FG_AWAY,FGA_HOME,FGA_AWAY,FG_PCT_HOME,FG_PCT_AWAY,FG(3)_HOME,FG(3)_AWAY,...,SEASON,OTs,H_TEAM_WINS_AT_HOME,H_TEAM_LOSSES_AT_HOME,A_TEAM_WINS_AT_AWAY,A_TEAM_LOSSES_AT_AWAY,HOME_GAMES_TEAM_HOME,AWAY_GAMES_TEAM_AWAY,HOME_GAME_WIN_RATE_HOME,AWAY_GAME_WIN_RATE_AWAY
0,IND,ORL,34,36,71,93,0.479,0.387,7,9,...,2014,0,0,0,0,0,0,0,0.000,0.000
1,MIA,CHI,37,35,72,83,0.514,0.422,11,7,...,2014,0,0,0,0,0,0,0,0.000,0.000
2,LAL,LAC,42,41,93,83,0.452,0.494,14,8,...,2014,0,0,0,0,0,0,0,0.000,0.000
3,CLE,BRK,35,33,84,82,0.417,0.402,5,9,...,2014,0,0,0,0,0,0,0,0.000,0.000
4,TOR,BOS,38,32,86,66,0.442,0.485,5,3,...,2014,0,0,0,0,0,0,0,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7375,PHI,CHI,52,45,93,95,0.559,0.474,12,9,...,2019,0,22,18,15,25,40,40,0.550,0.375
7376,SAS,DAL,41,37,88,91,0.466,0.407,8,11,...,2019,0,24,16,9,31,40,40,0.600,0.225
7377,DEN,MIN,39,39,87,91,0.448,0.429,10,13,...,2019,0,20,20,11,29,40,40,0.500,0.275
7378,LAC,UTA,54,47,106,106,0.509,0.443,12,14,...,2019,1,25,15,18,22,40,40,0.625,0.450


In [262]:
df.to_csv('./cleaned_data/team_data/full_data_home_away_record_cols.csv')

#### CHANGE GET AVG STATS FUNCTION
- PREVIOUS RESULT (BOTH AWAY AND HOME)
- CURRENT RECORD
- HOME/AWAY RECORD LAST N GAMES
- WIN PCT LAST N GAMES
- TOTAL AWAY/HOME WIN/LOSSES TOTALS

In [263]:
df.columns

Index(['TEAM_HOME', 'TEAM_AWAY', 'FG_HOME', 'FG_AWAY', 'FGA_HOME', 'FGA_AWAY',
       'FG_PCT_HOME', 'FG_PCT_AWAY', 'FG(3)_HOME', 'FG(3)_AWAY',
       ...
       'SEASON', 'OTs', 'H_TEAM_WINS_AT_HOME', 'H_TEAM_LOSSES_AT_HOME',
       'A_TEAM_WINS_AT_AWAY', 'A_TEAM_LOSSES_AT_AWAY', 'HOME_GAMES_TEAM_HOME',
       'AWAY_GAMES_TEAM_AWAY', 'HOME_GAME_WIN_RATE_HOME',
       'AWAY_GAME_WIN_RATE_AWAY'],
      dtype='object', length=102)

In [264]:
added_cols = ['H_TEAM_WINS_AT_HOME', 'H_TEAM_LOSSES_AT_HOME',
       'A_TEAM_WINS_AT_AWAY', 'A_TEAM_LOSSES_AT_AWAY', 'HOME_GAMES_TEAM_HOME',
       'AWAY_GAMES_TEAM_AWAY', 'HOME_GAME_WIN_RATE_HOME',
       'AWAY_GAME_WIN_RATE_AWAY']

In [265]:
df[added_cols]

Unnamed: 0,H_TEAM_WINS_AT_HOME,H_TEAM_LOSSES_AT_HOME,A_TEAM_WINS_AT_AWAY,A_TEAM_LOSSES_AT_AWAY,HOME_GAMES_TEAM_HOME,AWAY_GAMES_TEAM_AWAY,HOME_GAME_WIN_RATE_HOME,AWAY_GAME_WIN_RATE_AWAY
0,0,0,0,0,0,0,0.000,0.000
1,0,0,0,0,0,0,0.000,0.000
2,0,0,0,0,0,0,0.000,0.000
3,0,0,0,0,0,0,0.000,0.000
4,0,0,0,0,0,0,0.000,0.000
...,...,...,...,...,...,...,...,...
7375,22,18,15,25,40,40,0.550,0.375
7376,24,16,9,31,40,40,0.600,0.225
7377,20,20,11,29,40,40,0.500,0.275
7378,25,15,18,22,40,40,0.625,0.450


In [266]:
df

Unnamed: 0,TEAM_HOME,TEAM_AWAY,FG_HOME,FG_AWAY,FGA_HOME,FGA_AWAY,FG_PCT_HOME,FG_PCT_AWAY,FG(3)_HOME,FG(3)_AWAY,...,SEASON,OTs,H_TEAM_WINS_AT_HOME,H_TEAM_LOSSES_AT_HOME,A_TEAM_WINS_AT_AWAY,A_TEAM_LOSSES_AT_AWAY,HOME_GAMES_TEAM_HOME,AWAY_GAMES_TEAM_AWAY,HOME_GAME_WIN_RATE_HOME,AWAY_GAME_WIN_RATE_AWAY
0,IND,ORL,34,36,71,93,0.479,0.387,7,9,...,2014,0,0,0,0,0,0,0,0.000,0.000
1,MIA,CHI,37,35,72,83,0.514,0.422,11,7,...,2014,0,0,0,0,0,0,0,0.000,0.000
2,LAL,LAC,42,41,93,83,0.452,0.494,14,8,...,2014,0,0,0,0,0,0,0,0.000,0.000
3,CLE,BRK,35,33,84,82,0.417,0.402,5,9,...,2014,0,0,0,0,0,0,0,0.000,0.000
4,TOR,BOS,38,32,86,66,0.442,0.485,5,3,...,2014,0,0,0,0,0,0,0,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7375,PHI,CHI,52,45,93,95,0.559,0.474,12,9,...,2019,0,22,18,15,25,40,40,0.550,0.375
7376,SAS,DAL,41,37,88,91,0.466,0.407,8,11,...,2019,0,24,16,9,31,40,40,0.600,0.225
7377,DEN,MIN,39,39,87,91,0.448,0.429,10,13,...,2019,0,20,20,11,29,40,40,0.500,0.275
7378,LAC,UTA,54,47,106,106,0.509,0.443,12,14,...,2019,1,25,15,18,22,40,40,0.625,0.450


In [267]:
df['WIN_HOME']

0       1
1       0
2       1
3       1
4       1
       ..
7375    1
7376    0
7377    1
7378    1
7379    1
Name: WIN_HOME, Length: 7380, dtype: int64

In [268]:
def get_avg_stats_last_n_games(year,team,date,n):
    away_cols = [] 
    home_cols = []
    for col in df.columns: #GETTING THE HOME AND AWAY COLUMNS SEPARATED
        if '_AWAY' in col:
            away_cols.append(col)
        if '_HOME' in col:
            home_cols.append(col)
    df_year = df.loc[df['SEASON'] == year].copy() #MAKING SURE NO PREVIOUS SEASON DATA LEAKS IN
    df_year_date = df_year.loc[(df_year['TEAM_HOME'] == team) | (df_year['TEAM_AWAY'] == team)].loc[df_year['DATE'] < date].copy() # GETTING GAMES THE TEAM PLAYED PRIOR
    df_year_date.reset_index(inplace=True) # RESETING THE INDEX SO THAT LOCATING A GAME IS EASIER
    df_year_date.drop('index',axis=1,inplace=True) #DROPPING THE OLD INDEX
    final_ten = df_year_date.tail(n).copy() # GETTING THE LAST TEN GAMES
    home = final_ten[final_ten['TEAM_HOME'] == team] # SEEING WHICH OF THE TEN GAMES WERE AT HOME
    away = final_ten[final_ten['TEAM_AWAY'] == team] # SEEING WHICH OF THE TEN GAMES WERE AWAY
    home = home[home_cols].copy() # REMOVING OPPOSING TEAM DATA
    away = away[away_cols].copy()
    home_wins = home['WIN_HOME'].sum() # GETTING HOME WINS IN PAST 10 GAMES
    away_wins = away['WIN_AWAY'].sum() # GETTING AWAY WINS IN PAST 10 GAMES
    home_games = home_wins + (home['LOSS_HOME'].sum()) # GETTING AMOUNT OF HOME GAMES IN PAST 10 GAMES
    away_games = away_wins + (away['LOSS_AWAY'].sum()) # GETTING AMOUNT OF AWAY GAMES IN PAST 10 GAMES
    home_win_pct = home['WIN_HOME'].mean() # GETTING WIN PERCENTAGE AT HOME IN PAST 10 GAMES
    away_win_pct = away['WIN_AWAY'].mean() # GETTING WIN PERCENTAGE AWAY IN PAST 10 GAMES
    home.columns = [x.replace('_HOME', '') for x in home.columns] # RENAMING THE COLUMNS TO MAKE THE CONCATENATION CONSISTENT WITH ONE ANOTHER
    away.columns = [x.replace('_AWAY','') for x in away.columns]
    total = pd.concat([home,away]) # CONCATENATING THE TWO DATAFRAMES
    last = total.tail(1) # GETTING THE LAST GAME
    wins_amount = list(last['WINS_RECORD'])[0] # GETTING THE TOTAL AMOUNT OF WINS UP TO THE GAME
    losses_amount = list(last['LOSSES_RECORD'])[0] # GETTING THE TOTAL AMOUNT OF LOSSES UP TO THE GAME
    last_home = home.tail(1) # GETTING THE LAST HOME GAME
    home_wins_amount = list(last_home['H_TEAM_WINS_AT'])[0] # GETTING THE TOTAL AMOUNT OF HOME WINS UP TO THE GAME
    home_losses_amount = list(last_home['H_TEAM_LOSSES_AT'])[0] # GETTING THE TOTAL AMOUNT OF HOME LOSSES UP TO THE GAME
    last_away = total.tail(1) # GETTING THE LAST AWAY GAME
    away_wins_amount = list(last_away['A_TEAM_WINS_AT'])[0] # GETTING THE TOTAL AMOUNT OF AWAY WINS UP TO THE GAME
    away_losses_amount = list(last_away['A_TEAM_LOSSES_AT'])[0] # GETTING THE TOTAL AMOUNT OF AWAY LOSSES UP TO THE GAME 
    total.drop(columns=['TEAM', 'LOSS', 'WINS_RECORD', 'LOSSES_RECORD', 'GAME_NO', 'WIN_PCT','H_TEAM_WINS_AT', 'H_TEAM_LOSSES_AT',
       'HOME_GAMES_TEAM', 'HOME_GAME_WIN_RATE', 'A_TEAM_WINS_AT',
       'A_TEAM_LOSSES_AT', 'AWAY_GAMES_TEAM', 'AWAY_GAME_WIN_RATE'], inplace=True) # DROPPING UNECCESSARY COLUMNS
    total.rename(columns={'WIN':'WIN_PCT'}, inplace=True) # RENAMING WIN COLUMN BECAUSE I WILL BE APPLYING MEAN TO IT WHICH THEN MAKES IT WIN_PCT
    total.columns = [x+f'_LAST_{n}_GAMES' for x in total.columns] # RENAMING COLS TO ADD LAST {N} GAMES FOR CLARIFICATION
    total = total.mean() # APPLYING THE MEAN FUNCTION TO THE ENTIRE DATAFRAME TO GET LAST N GAME AVERAGES
    total[f'WINS_LAST_{n}_GAMES'] = total[f'WIN_PCT_LAST_{n}_GAMES'] * n #GETTING AMOUNT OF GAMES WON IN THE N GAME STRETCH
    total[f'LOSSES_LAST_{n}_GAMES'] = n - total[f'WINS_LAST_{n}_GAMES'] # GETTING AMOUNT OF LOSSES IN THE N GAME STRETCH
    total[f'HOME_WINS_LAST_{n}_GAMES'] = home_wins # GETTING AMOUNT OF HOME WINS IN N GAME STRETCH
    total[f'AWAY_WINS_LAST_{n}_GAMES'] = away_wins # GETTING AMOUNT OF AWAY WINS IN N GAME STRETCH
    total[f'HOME_GAMES_IN_LAST_{n}_GAMES'] = home_games # GETTING AMOUNT OF HOME GAMES IN N GAME STRETCH
    total[f'AWAY_GAMES_IN_LAST_{n}_GAMES'] = away_games # GETTING AMOUNT OF AWAY GAMES IN N GAME STRETCH
    total[f'HOME_WIN_PCT_IN_LAST_{n}_GAMES'] = home_win_pct # GETTING WIN PCT IN HOME GAMES IN N GAME STRETCH
    total[f'AWAY_WIN_PCT_IN_LAST_{n}_GAMES'] = away_win_pct # GETTING WIN PCT IN AWAY GAMES IN N GAME STRETCH
    total['CURRENT_TOTAL_WINS'] = wins_amount # TOTAL WINS UP TO THE GAME 
    total['CURRENT_TOTAL_LOSSES'] = losses_amount # TOTAL LOSSES UP TO THE GAME
    total['CURRENT_TOTAL_AWAY_WINS'] = away_wins_amount # TOTAL AWAY WINS UP TO THE GAME
    total['CURRENT_TOTAL_AWAY_LOSSES'] = away_losses_amount # TOTAL AWAY LOSSES UP TO THE GAME
    total['CURRENT_TOTAL_HOME_WINS'] = home_wins_amount #  TOTAL WINS AT HOME UP TO THE GAME
    total['CURRENT_TOTAL_HOME_LOSSES'] = home_losses_amount # TOTAL LOSSES AT HOME UP TO THE GAME
    total['TEAM'] = team # TEAM NAME
    return total

In [269]:
okc_test = pd.DataFrame(get_avg_stats_last_n_games(2019,'OKC','17-12-19',10)).T
[x for x in okc_test.columns if 'AWAY' in x]

['AWAY_WINS_LAST_10_GAMES',
 'AWAY_GAMES_IN_LAST_10_GAMES',
 'AWAY_WIN_PCT_IN_LAST_10_GAMES',
 'CURRENT_TOTAL_AWAY_WINS',
 'CURRENT_TOTAL_AWAY_LOSSES']

In [270]:
okc_test

Unnamed: 0,FG_LAST_10_GAMES,FGA_LAST_10_GAMES,FG_PCT_LAST_10_GAMES,FG(3)_LAST_10_GAMES,FGA(3)_LAST_10_GAMES,FG_PCT(3)_LAST_10_GAMES,FT_LAST_10_GAMES,FTA_LAST_10_GAMES,FT_PCT_LAST_10_GAMES,ORB_LAST_10_GAMES,...,AWAY_GAMES_IN_LAST_10_GAMES,HOME_WIN_PCT_IN_LAST_10_GAMES,AWAY_WIN_PCT_IN_LAST_10_GAMES,CURRENT_TOTAL_WINS,CURRENT_TOTAL_LOSSES,CURRENT_TOTAL_AWAY_WINS,CURRENT_TOTAL_AWAY_LOSSES,CURRENT_TOTAL_HOME_WINS,CURRENT_TOTAL_HOME_LOSSES,TEAM
0,43.3,95.7,0.453,14.2,38.7,0.3648,13.9,19.2,0.7197,13.8,...,4.0,0.5,0.75,49.0,33.0,22.0,18.0,25.0,15.0,OKC


In [271]:
get_avg_stats_last_n_games(2019,'OKC','17-12-19',10).index

Index(['FG_LAST_10_GAMES', 'FGA_LAST_10_GAMES', 'FG_PCT_LAST_10_GAMES',
       'FG(3)_LAST_10_GAMES', 'FGA(3)_LAST_10_GAMES',
       'FG_PCT(3)_LAST_10_GAMES', 'FT_LAST_10_GAMES', 'FTA_LAST_10_GAMES',
       'FT_PCT_LAST_10_GAMES', 'ORB_LAST_10_GAMES', 'DRB_LAST_10_GAMES',
       'TRB_LAST_10_GAMES', 'AST_LAST_10_GAMES', 'STL_LAST_10_GAMES',
       'BLK_LAST_10_GAMES', 'TOV_LAST_10_GAMES', 'PF_LAST_10_GAMES',
       'PTS_LAST_10_GAMES', 'TS_PCT_LAST_10_GAMES', 'EFG_PCT_LAST_10_GAMES',
       '3PA_R_LAST_10_GAMES', 'FT_R_LAST_10_GAMES', 'ORB_PCT_LAST_10_GAMES',
       'DRB_PCT_LAST_10_GAMES', 'TRB_PCT_LAST_10_GAMES',
       'AST_PCT_LAST_10_GAMES', 'STL_PCT_LAST_10_GAMES',
       'BLK_PCT_LAST_10_GAMES', 'TOV_PCT_LAST_10_GAMES', 'O_RTG_LAST_10_GAMES',
       'D_RTG_LAST_10_GAMES', 'WIN_PCT_LAST_10_GAMES',
       'FG(3)_MISSED_LAST_10_GAMES', 'FG_MISSED_LAST_10_GAMES',
       'BLOCKED_ATTEMPTS_LAST_10_GAMES', 'TOV_FORCED_LAST_10_GAMES',
       'FG(2)_LAST_10_GAMES', 'FGA(2)_LAST_10_GAMES

In [272]:
[key+'_HOME' for key,value in get_avg_stats_last_n_games(2019,'OKC','17-12-19',10).iteritems()]

['FG_LAST_10_GAMES_HOME',
 'FGA_LAST_10_GAMES_HOME',
 'FG_PCT_LAST_10_GAMES_HOME',
 'FG(3)_LAST_10_GAMES_HOME',
 'FGA(3)_LAST_10_GAMES_HOME',
 'FG_PCT(3)_LAST_10_GAMES_HOME',
 'FT_LAST_10_GAMES_HOME',
 'FTA_LAST_10_GAMES_HOME',
 'FT_PCT_LAST_10_GAMES_HOME',
 'ORB_LAST_10_GAMES_HOME',
 'DRB_LAST_10_GAMES_HOME',
 'TRB_LAST_10_GAMES_HOME',
 'AST_LAST_10_GAMES_HOME',
 'STL_LAST_10_GAMES_HOME',
 'BLK_LAST_10_GAMES_HOME',
 'TOV_LAST_10_GAMES_HOME',
 'PF_LAST_10_GAMES_HOME',
 'PTS_LAST_10_GAMES_HOME',
 'TS_PCT_LAST_10_GAMES_HOME',
 'EFG_PCT_LAST_10_GAMES_HOME',
 '3PA_R_LAST_10_GAMES_HOME',
 'FT_R_LAST_10_GAMES_HOME',
 'ORB_PCT_LAST_10_GAMES_HOME',
 'DRB_PCT_LAST_10_GAMES_HOME',
 'TRB_PCT_LAST_10_GAMES_HOME',
 'AST_PCT_LAST_10_GAMES_HOME',
 'STL_PCT_LAST_10_GAMES_HOME',
 'BLK_PCT_LAST_10_GAMES_HOME',
 'TOV_PCT_LAST_10_GAMES_HOME',
 'O_RTG_LAST_10_GAMES_HOME',
 'D_RTG_LAST_10_GAMES_HOME',
 'WIN_PCT_LAST_10_GAMES_HOME',
 'FG(3)_MISSED_LAST_10_GAMES_HOME',
 'FG_MISSED_LAST_10_GAMES_HOME',
 'BLOCK

In [273]:
inshallah = []
for key,value in df.iterrows():
    try:
        home = get_avg_stats_last_n_games(value['SEASON'],value['TEAM_HOME'],value['DATE'],5) # APPLYING N GAMES FUNCTION
        away = get_avg_stats_last_n_games(value['SEASON'],value['TEAM_AWAY'],value['DATE'],5)
        home.drop(labels=[x for x in home.index if 'AWAY' in x], inplace=True) # DROPPING AWAY COLUMNS FOR HOME ROW AND HOME COLUMNS FOR AWAY ROW AS THEY AREN'T RELEVANT TO MODELING AND COULD CAUSE CONFUSION
        away.drop(labels=[x for x in away.index if 'HOME' in x], inplace=True)
        home.index = [i+'_HOME' for i in home.index] # ADDING AWAY AND HOME LABELS TO INDICATE WHICH IS WHICH FOR CONCATENATION LATER
        away.index = [i+'_AWAY' for i in away.index]
        combined = pd.concat([home,away]) # CONCATENATING THE HOME AND AWAY ROWS
        combined['GAMEID'] = value['GAMEID'] # ADDING GAMEID
        combined['GAME_NO_HOME'] = value['GAME_NO_HOME'] # ADDING HOW MANY GAMES PLAYED FOR HOME TEAM
        combined['GAME_NO_AWAY'] = value['GAME_NO_AWAY'] # ADDING HOW MANY GAMES PLAYED FOR AWAY TEAM
        combined['OTs'] = value['OTs'] # ADDING OVERTIME COLUMN
        combined['DATE'] = value['DATE'] # ADDING DATE OF GAME
        combined['SEASON'] = value['SEASON'] # ADDING SEASON IN WHICH GAME TOOK PLACE
        combined['HOME_WIN/LABEL'] = value['WIN_HOME'] # ADDING Y VARIABLE (HOME WIN)
        combined = pd.DataFrame(combined).T # TURNING THE COMBINED ROW INTO A DATAFRAME ROW FROM IT'S PANDAS SERIES FORMAT
        inshallah.append(combined)
    except IndexError:
        continue

In [274]:
lag5 = pd.concat(inshallah)
lag5['SEASON'].value_counts()

2015    1144
2019    1143
2018    1142
2014    1136
2016    1134
2017    1121
Name: SEASON, dtype: int64

In [275]:
lag5.columns

Index(['FG_LAST_5_GAMES_HOME', 'FGA_LAST_5_GAMES_HOME',
       'FG_PCT_LAST_5_GAMES_HOME', 'FG(3)_LAST_5_GAMES_HOME',
       'FGA(3)_LAST_5_GAMES_HOME', 'FG_PCT(3)_LAST_5_GAMES_HOME',
       'FT_LAST_5_GAMES_HOME', 'FTA_LAST_5_GAMES_HOME',
       'FT_PCT_LAST_5_GAMES_HOME', 'ORB_LAST_5_GAMES_HOME',
       ...
       'CURRENT_TOTAL_AWAY_WINS_AWAY', 'CURRENT_TOTAL_AWAY_LOSSES_AWAY',
       'TEAM_AWAY', 'GAMEID', 'GAME_NO_HOME', 'GAME_NO_AWAY', 'OTs', 'DATE',
       'SEASON', 'HOME_WIN/LABEL'],
      dtype='object', length=105)

In [276]:
lag5.isna().sum().sort_values(ascending=False)

AWAY_WIN_PCT_IN_LAST_5_GAMES_AWAY    231
CURRENT_TOTAL_AWAY_LOSSES_AWAY       231
CURRENT_TOTAL_AWAY_WINS_AWAY         231
PF_LAST_5_GAMES_AWAY                   0
STL_PCT_LAST_5_GAMES_AWAY              0
                                    ... 
FG(3)_MISSED_LAST_5_GAMES_HOME         0
WIN_PCT_LAST_5_GAMES_HOME              0
D_RTG_LAST_5_GAMES_HOME                0
O_RTG_LAST_5_GAMES_HOME                0
HOME_WIN/LABEL                         0
Length: 105, dtype: int64

In [277]:
lag5.reset_index(inplace=True)

In [278]:
null_a_losses = lag5[lag5['CURRENT_TOTAL_AWAY_LOSSES_AWAY'].isnull()].index
null_a_wins = lag5[lag5['CURRENT_TOTAL_AWAY_WINS_AWAY'].isnull()].index
null_a_pct = lag5[lag5['AWAY_WIN_PCT_IN_LAST_5_GAMES_AWAY'].isnull()].index

In [279]:
(null_a_losses == null_a_wins).sum()

231

In [280]:
(null_a_losses == null_a_pct).sum()

231

In [281]:
missing = lag5.loc[null_a_losses].loc[null_a_wins].loc[null_a_pct]
missing['AWAY_GAMES_IN_LAST_5_GAMES_AWAY'].value_counts()

0.0    231
Name: AWAY_GAMES_IN_LAST_5_GAMES_AWAY, dtype: int64

In [282]:
lag5.fillna(0,inplace=True)

In [283]:
lag5.isnull().sum().sort_values(ascending=False)

index                        0
O_RTG_LAST_5_GAMES_AWAY      0
BLK_PCT_LAST_5_GAMES_AWAY    0
STL_PCT_LAST_5_GAMES_AWAY    0
AST_PCT_LAST_5_GAMES_AWAY    0
                            ..
WIN_PCT_LAST_5_GAMES_HOME    0
D_RTG_LAST_5_GAMES_HOME      0
O_RTG_LAST_5_GAMES_HOME      0
TOV_PCT_LAST_5_GAMES_HOME    0
HOME_WIN/LABEL               0
Length: 106, dtype: int64

In [284]:
lag5.drop('index', axis = 1, inplace=True)

In [285]:
lag5.columns

Index(['FG_LAST_5_GAMES_HOME', 'FGA_LAST_5_GAMES_HOME',
       'FG_PCT_LAST_5_GAMES_HOME', 'FG(3)_LAST_5_GAMES_HOME',
       'FGA(3)_LAST_5_GAMES_HOME', 'FG_PCT(3)_LAST_5_GAMES_HOME',
       'FT_LAST_5_GAMES_HOME', 'FTA_LAST_5_GAMES_HOME',
       'FT_PCT_LAST_5_GAMES_HOME', 'ORB_LAST_5_GAMES_HOME',
       ...
       'CURRENT_TOTAL_AWAY_WINS_AWAY', 'CURRENT_TOTAL_AWAY_LOSSES_AWAY',
       'TEAM_AWAY', 'GAMEID', 'GAME_NO_HOME', 'GAME_NO_AWAY', 'OTs', 'DATE',
       'SEASON', 'HOME_WIN/LABEL'],
      dtype='object', length=105)

In [286]:
lag5.to_csv('./cleaned_data/data_to_model/data_last_5_games.csv')

In [287]:
inshallah = []
for key,value in df.iterrows():
    try:
        home = get_avg_stats_last_n_games(value['SEASON'],value['TEAM_HOME'],value['DATE'],10)
        away = get_avg_stats_last_n_games(value['SEASON'],value['TEAM_AWAY'],value['DATE'],10)
        home.drop(labels=[x for x in home.index if 'AWAY' in x], inplace=True)
        away.drop(labels=[x for x in away.index if 'HOME' in x], inplace=True)
        home.index = [i+'_HOME' for i in home.index]
        away.index = [i+'_AWAY' for i in away.index]
        combined = pd.concat([home,away])
        combined['GAMEID'] = value['GAMEID']
        combined['GAME_NO_HOME'] = value['GAME_NO_HOME']
        combined['GAME_NO_AWAY'] = value['GAME_NO_AWAY']
        combined['OTs'] = value['OTs']
        combined['DATE'] = value['DATE']
        combined['SEASON'] = value['SEASON']
        combined['HOME_WIN/LABEL'] = value['WIN_HOME']
        combined = pd.DataFrame(combined).T
        inshallah.append(combined)
    except IndexError:
        continue

In [288]:
pd.concat(inshallah).isna().sum().sort_values(ascending=False)

AWAY_WIN_PCT_IN_LAST_10_GAMES_AWAY    47
CURRENT_TOTAL_AWAY_LOSSES_AWAY        47
CURRENT_TOTAL_AWAY_WINS_AWAY          47
PF_LAST_10_GAMES_AWAY                  0
STL_PCT_LAST_10_GAMES_AWAY             0
                                      ..
FG(3)_MISSED_LAST_10_GAMES_HOME        0
WIN_PCT_LAST_10_GAMES_HOME             0
D_RTG_LAST_10_GAMES_HOME               0
O_RTG_LAST_10_GAMES_HOME               0
HOME_WIN/LABEL                         0
Length: 105, dtype: int64

In [289]:
lag10 = pd.concat(inshallah)
lag10.fillna(10,inplace=True)

In [290]:
lag10.to_csv('./cleaned_data/data_to_model/data_last_10_games.csv')

In [291]:
lag10['SEASON'].value_counts()

2015    1200
2014    1199
2019    1199
2017    1197
2016    1196
2018    1195
Name: SEASON, dtype: int64

In [292]:
lag5['SEASON'].value_counts()

2015    1144
2019    1143
2018    1142
2014    1136
2016    1134
2017    1121
Name: SEASON, dtype: int64

I believe a primary reason as to why there are less in the five game lag dataframe compared to the 10 lag dataframe is the occurence of five game stretches where there are no home or away games which could cause potential index errors.

In this situation, I can either alter the function or use a larger sample size (ie; 10) to counteract these stretches of extended home/away games.

In [293]:
len(df)

7380

In [294]:
len(lag5)

6820

In [295]:
len(lag10)

7186

## GET ELO FOR DATAFRAME


THE FOLLOWING FUNCTIONS WERE CREATED BY JOSH WEINER AND THIS PROJECT WAS INSPIRED BY HIS [BLOG-POST](https://towardsdatascience.com/predicting-the-outcome-of-nba-games-with-machine-learning-a810bb768f20)

In [296]:
df = pd.read_csv('./cleaned_data/team_data/full_data_home_away_record_cols.csv')
df.drop('Unnamed: 0', axis = 1, inplace=True)
gen_cols = ['GAMEID','DATE','SEASON','OTs']
away_cols = []
home_cols = []
for col in df.columns:
    if '_AWAY' in col:
        away_cols.append(col)
    if '_HOME' in col:
        home_cols.append(col)
df['DATE'] = pd.to_datetime(df['DATE'])

In [297]:
len(df)

7380

In [298]:
import math

In [299]:
#Home and road team win probabilities implied by Elo ratings and home court adjustment
def win_probs(home_elo, away_elo, home_court_advantage) :
    h = math.pow(10, home_elo/400)
    r = math.pow(10, away_elo/400)
    a = math.pow(10, home_court_advantage/400) 

    denom = r + a*h
    home_prob = a*h / denom
    away_prob = r / denom 

    return home_prob, away_prob

    #odds the home team will win based on elo ratings and home court advantage

def home_odds_on(home_elo, away_elo, home_court_advantage) :
    h = math.pow(10, home_elo/400)
    r = math.pow(10, away_elo/400)
    a = math.pow(10, home_court_advantage/400)
    return a*h/r

    #this function determines the constant used in the elo rating, based on margin of victory and difference in elo ratings
def elo_k(MOV, elo_diff):
    k = 20
    if MOV>0:
        multiplier=(MOV+3)**(0.8)/(7.5+0.006*(elo_diff))
    else:
        multiplier=(-MOV+3)**(0.8)/(7.5+0.006*(-elo_diff))
    return k*multiplier


    #updates the home and away teams elo ratings after a game 

def update_elo(home_score, away_score, home_elo, away_elo, home_court_advantage) :
    home_prob, away_prob = win_probs(home_elo, away_elo, home_court_advantage) 

    if (home_score - away_score > 0) :
        home_win = 1 
        away_win = 0 
    else :
        home_win = 0 
        away_win = 1 

    k = elo_k(home_score - away_score, home_elo - away_elo)

    updated_home_elo = home_elo + k * (home_win - home_prob) 
    updated_away_elo = away_elo + k * (away_win - away_prob)

    return updated_home_elo, updated_away_elo


    #takes into account prev season elo
def get_prev_elo(team, date, season, df, elo_df) :
    prev_game = df[df['DATE'] < game_date][(df['TEAM_HOME'] == team) | (df['TEAM_AWAY'] == team)].sort_values(by = 'DATE').tail(1).iloc[0] 

    if team == prev_game['TEAM_HOME'] :
        elo_rating = elo_df[elo_df['GAMEID'] == prev_game['GAMEID']]['TEAM_ELO_AFTER_HOME'].values[0]
    else :
        elo_rating = elo_df[elo_df['GAMEID'] == prev_game['GAMEID']]['TEAM_ELO_AFTER_AWAY'].values[0]

    if prev_game['SEASON'] != season :
        return (0.75 * elo_rating) + (0.25 * 1505)
    else :
        return elo_rating

In [300]:
# CREATING TWO DATAFRAMES DEDICATED TO ELO EDA
elo_df = pd.DataFrame(columns=['GAMEID', 'TEAM_HOME', 'TEAM_AWAY', 'TEAM_ELO_BEFORE_HOME', 'TEAM_ELO_BEFORE_AWAY', 'TEAM_ELO_AFTER_HOME', 'TEAM_ELO_AFTER_AWAY'
                              , 'ODDS_HOME', 'PROBS_HOME', 'PROBS_AWAY']) # ELO GAME BY GAME BASIS
teams_elo_df = pd.DataFrame(columns=['GAMEID','TEAM', 'ELO', 'DATE', 'WHERE_PLAYED', 'SEASON']) # ELO TEAM BY TEAM BASIS

for index, row in df.iterrows(): 
    game_id = row['GAMEID']
    game_date = row['DATE']
    season = row['SEASON']
    h_team, a_team = row['TEAM_HOME'], row['TEAM_AWAY']
    h_score, a_score = row['PTS_HOME'], row['PTS_AWAY'] 
    h_ortg, a_ortg = row['O_RTG_HOME'], row['O_RTG_AWAY']
    h_drtg, a_drtg = row['D_RTG_HOME'], row['D_RTG_AWAY']
    h_no, a_no = row['GAME_NO_HOME'], row['GAME_NO_AWAY']

    if (h_team not in elo_df['TEAM_HOME'].values and h_team not in elo_df['TEAM_AWAY'].values) :
        h_team_elo_before = 1500 # SET INITIAL ELO SCORE
    else :
        h_team_elo_before = get_prev_elo(h_team, game_date, season, df, elo_df) # GET PREVIOUS ELO

    if (a_team not in elo_df['TEAM_HOME'].values and a_team not in elo_df['TEAM_AWAY'].values) :
        a_team_elo_before = 1500 # SET INITIAL ELO SCORE
    else :
        a_team_elo_before = get_prev_elo(a_team, game_date, season, df, elo_df) # GET PREVIOUS ELO

    h_team_elo_after, a_team_elo_after = update_elo(h_score, a_score, h_team_elo_before, a_team_elo_before, 92) # UPDATING ELO
    odds_home_team = home_odds_on(h_team_elo_before, a_team_elo_before, 92) # GETTING ODDS FOR HOME WIN
    probs_home_team, probs_away_team = win_probs(h_team_elo_before, a_team_elo_before, 92) # GETTING HOME AND AWAY WIN PROBABILITIES OFF ELO

    new_row = {'GAMEID': game_id, 'TEAM_HOME': h_team, 'TEAM_AWAY': a_team, 'TEAM_ELO_BEFORE_HOME': h_team_elo_before, 'TEAM_ELO_BEFORE_AWAY': a_team_elo_before, \
                                                                        'TEAM_ELO_AFTER_HOME' : h_team_elo_after, 'TEAM_ELO_AFTER_AWAY': a_team_elo_after,
              'ODDS_HOME': odds_home_team, 'PROBS_HOME': probs_home_team, 'PROBS_AWAY': probs_away_team}
    teams_row_one = {'GAMEID': game_id,'TEAM': h_team, 'ELO': h_team_elo_before, 'DATE': game_date, 'WHERE_PLAYED': 'Home', 'SEASON': season, 'SCORE': h_score,
                    'OPP_SCORE': a_score, 'O_RTG': h_ortg, 'D_RTG': h_drtg, 'GAME_NO': h_no}
    teams_row_two = {'GAMEID': game_id,'TEAM': a_team, 'ELO': a_team_elo_before, 'DATE': game_date, 'WHERE_PLAYED': 'Away', 'SEASON': season, 'SCORE': a_score,
                    'OPP_SCORE': h_score, 'O_RTG': a_ortg, 'D_RTG': a_drtg, 'GAME_NO': a_no}

    elo_df = elo_df.append(new_row, ignore_index = True)
    teams_elo_df = teams_elo_df.append(teams_row_one, ignore_index=True)
    teams_elo_df = teams_elo_df.append(teams_row_two, ignore_index=True)

In [301]:
elo_df.to_csv('./cleaned_data/elo_dfs/elo_dataframe.csv')

In [302]:
teams_elo_df.to_csv('./cleaned_data/elo_dfs/teams_elo_dataframe.csv')

In [303]:
len(teams_elo_df)

14760

In [304]:
len(elo_df)

7380

In [305]:
elo_df.head()

Unnamed: 0,GAMEID,TEAM_HOME,TEAM_AWAY,TEAM_ELO_BEFORE_HOME,TEAM_ELO_BEFORE_AWAY,TEAM_ELO_AFTER_HOME,TEAM_ELO_AFTER_AWAY,ODDS_HOME,PROBS_HOME,PROBS_AWAY
0,201310290IND,IND,ORL,1500,1500,1507.692053,1492.307947,1.698244,0.629389,0.370611
1,201310290MIA,MIA,CHI,1500,1500,1508.625029,1491.374971,1.698244,0.629389,0.370611
2,201310290LAL,LAL,LAC,1500,1500,1509.082043,1490.917957,1.698244,0.629389,0.370611
3,201310300CLE,CLE,BRK,1500,1500,1504.687767,1495.312233,1.698244,0.629389,0.370611
4,201310300TOR,TOR,BOS,1500,1500,1505.731675,1494.268325,1.698244,0.629389,0.370611


In [306]:
df['TEAM_ELO_BEFORE_HOME'] = elo_df['TEAM_ELO_BEFORE_HOME']
df['TEAM_ELO_BEFORE_AWAY'] = elo_df['TEAM_ELO_BEFORE_AWAY']
df['TEAM_ELO_AFTER_HOME'] = elo_df['TEAM_ELO_AFTER_HOME']
df['TEAM_ELO_AFTER_AWAY'] = elo_df['TEAM_ELO_AFTER_AWAY']
df['ODDS_HOME'] = elo_df['ODDS_HOME']
df['PROBS_HOME'] = elo_df['PROBS_HOME']
df['PROBS_AWAY'] = elo_df['PROBS_AWAY']

In [307]:
df.head()

Unnamed: 0,TEAM_HOME,TEAM_AWAY,FG_HOME,FG_AWAY,FGA_HOME,FGA_AWAY,FG_PCT_HOME,FG_PCT_AWAY,FG(3)_HOME,FG(3)_AWAY,...,AWAY_GAMES_TEAM_AWAY,HOME_GAME_WIN_RATE_HOME,AWAY_GAME_WIN_RATE_AWAY,TEAM_ELO_BEFORE_HOME,TEAM_ELO_BEFORE_AWAY,TEAM_ELO_AFTER_HOME,TEAM_ELO_AFTER_AWAY,ODDS_HOME,PROBS_HOME,PROBS_AWAY
0,IND,ORL,34,36,71,93,0.479,0.387,7,9,...,0,0.0,0.0,1500,1500,1507.692053,1492.307947,1.698244,0.629389,0.370611
1,MIA,CHI,37,35,72,83,0.514,0.422,11,7,...,0,0.0,0.0,1500,1500,1508.625029,1491.374971,1.698244,0.629389,0.370611
2,LAL,LAC,42,41,93,83,0.452,0.494,14,8,...,0,0.0,0.0,1500,1500,1509.082043,1490.917957,1.698244,0.629389,0.370611
3,CLE,BRK,35,33,84,82,0.417,0.402,5,9,...,0,0.0,0.0,1500,1500,1504.687767,1495.312233,1.698244,0.629389,0.370611
4,TOR,BOS,38,32,86,66,0.442,0.485,5,3,...,0,0.0,0.0,1500,1500,1505.731675,1494.268325,1.698244,0.629389,0.370611


In [308]:
df.to_csv('./cleaned_data/team_data/full_data_with_elo_carryover.csv')

Now that I have the ELO ratings throughout the entire dataset, it's time to segregate the dataframe by season and attach win probablities based on ELO to the dataframe. This should help out the model significantly because as of right now it is at a 0.58 cross validation score which is 0.10 off from where it should be

Home court advantage is at 92 to account for home court advantage, the exact number was provided by fivethirtyeight in this [article](https://fivethirtyeight.com/methodology/how-our-nba-predictions-work/)

In [309]:
df = pd.read_csv('./cleaned_data/team_data/full_data_home_away_record_cols.csv')
df.drop('Unnamed: 0', axis = 1, inplace=True)
gen_cols = ['GAMEID','DATE','SEASON','OTs']
away_cols = []
home_cols = []
for col in df.columns:
    if '_AWAY' in col:
        away_cols.append(col)
    if '_HOME' in col:
        home_cols.append(col)
df['DATE'] = pd.to_datetime(df['DATE'])

In [310]:
list(df.columns)

['TEAM_HOME',
 'TEAM_AWAY',
 'FG_HOME',
 'FG_AWAY',
 'FGA_HOME',
 'FGA_AWAY',
 'FG_PCT_HOME',
 'FG_PCT_AWAY',
 'FG(3)_HOME',
 'FG(3)_AWAY',
 'FGA(3)_HOME',
 'FGA(3)_AWAY',
 'FG_PCT(3)_HOME',
 'FG_PCT(3)_AWAY',
 'FT_HOME',
 'FT_AWAY',
 'FTA_HOME',
 'FTA_AWAY',
 'FT_PCT_HOME',
 'FT_PCT_AWAY',
 'ORB_HOME',
 'ORB_AWAY',
 'DRB_HOME',
 'DRB_AWAY',
 'TRB_HOME',
 'TRB_AWAY',
 'AST_HOME',
 'AST_AWAY',
 'STL_HOME',
 'STL_AWAY',
 'BLK_HOME',
 'BLK_AWAY',
 'TOV_HOME',
 'TOV_AWAY',
 'PF_HOME',
 'PF_AWAY',
 'PTS_HOME',
 'PTS_AWAY',
 'TS_PCT_HOME',
 'TS_PCT_AWAY',
 'EFG_PCT_HOME',
 'EFG_PCT_AWAY',
 '3PA_R_HOME',
 '3PA_R_AWAY',
 'FT_R_HOME',
 'FT_R_AWAY',
 'ORB_PCT_HOME',
 'ORB_PCT_AWAY',
 'DRB_PCT_HOME',
 'DRB_PCT_AWAY',
 'TRB_PCT_HOME',
 'TRB_PCT_AWAY',
 'AST_PCT_HOME',
 'AST_PCT_AWAY',
 'STL_PCT_HOME',
 'STL_PCT_AWAY',
 'BLK_PCT_HOME',
 'BLK_PCT_AWAY',
 'TOV_PCT_HOME',
 'TOV_PCT_AWAY',
 'O_RTG_HOME',
 'O_RTG_AWAY',
 'D_RTG_HOME',
 'D_RTG_AWAY',
 'WINS_RECORD_HOME',
 'WINS_RECORD_AWAY',
 'LOSSES_REC

In [311]:
seasons

[2014, 2015, 2016, 2017, 2018, 2019]

In [312]:
home_odds_on(1500, 1500, 92)

1.6982436524617444

In [313]:
win_probs(1500,1500,92)

(0.6293885472175023, 0.37061145278249774)

In [314]:
df['GAME_NO_HOME'].value_counts()

8.0     101
20.0    100
45.0     99
44.0     99
34.0     98
       ... 
74.0     80
7.0      80
28.0     80
50.0     79
19.0     77
Name: GAME_NO_HOME, Length: 82, dtype: int64

In [315]:
elo_df = pd.DataFrame(columns=['GAMEID', 'TEAM_HOME', 'TEAM_AWAY', 'TEAM_ELO_BEFORE_HOME', 'TEAM_ELO_BEFORE_AWAY', 'TEAM_ELO_AFTER_HOME', 'TEAM_ELO_AFTER_AWAY'
                              , 'ODDS_HOME', 'PROBS_HOME', 'PROBS_AWAY'])
teams_elo_df = pd.DataFrame(columns=['GAMEID','TEAM', 'ELO', 'DATE', 'WHERE_PLAYED', 'SEASON']) 
for index, row in df.iterrows(): 
    game_id = row['GAMEID']
    game_date = row['DATE']
    season = row['SEASON']
    h_team, a_team = row['TEAM_HOME'], row['TEAM_AWAY']
    h_score, a_score = row['PTS_HOME'], row['PTS_AWAY']
    game_no_home, game_no_away = row['GAME_NO_HOME'], row['GAME_NO_AWAY']


    if game_no_home == 1:
        h_team_elo_before = 1500
    else:
        h_team_elo_before = get_prev_elo(h_team, game_date, season, df, elo_df)

    if game_no_away == 1: # EDITED DATAFRAME SO THAT ELO WOULD RESET AFTER BEGINNING EACH SEASON
        a_team_elo_before = 1500
    else:
        a_team_elo_before = get_prev_elo(a_team, game_date, season, df, elo_df)

    h_team_elo_after, a_team_elo_after = update_elo(h_score, a_score, h_team_elo_before, a_team_elo_before, 92)
    odds_home_team = home_odds_on(h_team_elo_before, a_team_elo_before, 92)
    probs_home_team, probs_away_team = win_probs(h_team_elo_before, a_team_elo_before, 92)

    new_row = {'GAMEID': game_id, 'TEAM_HOME': h_team, 'TEAM_AWAY': a_team, 'TEAM_ELO_BEFORE_HOME': h_team_elo_before, 'TEAM_ELO_BEFORE_AWAY': a_team_elo_before, \
                                                                        'TEAM_ELO_AFTER_HOME' : h_team_elo_after, 'TEAM_ELO_AFTER_AWAY': a_team_elo_after,
              'ODDS_HOME': odds_home_team, 'PROBS_HOME': probs_home_team, 'PROBS_AWAY': probs_away_team}
    teams_row_one = {'GAMEID': game_id,'TEAM': h_team, 'ELO': h_team_elo_before, 'DATE': game_date, 'WHERE_PLAYED': 'Home', 'SEASON': season}
    teams_row_two = {'GAMEID': game_id,'TEAM': a_team, 'ELO': a_team_elo_before, 'DATE': game_date, 'WHERE_PLAYED': 'Away', 'SEASON': season}

    elo_df = elo_df.append(new_row, ignore_index = True)
    teams_elo_df = teams_elo_df.append(teams_row_one, ignore_index=True)
    teams_elo_df = teams_elo_df.append(teams_row_two, ignore_index=True)

In [316]:
elo_df['TEAM_ELO_BEFORE_AWAY'].value_counts()

1500.000000    85
1506.235734     5
1507.692053     4
1504.687767     4
1505.731675     3
               ..
1460.294409     1
1713.409002     1
1558.028887     1
1551.047674     1
1459.938484     1
Name: TEAM_ELO_BEFORE_AWAY, Length: 7264, dtype: int64

In [317]:
elo_df['TEAM_ELO_BEFORE_HOME'].value_counts()

1500.000000    95
1492.785063     5
1493.764266     4
1489.142957     3
1507.214937     3
               ..
1743.025829     1
1566.586534     1
1305.431636     1
1270.924426     1
1644.474755     1
Name: TEAM_ELO_BEFORE_HOME, Length: 7254, dtype: int64

In [318]:
85+95

180

In [319]:
df['TEAM_ELO_BEFORE_HOME'] = elo_df['TEAM_ELO_BEFORE_HOME']
df['TEAM_ELO_BEFORE_AWAY'] = elo_df['TEAM_ELO_BEFORE_AWAY']
df['TEAM_ELO_AFTER_HOME'] = elo_df['TEAM_ELO_AFTER_HOME']
df['TEAM_ELO_AFTER_AWAY'] = elo_df['TEAM_ELO_AFTER_AWAY']
df['ODDS_HOME'] = elo_df['ODDS_HOME']
df['PROBS_HOME'] = elo_df['PROBS_HOME']
df['PROBS_AWAY'] = elo_df['PROBS_AWAY']

In [320]:
list(df.isna().sum())

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [321]:
df['ODDS_HOME']

0       1.698244
1       1.698244
2       1.698244
3       1.698244
4       1.698244
          ...   
7375    6.741837
7376    3.202098
7377    3.598549
7378    0.929183
7379    4.912931
Name: ODDS_HOME, Length: 7380, dtype: float64

In [322]:
df.head()

Unnamed: 0,TEAM_HOME,TEAM_AWAY,FG_HOME,FG_AWAY,FGA_HOME,FGA_AWAY,FG_PCT_HOME,FG_PCT_AWAY,FG(3)_HOME,FG(3)_AWAY,...,AWAY_GAMES_TEAM_AWAY,HOME_GAME_WIN_RATE_HOME,AWAY_GAME_WIN_RATE_AWAY,TEAM_ELO_BEFORE_HOME,TEAM_ELO_BEFORE_AWAY,TEAM_ELO_AFTER_HOME,TEAM_ELO_AFTER_AWAY,ODDS_HOME,PROBS_HOME,PROBS_AWAY
0,IND,ORL,34,36,71,93,0.479,0.387,7,9,...,0,0.0,0.0,1500,1500,1507.692053,1492.307947,1.698244,0.629389,0.370611
1,MIA,CHI,37,35,72,83,0.514,0.422,11,7,...,0,0.0,0.0,1500,1500,1508.625029,1491.374971,1.698244,0.629389,0.370611
2,LAL,LAC,42,41,93,83,0.452,0.494,14,8,...,0,0.0,0.0,1500,1500,1509.082043,1490.917957,1.698244,0.629389,0.370611
3,CLE,BRK,35,33,84,82,0.417,0.402,5,9,...,0,0.0,0.0,1500,1500,1504.687767,1495.312233,1.698244,0.629389,0.370611
4,TOR,BOS,38,32,86,66,0.442,0.485,5,3,...,0,0.0,0.0,1500,1500,1505.731675,1494.268325,1.698244,0.629389,0.370611


In [323]:
df.to_csv('./cleaned_data/team_data/full_data_with_elo_per_season.csv')

In [324]:
df['H_TEAM_WINS_AT_HOME']

0        0
1        0
2        0
3        0
4        0
        ..
7375    22
7376    24
7377    20
7378    25
7379    25
Name: H_TEAM_WINS_AT_HOME, Length: 7380, dtype: int64

In [325]:
last = df.tail(1)
_ = last['WIN_HOME']
_

7379    1
Name: WIN_HOME, dtype: int64

In [326]:
def get_avg_stats_last_n_games(year,team,date,n):
    away_cols = []
    home_cols = []
    for col in df.columns:
        if '_AWAY' in col:
            away_cols.append(col)
        if '_HOME' in col:
            home_cols.append(col)
    df_year = df.loc[df['SEASON'] == year].copy()
    df_year_date = df_year.loc[(df_year['TEAM_HOME'] == team) | (df_year['TEAM_AWAY'] == team)].loc[df_year['DATE'] < date].copy()
    df_year_date.reset_index(inplace=True)
    df_year_date.drop('index',axis=1,inplace=True)
    final_ten = df_year_date.tail(n).copy()
    home = final_ten[final_ten['TEAM_HOME'] == team]
    away = final_ten[final_ten['TEAM_AWAY'] == team]
    home = home[home_cols].copy()
    away = away[away_cols].copy()
    home_wins = home['WIN_HOME'].sum()
    away_wins = away['WIN_AWAY'].sum()
    home_games = home_wins + (home['LOSS_HOME'].sum())
    away_games = away_wins + (away['LOSS_AWAY'].sum())
    home_win_pct = home['WIN_HOME'].mean()
    away_win_pct = away['WIN_AWAY'].mean()
    home.columns = [x.replace('_HOME', '') for x in home.columns]
    away.columns = [x.replace('_AWAY','') for x in away.columns]
    total = pd.concat([home,away])
    last = total.tail(1)
    elo = list(last['TEAM_ELO_AFTER'])[0]
    wins_amount = list(last['WINS_RECORD'])[0]
    losses_amount = list(last['LOSSES_RECORD'])[0]
    last_home = home.tail(1)
    home_wins_amount = list(last_home['H_TEAM_WINS_AT'])[0]
    home_losses_amount = list(last_home['H_TEAM_LOSSES_AT'])[0]
    last_away = total.tail(1)
    away_wins_amount = list(last_away['A_TEAM_WINS_AT'])[0]
    away_losses_amount = list(last_away['A_TEAM_LOSSES_AT'])[0]
    total.drop(columns=['TEAM', 'LOSS', 'WINS_RECORD', 'LOSSES_RECORD', 'GAME_NO', 'WIN_PCT','H_TEAM_WINS_AT', 'H_TEAM_LOSSES_AT',
       'HOME_GAMES_TEAM', 'HOME_GAME_WIN_RATE', 'A_TEAM_WINS_AT','A_TEAM_LOSSES_AT', 'AWAY_GAMES_TEAM', 'AWAY_GAME_WIN_RATE', 'TEAM_ELO_BEFORE',
       'TEAM_ELO_AFTER', 'ODDS', 'PROBS'], inplace=True)
    total.rename(columns={'WIN':'WIN_PCT'}, inplace=True)
    total.columns = [x+f'_LAST_{n}_GAMES' for x in total.columns]
    total = total.mean()
    total[f'WINS_LAST_{n}_GAMES'] = total[f'WIN_PCT_LAST_{n}_GAMES'] * n
    total[f'LOSSES_LAST_{n}_GAMES'] = n - total[f'WINS_LAST_{n}_GAMES']
    total[f'HOME_WINS_LAST_{n}_GAMES'] = home_wins
    total[f'AWAY_WINS_LAST_{n}_GAMES'] = away_wins
    total[f'HOME_GAMES_IN_LAST_{n}_GAMES'] = home_games
    total[f'AWAY_GAMES_IN_LAST_{n}_GAMES'] = away_games
    total[f'HOME_WIN_PCT_IN_LAST_{n}_GAMES'] = home_win_pct
    total[f'AWAY_WIN_PCT_IN_LAST_{n}_GAMES'] = away_win_pct
    total['CURRENT_TOTAL_WINS'] = wins_amount
    total['CURRENT_TOTAL_LOSSES'] = losses_amount
    total['CURRENT_TOTAL_AWAY_WINS'] = away_wins_amount
    total['CURRENT_TOTAL_AWAY_LOSSES'] = away_losses_amount
    total['CURRENT_TOTAL_HOME_WINS'] = home_wins_amount
    total['CURRENT_TOTAL_HOME_LOSSES'] = home_losses_amount
    total['ELO'] = elo # ADDED ELO COLUMN
    total['TEAM'] = team
    return total

In [327]:
get_avg_stats_last_n_games(2019,'OKC','17-12-19',10)

FG_LAST_10_GAMES                         43.3
FGA_LAST_10_GAMES                        95.7
FG_PCT_LAST_10_GAMES                    0.453
FG(3)_LAST_10_GAMES                      14.2
FGA(3)_LAST_10_GAMES                     38.7
FG_PCT(3)_LAST_10_GAMES                0.3648
FT_LAST_10_GAMES                         13.9
FTA_LAST_10_GAMES                        19.2
FT_PCT_LAST_10_GAMES                   0.7197
ORB_LAST_10_GAMES                        13.8
DRB_LAST_10_GAMES                        33.4
TRB_LAST_10_GAMES                        47.2
AST_LAST_10_GAMES                        26.4
STL_LAST_10_GAMES                         7.8
BLK_LAST_10_GAMES                         4.8
TOV_LAST_10_GAMES                        11.5
PF_LAST_10_GAMES                         20.5
PTS_LAST_10_GAMES                       114.7
TS_PCT_LAST_10_GAMES                   0.5509
EFG_PCT_LAST_10_GAMES                  0.5269
3PA_R_LAST_10_GAMES                    0.4033
FT_R_LAST_10_GAMES                

In [328]:
okc_test = pd.DataFrame(get_avg_stats_last_n_games(2019,'OKC','17-12-19',10)).T

In [329]:
okc_test.columns

Index(['FG_LAST_10_GAMES', 'FGA_LAST_10_GAMES', 'FG_PCT_LAST_10_GAMES',
       'FG(3)_LAST_10_GAMES', 'FGA(3)_LAST_10_GAMES',
       'FG_PCT(3)_LAST_10_GAMES', 'FT_LAST_10_GAMES', 'FTA_LAST_10_GAMES',
       'FT_PCT_LAST_10_GAMES', 'ORB_LAST_10_GAMES', 'DRB_LAST_10_GAMES',
       'TRB_LAST_10_GAMES', 'AST_LAST_10_GAMES', 'STL_LAST_10_GAMES',
       'BLK_LAST_10_GAMES', 'TOV_LAST_10_GAMES', 'PF_LAST_10_GAMES',
       'PTS_LAST_10_GAMES', 'TS_PCT_LAST_10_GAMES', 'EFG_PCT_LAST_10_GAMES',
       '3PA_R_LAST_10_GAMES', 'FT_R_LAST_10_GAMES', 'ORB_PCT_LAST_10_GAMES',
       'DRB_PCT_LAST_10_GAMES', 'TRB_PCT_LAST_10_GAMES',
       'AST_PCT_LAST_10_GAMES', 'STL_PCT_LAST_10_GAMES',
       'BLK_PCT_LAST_10_GAMES', 'TOV_PCT_LAST_10_GAMES', 'O_RTG_LAST_10_GAMES',
       'D_RTG_LAST_10_GAMES', 'WIN_PCT_LAST_10_GAMES',
       'FG(3)_MISSED_LAST_10_GAMES', 'FG_MISSED_LAST_10_GAMES',
       'BLOCKED_ATTEMPTS_LAST_10_GAMES', 'TOV_FORCED_LAST_10_GAMES',
       'FG(2)_LAST_10_GAMES', 'FGA(2)_LAST_10_GAMES

In [330]:
inshallah = []
for key,value in df.iterrows():
    try:
        home = get_avg_stats_last_n_games(value['SEASON'],value['TEAM_HOME'],value['DATE'],10)
        away = get_avg_stats_last_n_games(value['SEASON'],value['TEAM_AWAY'],value['DATE'],10)
        home.drop(labels=[x for x in home.index if 'AWAY' in x], inplace=True)
        away.drop(labels=[x for x in away.index if 'HOME' in x], inplace=True)
        home.index = [i+'_HOME' for i in home.index]
        away.index = [i+'_AWAY' for i in away.index]
        combined = pd.concat([home,away])
        combined['ODDS_HOME'] = value['ODDS_HOME'] # ADDED HOME ODDS COLUMN
        combined['GAMEID'] = value['GAMEID']
        combined['GAME_NO_HOME'] = value['GAME_NO_HOME']
        combined['GAME_NO_AWAY'] = value['GAME_NO_AWAY']
        combined['OTs'] = value['OTs']
        combined['DATE'] = value['DATE']
        combined['SEASON'] = value['SEASON']
        combined['HOME_WIN/LABEL'] = value['WIN_HOME']
        combined = pd.DataFrame(combined).T
        inshallah.append(combined)
    except IndexError:
        continue

In [331]:
elo_10 = pd.concat(inshallah)


In [332]:
elo_10.isna().sum().sort_values(ascending=False)

AWAY_WIN_PCT_IN_LAST_10_GAMES_AWAY    47
CURRENT_TOTAL_AWAY_LOSSES_AWAY        47
CURRENT_TOTAL_AWAY_WINS_AWAY          47
FG_LAST_10_GAMES_HOME                  0
BLK_PCT_LAST_10_GAMES_AWAY             0
                                      ..
FG(3)_MISSED_LAST_10_GAMES_HOME        0
WIN_PCT_LAST_10_GAMES_HOME             0
D_RTG_LAST_10_GAMES_HOME               0
O_RTG_LAST_10_GAMES_HOME               0
HOME_WIN/LABEL                         0
Length: 108, dtype: int64

In [333]:
elo_10.fillna(0, inplace=True)

In [334]:
elo_10.to_csv('./cleaned_data/data_to_model/elo_10_games_full.csv')

In [335]:
elo_10

Unnamed: 0,FG_LAST_10_GAMES_HOME,FGA_LAST_10_GAMES_HOME,FG_PCT_LAST_10_GAMES_HOME,FG(3)_LAST_10_GAMES_HOME,FGA(3)_LAST_10_GAMES_HOME,FG_PCT(3)_LAST_10_GAMES_HOME,FT_LAST_10_GAMES_HOME,FTA_LAST_10_GAMES_HOME,FT_PCT_LAST_10_GAMES_HOME,ORB_LAST_10_GAMES_HOME,...,ELO_AWAY,TEAM_AWAY,ODDS_HOME,GAMEID,GAME_NO_HOME,GAME_NO_AWAY,OTs,DATE,SEASON,HOME_WIN/LABEL
0,36.0,76.0,0.4740,9.0,25.0,0.3600,15.0,24.0,0.6250,14.0,...,1507.214937,DAL,1.716595,201311010HOU,2.0,2.0,0,2013-11-01,2014,1
0,43.0,83.0,0.5180,4.0,19.0,0.2110,14.0,18.0,0.7780,11.0,...,1492.962659,UTA,1.863367,201311010PHO,2.0,2.0,0,2013-11-01,2014,1
0,33.0,81.0,0.4070,8.0,21.0,0.3810,16.0,23.0,0.6960,11.0,...,1490.917957,LAC,1.732959,201311010SAC,2.0,3.0,0,2013-11-01,2014,0
0,38.5,91.0,0.4225,11.0,23.5,0.4635,17.0,24.5,0.7025,13.5,...,1506.235734,SAS,1.562900,201311010LAL,3.0,2.0,0,2013-11-01,2014,0
0,32.0,70.5,0.4540,8.5,20.0,0.4235,23.5,31.0,0.7605,8.0,...,1498.600276,CLE,1.880949,201311020IND,3.0,3.0,0,2013-11-02,2014,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,41.3,91.0,0.4552,10.2,28.7,0.3531,21.7,28.7,0.7545,12.4,...,1336.570066,CHI,6.741837,201904100PHI,82.0,82.0,0,2019-04-10,2019,1
0,42.9,89.7,0.4794,10.2,28.5,0.3551,14.6,18.7,0.7871,10.4,...,1447.956222,DAL,3.202098,201904100SAS,82.0,82.0,0,2019-04-10,2019,0
0,39.3,89.2,0.4425,9.0,29.7,0.3015,14.7,19.4,0.7530,9.9,...,1461.666686,MIN,3.598549,201904100DEN,82.0,82.0,0,2019-04-10,2019,1
0,41.6,84.6,0.4923,11.4,29.4,0.3862,21.2,26.8,0.7927,7.6,...,1627.484293,UTA,0.929183,201904100LAC,82.0,82.0,1,2019-04-10,2019,1


In [336]:
inshallah = []
for key,value in df.iterrows():
    try:
        home = get_avg_stats_last_n_games(value['SEASON'],value['TEAM_HOME'],value['DATE'],5)
        away = get_avg_stats_last_n_games(value['SEASON'],value['TEAM_AWAY'],value['DATE'],5)
        home.drop(labels=[x for x in home.index if 'AWAY' in x], inplace=True)
        away.drop(labels=[x for x in away.index if 'HOME' in x], inplace=True)
        home.index = [i+'_HOME' for i in home.index]
        away.index = [i+'_AWAY' for i in away.index]
        combined = pd.concat([home,away])
        combined['ODDS_HOME'] = value['ODDS_HOME']
        combined['GAMEID'] = value['GAMEID']
        combined['GAME_NO_HOME'] = value['GAME_NO_HOME']
        combined['GAME_NO_AWAY'] = value['GAME_NO_AWAY']
        combined['OTs'] = value['OTs']
        combined['DATE'] = value['DATE']
        combined['SEASON'] = value['SEASON']
        combined['HOME_WIN/LABEL'] = value['WIN_HOME']
        combined = pd.DataFrame(combined).T
        inshallah.append(combined)
    except IndexError:
        continue

In [337]:
elo_5 = pd.concat(inshallah)
elo_5.isna().sum().sort_values(ascending=False)

AWAY_WIN_PCT_IN_LAST_5_GAMES_AWAY    231
CURRENT_TOTAL_AWAY_LOSSES_AWAY       231
CURRENT_TOTAL_AWAY_WINS_AWAY         231
FG_LAST_5_GAMES_HOME                   0
BLK_PCT_LAST_5_GAMES_AWAY              0
                                    ... 
FG(3)_MISSED_LAST_5_GAMES_HOME         0
WIN_PCT_LAST_5_GAMES_HOME              0
D_RTG_LAST_5_GAMES_HOME                0
O_RTG_LAST_5_GAMES_HOME                0
HOME_WIN/LABEL                         0
Length: 108, dtype: int64

In [338]:
elo_5.fillna(0, inplace=True)

In [339]:
elo_5.isna().sum().sort_values(ascending=False)

FG_LAST_5_GAMES_HOME              0
TS_PCT_LAST_5_GAMES_AWAY          0
O_RTG_LAST_5_GAMES_AWAY           0
TOV_PCT_LAST_5_GAMES_AWAY         0
BLK_PCT_LAST_5_GAMES_AWAY         0
                                 ..
FG(3)_MISSED_LAST_5_GAMES_HOME    0
WIN_PCT_LAST_5_GAMES_HOME         0
D_RTG_LAST_5_GAMES_HOME           0
O_RTG_LAST_5_GAMES_HOME           0
HOME_WIN/LABEL                    0
Length: 108, dtype: int64

In [340]:
elo_5.to_csv('./cleaned_data/data_to_model/elo_5_games_full.csv')