In [None]:
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network._multilayer_perceptron import MLPRegressor
from statsmodels.tsa.statespace.sarimax import SARIMAX
from google.oauth2 import service_account
from datetime import datetime as date
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import TimeSeriesSplit 
from sklearn.metrics import r2_score,make_scorer,mean_squared_error
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from pmdarima import auto_arima
from statsmodels.tsa.stattools import adfuller

import joblib
import pickle
import pandas as pd
import pandas_gbq
import numpy as np
import xgboost
import lightgbm
import shap
import os

### Gathering Data and Feature Engineering

In [None]:
player_features= ["min_3gm_avg", "fgm_3gm_avg", "fga_3gm_avg", "fg%_3gm_avg", "3pm_3gm_avg", 
                   "3pa_3gm_avg", "3p%_3gm_avg", "ftm_3gm_avg", "fta_3gm_avg", "ft%_3gm_avg", 
                   "oreb_3gm_avg", "dreb_3gm_avg", "reb_3gm_avg", "ast_3gm_avg", "stl_3gm_avg", 
                   "blk_3gm_avg", "to_3gm_avg", "pf_3gm_avg", "pts_3gm_avg", "plus_mins_3gm_avg",
                   "pts_season", "pts_momentum", "min_season", "min_momentum", "fgm_season", "fgm_momentum", 
                    "fga_season", "fga_momentum", "fg%_season", "fg%_momentum", "3pm_season", "3pm_momentum", 
                    "3pa_season", "3pa_momentum", "3p%_season", "3p%_momentum", "ftm_season", "ftm_momentum", 
                    "fta_season", "fta_momentum", "ft%_season", "ft%_momentum", "oreb_season", "oreb_momentum", 
                    "dreb_season", "dreb_momentum", "reb_season", "reb_momentum", "ast_season", "ast_momentum", 
                    "stl_season", "stl_momentum", "blk_season", "blk_momentum", "to_season", "to_momentum", 
                    "pf_season", "pf_momentum", "plus_mins_season", "plus_mins_momentum"]

team_features = ["home","away","offrtg_3gm_avg", "defrtg_3gm_avg", "netrtg_3gm_avg", "ast%_3gm_avg", "ast_to_3gm_avg", 
                    "ast_ratio_3gm_avg", "oreb%_3gm_avg", "dreb%_3gm_avg", "reb%_3gm_avg", "tov%_3gm_avg", 
                    "efg%_3gm_avg", "ts%_3gm_avg", "pace_3gm_avg", "pie_3gm_avg",
                    "netrtg_season", "netrtg_momentum", "offrtg_season", "offrtg_momentum", "defrtg_season", "defrtg_momentum", 
                    "ast%_season", "ast%_momentum", "ast_to_season", "ast_to_momentum", "ast_ratio_season", "ast_ratio_momentum", 
                    "oreb%_season", "oreb%_momentum", "dreb%_season", "dreb%_momentum", "reb%_season", "reb%_momentum", 
                    "tov%_season", "tov%_momentum", "efg%_season", "efg%_momentum", "ts%_season", "ts%_momentum", 
                    "pace_season", "pace_momentum", "pie_season", "pie_momentum"]

#using shifted windows for rolling data to prevent data leakage
player_query = f""" 
SELECT player,team,game_id,game_date,matchup,pts,reb,ast,`3pm`, {','.join([f'`{player}`' for player in player_features])},season
from `capstone_data.player_modeling_data`
order by game_date asc
"""

team_query = f"""
SELECT team,game_id,game_date,home,away, {', '.join([f'`{team}`' for team in team_features])}
from `capstone_data.team_modeling_data`
order by game_date asc
"""


In [None]:
try:
    full_data = pd.read_csv('full_data.csv')

except:
    nba_player_data = pd.DataFrame(pandas_gbq.read_gbq(player_query,project_id='miscellaneous-projects-444203'))
    team_data = pd.DataFrame(pandas_gbq.read_gbq(team_query,project_id='miscellaneous-projects-444203'))
    opponent_data = team_data.rename(columns={
    col: ('matchup' if col == 'team' else 'game_id' if col == 'game_id' else f'opponent_{col}') for col in team_data.columns})
    full_data = nba_player_data.merge(team_data, on = ['game_id','team'], how = 'inner',suffixes=('','remove'))
    full_data = full_data.merge(opponent_data,on = ['game_id','matchup'],how = 'inner',suffixes=('','remove'))
    full_data.drop([column for column in full_data.columns if 'remove' in column],axis = 1 , inplace=True) 
    full_data.drop([column for column in full_data.columns if '_1' in column],axis = 1 , inplace=True)
    full_data.to_csv('full_data.csv',mode = 'x')

In [None]:
data_ordered = full_data.sort_values('game_date')

data_ordered.dropna(inplace=True)


#### Feature Engineering Ideas 

* (ratio of 3pa and fga and 3pm and 3pa) TS% for players efg% 
* for players assist_to_turnover ratio assist ratio, 
* rebound_cahnce, defesnive reb %, 
* ast_ratio_season * pace, 
* home * pts season - data pts 3pm avg,
* cold_streak pts_3gm_avg < pts_season boolean, 
* away difficulty away * opponent_defrtg_3gm_avg,
* home_performance = data_ordered[data_ordered["home"] == 1].groupby("team")["pts_season"].mean()
* away_performance = data_ordered[data_ordered["away"] == 1].groupby("team")["pts_season"].mean() these would be to see how the team performance changes 


In [None]:
data_ordered['pts_per_min_3gm'] = data_ordered['pts_3gm_avg']/data_ordered['min_3gm_avg']
data_ordered['pts_per_min_season'] = data_ordered['pts_season']/data_ordered['min_season']
data_ordered['pts_per_min_momentum'] = data_ordered['pts_per_min_3gm'] - data_ordered['pts_per_min_season']

data_ordered['3pm_per_min_3gm'] = data_ordered['3pm_3gm_avg']/data_ordered['min_3gm_avg']
data_ordered['3pm_per_min_season'] = data_ordered['3pm_season']/data_ordered['min_season']
data_ordered['3pm_per_min_momentum'] = data_ordered['3pm_per_min_3gm'] - data_ordered['3pm_per_min_season'] 

data_ordered['reb_per_min_3gm'] = data_ordered['reb_3gm_avg']/data_ordered['min_3gm_avg']
data_ordered['reb_per_min_season'] = data_ordered['reb_season']/data_ordered['min_season']
data_ordered['reb_per_min_momentum'] = data_ordered['3pm_per_min_3gm'] - data_ordered['reb_per_min_season']

home_performance = data_ordered[data_ordered['home'] == 1]
away_performance = data_ordered[data_ordered['away'] == 1]

In [None]:
# Ensure data is sorted correctly for chronological calculations
data_ordered = data_ordered.sort_values(by=['player', 'season', 'game_date'])

# Separate home and away games
home_performance = data_ordered[data_ordered['home'] == 1]
away_performance = data_ordered[data_ordered['home'] == 0]  # Fixed to align with `home` flag

# Compute season-to-date averages for home and away games (including game_id)
home_rolling = (
    home_performance.groupby(['player', 'season'])[['game_id', 'pts', 'reb', 'ast', '3pm']]
    .apply(lambda x: x.set_index('game_id').expanding().mean().shift(1))  # Prevent data leakage
    .reset_index()
)

away_rolling = (
    away_performance.groupby(['player', 'season'])[['game_id', 'pts', 'reb', 'ast', '3pm']]
    .apply(lambda x: x.set_index('game_id').expanding().mean().shift(1))
    .reset_index()
)

# Rename columns before merging
home_rolling = home_rolling.rename(columns={'pts': 'home_avg_pts', 'reb': 'home_avg_reb', 
                                            'ast': 'home_avg_ast', '3pm': 'home_avg_3pm'})
away_rolling = away_rolling.rename(columns={'pts': 'away_avg_pts', 'reb': 'away_avg_reb', 
                                            'ast': 'away_avg_ast', '3pm': 'away_avg_3pm'})

# Merge rolling averages back into `data_ordered`
data_ordered = data_ordered.merge(home_rolling[['player', 'game_id', 'home_avg_pts', 'home_avg_reb', 'home_avg_ast', 'home_avg_3pm']],
                                  on=['player', 'game_id'], how='left')

data_ordered = data_ordered.merge(away_rolling[['player', 'game_id', 'away_avg_pts', 'away_avg_reb', 'away_avg_ast', 'away_avg_3pm']],
                                  on=['player', 'game_id'], how='left')

# Fill missing values for early season games
for cat in ['pts', 'reb', 'ast', '3pm']:
    data_ordered[f'home_avg_{cat}'] = data_ordered[f'home_avg_{cat}'].fillna(0)
    data_ordered[f'away_avg_{cat}'] = data_ordered[f'away_avg_{cat}'].fillna(0)

    # Compute home vs. away performance difference conditionally
    data_ordered[f'{cat}_home_away_diff'] = (
        (data_ordered['home'] == 1) * (data_ordered[f'home_avg_{cat}'] - data_ordered[f'away_avg_{cat}']) +
        (data_ordered['home'] == 0) * (data_ordered[f'away_avg_{cat}'] - data_ordered[f'home_avg_{cat}'])
    )

# Drop unnecessary columns
data_ordered = data_ordered.drop(columns=[f'home_avg_{cat}' for cat in ['pts', 'reb', 'ast', '3pm']] + 
                                           [f'away_avg_{cat}' for cat in ['pts', 'reb', 'ast', '3pm']])



In [None]:
data_ordered.drop(columns = ['Unnamed: 0'],inplace=True)

In [None]:
data_ordered = data_ordered.groupby(['player','season']).apply(lambda x: x.iloc[3:]).reset_index(drop=True)

In [None]:
data_ordered.sort_values(by='game_date',inplace=True)

In [None]:
data_ordered['game_date'] = pd.to_datetime(data_ordered['game_date'])

In [None]:
data_ordered['days_ago'] = (data_ordered['game_date'].max() - data_ordered['game_date']).dt.days
data_ordered['time_decay_weight'] = 1 / (1 + np.log(1 + data_ordered['days_ago']))

In [None]:
# WILL APPLY POLYNOMIAL FEATURES IN THE FUTURE
# poly = PolynomialFeatures(2,include_bias=False,interaction_only=False)
# percentages = [feature for feature in data_ordered.columns if '%' in feature]

# for col in percentages:
#     transformed_data = poly.fit_transform(data_ordered[[col]])

#     column_names = [f"{col}_poly_{i+1}" for i in range(transformed_data.shape[1])]

#     poly_df = pd.DataFrame(transformed_data,columns=column_names,index=data_ordered)

# print(poly_df['opponent_ts%_momentum_poly_2'])

In [None]:
pd.set_option('display.max_columns',100000)

In [None]:
numeric_columns = data_ordered.select_dtypes(include=['number']).columns.tolist()
numeric_columns = [column for column in numeric_columns if column not in ['pts','reb','ast','blk','stl','3pm','game_id','game_date','days_ago','time_decay_weight']]

features = {feature:[] for feature in ['pts','reb','ast','3pm']}

In [None]:
for category in features.keys():
    print(category)
    for column in numeric_columns:
        correlation = pearsonr(data_ordered[column],data_ordered[category])
        if correlation[1] < .05 and abs(correlation[0]) > .3:
            print(column)
            print(f'correlation {correlation[0]} p_value {correlation[1]}')
            features[category].append(column)
    features[category].append('time_decay_weight')

In [None]:
# Checking spearmanr 

for category in features.keys():
    print(category)
    for column in numeric_columns:
        pearson_corr, pearson_p = pearsonr(data_ordered[column], data_ordered[category])
        spearman_corr, spearman_p = spearmanr(data_ordered[column], data_ordered[category])

        # If Spearman is high but Pearson is low, it suggests a non-linear relationship
        if abs(spearman_corr) > 0.3 and abs(pearson_corr) < 0.2:
            print(f"🚀 {column} likely has a non-linear relationship with {category}")


In [None]:
#These values appeared to have non-linear relationships applying transformations
data_ordered['ft%_season'] = np.log1p(data_ordered['ft%_season'])
data_ordered['stl_3gm_avg'] = np.log1p(data_ordered['stl_3gm_avg'])
data_ordered['stl_season'] = np.log1p(data_ordered['stl_season'])
data_ordered['to_season'] = data_ordered['to_season']**2 
data_ordered['to_3gm_avg'] = data_ordered['to_3gm_avg']**2 

In [None]:
tscv = TimeSeriesSplit(n_splits=5)

In [None]:
saved_models = {category:{} for category in features.keys()}

#### SHAP
Applying shap to help reduce collinearity

### Linear Model

In [None]:
for category in features.keys():
    features[category] = [f for f in features[category] if f != category]


In [None]:
split_index = int(len(data_ordered) * .80)

train_data = data_ordered.iloc[:split_index]
test_data = data_ordered[split_index:]

for category in features.keys():
    features_list = [f for f in features[category] if f != category]
    x_train,y_train = train_data[features_list],train_data[category]
    x_test, y_test = test_data[features_list],test_data[category]
    linear_model = LinearRegression()

    linear_model.fit(x_train,y_train)

    output = pd.DataFrame({'prediction':linear_model.predict(x_test), 'actual':y_test})
    print(category)
    print(r2_score(y_true=output['actual'],y_pred=output['prediction']))

    saved_models[category]['linear_model'] = linear_model

In [None]:
#Assessing for multicollinearity
mulitcol_pairs = {cat:[] for cat in features.keys()}
for category in features.keys():
    corr_matrix = data_ordered[features[category]].corr()
    high_corr_vars = np.where(abs(corr_matrix) > 0.8)
    high_corr_pairs = [(corr_matrix.index[x], corr_matrix.columns[y]) 
                    for x, y in zip(*high_corr_vars) if x != y and x < y]
    mulitcol_pairs[category].append(high_corr_pairs)
      # Drop one from each highly correlated pair


In [None]:
trimmed_feats = {cat:[] for cat in features.keys()}
for cat in mulitcol_pairs.keys():
    print(cat)
    for pairs in mulitcol_pairs[cat]:
        for x,y in pairs:
            cor_x = pearsonr(data_ordered[x],data_ordered[cat])
            cor_y = pearsonr(data_ordered[y],data_ordered[cat])

            if cor_x[1] < .05 and cor_x[0] > cor_y[0]:
                trimmed_feats[cat].append(x)
            elif cor_y[1] < .05 and cor_y[0] > cor_x[0]:
                trimmed_feats[cat].append(y)

In [None]:
for category in features.keys():
    features_list = [f for f in features[category] if f != category]
    x_train,y_train = train_data[features_list],train_data[category]
    x_test, y_test = test_data[features_list],test_data[category]
    ridge_model = Ridge(alpha=1)

    ridge_model.fit(x_train,y_train)

    output = pd.DataFrame({'prediction':ridge_model.predict(x_test), 'actual':y_test})
    print(category)
    print(r2_score(y_true=output['actual'],y_pred=output['prediction']))

### Random Forrest 

In [None]:
scaler = StandardScaler()
numeric_columns.append('time_decay_weight')
scaled_data = scaler.fit_transform(data_ordered[numeric_columns])

scaled_data_df = pd.DataFrame(scaled_data,columns=numeric_columns)

split_index = int(len(data_ordered) * .80)

scaled_train_data = scaled_data_df.iloc[:split_index]
scaled_test_data = scaled_data_df[split_index:]

In [None]:
rand_forrest = RandomForestRegressor(n_estimators=100,criterion='squared_error',max_depth=10, min_samples_split=4,n_jobs=-1)

for category in features.keys():
    features_list = [f for f in features[category] if f != category]
    x_train,y_train = scaled_train_data[features_list],train_data[category]
    x_test, y_test = scaled_test_data[features_list],test_data[category]

    rand_forrest.fit(x_train,y_train)

    y_pred = rand_forrest.predict(x_test)

    r2 = r2_score(y_test,y_pred)

    print(category)
    print(r2)


In [None]:
rand_forrest = RandomForestRegressor(n_estimators=100,criterion='squared_error',max_depth=10, min_samples_split=4,n_jobs=-1)

for category in features.keys():
    features_list = [f for f in features[category] if f != category]
    x_train,y_train = train_data[features_list],train_data[category]
    x_test, y_test = test_data[features_list],test_data[category]

    rand_forrest.fit(x_train,y_train)

    y_pred = rand_forrest.predict(x_test)

    r2 = r2_score(y_test,y_pred)

    print(category)
    print(r2)
    saved_models[category]['Random_Forrest'] = rand_forrest

In [None]:
import matplotlib.pyplot as plt

plt.hist(data_ordered['time_decay_weight'], bins=30)
plt.title("Distribution of Time Decay Weights")
plt.xlabel("Decay Weight")
plt.ylabel("Count")
plt.show()

### XGboost

In [None]:
scaler = StandardScaler()

scaled_data = scaler.fit_transform(data_ordered[numeric_columns])

scaled_data_df = pd.DataFrame(scaled_data,columns=numeric_columns)

split_index = int(len(data_ordered) * .80)

scaled_train_data = scaled_data_df.iloc[:split_index]
scaled_test_data = scaled_data_df[split_index:]

In [None]:
param_grid = {'max_depth':[3,6,9],'learning_rate':[.01,.05,.1,.3],'booster':['gbtree','dart'],'subsample':[.5,.7,.9],'colsample_bytree':[.5,.7,.9],'n_estimators':[100,300,500]}
param_linear = {'booster':['gblinear'],'lambda':[0,.1,1,10],'alpha':[0,.1,1,10]}

In [None]:
xgb_regressor = xgboost.XGBRegressor()
mse_score = make_scorer(mean_squared_error,greater_is_better=False)
r2_scorer = make_scorer(r2_score)
scoring = {'MSE':mse_score,'r2':r2_scorer}
grid_search = GridSearchCV(estimator=xgb_regressor,param_grid=param_grid,scoring = scoring,cv=tscv,n_jobs=1,verbose=0,refit='r2')
grid_linear_search = GridSearchCV(estimator=xgb_regressor,param_grid=param_linear,scoring = scoring,cv=tscv,n_jobs=3,verbose=0,refit='r2')


In [None]:
xg_features = [feature for feature in data_ordered.columns if data_ordered[feature].dtype == 'float' and feature not in features.keys()]

In [None]:
for category in features.keys():
    x_train,y_train = scaled_train_data[xg_features],train_data[category]
    x_test, y_test = scaled_test_data[xg_features],test_data[category]

    fit_params = {'eval_set':[(x_test,y_test)],'early_stopping_rounds':20,'verbose':False}

    grid_linear_search.estimator.set_params(eval_metric='rmse')


    grid_linear_search.fit(x_train,y_train)


    print(category)
    print(grid_linear_search.best_params_)
    print(grid_linear_search.best_score_)

    saved_models[category]['XGboost'] = grid_linear_search.best_estimator_

In [None]:
# for category in features.keys():
#     x_train,y_train = scaled_train_data[xg_features],train_data[category]
#     x_test, y_test = scaled_test_data[xg_features],test_data[category]

#     fit_params = {'eval_set':[(x_test,y_test)],'early_stopping_rounds':20,'verbose':False}

#     grid_search.estimator.set_params(eval_metric='rmse')


#     grid_search.fit(x_train,y_train)

#     print(category)
#     print(grid_search.best_params_)
#     print(grid_search.best_score_)

### LightGBM

In [None]:
light = lightgbm.LGBMRegressor(boosting_type='gbdt', n_estimators=500)  # Using hist for faster training

param_grid = {
    'num_leaves': [31, 50],  
    'learning_rate': [0.01, 0.1],  
    'max_depth': [-1, 10],  
}


In [None]:
light_grid_search = GridSearchCV(estimator=light,param_grid=param_grid,cv=tscv,verbose=0,n_jobs=4)

In [None]:
split_index = int(len(data_ordered) * .80)

train_data = data_ordered.iloc[:split_index]
test_data = data_ordered[split_index:]
for category in features.keys():
    x_train,y_train = train_data[features[category]],train_data[category]
    x_test,y_test = test_data[features[category]],test_data[category]

    light_grid_search.fit(x_train,y_train)

    best_model = light_grid_search.best_estimator_
    print(category)
    print("Best Parameters:", light_grid_search.best_params_)

    y_pred = best_model.predict(x_test)

    mse = mean_squared_error(y_test,y_pred)
    r2 = r2_score(y_test,y_pred)

    saved_models[category]['lightgbm'] = best_model
    print(f'MSE: {mse}')
    print(f'R2: {r2}')

### Neural Network

In [None]:
scaler = StandardScaler()

scaled_data = scaler.fit_transform(data_ordered[numeric_columns])

scaled_data_df = pd.DataFrame(scaled_data,columns=numeric_columns)

split_index = int(len(data_ordered) * .80)

scaled_train_data = scaled_data_df.iloc[:split_index]
scaled_test_data = scaled_data_df[split_index:]

In [None]:
param_nn = {'activation':['identity','logistic','tanh','relu'],'solver':['lbfgs','sgd','adam'],'alpha':[.0001,.001,.01,.0005],'batch_size':[200,400,600],'learning_rate':['constant','invscaling','adaptive'],'max_iter':[500,1000]}

# nn_grid = GridSearchCV(estimator=MLPRegressor(),param_grid=param_nn,scoring=scoring,cv=tscv,n_jobs=-1,verbose=0,refit='r2')
nn_grid = RandomizedSearchCV(estimator=MLPRegressor(),param_distributions=param_nn,n_iter=30,scoring=scoring,cv=tscv,n_jobs=-1,verbose=1,refit='r2')

for category in features.keys():
    nn_features = [col for col in scaled_data_df.columns if col not in ['game_id','player','team','matchup','pts','reb','ast','3pm']]
    scaled_x_train,scaled_y_train = scaled_train_data[nn_features],train_data[category]
    scaled_x_test,scaled_y_test = scaled_test_data[nn_features],test_data[category]

    nn_grid.fit(scaled_x_train,scaled_y_train)

    best_model = nn_grid.best_estimator_

    y_pred = best_model.predict(scaled_x_test)

    mse = mean_squared_error(scaled_y_test,y_pred)
    r2 = r2_score(scaled_y_test,y_pred)
    print(category)
    print(r2)
    saved_models[category]['MLP'] = best_model

### SARIMAX 

In [128]:
# data_ordered = data_ordered.set_index('game_date')  # Now modifying original
split_index = int(len(data_ordered) * .80)

train_data = data_ordered.iloc[:split_index]
test_data = data_ordered.iloc[split_index:]

data_ordered_daily = data_ordered.resample('D').agg({
    'pts': 'sum',  
    'reb': 'sum',  
    'ast': 'sum',  
    '3pm': 'sum',  
    'game_id': 'count',  
    'player': lambda x: list(x),  
    'team': lambda x: list(x),  
    'matchup': lambda x: list(x)
})


for category in features.keys():
    # Aggregate statistics (Example: Total points scored in each game)
    x_train,y_train = train_data[features_list],train_data[category]
    x_test, y_test = test_data[features_list],test_data[category]

    auto_model = auto_arima(y_train,seasonal=False,trace=True,suppress_warnings=True,max_d=1,max_p=5,max_q=5)
    p,d,q = auto_model.order

    sarimax = SARIMAX(y_train,exog=x_train,order=(p,0,q),seasonal_order=(1,0,1,7))

    fitted_model=sarimax.fit()

    y_pred = fitted_model.forecast(steps=len(y_test),exog=x_test)

    print(f'{category}{r2_score(y_test,y_pred)}')
    saved_models[category]['sarimax'] = {
        'order': (p, d, q),
        'seasonal_order': (1, 0, 1, 7),
        'params': fitted_model.params,
        'exog_columns': list(features_list)
    }

Performing stepwise search to minimize aic
 ARIMA(2,1,2)(0,0,0)[0] intercept   : AIC=inf, Time=46.84 sec
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=554657.603, Time=0.89 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=534828.508, Time=1.96 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=inf, Time=33.96 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=554655.603, Time=0.41 sec
 ARIMA(2,1,0)(0,0,0)[0] intercept   : AIC=526527.360, Time=2.76 sec
 ARIMA(3,1,0)(0,0,0)[0] intercept   : AIC=521995.217, Time=3.47 sec
 ARIMA(4,1,0)(0,0,0)[0] intercept   : AIC=519047.667, Time=4.40 sec
 ARIMA(5,1,0)(0,0,0)[0] intercept   : AIC=517275.259, Time=5.43 sec
 ARIMA(5,1,1)(0,0,0)[0] intercept   : AIC=inf, Time=82.87 sec
 ARIMA(4,1,1)(0,0,0)[0] intercept   : AIC=inf, Time=62.76 sec
 ARIMA(5,1,0)(0,0,0)[0]             : AIC=517273.259, Time=1.76 sec
 ARIMA(4,1,0)(0,0,0)[0]             : AIC=519045.667, Time=1.61 sec
 ARIMA(5,1,1)(0,0,0)[0]             : AIC=inf, Time=17.17 sec
 ARIMA(4,1,1)(0,0,0)[0]            

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


pts0.5414376988436713
Performing stepwise search to minimize aic
 ARIMA(2,1,2)(0,0,0)[0] intercept   : AIC=inf, Time=44.89 sec
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=422217.487, Time=0.94 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=401842.004, Time=1.14 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=inf, Time=35.02 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=422215.487, Time=0.43 sec
 ARIMA(2,1,0)(0,0,0)[0] intercept   : AIC=393627.980, Time=2.89 sec
 ARIMA(3,1,0)(0,0,0)[0] intercept   : AIC=389081.118, Time=3.48 sec
 ARIMA(4,1,0)(0,0,0)[0] intercept   : AIC=386329.588, Time=4.26 sec
 ARIMA(5,1,0)(0,0,0)[0] intercept   : AIC=384368.420, Time=5.36 sec
 ARIMA(5,1,1)(0,0,0)[0] intercept   : AIC=inf, Time=87.39 sec
 ARIMA(4,1,1)(0,0,0)[0] intercept   : AIC=inf, Time=82.50 sec
 ARIMA(5,1,0)(0,0,0)[0]             : AIC=384366.420, Time=1.72 sec
 ARIMA(4,1,0)(0,0,0)[0]             : AIC=386327.587, Time=1.67 sec
 ARIMA(5,1,1)(0,0,0)[0]             : AIC=inf, Time=18.04 sec
 ARIMA(4,1,1)

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


reb-0.4917242367418173
Performing stepwise search to minimize aic
 ARIMA(2,1,2)(0,0,0)[0] intercept   : AIC=inf, Time=46.65 sec
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=384700.960, Time=0.92 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=364636.402, Time=1.12 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=inf, Time=24.33 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=384698.960, Time=0.43 sec
 ARIMA(2,1,0)(0,0,0)[0] intercept   : AIC=356224.290, Time=2.78 sec
 ARIMA(3,1,0)(0,0,0)[0] intercept   : AIC=351679.734, Time=3.40 sec
 ARIMA(4,1,0)(0,0,0)[0] intercept   : AIC=348879.295, Time=4.29 sec
 ARIMA(5,1,0)(0,0,0)[0] intercept   : AIC=346949.348, Time=5.49 sec
 ARIMA(5,1,1)(0,0,0)[0] intercept   : AIC=inf, Time=84.82 sec
 ARIMA(4,1,1)(0,0,0)[0] intercept   : AIC=inf, Time=63.88 sec
 ARIMA(5,1,0)(0,0,0)[0]             : AIC=346947.348, Time=1.91 sec
 ARIMA(4,1,0)(0,0,0)[0]             : AIC=348877.295, Time=1.79 sec
 ARIMA(5,1,1)(0,0,0)[0]             : AIC=inf, Time=15.38 sec
 ARIMA(4,1,1

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


ast0.22619320767635187
Performing stepwise search to minimize aic
 ARIMA(2,0,2)(0,0,0)[0]             : AIC=inf, Time=9.92 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=291231.651, Time=0.36 sec
 ARIMA(1,0,0)(0,0,0)[0]             : AIC=279533.810, Time=0.72 sec
 ARIMA(0,0,1)(0,0,0)[0]             : AIC=283717.265, Time=1.00 sec
 ARIMA(2,0,0)(0,0,0)[0]             : AIC=273768.918, Time=1.05 sec
 ARIMA(3,0,0)(0,0,0)[0]             : AIC=269939.832, Time=1.14 sec
 ARIMA(4,0,0)(0,0,0)[0]             : AIC=267733.126, Time=1.45 sec
 ARIMA(5,0,0)(0,0,0)[0]             : AIC=266121.719, Time=1.94 sec
 ARIMA(5,0,1)(0,0,0)[0]             : AIC=inf, Time=29.43 sec
 ARIMA(4,0,1)(0,0,0)[0]             : AIC=inf, Time=22.27 sec
 ARIMA(5,0,0)(0,0,0)[0] intercept   : AIC=256179.219, Time=6.13 sec
 ARIMA(4,0,0)(0,0,0)[0] intercept   : AIC=256177.311, Time=4.93 sec
 ARIMA(3,0,0)(0,0,0)[0] intercept   : AIC=256175.720, Time=3.23 sec
 ARIMA(2,0,0)(0,0,0)[0] intercept   : AIC=256183.000, Time=2.60 sec
 

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


3pm0.337992633521766


In [129]:
joblib.dump(saved_models,'models.pkl')

['models.pkl']