In [None]:
%load_ext autoreload
%autoreload 2

import sqlalchemy
import pandas as pd
import missingno as msno 
import matplotlib.pyplot as plt
import scipy
# %matplotlib widget

import seaborn as sns
import numpy as np
from sklearn.metrics import mean_squared_error,root_mean_squared_error,mean_absolute_error,r2_score
# from sklearn.metrics import mean_squared_error, mean_absolute_error,explained_variance_score,r2_score
import matplotlib.ticker as mtick
from CustomLibs.CustomFunctions import plot_corr_heatmap, value_to_float, fig_indexes, sqlcol
from config import Config
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

from CustomLibs.MultiPipe import MultiPipe

from statsmodels.graphics.tsaplots import plot_acf,plot_pacf

date_val_end=Config.TEST_DATE_CUTOFF
date_test_start=pd.to_datetime(date_val_end) + pd.DateOffset(days=1)


engine = sqlalchemy.create_engine(Config.CONN_STR)

In [None]:
with engine.connect() as conn:
    df_access = pd.read_sql_table('Moving_Averages_By_Day', conn,schema='Silver')
df_access = df_access.set_index('Date').loc[:date_val_end]

y=df_access['Pct_On_Site']

pds=MultiPipe()



In [None]:
seqlags=[x for x in df_access.columns if x.startswith('PctOnSite_seqlag')]
X=df_access[seqlags]

pds.AddPreProc(RobustScaler(),'pp')
pds.PurgeQCSet('Sequential Lag QC')
pds.AddQCSet('pp','Sequential Lag QC')
for lookback in range(1,len(seqlags[:8])+1):
    label = 'seqlag_' + str(lookback)
    _ = pds.CalculateScores('Sequential Lag QC','pp','Lag ' + str(lookback),X[seqlags[:lookback]],y)

# mavgs=[x for x in df_access.columns if x.startswith('PctOnSite_ma')]
# X=df_access[mavgs]
# pds.AddQCSet('pp','Moving Average QC')
# for lookback in range(1,len(mavgs)+1):
#     label = 'movavg_' + str(lookback)
#     _ = pds.CalculateScores('Moving Average QC','pp',mavgs[lookback-1],X[mavgs[:lookback]],y)

# daylags=[x for x in df_access.columns if x.startswith('PctOnSite_SameDay')]
# X=df_access[daylags]
# pds.AddQCSet('pp','Same Day QC')
# for lookback in range(1,len(daylags)+1):
#     label = 'daylag_' + str(lookback)
#     _ = pds.CalculateScores('Same Day QC','pp',daylags[lookback-1],X[daylags[:lookback]],y)


# difflags=[x for x in df_access.columns if x.startswith('PctOnSite_diff')]
# X=df_access[seqlags[:4] +daylags[:4] +mavgs[:4] + difflags]
# pds.PurgeQCSet('Diff QC')
# pds.AddQCSet('pp','Diff QC')
# # _ = pds.CalculateScores('Diff QC','pp','3seq',X[seqlags[:3] ],y)
# _ = pds.CalculateScores('Diff QC','pp','3seq_3week',X[seqlags[:3] +daylags[:3] ],y)
# _ = pds.CalculateScores('Diff QC','pp','3seq_3week_1ma',X[seqlags[:3] +daylags[:3] +[mavgs[0]]],y)
# for lookback in range(1,len(difflags)+1):
#     label = 'diff_' + str(lookback)
#     _ = pds.CalculateScores('Diff QC','pp',difflags[lookback-1],X[seqlags[:4] +daylags[:4] +[mavgs[0]] +difflags[:lookback]],y)

In [None]:
_ = pds.GetScores(metric_keys=['R^2 Score','RMS Error','Mean Absolute Error'],verbose=False)
fig1,axs1=plt.subplots(1,len(pds.active_metrics),figsize=(0.5+5*len(pds.active_metrics),4))  
pds.GraphScores('Sequential Lag QC',axs1)
# axs1[0].set_ylim(0.015,0.03)
# axs1[1].set_ylim(0.02,0.04)
# axs1[2].set_ylim(0.9,1.0)
fig1.suptitle('Impact of Adding Auto-Regressive Lags on Regression Model Accuracy',fontsize=12,fontweight='bold')
fig1.tight_layout()
fig1.savefig('./Output Files/Images/Feature Engineering/Seq_Lag_Metrics.png',format='png',bbox_inches='tight')

fig2,axs2=plt.subplots(1,len(pds.active_metrics),figsize=(0.5+5*len(pds.active_metrics),4))  
pds.GraphScores('Moving Average QC',axs2)
fig2.tight_layout()
fig2.savefig('./Output Files/Images/Feature Engineering/MovingAverage_Metrics.png',format='png',bbox_inches='tight')

fig3,axs3=plt.subplots(1,len(pds.active_metrics),figsize=(0.5+5*len(pds.active_metrics),4))  
pds.GraphScores('Same Day QC',axs3)
fig3.tight_layout()
fig3.savefig('./Output Files/Images/Feature Engineering/SameDay_Metrics.png',format='png',bbox_inches='tight')

fig4,axs4=plt.subplots(1,len(pds.active_metrics),figsize=(0.5+5*len(pds.active_metrics),4))  
pds.GraphScores('Diff QC',axs4)
fig4.tight_layout()
fig4.savefig('./Output Files/Images/Feature Engineering/Diff_Metrics.png',format='png',bbox_inches='tight')

In [None]:
fig,axs=plt.subplots(1,2,figsize=(10,4))  

# _ = plot_acf(y,ax=axs[0][0])
# _ = plot_pacf(y,ax=axs[0][1])

_ = plot_acf(y.diff().dropna(),ax=axs[0])
_ = plot_pacf(y.diff().dropna(),ax=axs[1])

for ax in axs:
    ax.set_xlabel('Lags')
axs[0].set_ylabel('ACF Value')
_ = axs[1].set_ylabel('PACF Value')

fig.suptitle('Partial and Full Autocorrelation Functions',fontsize=12,fontweight='bold')
fig.tight_layout()
fig.savefig('./Output Files/Images/Data Exploration/acf_pacf.png',format='png',bbox_inches='tight')

In [None]:
from statsmodels.tsa.stattools import adfuller

result = adfuller(y)
print(result[0])
print(result[1])

result = adfuller(y.diff().dropna())
print(result[0])
print(result[1])

In [None]:
# https://www.geeksforgeeks.org/complete-guide-to-sarimax-in-python/



from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
 
from statsmodels.tsa.stattools import adfuller
import pmdarima as pm
from statsmodels.tsa.seasonal import seasonal_decompose

result = seasonal_decompose(y, model='multiplicative', period=5)
trend = result.trend.dropna()
seasonal = result.seasonal.dropna()
residual = result.resid.dropna()

# Plot the decomposed components
plt.figure(figsize=(12,6))

plt.subplot(4, 1, 1)
plt.plot(y, label='Original Series')
plt.legend()

plt.subplot(4, 1, 2)
plt.plot(trend, label='Trend')
plt.legend()

plt.subplot(4, 1, 3)
plt.plot(seasonal, label='Seasonal')
plt.legend()

plt.subplot(4, 1, 4)
plt.plot(residual, label='Residuals')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
with engine.connect() as conn:
    df_preproc = pd.read_sql_table('Preprocessed_Features', conn,schema='Silver')


In [None]:
drop_col = [x for x in df_preproc.columns if x.startswith('PctOnSite_')]
df=df_preproc.loc[:date_val_end].drop(columns=drop_col).set_index('Date')
df.head()

SARIMAX_model = pm.auto_arima(df['Pct_On_Site'], exogenous=df_access.drop(columns=['Pct_On_Site'],),
                           start_p=1, start_q=1,
                           test='adf',
                           max_p=3, max_q=3, m=5,
                           start_P=0, seasonal=True,
                           d=None, D=1,
                           trace=False,
                           error_action='ignore',
                           suppress_warnings=True,
                           stepwise=True)

In [None]:
   
def sarimax_forecast(SARIMAX_model, periods=24):
    # Forecast
    n_periods = periods
 
    forecast_df = df_preproc.set_index('Date').sort_index().loc[date_test_start:].drop(columns=drop_col).reset_index()
 
    fitted, confint = SARIMAX_model.predict(n_periods=n_periods,
                                            return_conf_int=True,
                                            exogenous=forecast_df.drop(columns='Pct_On_Site'))
    index_of_fc = forecast_df.index
    print(max(df.index))
    # make series for plotting purpose
    fitted_series = pd.Series(fitted, index=index_of_fc)
    lower_series = pd.Series(confint[:, 0], index=index_of_fc)
    upper_series = pd.Series(confint[:, 1], index=index_of_fc)
 
    # Plot
    plt.figure(figsize=(15, 7))
    plt.plot(df["Pct_On_Site"], color='#1f76b4')
    plt.plot(fitted_series, color='darkgreen')
    plt.fill_between(lower_series.index,
                     lower_series,
                     upper_series,
                     color='k', alpha=.15)
 
    plt.title("SARIMAX - Forecast of Staff On Site")
    plt.show()

In [None]:
   
sarimax_forecast(SARIMAX_model, periods=80)

In [None]:
%load_ext autoreload
%autoreload 2

from CustomLibs.CustomTransformers import SpikeRemover, DailyMeanImputer, filtered_transformer
from sklearn.pipeline import Pipeline
# from imblearn.pipeline import Pipeline 
# from imblearn          import FunctionSampler
from sklearn.compose import ColumnTransformer

with engine.connect() as conn:
    df_raw = pd.read_sql_table('All_Raw_Features', conn,schema='Bronze')
df_raw.set_index('Date',inplace=True)
df_raw.columns = df_raw.columns.astype(str)
df_raw.columns = [str(x) for x in df_raw.columns]

drop_list = [x for x in df_raw.columns if x.endswith('_ori')]

df_raw.drop(columns=drop_list,inplace=True)

zthresh=5

outlier_feature_list=['Heat_Consumption','Cold_Consumption','Webex_Connections','Webex_Total_Participants','FTE_Count']
dezero_feature_list = ['VPN_cnxn','Webex_Connections','Webex_Total_Participants','Webex_Maximum_Concurrent_Meetings','Day_Electric_KWh','Night_Electric_KWh']

# outlier_feature_list_ext = [x + '_onedayago' for x in outlier_feature_list] + [x + '_oneweekago' for x in outlier_feature_list] 
# dezero_feature_list_ext = [x + '_onedayago' for x in dezero_feature_list] + [x + '_oneweekago' for x in dezero_feature_list] 

despike_transformer=ColumnTransformer(
    transformers=[
        ('5z_despike',SpikeRemover(cutvalue=5,cutmode='zthresh'),outlier_feature_list)
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)
dezero_transformer=ColumnTransformer(
    transformers=[
        ('dezero',SpikeRemover(cutvalue=0,cutmode='value'),dezero_feature_list)
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

df_all_dspike = despike_transformer.set_output(transform='pandas').fit_transform(df_raw)
df_all_dzero = dezero_transformer.set_output(transform='pandas').fit_transform(df_all_dspike)


args={'annot':True}
fig1,ax1=plt.subplots(figsize=(25,25))
plot_corr_heatmap(df=df_all_dzero,ax=ax1,**args)
ax1.tick_params(axis='both', which='major', labelsize=10)