In [None]:
%load_ext autoreload
%autoreload 2
# Import Libraries
import sqlalchemy
import pandas as pd
import missingno as msno 
import matplotlib.pyplot as plt
# plt.style.use('ggplot')
from datetime import datetime
import seaborn as sns
import numpy as np
import matplotlib.ticker as mtick
from CustomLibs.CustomFunctions import plot_corr_heatmap, value_to_float, fig_indexes, sqlcol,what_pct_train
from config import Config
from CustomLibs.CustomTransformers import SpikeRemover, DailyMeanImputer, filtered_transformer
from sklearn.pipeline import Pipeline
# from imblearn.pipeline import Pipeline 
# from imblearn          import FunctionSampler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder, SplineTransformer, FunctionTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import TimeSeriesSplit, KFold,cross_validate
import scipy.stats
import statsmodels.api as sm

#import csv



from CustomLibs.MultiPipe import MultiPipe

pds = MultiPipe()
pds.CV=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=4, test_size=30)

date_val_end=Config.TEST_DATE_CUTOFF
date_test_start=pd.to_datetime(date_val_end) + pd.DateOffset(days=1)

engine = sqlalchemy.create_engine(Config.CONN_STR)


In [None]:
with engine.connect() as conn:
    df_elec = pd.read_sql_table('ElectrictyUsage', conn,schema='Bronze')
    df_cold_heat = pd.read_sql_table('ColdAndHeat', conn,schema='Bronze')
    df_gas = pd.read_sql_table('GasUsage', conn,schema='Bronze')
    df_water = pd.read_sql_table('WaterUsage', conn,schema='Bronze')
    df_attendance=pd.read_sql_table('All_Raw_Features',conn,schema='Silver')[['Date','Actual_Desks_Used']]

df_elec.set_index('Date',inplace=True)
df_cold_heat.set_index('Date',inplace=True)
df_gas.set_index('Date',inplace=True)
df_water.set_index('Date',inplace=True)
df_attendance.set_index('Date',inplace=True)

df_consumption = df_elec.merge(df_cold_heat,how='outer',left_index=True,right_index=True).merge(df_gas,how='outer',left_index=True,right_index=True).merge(df_water,how='outer',left_index=True,right_index=True)


drop_id_cols = [x for x in df_consumption.columns if x.endswith('ID')]

not_enough_data=[]
for columnName, columnData in df_consumption.items():
    if columnData.count() / len(columnData) < 0.8:
        not_enough_data.append(columnName)

other_drop_cols=['Previous_year_(m³)','Cold_Consumption_Previous_Year','Heat_Consumption_Previous_Year','Mean_Temputaure','Energia_Invoice_(kWh)','PRIVA_Electricity_Consumption_(kWh)']

df_consumption.drop(columns=not_enough_data+drop_id_cols+other_drop_cols,inplace=True)
# print([x for x in df_consumption.columns.to_list()])
df_consumption=df_consumption[[x for x in df_consumption.columns if x != 'Mean_Temp']+['Mean_Temp']]

df_consumption['Day_Number']=df_consumption.index.dayofweek

df_consumption['Week_End'] = df_consumption['Day_Number'] >= 5

df_consumption = df_consumption.merge(df_attendance,how='left',left_index=True,right_index=True)

df_consumption.rename(columns={'Day_Time_8:00_to_23:00_NWQ':'Day_Electric_KWh',	'Night_Time_23:00_to_8:00_NWQ':'Night_Electric_KWh','NWQ_Gas_Consumption_(m³)':'Gas_Consumption','Water_consumption_(m³)':'Water_Consumption'},inplace=True)

df_consumption= df_consumption.rename(str,axis="columns") 

df_consumption.head(20)

In [None]:
zthresh=3

outlier_feature_list=['Heat_Consumption','Cold_Consumption']
dezero_feature_list = ['Day_Electric_KWh','Night_Electric_KWh']

despike_transformer=ColumnTransformer(
    transformers=[
        ('5z_despike',SpikeRemover(cutvalue=zthresh,cutmode='zthresh'),outlier_feature_list)
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)
dezero_transformer=ColumnTransformer(
    transformers=[
        ('dezero',SpikeRemover(cutvalue=0,cutmode='value'),dezero_feature_list)
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)


df_consumption = despike_transformer.set_output(transform='pandas').fit_transform(df_consumption)
df_consumption = dezero_transformer.set_output(transform='pandas').fit_transform(df_consumption)

In [None]:
msno.matrix(df_consumption)

In [None]:
# pds= MultiPipe()

# to_be_interpolated=df_consumption.columns.to_list()[:-2]

# line_reg_pipe=Pipeline(
#     steps=[
#         ('std scale',SimpleImputer(strategy='mean')),
#         ('rob scale',RobustScaler()),
#         ('linear regression',RandomForestRegressor())
#     ]
# ).set_output(transform='pandas')

# CV = KFold(n_splits=5,shuffle=True,random_state=43)

# for y_lab in to_be_interpolated:
#     X= df_consumption.loc[df_consumption['Week_End']].dropna(subset=y_lab).drop(columns=y_lab)
#     y= df_consumption.loc[df_consumption['Week_End']][y_lab].dropna()
    
#     res = cross_validate(line_reg_pipe,X,y,scoring=pds.Scorers,cv=CV)
#     agg_scores={}
#     for metric in ['R^2 Score','Mean Absolute Error','RMS Error']:
#         agg_scores[metric] = (abs(np.mean(res["test_" + metric])),np.std(res["test_" + metric]),X.shape[1],X.shape[0])
    
#     print(f'{y_lab}: '.ljust(21,' ') + f"MAE = {agg_scores['Mean Absolute Error'][0]:.4f}" + u' \u00B1' + f"{agg_scores['Mean Absolute Error'][1]:.4f}", end=' ')
#     print(f"RMS = {agg_scores['RMS Error'][0]:.4f}" + u' \u00B1' + f"{agg_scores['RMS Error'][1]:.4f}", end=' ')
#     print(f"R2 = {agg_scores['R^2 Score'][0]:.4f}" + u' \u00B1' + f"{agg_scores['R^2 Score'][1]:.4f}")

#     line_reg_pipe.fit(X,y)
#     df_consumption.loc[~df_consumption['Week_End'],y_lab+'_base'] = line_reg_pipe.predict(df_consumption.loc[~df_consumption['Week_End']].drop(columns=y_lab))
#     df_consumption.loc[df_consumption['Week_End'],y_lab+'_base'] = df_consumption.loc[df_consumption['Week_End'],y_lab]
    # y_pred = line_reg_pipe.predict(df_consumption.loc[~df_consumption['Week_End']].drop(columns=y_lab))
    # print(y_pred)


In [None]:
to_be_interpolated=df_consumption.columns.to_list()[:7]

fig,axs=plt.subplots(len(to_be_interpolated),1,figsize=(8,20),sharex=True)
for i,y_lab in enumerate(to_be_interpolated):

    sns.lineplot(df_consumption.loc[df_consumption['Week_End']],y=y_lab,x=df_consumption.loc[df_consumption['Week_End']].index,ax=axs[i])
    axs[i].set_title(y_lab + ' - Weekend')

In [None]:
fig,ax=plt.subplots(1,1,figsize=(6,4)) 

sns.lineplot(df_consumption.loc[df_consumption['Week_End']],y='Heat_Consumption',x=df_consumption.loc[df_consumption['Week_End']].index, ax=ax,label='Weekend')
sns.lineplot(df_consumption.loc[~df_consumption['Week_End']],y='Heat_Consumption',x=df_consumption.loc[~df_consumption['Week_End']].index,ax=ax,label='Weekday')

ax.set_title('Weekend versus Weekday usage for Heat Consumption')
ax.grid(visible=True,which='Major',axis='both') 
ax.tick_params(axis='x', labelrotation=45, labelsize=10)

if Config.MASK_VALUE:
    ax.set_yticklabels([])

fig.tight_layout()
fig.savefig('./Output Files/Images/Data Exploration/passive_consumption_correction.png',format='png',bbox_inches='tight')

In [None]:
fig,ax=plt.subplots(1,1,figsize=(10,5)) 


df_passive = df_consumption[to_be_interpolated].mask(~df_consumption['Week_End'])
df_passive.fillna(df_passive.rolling(window=7,min_periods=1,center=True).mean(),inplace=True)
# df_passive.fillna(df_passive.interpolate().mean(),inplace=True)
df_passive.fillna(0,inplace=True)
df_passive[['Kitchen_Usage','Water_Consumption']]=0

sns.lineplot(df_passive,y='Heat_Consumption',x=df_consumption.index,ax=ax)
sns.lineplot(df_consumption,y='Heat_Consumption',x=df_consumption.index,ax=ax)

# sns.lineplot(x=df_consumption.index,y=df_consumption['Heat_Consumption']-df_passive['Heat_Consumption'])


In [None]:

mms = MinMaxScaler()
# fig,axs=plt.subplots(len(to_be_interpolated),1,figsize=(10,3*len(to_be_interpolated)),sharex=True)
figs=[]
axs=[]
for i in range(len(to_be_interpolated)):
    f,a = plt.subplots(1,1,figsize=(6,3))
    figs.append(f)
    axs.append(a)

mask = df_consumption['Actual_Desks_Used'] > 0
for i,consumption in enumerate(to_be_interpolated):
    df = df_consumption[[consumption,'Actual_Desks_Used']].loc[mask]
    df['y_val']=df[consumption]
    df['y_val_ex_passive']=df[consumption]-df_passive.loc[mask][consumption]
    df['y_val_ex_passive_pp']=(df[consumption]-df_passive.loc[mask][consumption])/df['Actual_Desks_Used']
    # df[['y_val','y_val_ex_passive']]=mms.fit_transform(df[['y_val','y_val_ex_passive']])
    # df['y_val_ex_passive']=mms.fit_transform((df[consumption]-df_passive.loc[mask][consumption])/df['Actual_Desks_Used'])
    # x_val=df_consumption.loc[mask]['Actual_Desks_Used']
    # y_val_ex_passive = (df_consumption.loc[mask][consumption]-df_passive.loc[mask][consumption])/df_consumption.loc[mask]['Actual_Desks_Used']
    # y_val = df_consumption.loc[mask][consumption]/df_consumption.loc[mask]['Actual_Desks_Used']
    df.dropna(inplace=True)
    p = sns.regplot(x=df['Actual_Desks_Used'],y=df['y_val_ex_passive'],ax=axs[i],marker=".", color=".3",robust=False,logx=False,line_kws=dict(color="r"))
    # print(y_val_ex_passive)
    # print(x_val)
    est = sm.OLS(df['y_val_ex_passive'], sm.add_constant(df['Actual_Desks_Used']))
    est2 = est.fit()
    # print(len(['const']+X.columns.values.tolist()))
    # print(est2.params)
    print(f'{consumption}: {est2.pvalues["Actual_Desks_Used"]}')
    # print(est2.summary(xname=['const','Actual_Desks_Used']))
    #calculate slope and intercept of regression equation
    slope, intercept, r, p, sterr = scipy.stats.linregress(x=p.get_lines()[0].get_xdata(),
                                                        y=p.get_lines()[0].get_ydata())
    # print(f'{consumption}: y = {intercept:.4f} + {slope:.6f}x | r = {r} | p = {p}')
    # axs[i].text('y = ' + str(round(intercept,3)) + ' + ' + str(round(slope,3)) + 'x')
    p = est2.pvalues["Actual_Desks_Used"]
    if p < 0.001:
        p_str= '< 0.001'
    else:
        p_str= f'{p:.3f}'
    axs[i].text(0.05, 0.95, f'p value : {p_str}', transform=axs[i].transAxes, fontsize=10, va='top', ha='left')
    # axs[i].text(0.6, 0.9, f'y = {intercept:.2f} + {slope:.5f}x | p = {p_str}', transform=axs[i].transAxes, fontsize=10, va='top', ha='left')
    # sns.lmplot(x=x_val,y=y_val,ax=axs[i])
    axs[i].set_title(consumption.replace('_',' '))
    axs[i].set_ylabel(consumption.replace('_',' '))# + ' Per Person')
    axs[i].grid(visible=True,which='Major',axis='both') 
    axs[i].set_xlabel('')
    if Config.MASK_VALUE:
        axs[i].set_yticklabels([])
        axs[i].set_xticklabels([])

    axs[i].set_xlabel('Number of Staff On Site')

for selection in [0,2]:
    figs[selection].tight_layout()
    figs[selection].savefig('./Output Files/Images/Data Exploration/Consumption_Attendence_single_linear' + str(selection) +  '.png',format='png',bbox_inches='tight')