In [None]:
%load_ext autoreload
%autoreload 2

## Common Libray Definitions

import sqlalchemy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib.ticker as mtick

# Custom Library Definitions
from CustomLibs.CustomFunctions import plot_corr_heatmap, plot_permutation_importance, sqlcol,save_to_file,load_from_file
from config import Config
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler,SplineTransformer,OneHotEncoder
from datetime import datetime
from CustomLibs.CustomTransformers import filtered_transformer
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import LinearSVR, SVR
from sklearn.ensemble import RandomForestRegressor
from CustomLibs.MultiPipe import MultiPipe
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error,root_mean_squared_error,r2_score
from pprint import pprint
import xgboost as xgb

import scipy.stats

## SQL Store Definition
engine = sqlalchemy.create_engine(Config.CONN_STR)

date_val_end=Config.TEST_DATE_CUTOFF
date_test_start=pd.to_datetime(date_val_end) + pd.DateOffset(days=1)




In [None]:
with engine.connect() as conn:
    df_preproc = pd.read_sql_table('Preprocessed_Features', conn,schema='Gold')
    df_feature_ranks = pd.read_sql_table('PermutationFeatureRanks', conn,schema='Gold')
df_preproc.columns = [str(x) for x in df_preproc.columns]
df_preproc.set_index('Date',inplace=True)
label_field = df_preproc.columns[-1]


# colorder = df_preproc.columns.tolist()
# for i,val in enumerate(colorder):
#     if val=='Month':
#         break
# neworder=colorder[:i] + ['Quarter'] + colorder[i+1:]
# df_preproc['Quarter']=(df_preproc.index.month//4)+1
# df_preproc=df_preproc[neworder]
# print(df_preproc.columns.tolist())
# df_preproc['Month']=(df_preproc.index.month//4)+1

X=df_preproc.sort_index().loc[:date_val_end].drop(columns=label_field)
y=df_preproc[label_field].sort_index().loc[:date_val_end]

X_test=df_preproc.sort_index().loc[date_test_start:].drop(columns=label_field)
y_test=df_preproc[label_field].sort_index().loc[date_test_start:]

feat_ranks = df_feature_ranks.sort_values('Mean Rank')['Feature'].to_list()

rf_random_search_parameters = load_from_file('rf_random_search_parameters')
rf_grid_search_parameters = load_from_file('rf_grid_search_parameters')
rf_random_search_parameters_t60 = load_from_file('rf_random_search_parameters_t60')
rf_grid_search_parameters_t60 = load_from_file('rf_grid_search_parameters_t60')
rf_random_search_parameters_t20 = load_from_file('rf_random_search_parameters_t20')
rf_grid_search_parameters_t20 = load_from_file('rf_grid_search_parameters_t20')
xgb_gs1_parameters = load_from_file('xgb_gs1_parameters')
xgb_gs2_parameters = load_from_file('xgb_gs2_parameters')
xgb_gs3_parameters = load_from_file('xgb_gs3_parameters')
xgb_gs4_parameters = load_from_file('xgb_gs4_parameters')
xgb_gs5_parameters = load_from_file('xgb_gs5_parameters')
svr_gs1_parameters = load_from_file('svr_gs1_parameters')
svr_gs1_parameters_t20 = load_from_file('svr_gs2_parameters_t20')
ridge_gs1_parameters = load_from_file('ridge_gs1_parameters')
ridge_gs1_parameters_t20 = load_from_file('ridge_gs1_parameters_t20')

# print([type(x) for x in X.columns])

pds=MultiPipe()
pds.Regressors = {}
pds.Regressors['Random Forest Baseline']=RandomForestRegressor(random_state=43)
pds.Regressors['Random Forest RandomSearch']=RandomForestRegressor(random_state=43).set_params(**rf_random_search_parameters)
pds.Regressors['Random Forest GridSearch']=RandomForestRegressor(random_state=43).set_params(**rf_grid_search_parameters)
# pds.Regressors['Random Forest GridSearch 20 MF0.8']=RandomForestRegressor(random_state=43)
pds.Regressors['Random Forest GridSearch 20']=RandomForestRegressor(random_state=43)
pds.Regressors['Random Forest Baseline 20']=RandomForestRegressor(random_state=43)
pds.Regressors['XGBoost Baseline'] = xgb.XGBRegressor()
pds.Regressors['XGBoost GS5']=xgb.XGBRegressor().set_params(**xgb_gs5_parameters).set_params(**xgb_gs2_parameters).set_params(**xgb_gs3_parameters).set_params(**xgb_gs4_parameters)
pds.Regressors['Linear Regression']=LinearRegression()
pds.Regressors['Linear Ridge']=Ridge().set_params(**ridge_gs1_parameters)
pds.Regressors['Linear Ridge 20']=Ridge()
# pds.Regressors['Linear Regression 30']=LinearRegression()
pds.Regressors['SVR Baseline'] = LinearSVR(random_state=43,max_iter=20000)
pds.Regressors['SVR GridSearch']=LinearSVR(random_state=43,max_iter=20000).set_params(**svr_gs1_parameters)
pds.Regressors['SVR GridSearch_manual']=LinearSVR(random_state=43,max_iter=20000).set_params(**{'C': 0.1,'epsilon': 0,'loss': 'squared_epsilon_insensitive'})
pds.Regressors['SVR GridSearch 20']=LinearSVR(random_state=43,max_iter=20000)

tra=filtered_transformer(feat_ranks)
tra20=filtered_transformer(feat_ranks[:20])
tra30=filtered_transformer(feat_ranks[:30])
tra40=filtered_transformer(feat_ranks[:40])
tra60=filtered_transformer(feat_ranks[:60])

pds.AddPreProc(tra,'pp')
pds.PurgeQCSet('Test Data')
pds.AddQCSet('pp','Test Data')
# pds.QC_Set['Test Data']['pp']['Linear Regression 30']=make_pipeline(tra30,LinearRegression())
pds.QC_Set['Test Data']['pp']['Random Forest GridSearch 20']=make_pipeline(tra20,RandomForestRegressor(random_state=43).set_params(**rf_grid_search_parameters_t20))
pds.QC_Set['Test Data']['pp']['Random Forest Baseline 20']=make_pipeline(tra20,RandomForestRegressor(random_state=43))
# pds.QC_Set['Test Data']['pp']['Random Forest GridSearch 20 MF0.8']=make_pipeline(tra20,RandomForestRegressor(random_state=43).set_params(**rf_grid_search_parameters_t20).set_params(**{'max_features':0.6}))
# pds.QC_Set['Test Data']['pp']['Random Forest GridSearch 60']=make_pipeline(tra60,RandomForestRegressor(random_state=43).set_params(**rf_grid_search_parameters_t60))
pds.QC_Set['Test Data']['pp']['SVR GridSearch 20']=make_pipeline(tra20,LinearSVR(random_state=43).set_params(**svr_gs1_parameters_t20))
pds.QC_Set['Test Data']['pp']['Linear Ridge 20']=make_pipeline(tra20,Ridge().set_params(**ridge_gs1_parameters_t20))

# print(feat_ranks)



In [None]:
tra

In [None]:
rf_parameters=[rf_grid_search_parameters_t20]
for param_set in rf_parameters:
    for k,v in param_set.items():
        print(f'{k}: {v}')

In [None]:
xgboost_parameters=[xgb_gs5_parameters,xgb_gs2_parameters,xgb_gs3_parameters,xgb_gs4_parameters]
for param_set in xgboost_parameters:
    for k,v in param_set.items():
        print(f'{k}: {v}')

In [None]:


# X_test=X_test[10:]
# y_test=y_test[10:]

# Initialise result dataframe
df_results = y_test.to_frame('Actual')
df_results['3 Day Mean'] = X_test['PctOnSite_ma3']

days = ['Monday','Tuesday','Wednesday','Thursday','Friday']

# for i,(key,reg) in enumerate(pds.Regressors.items()):
    # pipe=make_pipeline(tra,reg)
    
for i,(key,pipe) in enumerate(pds.QC_Set['Test Data']['pp'].items()):
    print(f'{key}')
    if key in ['Random Forest GridSearch 20','Random Forest GridSearch 20 MF0.8', 'Random Forest Baseline 20']:
        pipe.fit(X[feat_ranks[:20]],y)
        y_pred = pd.Series(pipe.predict(X_test[feat_ranks[:20]]),y_test.index)
    elif key == 'Linear Ridge 20':
        pipe.fit(X[feat_ranks[:20]],y)
        y_pred = pd.Series(pipe.predict(X_test[feat_ranks[:20]]),y_test.index)
    elif key == 'SVR GridSearch 20':
        pipe.fit(X[feat_ranks[:20]],y)
        y_pred = pd.Series(pipe.predict(X_test[feat_ranks[:20]]),y_test.index)
    else:
        pipe.fit(X,y)
        y_pred = pd.Series(pipe.predict(X_test),y_test.index)
    # y_pred = reg.predict(X_test)
    df_results[key] = y_pred
    mae=mean_absolute_error(y_test,y_pred)
    rms=root_mean_squared_error(y_test,y_pred)
    r2=r2_score(y_test,y_pred)
    print(f'    Overall'.ljust(35,' ') + f'MAE: {mae:.4f}, RMS: {rms:.4f}, R2: {r2:4f}')
    # print(len(y_test))
    for day in range(1,6):
        
        mae=mean_absolute_error(y_test.loc[X_test['Day_Of_Week']==day],y_pred.loc[X_test['Day_Of_Week']==day])
        rms=root_mean_squared_error(y_test.loc[X_test['Day_Of_Week']==day],y_pred.loc[X_test['Day_Of_Week']==day])
        r2=r2_score(y_test.loc[X_test['Day_Of_Week']==day],y_pred.loc[X_test['Day_Of_Week']==day])
        print(f'    {days[day-1]}'.ljust(35,' ') + f'MAE: {mae:.4f}, RMS: {rms:.4f}, R2: {r2:4f}')
        # print(len(y_test.loc[X_test['Day_Of_Week']==day]))



print(f'3 Day Mean'.ljust(28,' ') + f"MAE: {mean_absolute_error(y_test,X_test['PctOnSite_ma3']):.4f}, RMS: {root_mean_squared_error(y_test,X_test['PctOnSite_ma3']):.4f}, R2: {r2_score(y_test,X_test['PctOnSite_ma3']):4f}")






## Aggregate Forecast

In [None]:
with engine.connect() as conn:
    df_access_by_directorate = pd.read_sql_table('Moving_Averages_By_Directorate', conn,schema='Gold')
    df_preproc = pd.read_sql_table('Preprocessed_Features', conn,schema='Gold')

label_field = df_preproc.columns[-1]
feat_to_be_replaced = [x for x in df_preproc.columns if x.startswith('PctOnSite_')]
feat_not_imported = [x for x in df_access_by_directorate.columns if x.startswith('PctOnSite_')]
feat_not_imported = [x for x in feat_not_imported if x not in feat_to_be_replaced]

df_preproc.drop(columns=feat_to_be_replaced,inplace=True)
df_preproc.rename(columns={'Pct_On_Site':'Pct_On_Site_Overall','Desks_Booked':'Desks_Booked_Overall'},inplace=True)
df_access_by_directorate.drop(columns=feat_not_imported+['Day_Name','Desks_Used'],inplace=True)
# df_access_by_directorate.columns = [x.replace('PctOnSite_m','m') for x in df_access_by_directorate.columns]
df_preproc_by_directorate=df_preproc.merge(right=df_access_by_directorate,how='left',on='Date').drop(columns='Pct_On_Site_Overall')

df_preproc_by_directorate= df_preproc_by_directorate.rename(str,axis="columns") 



directorates=df_preproc_by_directorate['Directorate'].unique()
# df_preproc_by_directorate=df_preproc_by_directorate.set_index(['Directorate','Date']).sort_index()
df_preproc_by_directorate=df_preproc_by_directorate.set_index('Date').sort_index()

df_grouped_by_directorate=df_preproc_by_directorate.groupby('Directorate')




In [None]:


for i,(key,pipe) in enumerate(pds.QC_Set['Test Data']['pp'].items()):
    print(f'{key}')
    y_pred_by_dir=[]
    total_staff_by_dir=[]
    for directorate, df_g in df_grouped_by_directorate:
        # print(X.columns)

        if key == 'Random Forest GridSearch 20':
            X=df_g.loc[:date_val_end,feat_ranks[:20]]
            X_test=df_g.loc[date_test_start:,feat_ranks[:20]]
        elif key == 'Linear Regression 30':
            X=df_g.loc[:date_val_end,feat_ranks[:30]]
            X_test=df_g.loc[date_test_start:,feat_ranks[:30]]
        elif key == 'SVR GridSearch 30':
            X=df_g.loc[:date_val_end,feat_ranks[:30]]
            X_test=df_g.loc[date_test_start:,feat_ranks[:30]]
        else:
            X=df_g.loc[:date_val_end].drop(columns=[label_field,'Directorate'])
            X_test=df_g.loc[date_test_start:].drop(columns=[label_field,'Directorate'])

        # X=df_g.loc[:date_val_end].drop(columns=[label_field,'Directorate'])
        y=df_g[label_field].loc[:date_val_end]
        
        # X_test=df_g.loc[date_test_start:].drop(columns=[label_field,'Directorate'])
        y_test=df_g[label_field].loc[date_test_start:]


        X.columns = X.columns.astype(str)
        pipe.fit(X,y)
        y_pred = pd.Series(pipe.predict(X_test),y_test.index)

        y_pred_by_dir.append(y_pred * df_g.loc[date_test_start:,'Directorate_Numbers'])
        total_staff_by_dir.append(df_g.loc[date_test_start:,'Directorate_Numbers'])

        mae=mean_absolute_error(y_test,y_pred)
        rms=root_mean_squared_error(y_test,y_pred)
        r2=r2_score(y_test,y_pred)
        # print(f'    {directorate}'.ljust(40,' ') + f'MAE: {mae:.4f}, RMS: {rms:.4f}, R2: {r2:4f}')

    y_test=df_preproc.set_index('Date')['Pct_On_Site_Overall'].sort_index().loc[date_test_start:]
    y_pred_sum_of_dir = pd.Series(np.sum(y_pred_by_dir,axis=0),index=y_pred_by_dir[0].index)
    total_staff = pd.Series(np.sum(total_staff_by_dir,axis=0),index=total_staff_by_dir[0].index)
    y_pred_pct=y_pred_sum_of_dir/total_staff
    df_results[key + ' ByDir']=y_pred_pct.to_list()
    # print(y_pred_pct)

    mae=mean_absolute_error(y_test,y_pred_pct)
    rms=root_mean_squared_error(y_test,y_pred_pct)
    r2=r2_score(y_test,y_pred_pct)

    print(f'        Overall'.ljust(40,' ') + f'MAE: {mae:.4f}, RMS: {rms:.4f}, R2: {r2:4f}')

df_results['Booked_Pct']=np.NaN
df_results['Booked_Pct']=df_preproc.set_index('Date')['Desks_Booked_Overall'].sort_index().loc[date_test_start:]/total_staff

mae=mean_absolute_error(y_test,df_results['Booked_Pct'])
rms=root_mean_squared_error(y_test,df_results['Booked_Pct'])
r2=r2_score(y_test,df_results['Booked_Pct'])

print(f'    Booked Accuracy'.ljust(40,' ') + f'MAE: {mae:.4f}, RMS: {rms:.4f}, R2: {r2:4f}')
    

In [None]:
mae=mean_absolute_error(y_test,df_results['Booked_Pct'])
rms=root_mean_squared_error(y_test,df_results['Booked_Pct'])
r2=r2_score(y_test,df_results['Booked_Pct'])

print(f'    Booked Accuracy'.ljust(40,' ') + f'MAE: {mae:.4f}, RMS: {rms:.4f}, R2: {r2:4f}')
    

In [None]:
regressor_labels=df_results.columns.tolist()[1:]
df_results.reset_index(inplace=True)
df_results['Week_Number']=df_results['Date'].dt.isocalendar().week
df_results['Week_Start'] = df_results['Date'].dt.to_period('W').apply(lambda r: r.start_time)
df_results['Day_Name']=df_results['Date'].dt.day_name()


with engine.connect() as conn:
    df_results.to_sql('TestSet_Predictions',conn,schema='Gold',if_exists='replace',dtype=sqlcol(df_results),index=False)


In [None]:
def get_scores(df,args):
    true_label=args[0]
    pred_labels=args[1]
    result = {
        'MAE':{},
        'RMS':{},
        'R^2':{}
    }


    for pred_label in pred_labels:
        result['MAE'][pred_label]=mean_absolute_error(df[true_label],df[pred_label])
        result['RMS'][pred_label]=root_mean_squared_error(df[true_label],df[pred_label])
        result['R^2'][pred_label]=r2_score(df[true_label],df[pred_label])

    return pd.DataFrame.from_dict(result)



df_metrics_byday = df_results.groupby('Day_Name').apply(get_scores,('Actual',regressor_labels),include_groups=False)

df_results['dummy'] = 1
df_metrics_all = df_results.groupby('dummy').apply(get_scores,('Actual',regressor_labels),include_groups=False).reset_index().drop(columns='dummy').rename(columns={'level_1':'Model'}).set_index('Model')
display(df_metrics_all.sort_values('RMS').head(40))

ts_list=df_metrics_all.sort_values('RMS').index[:4].to_list()
ts_list=ts_list[:2]+[ts_list[3]]

# df_metrics_byday

df_metrics_byweek = df_results[ts_list+['3 Day Mean','Week_Start','Actual']].groupby('Week_Start').apply(get_scores,('Actual',ts_list+['3 Day Mean']),include_groups=False)
# df_metrics_byweek.head(10)


df_results['dummy'] = 1
df_metrics_all = df_results.groupby('dummy').apply(get_scores,('Actual',regressor_labels),include_groups=False).reset_index().drop(columns='dummy').rename(columns={'level_1':'Model'}).set_index('Model')
df_metrics_all.sort_values('RMS').head(40)


with engine.connect() as conn:
    df_metrics_all.reset_index().to_sql('TestSet_Metrics',conn,schema='Gold',if_exists='replace',index=False,dtype=sqlcol(df_results))



In [None]:
regressor_sublist=df_metrics_all.sort_values('RMS').index[:4].to_list()
regressor_sublist=regressor_sublist[:2]+[regressor_sublist[3]]+['3 Day Mean']


fig,axs=plt.subplots(2,1,figsize=(10,6),sharex=True)  

for i,day in enumerate(['Wednesday','Thursday']):
# for i,day in enumerate(['Monday','Tuesday','Wednesday','Thursday','Friday']):
    df=df_results.loc[df_results['Day_Name']==day].set_index('Date')
    sns.lineplot(data=df[regressor_sublist+['Actual']] ,ax=axs[i],linestyle='-')
    axs[i].set_title(f'Predicted Versus Actual: {day}',fontsize=11)
    axs[i].legend(loc='upper left')
    axs[i].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    # axs[i].tick_params(axis='x', labelrotation=45, labelsize=10)
    axs[i].grid(visible=True,which='Major',axis='both') 
    # axs[i].set_ylim(0,0.6)
    axs[i].set_yticklabels([])
    axs[i].tick_params(axis='x', labelrotation=45, labelsize=10)
    axs[i].set_ylabel('Staff On Site')
fig.suptitle('Predicted versus Actual - Test Data',fontsize=12,fontweight='bold')
fig.tight_layout()
fig.savefig('./Output Files/Images/Final/top_model_timeseries.png',format='png',bbox_inches='tight')

for ax in axs:
    for line in ax.get_lines():
        # print(line.get_label())
        if line.get_label() not in ['_child8','_child6','3 Day Mean','Actual']:
            line.set_linestyle('-')
    leg=ax.get_legend()
    for line in leg.get_lines():
        if line.get_label() not in ['_child8','_child6','3 Day Mean','Actual']:
            line.set_linestyle('-')

In [None]:
fig,axs=plt.subplots(3,1,figsize=(10,8),sharex=True)

for i,metric in enumerate(['MAE','RMS','R^2']):
    # sns.lineplot(data=df_metrics_byweek[~df_metrics_byweek.index.isin(['7 Day Mean'], level=1)],x=df_metrics_byweek[~df_metrics_byweek.index.isin(['7 Day Mean'], level=1)].index.get_level_values(0),y=metric,hue=df_metrics_byweek[~df_metrics_byweek.index.isin(['7 Day Mean'], level=1)].index.get_level_values(1),ax=axs[i])
    sns.lineplot(data=df_metrics_byweek,x=df_metrics_byweek.index.get_level_values(0),y=metric,hue=df_metrics_byweek.index.get_level_values(1),ax=axs[i])
    axs[i].legend(loc='upper left').set_title('')
    axs[i].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    # axs[i].tick_params(axis='x', labelrotation=45, labelsize=10)
    axs[i].grid(visible=True,which='Major',axis='y') 
    axs[i].set_ylabel(f'{metric} Value',fontsize=11)
    axs[i].set_xlabel(f'Date',fontsize=11)
    axs[i].tick_params(axis='x', labelrotation=45, labelsize=10)
    axs[i].tick_params(axis='y', labelsize=10)
    axs[i].set_title(f'{metric} Calculated Per Week',fontsize=12)

axs[0].set_ylim(0.0,0.25)
axs[1].set_ylim(0.0,0.25)
axs[2].set_ylim(0.0,1.1)

for ax in axs:
    ax.vlines(datetime(2023,10,2), 0, 1, colors='tab:green', linestyles=':', label='')
    ax.vlines(datetime(2023,10,30), 0, 1, colors='tab:green', linestyles=':', label='')
    ax.vlines(datetime(2024,1,1), 0, 1, colors='tab:green', linestyles=':', label='')
    ax.vlines(datetime(2023,11,13), 0, 1, colors='tab:red', linestyles=':', label='')
    ax.vlines(datetime(2023,10,9), 0, 1, colors='tab:red', linestyles=':', label='')
fig.suptitle('Prediction Metrics By Week - Test Data',fontsize=12,fontweight='bold')
fig.tight_layout()
fig.savefig('./Output Files/Images/Final/top_model_weekly_err.png',format='png',bbox_inches='tight')


In [None]:

df_metrics_all_ex1day = df_results.loc[~(df_results['Date'].isin(['2023-10-9','2023-10-12']))].groupby('dummy').apply(get_scores,('Actual',regressor_labels),include_groups=False).reset_index().drop(columns='dummy').rename(columns={'level_1':'Model'}).set_index('Model')
display(df_metrics_all_ex1day.sort_values('RMS').head(40))

with engine.connect() as conn:
    df_metrics_all_ex1day.reset_index().to_sql('TestSet_Metrics_excl',conn,schema='Gold',if_exists='replace',index=False,dtype=sqlcol(df_metrics_all_ex1day))

### Testing without Desks Booked

In [None]:

pipe = pds.QC_Set['Test Data']['pp']['XGBoost GS5']

y_pred_by_dir=[]
total_staff_by_dir=[]
for directorate, df_g in df_grouped_by_directorate:

    X=df_g.loc[:date_val_end].drop(columns=[label_field,'Directorate','Desks_Booked'])
    X_test=df_g.loc[date_test_start:].drop(columns=[label_field,'Directorate','Desks_Booked'])

    # X=df_g.loc[:date_val_end].drop(columns=[label_field,'Directorate'])
    y=df_g[label_field].loc[:date_val_end]
    
    # X_test=df_g.loc[date_test_start:].drop(columns=[label_field,'Directorate'])
    y_test=df_g[label_field].loc[date_test_start:]


    X.columns = X.columns.astype(str)
    pipe.fit(X,y)
    y_pred = pd.Series(pipe.predict(X_test),y_test.index)

    y_pred_by_dir.append(y_pred * df_g.loc[date_test_start:,'Directorate_Numbers'])
    total_staff_by_dir.append(df_g.loc[date_test_start:,'Directorate_Numbers'])

    mae=mean_absolute_error(y_test,y_pred)
    rms=root_mean_squared_error(y_test,y_pred)
    r2=r2_score(y_test,y_pred)
    # print(f'    {directorate}'.ljust(40,' ') + f'MAE: {mae:.4f}, RMS: {rms:.4f}, R2: {r2:4f}')

y_test=df_preproc.set_index('Date')['Pct_On_Site_Overall'].sort_index().loc[date_test_start:]
y_pred_sum_of_dir = pd.Series(np.sum(y_pred_by_dir,axis=0),index=y_pred_by_dir[0].index)
total_staff = pd.Series(np.sum(total_staff_by_dir,axis=0),index=total_staff_by_dir[0].index)
y_pred_pct=y_pred_sum_of_dir/total_staff

# print(y_pred_pct)

mae=mean_absolute_error(y_test.loc[~(y_test.index=='2023-10-12')],y_pred_pct.loc[~(y_test.index=='2023-10-12')])
rms=root_mean_squared_error(y_test.loc[~(y_test.index=='2023-10-12')],y_pred_pct.loc[~(y_test.index=='2023-10-12')])
r2=r2_score(y_test.loc[~(y_test.index=='2023-10-12')],y_pred_pct.loc[~(y_test.index=='2023-10-12')])

print(f'        Overall'.ljust(40,' ') + f'MAE: {mae:.4f}, RMS: {rms:.4f}, R2: {r2:4f}')
    