In [None]:
%load_ext autoreload
%autoreload 2
# Import Libraries
import sqlalchemy
import pandas as pd
import missingno as msno 
import matplotlib.pyplot as plt
# plt.style.use('ggplot')
from datetime import datetime
import seaborn as sns
import numpy as np
import matplotlib.ticker as mtick
from CustomLibs.CustomFunctions import plot_corr_heatmap, value_to_float, fig_indexes, sqlcol,what_pct_train
from config import Config
from CustomLibs.CustomTransformers import SpikeRemover, DailyMeanImputer, filtered_transformer
from sklearn.pipeline import Pipeline
# from imblearn.pipeline import Pipeline 
# from imblearn          import FunctionSampler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder, SplineTransformer, FunctionTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import TimeSeriesSplit, KFold,cross_validate
import scipy.stats
import statsmodels.api as sm
#import csv



from CustomLibs.MultiPipe import MultiPipe

pds = MultiPipe()
pds.CV=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=4, test_size=30)

date_val_end=Config.TEST_DATE_CUTOFF
date_test_start=pd.to_datetime(date_val_end) + pd.DateOffset(days=1)

engine = sqlalchemy.create_engine(Config.CONN_STR)


In [None]:
despike_transformer=ColumnTransformer(
    transformers=[
        ('5z_despike',SpikeRemover(cutvalue=3,cutmode='zthresh'),['Webex_Total_Participants'])
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)
dezero_transformer=ColumnTransformer(
    transformers=[
        ('dezero',SpikeRemover(cutvalue=0,cutmode='value'),['Webex_Total_Participants','Average_Meeting_Duration_Times_(Minutes)'])
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

with engine.connect() as conn:
    df_webex=pd.read_sql_table('VPNUsage',conn,schema='Bronze').rename(columns={'Dates':'Date'})
    df_attendance=pd.read_sql_table('All_Raw_Features',conn,schema='Bronze')[['Date','Actual_Desks_Used']]
df_webex['Webex_Total_Participants']=df_webex['Webex_Total_Participants'].apply(value_to_float).astype(float)
df_webex= df_webex.rename(str,axis="columns") 




df_webex = despike_transformer.set_output(transform='pandas').fit_transform(df_webex)
df_webex = dezero_transformer.set_output(transform='pandas').fit_transform(df_webex)

df_webex.set_index('Date',inplace=True)
df_attendance.set_index('Date',inplace=True)

df_webex['Day_Name']=df_webex.index.day_name()
df_webex['Week_Number']=df_webex.index.isocalendar().week
df_webex['Year']=df_webex.index.year
df_webex = df_webex.merge(df_attendance,how='left',left_index=True,right_index=True).rename(columns={'Average_Meeting_Duration_Times_(Minutes)':'Avg_Meeting_Durations_Mins'})
df_webex['Day_Number']=df_webex.index.dayofweek
df_webex = df_webex.loc[df_webex['Day_Number'] <5]
df_webex['Total_Meeting_Time']= df_webex['Webex_Total_Participants']*df_webex['Avg_Meeting_Durations_Mins']


# df_webex.head(40)

In [None]:
fig,axs=plt.subplots(3,1,figsize=(8,7)) 

# y_val=df_webex['Avg_Meeting_Durations_Mins']
x_val=df_webex['Actual_Desks_Used']
p = sns.regplot(x=x_val,y=df_webex['Avg_Meeting_Durations_Mins'],ax=axs[0],marker=".", color=".3",robust=False,logx=False,line_kws=dict(color="r"))
axs[0].set_title('Meeting Duration versus Numbers of Staff On Site')
axs[0].set_ylabel('Meeting Duration')

est = sm.OLS(df_webex.dropna()['Avg_Meeting_Durations_Mins'], sm.add_constant(df_webex.dropna()['Actual_Desks_Used']))
est2 = est.fit()
p = est2.pvalues["Actual_Desks_Used"]
if p < 0.001:
    p_str= '< 0.001'
else:
    p_str= f'{p:.3f}'
axs[0].text(0.8, 0.15, f'p value : {p_str}', transform=axs[0].transAxes, fontsize=10, va='top', ha='left')


p = sns.regplot(x=x_val,y=df_webex['Webex_Total_Participants'],ax=axs[1],marker=".", color=".3",robust=False,logx=False,line_kws=dict(color="r"))
axs[1].set_title('Meeting Attendees versus Numbers of Staff On Site')
axs[1].set_ylabel('Meeting Attendees')


est = sm.OLS(df_webex.dropna()['Webex_Total_Participants'], sm.add_constant(df_webex.dropna()['Actual_Desks_Used']))
est2 = est.fit()
p = est2.pvalues["Actual_Desks_Used"]
if p < 0.001:
    p_str= '< 0.001'
else:
    p_str= f'{p:.3f}'
axs[1].text(0.8, 0.15, f'p value : {p_str}', transform=axs[1].transAxes, fontsize=10, va='top', ha='left')

x_val=df_webex.loc[~(df_webex['Day_Name'].isin(['Monday','Friday']))]['Actual_Desks_Used']
p = sns.regplot(x=x_val,y=df_webex.loc[~(df_webex['Day_Name'].isin(['Monday','Friday']))]['Webex_Total_Participants'],ax=axs[2],marker=".", color=".3",robust=False,logx=False,line_kws=dict(color="r"))
axs[2].set_title('Meeting Attendees versus Numbers of Staff On Site (Excl. Monday,Friday)')
axs[2].set_ylabel('Meeting Attendees')

est = sm.OLS(df_webex.loc[~(df_webex['Day_Name'].isin(['Monday','Friday']))].dropna()['Webex_Total_Participants'], sm.add_constant(df_webex.loc[~(df_webex['Day_Name'].isin(['Monday','Friday']))].dropna()['Actual_Desks_Used']))
est2 = est.fit()
p = est2.pvalues["Actual_Desks_Used"]
if p < 0.001:
    p_str= '< 0.001'
else:
    p_str= f'{p:.3f}'
axs[2].text(0.8, 0.15, f'p value : {p_str}', transform=axs[2].transAxes, fontsize=10, va='top', ha='left')


# p = sns.regplot(x=x_val,y=df_webex['Total_Meeting_Time'],ax=axs[2],marker=".", color=".3",robust=False,logx=False,line_kws=dict(color="r"))
# axs[0].set_xlabel('')
for ax in axs:
    ax.grid(visible=True,which='Major',axis='both') 
    
    if Config.MASK_VALUE:
        ax.set_yticklabels([])
        ax.set_xticklabels([])
fig.suptitle('Mean Feature Ranking Metrics, Linear Regression and Tuned Ridge Regression',fontsize=12,fontweight='bold')
fig.tight_layout()
fig.savefig('./Output Files/Images/Data Exploration/meetings_Attendence_single_linear.png',format='png',bbox_inches='tight')

In [None]:
import statsmodels.api as sm
# X2 = sm.add_constant(imputers['simplemean'].fit_transform(X))
est = sm.OLS(df_webex.dropna()['Webex_Total_Participants'], sm.add_constant(df_webex.dropna()['Actual_Desks_Used']))
est2 = est.fit()
# print(len(['const']+X.columns.values.tolist()))
# print(est2.params)
print(est2.summary(xname=['const','Actual_Desks_Used']))

In [None]:
est2.pvalues['Actual_Desks_Used']