## Packages

In [None]:
import pandas as pd
import numpy as np
from math import ceil
from itertools import product
## import plotly.express as px
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

import sqlalchemy 
from sqlalchemy import create_engine, text

import sys
import os

## Add the path of the functions folder
current_dir = os.getcwd()  ## Gets the current working directory
sub_dir = os.path.abspath(os.path.join(current_dir, '..'
                                       , 'Functions'))
sys.path.append(sub_dir)

# Now you can import functions
from db_secrets import SQL_107

from visualisations import plot_prediction_error, plot_prediction_density_subplots

from helpers import aggregate_sites, keras_calculate_accuracy, keras_calculate_baseline_accuracy

In [None]:
# TensorFlow sequential model
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.optimizers import Adam

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


In [None]:
# Turn warnings off to keep notebook tidy
import warnings
warnings.filterwarnings("ignore")

## Connection

In [None]:
## text for query
with open("../Exploratory_Analysis/111_sql.sql", "r") as file:
    query_text = file.read()

query_text = query_text.replace('REPLACE START DATE','2022-01-01')

In [None]:
## Create an engine + connection
engine = create_engine(SQL_107())
conn = engine.connect()

## Return data
df_raw = pd.read_sql(query_text,conn)

## Wrangle

In [None]:
## Makes working copy
df = df_raw.copy()

#df = df.sample(n=100000, random_state=42)

In [None]:
## List columns
df.columns

In [None]:
df = df[['Call Connect Time'
         ,'Outcome Location Name'
         ,'Bank Holiday'
         , 'In_Out_Hours'
         , 'Sub ICB Name'
         ,'Outcome Type']].copy()

In [None]:
## Round time to nearest hour
df['Call Connect Time'] = df['Call Connect Time'].dt.round(freq='h')


In [None]:
## Replaces low frequency sites with 'OTHER SITE'
df['Outcome Location Name'] = (df['Outcome Location Name']
                               .apply(lambda x: aggregate_sites(x)))

In [None]:
df.head()

#### binary outcome

In [None]:
df['Calls'] = 1

In [None]:
df['Outcome'] = df['Outcome Type'].transform(lambda x: 0 if x == 'No UEC Contact' else 1)
df = df.drop(['Outcome Type'],axis=1) 

# Reassemble data

#### ICB values

In [None]:
## Aggregates to one column per place per timestamp
df_call = pd.pivot_table(df
                        ,values = 'Calls'
                        ,index = 'Call Connect Time'
                        ,columns ='Sub ICB Name'
                        ,aggfunc ='sum'
                        ,fill_value = 0)

In [None]:
df_call.head()

#### Site values

In [None]:
df_site = df[df['Outcome']==1]

df_site = df_site[[ 'Call Connect Time'
         , 'Outcome Location Name'
         , 'Outcome'
         ,]].groupby([ pd.Grouper(key='Call Connect Time', freq='1h')
         , 'Outcome Location Name']).agg('sum').reset_index()

In [None]:
## Removes OTHER SITE
df_site = df_site[~(df_site['Outcome Location Name']=='OTHER SITE')]

In [None]:
df_site.head()

### Extra time features

In [None]:
df_times = df[['Call Connect Time'
            ,'Bank Holiday'
            , 'In_Out_Hours']].drop_duplicates()

In [None]:
df_times.head()

### Assemble

In [None]:
# Get unique timestamps and sites
unique_timestamps = df['Call Connect Time'].unique()
unique_sites = df_site['Outcome Location Name'].unique()

# Create a complete cross join of every site with every timestamp
complete_index = pd.DataFrame(product(unique_timestamps, unique_sites)
                              , columns=['Call Connect Time'
                                         , 'Outcome Location Name'])


In [None]:
## Merge time features
df = complete_index.merge(df_times, on='Call Connect Time', how='left')

In [None]:
## Merge calls + places
df = df.merge(df_call,on='Call Connect Time', how='left').fillna(0)

In [None]:
## Merge sites
df = df.merge(df_site,on=['Call Connect Time'
                           , 'Outcome Location Name'], how='left').fillna(0)

In [None]:
df.head()

### date time

In [None]:
## Date time conversion to numeric
df['year']    = df['Call Connect Time'].dt.year

df['month sin'] = np.sin(df['Call Connect Time'].dt.month * (2*np.pi/12))
df['month cos'] = np.cos(df['Call Connect Time'].dt.month * (2*np.pi/12))

df['YearDay sin'] = np.sin(df['Call Connect Time'].dt.day_of_year * (2*np.pi/365))
df['YearDay cos'] = np.cos(df['Call Connect Time'].dt.day_of_year * (2*np.pi/365))

df['weekday sin'] = np.sin(df['Call Connect Time'].dt.weekday+1 * (2*np.pi/7))  # Monday=0, Sunday=6
df['weekday cos'] = np.cos(df['Call Connect Time'].dt.weekday+1 * (2*np.pi/7))  # Monday=0, Sunday=6

df['Hour sin'] = np.sin(df['Call Connect Time'].dt.hour * (2*np.pi/24))
df['Hour cos'] = np.cos(df['Call Connect Time'].dt.hour * (2*np.pi/24))

df = df.drop('Call Connect Time',axis=1) 

In [None]:
## One hot encodinng for boolean variables
bool_mapping = {
    'Yes': 1,
    'No': 0,
    'In Hours': 1,
    'Out of Hours': 0
}

df.loc[:,'Is Bank Holiday'] = df['Bank Holiday'].map(bool_mapping)             
df.loc[:,'In Hours'] = df['In_Out_Hours'].map(bool_mapping)
df = df.drop(['Bank Holiday','In_Out_Hours'],axis=1) 

In [None]:
## Dummy variables from Outcome Location Name	
df = pd.concat([df, pd.get_dummies(df['Outcome Location Name']
                                   ,dtype=int
                                   , prefix='Site')]
                ,axis=1)
df = df.drop('Outcome Location Name', axis=1)

In [None]:
df.head()

In [None]:
df.columns

## Build a baseline mode

Baseline split

In [None]:
base_X = df.drop('Outcome',axis=1)# X = all  except the 'Outcome' column
base_y = df['Outcome']# y = 'Outcome' column 

base_X_train, base_X_test, base_y_train, base_y_test = train_test_split(base_X
                                                    , base_y 
                                                    , test_size = 0.25
                                                    , random_state=42)

In [None]:
## Joins outcome onto predictors
base_df = pd.concat([base_X_train,base_y_train],axis=1)

In [None]:
## Columns we care about for baseline model
group_cols = [
    'month sin'
    ,'month cos'    
    ,'weekday sin'
    ,'weekday cos'
    ,'Hour sin'
    ,'Hour cos'] + df.columns[df.columns.str.startswith('Site_')].to_list() ## sites

## Mean value across baseline
base_trained = (base_df[group_cols + ['Outcome']]
                .groupby(group_cols)
                .agg( Pred_Outcome=pd.NamedAgg(column="Outcome"
                                               , aggfunc="mean"))
                )

In [None]:
base_y_pred_train = pd.merge(base_X_train,base_trained,how='left',on=group_cols)['Pred_Outcome']
base_y_pred_test = pd.merge(base_X_test,base_trained,how='left',on=group_cols)['Pred_Outcome']

## Split

In [None]:
X = df.drop('Outcome',axis=1).to_numpy() # X = all  except the 'Outcome' column
y = df['Outcome'].to_numpy() # y = 'Outcome' column 


X_train, X_test, y_train, y_test = train_test_split(X
                                                    , y 
                                                    , test_size = 0.25
                                                    , random_state=42)

## Scale

In [None]:
def scale_data(Xy_train, Xy_test,X_or_y = ['X','y']):
    """Scale data 0-1 based on min and max in training set"""
    
    # Initialise a new scaling object for normalising input data
    sc = MinMaxScaler()
    
    if X_or_y == 'X':
        # Apply the scaler to the training and test sets
        train_sc = sc.fit_transform(Xy_train)
        test_sc = sc.transform(Xy_test)

    elif X_or_y == 'y':
        # Apply the scaler to the training and test sets
        train_sc = sc.fit_transform(Xy_train.reshape(-1, 1))
        test_sc = sc.transform(Xy_test.reshape(-1, 1))        
        
    return train_sc, test_sc

In [None]:
# Scale X data
X_train, X_test = scale_data(X_train, X_test, X_or_y='X')


# Scale y data
#y_train, y_test = scale_data(y_train, y_test, X_or_y='y')

## Build a model

In [None]:
def make_net(number_features, 
             hidden_layers=3, 
             hidden_layer_neurones=128, 
             dropout=0.0, 
             learning_rate=0.003):
    
    """Make TensorFlow neural net"""
    
    # Clear Tensorflow 
    K.clear_session()
    
    # Set up neural net
    net = Sequential()
    
    # Add hidden hidden_layers using a loop
    for i in range(hidden_layers):
        # Add fully connected layer with ReLu activation
        net.add(Dense(
            hidden_layer_neurones, 
            input_dim=number_features,
            activation='relu'))
        # Add droput layer
        net.add(Dropout(dropout))
    
    # Add final sigmoid activation output
    net.add(Dense(1, activation='linear'))    
    #    net.add(Dense(1, activation='sigmoid'))    

    # Compiling model
    opt = Adam(learning_rate=learning_rate)
    
    net.compile(loss='mse', 
                optimizer=opt, 
                metrics=['mae'])
    
    return net

In [None]:
def calculate_site_accuracy(df,model, X_train, X_test, y_train, y_test):
    """Calculate and print accuracy at site level of training and test data fits"""    
    
    X_df = df.drop('Outcome',axis=1)
    site_columns = X_df.columns[X_df.columns.str.startswith('Site_')]

    results = []

    for site in site_columns:
        
        # Get the column index for the site
        site_idx = X_df.columns.get_loc(site)

        # Filter array where site column equals 1
        test_index = np.where(X_test[:, site_idx] == 1)[0]  
        site_X_test = X_test[test_index]
        site_y_test = y_test[test_index]

        train_index = np.where(X_train[:, site_idx] == 1)[0]  
        site_X_train = X_train[train_index]
        site_y_train = y_train[train_index]

        # Predict on training and test data
        print(f'{site}: col {site_idx}')
        site_y_pred_train = model.predict(site_X_train).flatten()
        site_y_pred_test = model.predict(site_X_test).flatten()
    
        # Calculate Mean Absolute Error (MAE) for training and test sets
        site_mae_train = np.mean(np.abs(site_y_pred_train - site_y_train))
        site_mae_test = np.mean(np.abs(site_y_pred_test - site_y_test))
        
        # Calculate Mean Squared Error (MSE) for training and test sets
        site_mse_train = np.mean((site_y_pred_train - site_y_train) ** 2)
        site_mse_test = np.mean((site_y_pred_test - site_y_test) ** 2)

        # Calculate Root Mean Squared Error (RMSE) for training and test sets
        site_rmse_train = np.sqrt(site_mse_train)
        site_rmse_test = np.sqrt(site_mse_test)

        # Calculate NRMSE (Normalized RMSE)
        range_y_train = np.max(site_y_train) - np.min(site_y_train)  # Range of y_train
        range_y_test = np.max(site_y_test) - np.min(site_y_test)  # Range of y_test
        site_nrmse_train = site_rmse_train / range_y_train
        site_nrmse_test = site_rmse_test / range_y_test

        # Calculate R^2 for training and test sets
        ss_total_train = np.sum((site_y_train - np.mean(site_y_train)) ** 2)
        ss_total_test = np.sum((site_y_test - np.mean(site_y_test)) ** 2)
        ss_residual_train = np.sum((site_y_pred_train - site_y_train) ** 2)
        ss_residual_test = np.sum((site_y_pred_test - site_y_test) ** 2)

        r2_train = 1 - (ss_residual_train / ss_total_train)
        r2_test = 1 - (ss_residual_test / ss_total_test)

        ## results
        site_result = {'Site':site
                    ,'MAE train':site_mae_train                   
                    ,'MAE test':site_mae_test
                    ,'MSE train':site_mse_train
                    ,'MSE test':site_mse_test
                    ,'NRMSE train':site_nrmse_train
                    ,'NRMSE test':site_nrmse_test
                    ,'r2 train':r2_train
                    ,'r2 test':r2_test
                    }

        results.append(site_result)

    results = pd.DataFrame(results)

    return results

In [None]:
def plot_training(history_dict,measure='mae'):
    acc_values = history_dict[measure]
    val_acc_values = history_dict[f'val_{measure}']
    epochs = range(1, len(acc_values) + 1)

    fig, ax = plt.subplots()

    ax.set_xlabel('Time')
    ax.set_ylabel(measure)

    ax.plot(epochs, acc_values, color='blue', label=f'Training {measure}')
    ax.plot(epochs, val_acc_values, color='red', label=f'Test {measure}')
    ax.set_title(f'Training and validation {measure}')
    
    ax.legend()

    fig.show()

In [None]:
model = make_net(number_features=X_train.shape[1], 
             hidden_layers=3, 
             hidden_layer_neurones=128, 
             dropout=0.10, 
             learning_rate=0.003)

In [None]:
# Define save checkpoint callback (only save if new best validation results)
checkpoint_cb = keras.callbacks.ModelCheckpoint(
    'model_checkpoint.keras', save_best_only=True)

# Define early stopping callback
# Stop when no validation improvement for 25 epochs
# Restore weights to best validation accuracy
early_stopping_cb_loss = keras.callbacks.EarlyStopping(
    patience=20, restore_best_weights=True, monitor='val_loss')

#early_stopping_cb_acc = keras.callbacks.EarlyStopping(
#    patience=5, restore_best_weights=True, monitor='val_accuracy')


### Train model (and store training info in history)
history = model.fit(X_train,
                    y_train,
                    epochs=200,
                    batch_size=64,
                    validation_data=(X_test, y_test),
                    verbose=1,
                    callbacks=[checkpoint_cb
                               , early_stopping_cb_loss
                               #, early_stopping_cb_acc
                               ])

## Accuracy

In [None]:
## Baseline accuracy is the mean of site, month, weekday, hour
keras_calculate_baseline_accuracy(base_y_pred_train
                                ,base_y_pred_test
                                ,base_y_train
                                ,base_y_test)


In [None]:
keras_calculate_accuracy(model, X_train, X_test, y_train, y_test)

In [None]:
site_results = calculate_site_accuracy(df,model
                                       , X_train
                                       , X_test
                                       , y_train
                                       , y_test)

In [None]:
site_results

In [None]:
plot_training(history.history,measure='mae')

In [None]:
# Generate predictions
y_pred_train = model.predict(X_train).flatten()
y_pred_test = model.predict(X_test).flatten()

# Plot errors for both training and test data
plot_prediction_error(y_train, y_pred_train, title='Training Data - Prediction Error Plot')
plot_prediction_error(y_test, y_pred_test, title='Test Data - Prediction Error Plot')

In [None]:
# Generate predictions
y_pred_train = model.predict(X_train).flatten()
y_pred_test = model.predict(X_test).flatten()

# Plot the density plots as subplots
plot_prediction_density_subplots(y_train, y_pred_train, y_test, y_pred_test)


In [None]:
plot_prediction_density_subplots(base_y_train
                                 , base_y_pred_train
                                 , base_y_test
                                 , base_y_pred_test)