In [None]:
#reproducability
from numpy.random import seed
seed(1+347823)
import tensorflow as tf
tf.random.set_seed(1+63493)

import numpy as np
from bayes_opt import BayesianOptimization
from bayes_opt.logger import JSONLogger
from bayes_opt.event import Events
from bayes_opt.util import load_logs
import os
import pandas as pd
import datetime
from scipy import stats
from matplotlib import pyplot
from sklearn.preprocessing import MinMaxScaler

from tqdm import tqdm
import pickle
import innvestigate
tf.compat.v1.disable_eager_execution()

In [None]:

def load_RM_GW_and_HYRAS_Data(ID):
    pathGW = "./01_GWdata/"
    pathHYRAS = "./00_HYRAS/"

    GWData = pd.read_csv(pathGW+ID+'_GW-Data.csv', 
                          parse_dates=['Date'],index_col=0, dayfirst = True, 
                          decimal = '.', sep=',')
    HYRASData = pd.read_csv(pathHYRAS+ID+'_Hyras_weekly.csv',
                            parse_dates=['Date'],index_col=0, dayfirst = True,
                            decimal = '.', sep=',')
    data = pd.merge(GWData, HYRASData, how='inner', left_index = True, right_index = True)

    return data

def split_data(data, GLOBAL_SETTINGS):
    '''
    split data in four parts: training, early stopping, optimization, and testing
    strict dates for testing and optimization, relative portions for training and early stopping of the rest
    according to the input sequence length the parts do overlap ("_ext"), so the model can make use of the full data
    '''
    dataset = data[(data.index < GLOBAL_SETTINGS["test_start"])]
    dataset2 = data[(data.index < GLOBAL_SETTINGS["opt_start"])]
    
    TrainingData = dataset2[0:round(0.9 * len(dataset2))]
    StopData = dataset2[round(0.9 * len(dataset2))+1:]
    StopData_ext = dataset2[round(0.9 * len(dataset2))+1-GLOBAL_SETTINGS["seq_length"]:] 
    OptData = data[(data.index >= GLOBAL_SETTINGS["opt_start"]) & (data.index < GLOBAL_SETTINGS["test_start"])] 
    OptData_ext = pd.concat([dataset2.iloc[-GLOBAL_SETTINGS["seq_length"]:], OptData], axis=0)                                             

    TestData = data[(data.index >= GLOBAL_SETTINGS["test_start"]) & (data.index <= GLOBAL_SETTINGS["test_end"])] 
    TestData_ext = pd.concat([dataset.iloc[-GLOBAL_SETTINGS["seq_length"]:], TestData], axis=0)                                             

    return TrainingData, StopData, StopData_ext, OptData, OptData_ext, TestData, TestData_ext

def to_supervised(data, GLOBAL_SETTINGS):
    '''establish sequence to value data format, function based on code from machinelearningmastery.com'''
    X, Y = list(), list()
    # step over the entire history one time step at a time
    for i in range(len(data)):
        # find the end of this pattern
        end_idx = i + GLOBAL_SETTINGS["seq_length"]
        # check if we are beyond the dataset
        if end_idx >= len(data):
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = data[i:end_idx, 1:], data[end_idx, 0]
        X.append(seq_x)
        Y.append(seq_y)
    return np.array(X), np.array(Y)

def bayesOpt_function():
    '''can be empty here, just needed for the loading logs module'''
    return 


In [None]:

well_list = pd.read_csv("./locations.csv",sep=';',header = 0,encoding = 'Latin1')

for loc in [0]:#range(well_list.shape[0]): # loop all locations
    seed(1)
    tf.random.set_seed(1)
    
    ID = well_list.alias[loc]
    print(ID+": "+str(loc))
    
    # define bounds for hyperparameters for optimization, here the numbers x are translated into 2^x later
    pbounds = { 'densesize': (4,8),
                'batchsize': (4,9),
                'filters': (4,9)
                }
    
    BayesOptimizer = BayesianOptimization(
        f= bayesOpt_function, #Funktion die optimiert wird
        pbounds=pbounds, #Wertebereiche in denen optimiert wird
        random_state=1, 
        verbose = 0 # verbose = 1 prints only when a maximum is observed, verbose = 0 is silent, verbose = 2 prints everything
        )
    
    # #load existing optimizer
    load_logs(BayesOptimizer, logs=["./Results_XAI/logs_CNN_"+ID+".json"])
    
    #get best values from optimizer
    densesize = BayesOptimizer.max.get("params").get("densesize")
    batchsize = BayesOptimizer.max.get("params").get("batchsize")
    filters = BayesOptimizer.max.get("params").get("filters")
    
    densesize = 2**int(densesize)
    batchsize = 2**int(batchsize)
    filters = 2**int(filters)
    seqlength = 52

    #%%
    GLOBAL_SETTINGS = {
        'batch_size': batchsize, # best value from optimization
        'dense_size': densesize, # best value from optimization
        'filters': filters, # best value from optimization
        'clip_norm': True,
        'epochs': 3, #max number of training epochs to allow
        'patience': 30, # early stopping patience
        'learning_rate': 1e-3,
        'seq_length': seqlength, #length of the data input sequence for the model (here: 52 weeks = 1 year)
        'kernel_size': 3,
        'opt_start': pd.to_datetime('01012015', format='%d%m%Y'), 
        'test_start': pd.to_datetime('01012017', format='%d%m%Y'),
        'test_end': pd.to_datetime('31122020', format='%d%m%Y')
        }
        
    ## load data
    data = load_RM_GW_and_HYRAS_Data(ID)
    
    if GLOBAL_SETTINGS["test_end"] > data.index[-1]:
        GLOBAL_SETTINGS["test_end"] = data.index[-1]
        GLOBAL_SETTINGS["test_start"] = GLOBAL_SETTINGS["test_end"] - datetime.timedelta(days=(365*4))
        GLOBAL_SETTINGS["opt_start"] = GLOBAL_SETTINGS["test_start"] - datetime.timedelta(days=(365*2))
        
    #scale data
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler_gwl = MinMaxScaler(feature_range=(0, 1))
    scaler_gwl.fit(pd.DataFrame(data['GWL']))
    data_n = pd.DataFrame(scaler.fit_transform(data), index=data.index, columns=data.columns)

    #split data
    TrainingData, StopData, StopData_ext, OptData, OptData_ext, TestData, TestData_ext = split_data(data, GLOBAL_SETTINGS)
    TrainingData_n, StopData_n, StopData_ext_n, OptData_n, OptData_ext_n, TestData_n, TestData_ext_n = split_data(data_n, GLOBAL_SETTINGS)
    
    #sequence data
    X_train, Y_train = to_supervised(TrainingData_n.values, GLOBAL_SETTINGS)
    X_stop, Y_stop = to_supervised(StopData_ext_n.values, GLOBAL_SETTINGS) 
    X_opt, Y_opt = to_supervised(OptData_ext_n.values, GLOBAL_SETTINGS)
    X_test, Y_test = to_supervised(TestData_ext_n.values, GLOBAL_SETTINGS) 

    #build and train model with different initializations
    analysis_list=[]
    y_list=[]
    inimax = 20
    model_path = './Results_XAI/models/'+ID+"/"
    
    for ini in range(inimax):
        print('load: '+str(ini))
        model = tf.keras.models.load_model(model_path+str(ini))
        
        y = model.predict(X_test)
        y = scaler_gwl.inverse_transform(y)
        
        obs = np.array(TestData.GWL).reshape(-1,1)
        NSE = 1 - np.sum((obs-y)**2) / np.sum((obs-np.mean(obs))**2)
        
        #plot to see if any ensemble member shows unwanted behavior
        pyplot.figure(figsize=(3,1))
        pyplot.plot(TestData.index,TestData.GWL,color='k',label='observed')
        pyplot.plot(TestData.index,y,color='r',label='simulated')
        pyplot.legend()
        pyplot.title('NSE {} - '.format(str(np.round(NSE,2)))+ID+' - ini '+str(ini))
        pyplot.xlim([TestData.index[0],TestData.index[-1]])#
        pyplot.show()
        
        #%%apply LRP analysis
        X_full, Y_full = to_supervised(data_n.values, GLOBAL_SETTINGS)
        
        X_investigate = X_full # relevant data for LRP analysis
        data_investigate = data.iloc[seqlength:,:].copy()
        data_investigate_ext = data.copy()
        method = 'lrp.z' 
        analyzer = innvestigate.create_analyzer(method, model,reverse_verbose=True , neuron_selection_mode='all')
        analysis = analyzer.analyze(X_investigate)
        analysis_list.append(analysis)

        # make and save all predictions on the data
        y = model.predict(X_investigate)
        y = scaler_gwl.inverse_transform(y)
        y_list.append(y)
   
    #plot and save model fit on full time series
    sim_members = np.zeros(shape = (y_list[0].shape[0],inimax))
    for i in range(inimax):
        sim_members[:,i] = y_list[i].reshape(-1,)
    sim_mean = np.nanmean(sim_members,axis = 1)
    sim_uncertainty = [np.quantile(sim_members, 0.05, axis=1),np.quantile(sim_members, 0.95, axis=1)]
    
    pyplot.figure(figsize=(20,5))
    lb = sim_uncertainty[0]
    ub = sim_uncertainty[1]
    pyplot.fill_between(data_investigate.index, lb,
                ub, facecolor = (1,0.7,0,0.5),
                label ='90% confidence',linewidth = 1,
                edgecolor = (1,0.7,0,0.7))
    
    pyplot.plot(data_investigate.index, sim_mean, 'r', label ="simulated mean", linewidth = 1.7)
    pyplot.plot(data_investigate.index, data_investigate['GWL'], 'k', label ="observed", linewidth=1.7,alpha=0.9)
    pyplot.title(ID, size=17,fontweight = 'bold')
    pyplot.ylabel('GWL [m asl]', size=15)
    pyplot.xlabel('Date',size=15)
    pyplot.legend(fontsize=15,fancybox = False, framealpha = 1, edgecolor = 'k')
    pyplot.tight_layout()
    pyplot.grid(visible=True, which='major', color='#666666', alpha = 0.3, linestyle='-')
    pyplot.xticks(fontsize=16)
    pyplot.yticks(fontsize=16)
    pyplot.savefig('./Results_XAI/Full_'+ID+'_CNN.png', dpi=300,bbox_inches='tight')            
    pyplot.show()
    
    #dump results in pickle file
    analysis = analysis_list[0]
    for i in range(1,inimax):
        analysis = analysis+analysis_list[i]
    analysis = analysis/inimax
    
    dump = {}
    dump['data_investigate'] = data_investigate
    dump['data_investigate_ext'] = data_investigate_ext
    dump['analysis_list'] = analysis_list
    dump['analysis_mean'] = analysis
    dump['simulations'] = y_list
    file = './Results_XAI/analysis_'+ID+'_'+method.replace('.','')+'.pickle'
    with open(file, 'wb') as f:
        pickle.dump(dump, f)
