# Result figures for *Enhanced spatio-temporal electric load forecasts with less data using active deep learning*

---

## Overview
1. Import and test results
2. Numeric results
3. Space and time selection
4. Budget vs. accuracy
5. Training and validation losses
6. Validation against initial candidates
7. Query sequence importance
8. Exemplar predictions

In this notebook session, we summarize and visualize the experimental results of our manuscript that is entitled 'Active machine learning for spatio-temporal predictions'. First, we import the numeric results of four independently conducted experiments and check for their correct assignment (1.). Next, we create a summary of the numeric results that form table 1 of our manuscript (2.). These include the data and sensor usage rates, as well prediction test results of our experiments. Then, we create figures that compare training and validation losses for each prediction type, each active learning query variant, and each active learning query variabe that can be used for encoding, clustering and calculating the embedding entropy of candidate data points (3.). The embedding entropy that we propose is used to query the most informative candidate data points from a large pool. From these figures, we choose two representative results which we present in our manuscript. In a next step, we compare the validation losses of our AL method against the initial candidate data pool to analyse biases of our models (4.). In these figures, we omit the training losses as these are already plotted in the previous set of figures. Next, we visualize validation losses during training for randomized training sequences of data points that were queried under each active learning query variable and variant (5.). We eventually compare the performances of our initial model, as well as our passively and actively trained models visually when predicting on five randomly chosen data points (6.). We start with selecting the respective set of experimental results that we want to compare to each other, and import a number of packages that we use throughout this notebook session. 

In [28]:
### Set some parameters ###

# choose which results dataset you want to process
profile_set = 'profiles_100'

# prediction types to consider
PRED_TYPE_LIST = [
    'spatial',
    'temporal',
    'spatio-temporal'
]

# parameter constellations to consider
PARAMETER_LIST = [
    'delta0_valup0', 
    'delta0_valup1', 
    'delta1_valup0', 
    'delta1_valup1'
]


# choose which AL variables to plot. Choose from 'X_t', 'X_s1', 'X_st', 'X_joint', 'X_(t,s)', 'Y_(t,s)'
AL_VARIABLES = [
#    'X_t', 
#    'X_s1', 
    'X_st', 
    'X_(t,s)', 
    'Y_hat_(t,s)', 
    'Y_(t,s)'
]


# choose which AL variables to plot. Choose from 'X_t', 'X_s1', 'X_st', 'X_joint', 'X_(t,s)', 'Y_(t,s)'
AL_VARIANTS = [
    'rnd d_c', 
    'max d_c', 
    'min d_c', 
    'avg d_c'
]


# create figure sub title list
fig_title_list = [
    'a.', 'b.', 'c.', 'd.', 'e.', 'f.', 'g.', 'h.',
    'i.', 'j.', 'k.', 'l.', 'm.', 'n.', 'o.', 'p.',
    'q.', 'r.', 's.', 't.', 'u.', 'v.', 'w.', 'x.'
]

# set width
WIDTH_FACTOR = 8

# set universal fontsize
FONTSIZE = 20

### Import packages ###

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib.lines import Line2D
from mpl_toolkits.mplot3d import Axes3D
import math
import tensorflow as tf
import numpy as np
from PIL import Image
import random



### Provide import paths and file names ###

# path to results folder
path_to_results = '../results/' + profile_set + '/'

# path to initially trained encoders and prediction model
path_to_encoders = path_to_results + 'encoder weights/'

# path to numeric result values
path_to_values = path_to_results + 'values/'

# path to unseen test samples
path_to_samples = path_to_results + 'samples/'

# path to models 
path_to_models = path_to_results + 'models/'

# individual file names
hyper_filename = 'hyper.csv'
results_filename = 'results.csv'
seqimportance_filename = 'sequence_importance.csv'
spacetime_selection_filename = 'spacetime_selection.csv'
budgetaccuracy_filename = 'budget_vs_accuracy.csv'

initial_model_filename = 'initial.h5'
PL_model_filename = 'PL.h5'

### Provide export paths and file names ###

# path to saving result figures
path_to_saving_figures = path_to_results + 'figures/'

if not os.path.exists(path_to_saving_figures):
    os.mkdir(path_to_saving_figures)

# path to subfolder for saving spacetime point selection
path_to_saving_spacetime = path_to_saving_figures + 'spacetime selection/'

if not os.path.exists(path_to_saving_spacetime):
    os.mkdir(path_to_saving_spacetime)
    
# path to subfolder for saving budgetvsaccuracy
path_to_saving_budgetvsaccuracy = path_to_saving_figures + 'budget vs accuracy/'

if not os.path.exists(path_to_saving_budgetvsaccuracy):
    os.mkdir(path_to_saving_budgetvsaccuracy)
    
# path to subfolder for saving lossesvsunqueried
path_to_saving_lossesvsunqueried = path_to_saving_figures + 'losses vs remaining/'

if not os.path.exists(path_to_saving_lossesvsunqueried):
    os.mkdir(path_to_saving_lossesvsunqueried)
    
# path to subfolder for saving lossesvsall
path_to_saving_lossesvsall = path_to_saving_figures + 'losses vs initial/'

if not os.path.exists(path_to_saving_lossesvsall):
    os.mkdir(path_to_saving_lossesvsall)
    
# path to subfolder for saving seqimportance
path_to_saving_seqimportance = path_to_saving_figures + 'sequence importance/'

if not os.path.exists(path_to_saving_seqimportance):
    os.mkdir(path_to_saving_seqimportance)
    
# path to subfolder for saving exemplar predictions point selection
path_to_exemplar_predictions = path_to_saving_figures + 'exemplar predictions/'

if not os.path.exists(path_to_exemplar_predictions):
    os.mkdir(path_to_exemplar_predictions)

## 1. Import and test results

Here, we import the results and the hyper parameters that were used in each of our experiment. We check whether all imported results are calculated on the exact same hyper parameters for the hypothesis test.

In [18]:
### Import the results ###

hyper_list = []
results_list = []
seqimportance_list = []
spacetime_selection_list = []
budgetaccuracy_list = []

for index_pred, pred_type in enumerate(PRED_TYPE_LIST):
    
    for index_param, parameter in enumerate(PARAMETER_LIST):
        
        path_to_hyper_parameters = (
            path_to_values 
            + parameter 
            + '/' 
            + pred_type 
            + '/' 
            + hyper_filename
        )
        path_to_results = (
            path_to_values 
            + parameter 
            + '/' 
            + pred_type 
            + '/' 
            + results_filename
        )
        path_to_seqimportance = (
            path_to_values 
            + parameter 
            + '/' 
            + pred_type 
            + '/' 
            + seqimportance_filename
        )
        path_to_spacetime_selection = (
            path_to_values 
            + parameter 
            + '/' 
            + pred_type 
            + '/' 
            + spacetime_selection_filename
        )
        path_to_budgetaccuracy = (
            path_to_values 
            + parameter 
            + '/' 
            + pred_type 
            + '/' 
            + budgetaccuracy_filename
        )
        
        hyper_list.append(
            pd.read_csv(
                path_to_hyper_parameters
            )
        )
        results_list.append(
            pd.read_csv(
                path_to_results
            )
        )
        seqimportance_list.append(
            pd.read_csv(
                path_to_seqimportance
            )
        )
        spacetime_selection_list.append(
            pd.read_csv(
                path_to_spacetime_selection
            )
        )
        budgetaccuracy_list.append(
            pd.read_csv(
                path_to_budgetaccuracy
            )
        )
        
        
### Test correctness of results ###

index_counter = 0

for pred_type in PRED_TYPE_LIST:

    for parameter in PARAMETER_LIST:
        
        delta = int(parameter[5])
        valup = int(parameter[12])
        
        df_hyper = hyper_list[index_counter]
        
        if (df_hyper['red_cand_data_act_lrn'][0] != delta 
            or df_hyper['upd_val_data_act_lrn'][0] != valup):
            
            print(
                'Caution. Wrong assignment of results for', 
                parameter, 
                'and', 
                pred_type
            )

        if index_counter > 0:
            
            if not df_hyper.drop(
                [
                    'pred_list_act_lrn', 
                    'upd_val_data_act_lrn', 
                    'red_cand_data_act_lrn'
                ], 
                axis=1
            ).equals(
                previous_df.drop(
                    [
                        'pred_list_act_lrn', 
                        'upd_val_data_act_lrn', 
                        'red_cand_data_act_lrn'
                    ], 
                    axis=1
                )
            ):
                
                print(
                    'Caution. Results were not calculated on same hyper parameters for', 
                    parameter, 
                    'and', 
                    pred_type
                )
            
        previous_df = df_hyper
        index_counter += 1
        
        display(df_hyper)

Unnamed: 0.1,Unnamed: 0,public_access,test_sequence_importance,save_act_lrn_results,save_hyper_params,save_act_lrn_models,save_act_lrn_test_sample,pred_list_act_lrn,query_variants_act_lrn,query_variables_act_lrn,...,timestamp_data,time_encoding,spatial_features,histo_bins,grey_scale,down_scale_building_images,meteo_types,history_window_meteo,normalization,standardization
0,0,False,True,True,True,True,True,spatial,rnd d_c,X_t,...,15min,ORD,histogram,100.0,False,,air_density,24.0,True,True
1,1,,,,,,,,min d_c,X_s1,...,hour,,,,,,cloud_cover,,,
2,2,,,,,,,,max d_c,X_st,...,day,,,,,,precipitation,,,
3,3,,,,,,,,avg d_c,"X_(t,s)",...,month,,,,,,radiation_surface,,,
4,4,,,,,,,,,"Y_hat_(t,s)",...,,,,,,,radiation_toa,,,
5,5,,,,,,,,,"Y_(t,s)",...,,,,,,,snow_mass,,,
6,6,,,,,,,,,,...,,,,,,,snowfall,,,
7,7,,,,,,,,,,...,,,,,,,temperature,,,
8,8,,,,,,,,,,...,,,,,,,wind_speed,,,


Unnamed: 0.1,Unnamed: 0,public_access,test_sequence_importance,save_act_lrn_results,save_hyper_params,save_act_lrn_models,save_act_lrn_test_sample,pred_list_act_lrn,query_variants_act_lrn,query_variables_act_lrn,...,timestamp_data,time_encoding,spatial_features,histo_bins,grey_scale,down_scale_building_images,meteo_types,history_window_meteo,normalization,standardization
0,0,False,True,True,True,True,True,spatial,rnd d_c,X_t,...,15min,ORD,histogram,100.0,False,,air_density,24.0,True,True
1,1,,,,,,,,min d_c,X_s1,...,hour,,,,,,cloud_cover,,,
2,2,,,,,,,,max d_c,X_st,...,day,,,,,,precipitation,,,
3,3,,,,,,,,avg d_c,"X_(t,s)",...,month,,,,,,radiation_surface,,,
4,4,,,,,,,,,"Y_hat_(t,s)",...,,,,,,,radiation_toa,,,
5,5,,,,,,,,,"Y_(t,s)",...,,,,,,,snow_mass,,,
6,6,,,,,,,,,,...,,,,,,,snowfall,,,
7,7,,,,,,,,,,...,,,,,,,temperature,,,
8,8,,,,,,,,,,...,,,,,,,wind_speed,,,


Unnamed: 0.1,Unnamed: 0,public_access,test_sequence_importance,save_act_lrn_results,save_hyper_params,save_act_lrn_models,save_act_lrn_test_sample,pred_list_act_lrn,query_variants_act_lrn,query_variables_act_lrn,...,timestamp_data,time_encoding,spatial_features,histo_bins,grey_scale,down_scale_building_images,meteo_types,history_window_meteo,normalization,standardization
0,0,False,True,True,True,True,True,spatial,rnd d_c,X_t,...,15min,ORD,histogram,100.0,False,,air_density,24.0,True,True
1,1,,,,,,,,min d_c,X_s1,...,hour,,,,,,cloud_cover,,,
2,2,,,,,,,,max d_c,X_st,...,day,,,,,,precipitation,,,
3,3,,,,,,,,avg d_c,"X_(t,s)",...,month,,,,,,radiation_surface,,,
4,4,,,,,,,,,"Y_hat_(t,s)",...,,,,,,,radiation_toa,,,
5,5,,,,,,,,,"Y_(t,s)",...,,,,,,,snow_mass,,,
6,6,,,,,,,,,,...,,,,,,,snowfall,,,
7,7,,,,,,,,,,...,,,,,,,temperature,,,
8,8,,,,,,,,,,...,,,,,,,wind_speed,,,


Unnamed: 0.1,Unnamed: 0,public_access,test_sequence_importance,save_act_lrn_results,save_hyper_params,save_act_lrn_models,save_act_lrn_test_sample,pred_list_act_lrn,query_variants_act_lrn,query_variables_act_lrn,...,timestamp_data,time_encoding,spatial_features,histo_bins,grey_scale,down_scale_building_images,meteo_types,history_window_meteo,normalization,standardization
0,0,False,True,True,True,True,True,spatial,rnd d_c,X_t,...,15min,ORD,histogram,100.0,False,,air_density,24.0,True,True
1,1,,,,,,,,min d_c,X_s1,...,hour,,,,,,cloud_cover,,,
2,2,,,,,,,,max d_c,X_st,...,day,,,,,,precipitation,,,
3,3,,,,,,,,avg d_c,"X_(t,s)",...,month,,,,,,radiation_surface,,,
4,4,,,,,,,,,"Y_hat_(t,s)",...,,,,,,,radiation_toa,,,
5,5,,,,,,,,,"Y_(t,s)",...,,,,,,,snow_mass,,,
6,6,,,,,,,,,,...,,,,,,,snowfall,,,
7,7,,,,,,,,,,...,,,,,,,temperature,,,
8,8,,,,,,,,,,...,,,,,,,wind_speed,,,


Unnamed: 0.1,Unnamed: 0,public_access,test_sequence_importance,save_act_lrn_results,save_hyper_params,save_act_lrn_models,save_act_lrn_test_sample,pred_list_act_lrn,query_variants_act_lrn,query_variables_act_lrn,...,timestamp_data,time_encoding,spatial_features,histo_bins,grey_scale,down_scale_building_images,meteo_types,history_window_meteo,normalization,standardization
0,0,False,True,True,True,True,True,temporal,rnd d_c,X_t,...,15min,ORD,histogram,100.0,False,,air_density,24.0,True,True
1,1,,,,,,,,min d_c,X_s1,...,hour,,,,,,cloud_cover,,,
2,2,,,,,,,,max d_c,X_st,...,day,,,,,,precipitation,,,
3,3,,,,,,,,avg d_c,"X_(t,s)",...,month,,,,,,radiation_surface,,,
4,4,,,,,,,,,"Y_hat_(t,s)",...,,,,,,,radiation_toa,,,
5,5,,,,,,,,,"Y_(t,s)",...,,,,,,,snow_mass,,,
6,6,,,,,,,,,,...,,,,,,,snowfall,,,
7,7,,,,,,,,,,...,,,,,,,temperature,,,
8,8,,,,,,,,,,...,,,,,,,wind_speed,,,


Unnamed: 0.1,Unnamed: 0,public_access,test_sequence_importance,save_act_lrn_results,save_hyper_params,save_act_lrn_models,save_act_lrn_test_sample,pred_list_act_lrn,query_variants_act_lrn,query_variables_act_lrn,...,timestamp_data,time_encoding,spatial_features,histo_bins,grey_scale,down_scale_building_images,meteo_types,history_window_meteo,normalization,standardization
0,0,False,True,True,True,True,True,temporal,rnd d_c,X_t,...,15min,ORD,histogram,100.0,False,,air_density,24.0,True,True
1,1,,,,,,,,min d_c,X_s1,...,hour,,,,,,cloud_cover,,,
2,2,,,,,,,,max d_c,X_st,...,day,,,,,,precipitation,,,
3,3,,,,,,,,avg d_c,"X_(t,s)",...,month,,,,,,radiation_surface,,,
4,4,,,,,,,,,"Y_hat_(t,s)",...,,,,,,,radiation_toa,,,
5,5,,,,,,,,,"Y_(t,s)",...,,,,,,,snow_mass,,,
6,6,,,,,,,,,,...,,,,,,,snowfall,,,
7,7,,,,,,,,,,...,,,,,,,temperature,,,
8,8,,,,,,,,,,...,,,,,,,wind_speed,,,


Unnamed: 0.1,Unnamed: 0,public_access,test_sequence_importance,save_act_lrn_results,save_hyper_params,save_act_lrn_models,save_act_lrn_test_sample,pred_list_act_lrn,query_variants_act_lrn,query_variables_act_lrn,...,timestamp_data,time_encoding,spatial_features,histo_bins,grey_scale,down_scale_building_images,meteo_types,history_window_meteo,normalization,standardization
0,0,False,True,True,True,True,True,temporal,rnd d_c,X_t,...,15min,ORD,histogram,100.0,False,,air_density,24.0,True,True
1,1,,,,,,,,min d_c,X_s1,...,hour,,,,,,cloud_cover,,,
2,2,,,,,,,,max d_c,X_st,...,day,,,,,,precipitation,,,
3,3,,,,,,,,avg d_c,"X_(t,s)",...,month,,,,,,radiation_surface,,,
4,4,,,,,,,,,"Y_hat_(t,s)",...,,,,,,,radiation_toa,,,
5,5,,,,,,,,,"Y_(t,s)",...,,,,,,,snow_mass,,,
6,6,,,,,,,,,,...,,,,,,,snowfall,,,
7,7,,,,,,,,,,...,,,,,,,temperature,,,
8,8,,,,,,,,,,...,,,,,,,wind_speed,,,


Unnamed: 0.1,Unnamed: 0,public_access,test_sequence_importance,save_act_lrn_results,save_hyper_params,save_act_lrn_models,save_act_lrn_test_sample,pred_list_act_lrn,query_variants_act_lrn,query_variables_act_lrn,...,timestamp_data,time_encoding,spatial_features,histo_bins,grey_scale,down_scale_building_images,meteo_types,history_window_meteo,normalization,standardization
0,0,False,True,True,True,True,True,temporal,rnd d_c,X_t,...,15min,ORD,histogram,100.0,False,,air_density,24.0,True,True
1,1,,,,,,,,min d_c,X_s1,...,hour,,,,,,cloud_cover,,,
2,2,,,,,,,,max d_c,X_st,...,day,,,,,,precipitation,,,
3,3,,,,,,,,avg d_c,"X_(t,s)",...,month,,,,,,radiation_surface,,,
4,4,,,,,,,,,"Y_hat_(t,s)",...,,,,,,,radiation_toa,,,
5,5,,,,,,,,,"Y_(t,s)",...,,,,,,,snow_mass,,,
6,6,,,,,,,,,,...,,,,,,,snowfall,,,
7,7,,,,,,,,,,...,,,,,,,temperature,,,
8,8,,,,,,,,,,...,,,,,,,wind_speed,,,


Unnamed: 0.1,Unnamed: 0,public_access,test_sequence_importance,save_act_lrn_results,save_hyper_params,save_act_lrn_models,save_act_lrn_test_sample,pred_list_act_lrn,query_variants_act_lrn,query_variables_act_lrn,...,timestamp_data,time_encoding,spatial_features,histo_bins,grey_scale,down_scale_building_images,meteo_types,history_window_meteo,normalization,standardization
0,0,False,True,True,True,True,True,spatio-temporal,rnd d_c,X_t,...,15min,ORD,histogram,100.0,False,,air_density,24.0,True,True
1,1,,,,,,,,min d_c,X_s1,...,hour,,,,,,cloud_cover,,,
2,2,,,,,,,,max d_c,X_st,...,day,,,,,,precipitation,,,
3,3,,,,,,,,avg d_c,"X_(t,s)",...,month,,,,,,radiation_surface,,,
4,4,,,,,,,,,"Y_hat_(t,s)",...,,,,,,,radiation_toa,,,
5,5,,,,,,,,,"Y_(t,s)",...,,,,,,,snow_mass,,,
6,6,,,,,,,,,,...,,,,,,,snowfall,,,
7,7,,,,,,,,,,...,,,,,,,temperature,,,
8,8,,,,,,,,,,...,,,,,,,wind_speed,,,


Unnamed: 0.1,Unnamed: 0,public_access,test_sequence_importance,save_act_lrn_results,save_hyper_params,save_act_lrn_models,save_act_lrn_test_sample,pred_list_act_lrn,query_variants_act_lrn,query_variables_act_lrn,...,timestamp_data,time_encoding,spatial_features,histo_bins,grey_scale,down_scale_building_images,meteo_types,history_window_meteo,normalization,standardization
0,0,False,True,True,True,True,True,spatio-temporal,rnd d_c,X_t,...,15min,ORD,histogram,100.0,False,,air_density,24.0,True,True
1,1,,,,,,,,min d_c,X_s1,...,hour,,,,,,cloud_cover,,,
2,2,,,,,,,,max d_c,X_st,...,day,,,,,,precipitation,,,
3,3,,,,,,,,avg d_c,"X_(t,s)",...,month,,,,,,radiation_surface,,,
4,4,,,,,,,,,"Y_hat_(t,s)",...,,,,,,,radiation_toa,,,
5,5,,,,,,,,,"Y_(t,s)",...,,,,,,,snow_mass,,,
6,6,,,,,,,,,,...,,,,,,,snowfall,,,
7,7,,,,,,,,,,...,,,,,,,temperature,,,
8,8,,,,,,,,,,...,,,,,,,wind_speed,,,


Unnamed: 0.1,Unnamed: 0,public_access,test_sequence_importance,save_act_lrn_results,save_hyper_params,save_act_lrn_models,save_act_lrn_test_sample,pred_list_act_lrn,query_variants_act_lrn,query_variables_act_lrn,...,timestamp_data,time_encoding,spatial_features,histo_bins,grey_scale,down_scale_building_images,meteo_types,history_window_meteo,normalization,standardization
0,0,False,True,True,True,True,True,spatio-temporal,rnd d_c,X_t,...,15min,ORD,histogram,100.0,False,,air_density,24.0,True,True
1,1,,,,,,,,min d_c,X_s1,...,hour,,,,,,cloud_cover,,,
2,2,,,,,,,,max d_c,X_st,...,day,,,,,,precipitation,,,
3,3,,,,,,,,avg d_c,"X_(t,s)",...,month,,,,,,radiation_surface,,,
4,4,,,,,,,,,"Y_hat_(t,s)",...,,,,,,,radiation_toa,,,
5,5,,,,,,,,,"Y_(t,s)",...,,,,,,,snow_mass,,,
6,6,,,,,,,,,,...,,,,,,,snowfall,,,
7,7,,,,,,,,,,...,,,,,,,temperature,,,
8,8,,,,,,,,,,...,,,,,,,wind_speed,,,


Unnamed: 0.1,Unnamed: 0,public_access,test_sequence_importance,save_act_lrn_results,save_hyper_params,save_act_lrn_models,save_act_lrn_test_sample,pred_list_act_lrn,query_variants_act_lrn,query_variables_act_lrn,...,timestamp_data,time_encoding,spatial_features,histo_bins,grey_scale,down_scale_building_images,meteo_types,history_window_meteo,normalization,standardization
0,0,False,True,True,True,True,True,spatio-temporal,rnd d_c,X_t,...,15min,ORD,histogram,100.0,False,,air_density,24.0,True,True
1,1,,,,,,,,min d_c,X_s1,...,hour,,,,,,cloud_cover,,,
2,2,,,,,,,,max d_c,X_st,...,day,,,,,,precipitation,,,
3,3,,,,,,,,avg d_c,"X_(t,s)",...,month,,,,,,radiation_surface,,,
4,4,,,,,,,,,"Y_hat_(t,s)",...,,,,,,,radiation_toa,,,
5,5,,,,,,,,,"Y_(t,s)",...,,,,,,,snow_mass,,,
6,6,,,,,,,,,,...,,,,,,,snowfall,,,
7,7,,,,,,,,,,...,,,,,,,temperature,,,
8,8,,,,,,,,,,...,,,,,,,wind_speed,,,


## 2. Numeric results

In this section, we conclude the percentage of the data budget that is used, the percentage of novel senors in the queried candidate data point that is selected and the testing losses for each of our conducted experiments.

In [19]:
# create a counter over the list of numeric results
result_index_counter = 0

# iterate over all considered prediction types
for index_pred, pred_type in enumerate(PRED_TYPE_LIST):
    
    # iterate over all considered parameter constellations
    for index_param, parameter in enumerate(PARAMETER_LIST):
        
        # get results df corresponding to currently iterated parameter and pred_type
        result = results_list[result_index_counter]
        
        # perform some transformations on result
        results_transformed = (
            result[2:7].set_index('Unnamed: 0').transpose()
        )
        results_transformed = (
            results_transformed.drop(['streamtime_usage'], axis=1)
        )
        results_transformed['test_loss'] = (
            results_transformed['test_loss']
        )
        results_transformed['RF_loss'] = (
            results_transformed['RF_loss']
        )
        results_transformed['accuracy'] = (
            100 * (1 - np.minimum(1, results_transformed['test_loss'] /results_transformed['RF_loss']))
        ).round().astype(int)
        results_transformed['sensor_usage'] = (
            (100 * results_transformed['sensor_usage']).round().astype(int)
        )
        results_transformed['budget_usage'] = (
            (100 * results_transformed['budget_usage']).round().astype(int)
        )
        
        # increment result index counter
        result_index_counter += 1
        
        print(parameter)
        display(results_transformed)

delta0_valup0


Unnamed: 0,budget_usage,sensor_usage,RF_loss,test_loss,accuracy
spatial None PL train,79,100,0.419932,0.260671,38
spatial None PL val,79,100,0.419932,0.260671,38
spatial X_t rnd d_c train,80,100,0.419932,0.232299,45
spatial X_t rnd d_c val,80,100,0.419932,0.232299,45
spatial X_t min d_c train,11,100,0.419932,0.276116,34
spatial X_t min d_c val,11,100,0.419932,0.276116,34
spatial X_t max d_c train,11,100,0.419932,0.274408,35
spatial X_t max d_c val,11,100,0.419932,0.274408,35
spatial X_t avg d_c train,11,100,0.419932,0.272273,35
spatial X_t avg d_c val,11,100,0.419932,0.272273,35


delta0_valup1


Unnamed: 0,budget_usage,sensor_usage,RF_loss,test_loss,accuracy
spatial None PL train,79,100,0.860448,0.358573,58
spatial None PL val,79,100,0.860448,0.358573,58
spatial X_t rnd d_c train,80,100,0.860448,0.332754,61
spatial X_t rnd d_c val,80,100,0.860448,0.332754,61
spatial X_t min d_c train,11,100,0.860448,0.404185,53
spatial X_t min d_c val,11,100,0.860448,0.404185,53
spatial X_t max d_c train,11,100,0.860448,0.391305,55
spatial X_t max d_c val,11,100,0.860448,0.391305,55
spatial X_t avg d_c train,11,100,0.860448,0.404222,53
spatial X_t avg d_c val,11,100,0.860448,0.404222,53


delta1_valup0


Unnamed: 0,budget_usage,sensor_usage,RF_loss,test_loss,accuracy
spatial None PL train,100,100,0.770282,0.212516,72
spatial None PL val,100,100,0.770282,0.212516,72
spatial X_t rnd d_c train,100,100,0.770282,0.220407,71
spatial X_t rnd d_c val,100,100,0.770282,0.220407,71
spatial X_t min d_c train,100,100,0.770282,0.216773,72
spatial X_t min d_c val,100,100,0.770282,0.216773,72
spatial X_t max d_c train,100,100,0.770282,0.208933,73
spatial X_t max d_c val,100,100,0.770282,0.208933,73
spatial X_t avg d_c train,100,100,0.770282,0.224505,71
spatial X_t avg d_c val,100,100,0.770282,0.224505,71


delta1_valup1


Unnamed: 0,budget_usage,sensor_usage,RF_loss,test_loss,accuracy
spatial None PL train,100,100,0.511404,0.15928,69
spatial None PL val,100,100,0.511404,0.15928,69
spatial X_t rnd d_c train,100,100,0.511404,0.154267,70
spatial X_t rnd d_c val,100,100,0.511404,0.154267,70
spatial X_t min d_c train,100,100,0.511404,0.170737,67
spatial X_t min d_c val,100,100,0.511404,0.170737,67
spatial X_t max d_c train,100,100,0.511404,0.151425,70
spatial X_t max d_c val,100,100,0.511404,0.151425,70
spatial X_t avg d_c train,100,100,0.511404,0.148923,71
spatial X_t avg d_c val,100,100,0.511404,0.148923,71


delta0_valup0


Unnamed: 0,budget_usage,sensor_usage,RF_loss,test_loss,accuracy
temporal None PL train,79,0,0.829827,0.54151,35
temporal None PL val,79,0,0.829827,0.54151,35
temporal X_t rnd d_c train,80,0,0.829827,0.56934,31
temporal X_t rnd d_c val,80,0,0.829827,0.56934,31
temporal X_t min d_c train,11,0,0.829827,0.679796,18
temporal X_t min d_c val,11,0,0.829827,0.679796,18
temporal X_t max d_c train,12,0,0.829827,0.658713,21
temporal X_t max d_c val,12,0,0.829827,0.658713,21
temporal X_t avg d_c train,11,0,0.829827,0.665498,20
temporal X_t avg d_c val,11,0,0.829827,0.665498,20


delta0_valup1


Unnamed: 0,budget_usage,sensor_usage,RF_loss,test_loss,accuracy
temporal None PL train,79,0,1.333063,1.449833,0
temporal None PL val,79,0,1.333063,1.449833,0
temporal X_t rnd d_c train,80,0,1.333063,1.42508,0
temporal X_t rnd d_c val,80,0,1.333063,1.42508,0
temporal X_t min d_c train,11,0,1.333063,1.360505,0
temporal X_t min d_c val,11,0,1.333063,1.360505,0
temporal X_t max d_c train,11,0,1.333063,1.320415,1
temporal X_t max d_c val,11,0,1.333063,1.320415,1
temporal X_t avg d_c train,11,0,1.333063,1.340398,0
temporal X_t avg d_c val,11,0,1.333063,1.340398,0


delta1_valup0


Unnamed: 0,budget_usage,sensor_usage,RF_loss,test_loss,accuracy
temporal None PL train,100,0,1.200163,1.118887,7
temporal None PL val,100,0,1.200163,1.118887,7
temporal X_t rnd d_c train,100,0,1.200163,1.283915,0
temporal X_t rnd d_c val,100,0,1.200163,1.283915,0
temporal X_t min d_c train,100,0,1.200163,1.108572,8
temporal X_t min d_c val,100,0,1.200163,1.108572,8
temporal X_t max d_c train,100,0,1.200163,1.420246,0
temporal X_t max d_c val,100,0,1.200163,1.420246,0
temporal X_t avg d_c train,100,0,1.200163,1.100335,8
temporal X_t avg d_c val,100,0,1.200163,1.100335,8


delta1_valup1


Unnamed: 0,budget_usage,sensor_usage,RF_loss,test_loss,accuracy
temporal None PL train,100,0,1.20138,1.086269,10
temporal None PL val,100,0,1.20138,1.086269,10
temporal X_t rnd d_c train,100,0,1.20138,1.027639,14
temporal X_t rnd d_c val,100,0,1.20138,1.027639,14
temporal X_t min d_c train,100,0,1.20138,1.056781,12
temporal X_t min d_c val,100,0,1.20138,1.056781,12
temporal X_t max d_c train,100,0,1.20138,1.217591,0
temporal X_t max d_c val,100,0,1.20138,1.217591,0
temporal X_t avg d_c train,100,0,1.20138,1.133066,6
temporal X_t avg d_c val,100,0,1.20138,1.133066,6


delta0_valup0


Unnamed: 0,budget_usage,sensor_usage,RF_loss,test_loss,accuracy
spatio-temporal None PL train,79,100,3.334547,1.316524,61
spatio-temporal None PL val,79,100,3.334547,1.316524,61
spatio-temporal X_t rnd d_c train,80,100,3.334547,1.40362,58
spatio-temporal X_t rnd d_c val,80,100,3.334547,1.40362,58
spatio-temporal X_t min d_c train,11,100,3.334547,1.541356,54
spatio-temporal X_t min d_c val,11,100,3.334547,1.541356,54
spatio-temporal X_t max d_c train,11,100,3.334547,1.565275,53
spatio-temporal X_t max d_c val,11,100,3.334547,1.565275,53
spatio-temporal X_t avg d_c train,11,100,3.334547,1.526509,54
spatio-temporal X_t avg d_c val,11,100,3.334547,1.526509,54


delta0_valup1


Unnamed: 0,budget_usage,sensor_usage,RF_loss,test_loss,accuracy
spatio-temporal None PL train,79,100,1.037565,0.483955,53
spatio-temporal None PL val,79,100,1.037565,0.483955,53
spatio-temporal X_t rnd d_c train,80,100,1.037565,0.499855,52
spatio-temporal X_t rnd d_c val,80,100,1.037565,0.499855,52
spatio-temporal X_t min d_c train,11,100,1.037565,0.599947,42
spatio-temporal X_t min d_c val,11,100,1.037565,0.599947,42
spatio-temporal X_t max d_c train,11,100,1.037565,0.597273,42
spatio-temporal X_t max d_c val,11,100,1.037565,0.597273,42
spatio-temporal X_t avg d_c train,11,100,1.037565,0.609783,41
spatio-temporal X_t avg d_c val,11,100,1.037565,0.609783,41


delta1_valup0


Unnamed: 0,budget_usage,sensor_usage,RF_loss,test_loss,accuracy
spatio-temporal None PL train,100,100,1.492703,1.120166,25
spatio-temporal None PL val,100,100,1.492703,1.120166,25
spatio-temporal X_t rnd d_c train,100,100,1.492703,1.167029,22
spatio-temporal X_t rnd d_c val,100,100,1.492703,1.167029,22
spatio-temporal X_t min d_c train,100,100,1.492703,1.138904,24
spatio-temporal X_t min d_c val,100,100,1.492703,1.138904,24
spatio-temporal X_t max d_c train,100,100,1.492703,1.076804,28
spatio-temporal X_t max d_c val,100,100,1.492703,1.076804,28
spatio-temporal X_t avg d_c train,100,100,1.492703,1.10555,26
spatio-temporal X_t avg d_c val,100,100,1.492703,1.10555,26


delta1_valup1


Unnamed: 0,budget_usage,sensor_usage,RF_loss,test_loss,accuracy
spatio-temporal None PL train,100,100,1.095341,0.508049,54
spatio-temporal None PL val,100,100,1.095341,0.508049,54
spatio-temporal X_t rnd d_c train,100,100,1.095341,0.516228,53
spatio-temporal X_t rnd d_c val,100,100,1.095341,0.516228,53
spatio-temporal X_t min d_c train,100,100,1.095341,0.477694,56
spatio-temporal X_t min d_c val,100,100,1.095341,0.477694,56
spatio-temporal X_t max d_c train,100,100,1.095341,0.475584,57
spatio-temporal X_t max d_c val,100,100,1.095341,0.475584,57
spatio-temporal X_t avg d_c train,100,100,1.095341,0.47566,57
spatio-temporal X_t avg d_c val,100,100,1.095341,0.47566,57


## 3. Space and time selection

In [20]:
%%capture 
# prevents figures being printed out if used at begining of cell

# define some hyper for plots
n_iter_plot = 3
n_meshes_surface = 10

# import building meta data
path_to_building_meta = '../data/private/' + profile_set + '/meta/meta buildings.csv'
building_meta = pd.read_csv(path_to_building_meta)

def create_plot():
    
    # get bound coordinates of all buildings
    min_lat = df['building lat'].min()
    max_lat = df['building lat'].max() 
    min_long = df['building long'].min()
    max_long = df['building long'].max()

    # create evenly sized arrays and meshed grid of lats and longs
    lat_surface = np.linspace(
        min_lat, 
        max_lat,
        num=n_meshes_surface
    )
    long_surface = np.linspace(
        min_long, 
        max_long,
        num=n_meshes_surface
    )
    long_surface, lat_surface  = np.meshgrid(long_surface, lat_surface)
    
    # update max time point
    min_time_point, max_time_point = min(time_data), max(time_data)
    Z = np.full((len(lat_surface), 1), max_time_point)
            
    ax.scatter(
        df['building long'], 
        df['building lat'], 
        time_data, 
        c=time_data, alpha=0.7
    )
    ax.scatter(
        df_new_sensors['building long'], 
        df_new_sensors['building lat'], 
        max_time_point,  
        alpha=1, marker='x', c='r', s=100
    )
    ax.plot_surface(
        long_surface, 
        lat_surface, 
        Z,  
        alpha=0.03
    )
    
    
def customize_plot(first_colname):
    
    # set angle
    ax.view_init(30, 103)
    
    # Get rid of the panes
    ax.w_xaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))
    ax.w_yaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))
    ax.w_zaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))

    # Get rid of the ticks
    ax.set_xticks([]) 
    ax.set_yticks([]) 
    ax.set_zticks([])

    # Add the labels
    ax.set_xlabel('longitude' )
    ax.set_ylabel('latitude')
    ax.set_zlabel('time')

    # shift time (z) axis
    tmp_planes = ax.zaxis._PLANES 
    ax.zaxis._PLANES = ( 
        tmp_planes[2], tmp_planes[3], 
        tmp_planes[0], tmp_planes[1], 
        tmp_planes[4], tmp_planes[5]
    )
    
    # shift lat (y) axis
    ax.yaxis._PLANES = ( 
        tmp_planes[2], tmp_planes[3], 
        tmp_planes[0], tmp_planes[1], 
        tmp_planes[4], tmp_planes[5]
    )
    
    
    # set subplot titles
    if iteration == 0:
        ax.set_title(first_colname)
    else:
        ax.set_title(fig_title_list[plot_counter-1])
        

# create a counter over the list of numeric results
result_index_counter = 0

# iterate over all considered prediction types
for index_pred, pred_type in enumerate(PRED_TYPE_LIST):
    
    # iterate over all considered parameter constellations
    for index_param, parameter in enumerate(PARAMETER_LIST):
        
        # get results df corresponding to currently iterated parameter and pred_type
        spacetime_result = spacetime_selection_list[result_index_counter]

        # increment result index counter
        result_index_counter += 1
        
        for AL_variable in AL_VARIABLES:
            
            for AL_variant in AL_VARIANTS:
        
                # create figure
                fig = plt.figure(figsize=(16, 24))

                # set the fontsize for figures
                mpl.rcParams.update({'font.size': 16}) #FONTSIZE

                plot_counter = 1
                new_sensors_PL_set = set()
                new_sensors_AL_set = set()
                for iteration in range(n_iter_plot):

                    # create column names
                    colname_AL_times = '{} {} {} - iter {} times'.format(pred_type, AL_variable, AL_variant, iteration) 
                    colname_AL_spaces = '{} {} {} - iter {} spaces'.format(pred_type, AL_variable, AL_variant, iteration) 
                    colname_PL_times = '{} None PL - iter {} times'.format(pred_type, iteration)
                    colname_PL_spaces = '{} None PL - iter {} spaces'.format(pred_type, iteration)
                    
                    ### Plot AL results on left column ###
                    
                    # get data
                    space_data = spacetime_result[colname_AL_spaces]
                    time_data = spacetime_result[colname_AL_times]

                    # get new sensors and set old sensors
                    old_senors_AL_set = new_sensors_AL_set
                    new_sensors_AL_set = set(space_data)
                    new_sensors_AL_list = list(new_sensors_AL_set - old_senors_AL_set)

                    # assign lat and long to building IDs
                    df = pd.DataFrame({'building ID':space_data})
                    df = df.merge(building_meta, on='building ID', how='left')

                    df_new_sensors = pd.DataFrame({'building ID':new_sensors_AL_list})
                    df_new_sensors = df_new_sensors.merge(building_meta, on='building ID', how='left')

                    # AL temporal scatter plot
                    ax = fig.add_subplot(n_iter_plot, 2, plot_counter, projection='3d')
                    create_plot()
                    customize_plot('Active learning (AL) \n a.')
                    plot_counter += 1


                    ### Plot PL results on right column ###

                    # get data
                    space_data = spacetime_result[colname_PL_spaces]
                    time_data = spacetime_result[colname_PL_times]

                    # get new sensors and set old sensors
                    old_senors_PL_set = new_sensors_PL_set
                    new_sensors_PL_set = set(space_data)
                    new_sensors_PL_list = list(new_sensors_PL_set - old_senors_PL_set)

                    # assign lat and long to building IDs
                    df = pd.DataFrame({'building ID':space_data})
                    df = df.merge(building_meta, on='building ID', how='left')

                    df_new_sensors = pd.DataFrame({'building ID':new_sensors_PL_list})
                    df_new_sensors = df_new_sensors.merge(building_meta, on='building ID', how='left')

                    # PL temporal scatter plot
                    ax = fig.add_subplot(n_iter_plot, 2, plot_counter, projection='3d')
                    create_plot()
                    customize_plot('Passive learning (PL) \n b.')
                    plot_counter+= 1


                # create saving paths 
                saving_path = (
                    path_to_saving_spacetime 
                    + pred_type
                    + ' '
                    + parameter
                    + ' '
                    + AL_variable
                    + ' '
                    + AL_variant
                    + '.pdf'
                )

                # create a legend
                legend_elements = [
                    Line2D([0], [0], marker='o', color='w', markerfacecolor='b', markersize=10, label='queried data in space-time'),
                    Line2D([0], [0], marker='X', color='w', markerfacecolor='r', markersize=15, label='new sensors required')
                ]

                # set layout tight
                fig.tight_layout()

                fig.legend(handles=legend_elements, loc='upper center', bbox_to_anchor=(0.5,1))

                # save figures
                fig.savefig(saving_path, bbox_inches="tight")

## 4. Budget vs. accuracy

In [29]:
%%capture 
# prevents figures being printed out if used at begining of cell

for AL_variable in AL_VARIABLES:
    for AL_variant in AL_VARIANTS:
        
        # set the fontsize for figures
        mpl.rcParams.update({'font.size': FONTSIZE})

        fig, ax = plt.subplots(
            len(PRED_TYPE_LIST), 
            2, 
            figsize=(
                20, 
                24
            )
        )

        # set plot_counter for subplot title setting
        plot_counter = 0

        # create a counter over the list of numeric results
        result_index_counter = 0

        # iterate over all considered prediction types
        for index_pred, pred_type in enumerate(PRED_TYPE_LIST):

            # iterate over all considered parameter constellations
            for index_param, parameter in enumerate(PARAMETER_LIST):

                delta = int(parameter[5])
                valup = int(parameter[12])

                # skip cases where we validate against queried data too
                if valup == 0:
                    # increment result index counter
                    result_index_counter += 1
                    continue

                # get results df corresponding to currently iterated parameter and pred_type
                budgetaccuracy = budgetaccuracy_list[result_index_counter]

                # increment result index counter
                result_index_counter += 1

                # create the column name for PL lossess
                col_name_data = (
                    pred_type 
                    + ' None ' 
                    + 'PL ' 
                    + 'data'
                )
                col_name_sensors = (
                    pred_type 
                    + ' None ' 
                    + 'PL ' 
                    + 'sensors'
                )
                col_name_streamtimes = (
                    pred_type 
                    + ' None ' 
                    + 'PL ' 
                    + 'streamtimes'
                )
                col_name_accuracy = (
                    pred_type 
                    + ' None ' 
                    + 'PL ' 
                    + 'accuracy'
                )

                # get PL results
                PL_data = np.append(0, budgetaccuracy[col_name_data].values)
                PL_accuracy = np.append(0, budgetaccuracy[col_name_accuracy].values)

                # create the column name for iterated validation loss
                col_name_data = (
                    pred_type 
                    + ' ' 
                    + AL_variable 
                    + ' ' 
                    + AL_variant 
                    + ' data'
                )
                col_name_accuracy = (
                    pred_type 
                    + ' ' 
                    + AL_variable 
                    + ' ' 
                    + AL_variant 
                    + ' accuracy'
                )

                # get training losses for mode 1 with validation updates
                AL_data = np.append(0, budgetaccuracy[col_name_data].values)
                AL_accuracy = np.append(0, budgetaccuracy[col_name_accuracy].values)

                # plot iterated training losses
                ax[index_pred, delta].plot(
                    AL_accuracy,
                    color='b'
                )

                for x,y in enumerate(AL_accuracy):

                    # plot annotations only on every second step
                    if (x+1)%2 == 0:
                        ax[index_pred, delta].annotate(
                            str(AL_data[x])+'%',
                            (x, y+5)
                        )

                # plot PL accuracy.
                # Note: Moved plotting down after plotting AL, in order to have legends aligned with height of plots
                ax[index_pred, delta].plot(
                    PL_accuracy, 
                    color='r',
                )

                ax[index_pred, delta].set_ylim(
                    top=100
                )

                for x,y in enumerate(PL_accuracy):
                    # plot annotations only on every second step
                    if (x+1)%2 == 0:
                        ax[index_pred, delta].annotate(
                            str(PL_data[x])+'%',
                            (x, y-5)
                        )

                
                # set subplot titles
                ax[index_pred, delta].set_title(fig_title_list[plot_counter])
                plot_counter+= 1
                
                # set y-axis labels
                ax[index_pred, 0].set_ylabel(
                    '{} \n prediction accuracy'.format(pred_type), 
                    fontsize=FONTSIZE
                )

                # set x-axis
                ax[len(PRED_TYPE_LIST)-1, delta].set_xlabel(
                    'data selection \n iteration', 
                    fontsize=FONTSIZE
                )


        # set column titles
        cols = [
            'δ=0 \n a.', 
            'δ=1 \n b.'
        ]
        for axes, col in zip(ax[0], cols):
            axes.set_title(col)

        # create saving paths 
        saving_path = (
            path_to_saving_budgetvsaccuracy 
            + AL_variable
            + ' '
            + AL_variant
            + '.pdf'
        )

        legend_elements = [Line2D([0], [0], color='b', label='Active learning (AL)', markersize=FONTSIZE),
                           Line2D([0], [0], color='r', label='Passive learning (PL)', markersize=FONTSIZE),
                           Line2D([0], [0], color='w', label='% = budget usage', markersize=FONTSIZE)]

        # set layout tight
        fig.tight_layout()

        fig.legend(handles=legend_elements, loc='upper center', bbox_to_anchor=(0.9,1.02))

        # save figures
        fig.savefig(saving_path, bbox_inches="tight")

## 5. Training and validation losses

For each prediction task, each query variable and each query variant, we create figures that allow us to compare their training and validation losses throughout the process of querying new candidate data points in each iteration of the algorithm that we propose.

In [22]:
%%capture 
# prevents figures being printed out if used at begining of cell

### Define a series of manual corrections for figures ###

class ManualFigureCorrections:
    
    """ Bundles information for manually correcting figure axes """
    
    
    def __init__(
        self,
        pred_type, 
        AL_variable_list, 
        parameter, 
        column, 
        y_lim_bottom, 
        y_lim_top
    ):
        
        """ Takes required arguments for correcting axes. """
        
        self.pred_type = pred_type
        self.AL_variable_list = AL_variable_list
        self.parameter = parameter
        self.column = column
        self.y_lim_bottom = y_lim_bottom
        self.y_lim_top = y_lim_top
        
correction_list = []
correction_list.append(
    ManualFigureCorrections(
        'spatio-temporal', 
        [
            'X_st', 
            'X_(t,s)', 
            'Y_(t,s)'
        ], 
        'delta0_valup1', 
        0,
        0,
        9
    )
)

correction_list.append(
    ManualFigureCorrections(
        'spatio-temporal', 
        [
            'X_st', 
            'X_(t,s)', 
            'Y_(t,s)'
        ], 
        'delta0_valup1', 
        1,
        0.2,
        2.0
    )
)

correction_list.append(
    ManualFigureCorrections(
        'spatio-temporal', 
        [
            'X_st', 
            'X_(t,s)', 
            'Y_(t,s)'
        ], 
        'delta1_valup1', 
        0,
        0,
        3
    )
)

correction_list.append(
    ManualFigureCorrections(
        'spatio-temporal', 
        [
            'X_st', 
            'X_(t,s)', 
            'Y_(t,s)'
        ], 
        'delta1_valup1', 
        1,
        0,
        1
    )
)




# set the fontsize for figures
mpl.rcParams.update({'font.size': FONTSIZE})

# create a counter over the list of numeric results
result_index_counter = 0

# iterate over all considered prediction types
for index_pred, pred_type in enumerate(PRED_TYPE_LIST):
    
    # iterate over all considered parameter constellations
    for index_param, parameter in enumerate(PARAMETER_LIST):

        delta = int(parameter[5])
        valup = int(parameter[12])
        
        # get results df corresponding to currently iterated parameter and pred_type
        result = results_list[result_index_counter]
        
        # increment result index counter
        result_index_counter += 1
        
        # create the column name for random lossess
        col_name_train = (
            pred_type 
            + ' None ' 
            + 'PL ' 
            + 'train'
        )
        col_name_val = (
            pred_type 
            + ' None ' 
            + 'PL ' 
            + 'val'
        )
        
        
        # get random results
        PL_train = result[col_name_train][9:].dropna().values
        t_iter = int(result[col_name_train][1])

        PL_val = result[col_name_val][9:].dropna().values
        budget_usage = result[col_name_val][2]
        sensor_usage = result[col_name_val][3]
        RF_loss = result[col_name_val][5]
        PL_loss = result[col_name_val][6]
        PL_accuracy = round(100* (1 - PL_loss/RF_loss))/100
        
        # create the figure legends for random losses
        legend_RF = 'RF baseline'
        legend_PL_train = '{} {}s'.format(
            'PL random:', 
            t_iter
        )
        legend_PL_val = '{}  {:.0%} data  {:.0%} sensors  {:.0%} accuracy'.format(
            'PL random:', 
            budget_usage, 
            sensor_usage,
            PL_loss
        )
        
        fig, ax = plt.subplots(
            len(AL_VARIABLES), 
            2, 
            figsize=(
                20, 
                len(AL_VARIABLES) * WIDTH_FACTOR
            )
        )
        
        # set figure titles
        if valup == 0:

            string_valup = 'initial'

        else:

            string_valup = 'unqueried'
        
        # counter to increment subtitle of figures
        title_counter = 0
        
        # iterate over all AL variables
        for index_var, AL_variable in enumerate(AL_VARIABLES):
            
            # plot passive learning losses
            ax[index_var, 0].plot(
                PL_train, 
                color='b', 
                linestyle='--', 
                label=legend_PL_train
            )
            ax[index_var, 1].plot(
                PL_val, 
                color='b', 
                linestyle='--', 
                label=legend_PL_val
            )
                       
            # iterate over all AL variants
            for index_method, AL_variant in enumerate(AL_VARIANTS):
                
                # create the column name for iterated validation loss
                col_name_train = (
                    pred_type 
                    + ' ' 
                    + AL_variable 
                    + ' ' 
                    + AL_variant 
                    + ' train'
                )
                col_name_val = (
                    pred_type 
                    + ' ' 
                    + AL_variable 
                    + ' ' 
                    + AL_variant 
                    + ' val'
                )
                
                # get training losses for mode 1 with validation updates
                train_history = result[col_name_train][9:].dropna().values
                t_iter = int(result[col_name_train][1])

                val_history = result[col_name_val][9:].dropna().values
                budget = result[col_name_val][2]
                sensor = result[col_name_val][3]
                RF_loss = result[col_name_val][5]
                AL_loss = result[col_name_val][6]
                AL_accuracy = round(100* (1 - AL_loss/RF_loss))/100
                
                # create the legends
                legend_train = 'AL {}: {}s'.format(
                    AL_variant, 
                    t_iter
                )
                legend_val = 'AL {}:  {:.0%} data  {:.0%} sensors  {:.0%} accuracy'.format(
                    AL_variant, 
                    budget, 
                    sensor,
                    AL_accuracy
                )
                
                # plot iterated training losses
                ax[index_var, 0].plot(
                    train_history, 
                    label=legend_train
                )
                ax[index_var, 1].plot(
                    val_history, 
                    label=legend_val
                )

            # set legend
            ax[index_var, 0].legend(
                loc='best', 
                frameon=False,
                fontsize=FONTSIZE-2
            )
            ax[index_var, 1].legend(
                loc='best', 
                frameon=False,
                fontsize=FONTSIZE-2
            )

            # set y-axis labels
            ax[index_var, 0].set_ylabel(
                'L2 loss [kW²]', 
                fontsize=FONTSIZE+3
            )
            
            """
            # set y-axis limits
            for correction in correction_list:

                if correction.pred_type != pred_type:
                    continue

                if correction.parameter != parameter:
                    continue

                if AL_variable in correction.AL_variable_list:

                    ax[index_var, correction.column].set_ylim(
                        bottom=correction.y_lim_bottom,
                        top=correction.y_lim_top
                    )
            """
            
            # set title
            ax[index_var, 0].set_title(fig_title_list[title_counter])
            title_counter +=1
            ax[index_var, 1].set_title(fig_title_list[title_counter])
            title_counter +=1
            
        # set x-axis
        ax[index_var, 0].set_xlabel(
            'epoch', 
            fontsize=FONTSIZE+3
        )
        ax[index_var, 1].set_xlabel(
            'epoch', 
            fontsize=FONTSIZE+3
        )

        # set column titles
        cols = [
            'Training losses \n a.', 
            'Validation losses \n b.'
        ]
        for axes, col in zip(ax[0], cols):
            axes.set_title(col)


        # create saving paths 
        saving_path = (
            path_to_saving_lossesvsunqueried 
            + pred_type 
            + ' ' 
            + parameter 
            + '.pdf'
        )

        # set layout tight
        fig.tight_layout()

        # save figures
        fig.savefig(saving_path)

## 6. Validation losses against initial candidates

Here, we compare the validation losses for both removing candidate data points and keeping them after these are queried against the initial candidate data pool. This has the purpose to see how each variant of the algorithm that we propose deals with biases when extending the initial prediction model with newly queried data points. We only compare validation losses and neglect training losses to be more concise.

In [23]:
%%capture 
# prevents figures being printed out if used at begining of cell

### Define a set of manual corrections for figures ###

correction_list = []

correction_list.append(
    ManualFigureCorrections(
        'spatio-temporal', 
        [
            'X_st', 
            'X_(t,s)', 
            'Y_(t,s)'
        ], 
        None, 
        0,
        0.5,
        2.3
    )
)
correction_list.append(
    ManualFigureCorrections(
        'spatio-temporal', 
        [
            'X_st', 
            'X_(t,s)', 
            'Y_(t,s)'
        ], 
        None, 
        1,
        0.25,
        2.4
    )
)


# set the fontsize for figures
mpl.rcParams.update({'font.size': FONTSIZE})

# create a counter over the list of numeric results
result_index_counter = 0
    
# iterate over all considered prediction types
for index_pred, pred_type in enumerate(PRED_TYPE_LIST):
    
    # create figure
    fig, ax = plt.subplots(
        len(AL_VARIABLES), 
        2, 
        figsize=(
            20, 
            len(AL_VARIABLES) * WIDTH_FACTOR
        )
    )

    # iterate over all considered parameter constellations
    for index_param, parameter in enumerate(PARAMETER_LIST):
    
        delta = int(parameter[5])
        valup = int(parameter[12])

        # get results df corresponding to currently iterated parameter and pred_type
        result = results_list[result_index_counter]
        
        # increment result index counter
        result_index_counter += 1
        
        # skip results, if we consider valup == 0
        if valup == 1:
            continue
        if delta == 1:
            plot_column = 0
        else:
            plot_column = 1
            
        # create the column name for PL validation loss
        col_name_val = (
            pred_type 
            + ' None ' 
            + 'PL ' 
            + 'val'
        )

        PL_val = result[col_name_val][9:].dropna().values
        budget_usage = result[col_name_val][2]
        sensor_usage = result[col_name_val][3]
        RF_loss = result[col_name_val][5]
        PL_loss = result[col_name_val][6]
        PL_accuracy = round(100* (1 - PL_loss/RF_loss))/100
        
        # create the figure legends for random losses
        legend_RF = 'RF baseline'
        legend_PL_val = 'PL random {:.0%} data  {:.0%} sensors  {:.0%} accuracy'.format(
             budget_usage, 
             sensor_usage,
             PL_accuracy
        )
        
        title_counter = 0
        
        # iterate over all sort variables
        for index_var, AL_variable in enumerate(AL_VARIABLES):

            # plot passive learning training losses
            ax[index_var, plot_column].plot(
                PL_val, 
                color='b', 
                linestyle='--', 
                label=legend_PL_val
            )

            # iterate over all methods of currently iterated sort variable
            for index_method, method in enumerate(AL_VARIANTS):

                # create the column name for iterated validation loss
                col_name_val = (
                    pred_type 
                    + ' ' 
                    + AL_variable 
                    + ' ' 
                    + method 
                    + ' val'
                )
                
                val_history = result[col_name_val][9:].dropna().values
                budget = result[col_name_val][2]
                sensor = result[col_name_val][3]
                RF_loss = result[col_name_val][5]
                AL_loss = result[col_name_val][6]
                AL_accuracy = round(100* (1 - AL_loss/RF_loss))/100
                
                # create the legends
                legend_val = 'AL {}  {:.0%} data  {:.0%} sensors  {:.0%} accuracy'.format(
                    method, 
                    budget, 
                    sensor, 
                    AL_accuracy
                )
          
                # plot iterated validation losses
                ax[index_var, plot_column].plot(
                    val_history, 
                    label=legend_val
                )
                
                # set legends
                ax[index_var, plot_column].legend(
                    loc='best', 
                    frameon=False,
                    fontsize=FONTSIZE-2
                )

                # set y-axis labels
                ax[index_var, plot_column].set_ylabel(
                    'L2 loss [kW²]', 
                    fontsize=FONTSIZE+3
                )
                
                """
                # set y-axis limits
                for correction in correction_list:

                    if correction.pred_type != pred_type:
                        continue

                    if AL_variable in correction.AL_variable_list:

                        ax[index_var, correction.column].set_ylim(
                            bottom=correction.y_lim_bottom,
                            top=correction.y_lim_top
                        )
                """
                
            # set title
            ax[index_var, 0].set_title(fig_title_list[title_counter])
            title_counter +=1
            ax[index_var, 1].set_title(fig_title_list[title_counter])
            title_counter +=1
            
        # set x-axis labels
        ax[index_var, plot_column].set_xlabel(
            'epoch', 
            fontsize=FONTSIZE+3
        )

    # set column titles
    cols = [
        'Validation losses with δ=1 \n a.', 
        'Validation losses with δ=0 \n b.'
    ]
    for axes, col in zip(ax[0], cols):
        axes.set_title(col)

    # set layout tight
    fig.tight_layout()

    # create saving paths 
    saving_path = (
        path_to_saving_lossesvsall 
        + pred_type 
        + '.pdf'
    )

    # save figures
    fig.savefig(saving_path)

## 7. Query sequence importance

Here, we compare the training and validation losses of our active learning models against learning from the same data but in a randomized sequence. 

In [24]:
%%capture 
# prevents figures being printed out if used at begining of cell

# set the fontsize for figures
mpl.rcParams.update({'font.size': FONTSIZE})

# create list of custom lines for custom legend
custom_lines = [
    Line2D([0], [0], color='b', linestyle="--"),
    Line2D([0], [0], color='b')
]

# create color list for plots of same AL variant to have same color
color_list = [
    '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', 
    '#8c564b',  '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'
]

# create a counter over the list of numeric results
result_index_counter = 0

# iterate over all considered prediction types
for index_pred, pred_type in enumerate(PRED_TYPE_LIST):
    
    fig_valup0, ax_valup0 = plt.subplots(
        len(AL_VARIABLES), 
        2, 
        figsize=(
            20, 
            len(AL_VARIABLES) * WIDTH_FACTOR
        )
    )
    
    fig_valup1, ax_valup1 = plt.subplots(
        len(AL_VARIABLES), 
        2, 
        figsize=(
            20, 
            len(AL_VARIABLES) * WIDTH_FACTOR
        )
    )
    
    # iterate over all considered parameter constellations
    for index_param, parameter in enumerate(PARAMETER_LIST):

        delta = int(parameter[5])
        valup = int(parameter[12])
        
        # get results df corresponding to currently iterated parameter and pred_type
        AL_result = results_list[
            result_index_counter
        ]
        seqimportance_result = seqimportance_list[
            result_index_counter
        ]
        
        # increment result index counter
        result_index_counter += 1
        
        # set wanted plot column
        if delta == 1:
            plot_column = 0
        else:
            plot_column = 1
        
        if valup == 0:
            fig = fig_valup0
            ax = ax_valup0
        else:
            fig = fig_valup1
            ax = ax_valup1
        
        title_counter = 0
        # iterate over all AL variables
        for index_var, AL_variable in enumerate(AL_VARIABLES):
            
            # iterate over all AL variants
            for index_method, AL_variant in enumerate(AL_VARIANTS):
                
                # create the column name for iterated validation loss
                col_name_val = (
                    pred_type 
                    + ' ' 
                    + AL_variable 
                    + ' ' 
                    + AL_variant 
                    + ' val'
                )
                
                # get validation losses for AL
                AL_val_history = (
                    AL_result[col_name_val][9:].dropna().values
                )
                
                # get validation losses with randomized sequence tests
                seqimportance_val_history = (
                    seqimportance_result[col_name_val][1:].dropna().values
                )
                
                # plot iterated losses
                ax[index_var, plot_column].plot(
                    AL_val_history, 
                    color=color_list[index_method]
                )
                ax[index_var, plot_column].plot(
                    seqimportance_val_history, 
                    color=color_list[index_method], 
                    linestyle="--"
                )
                
                # set y-axis labels
                ax[index_var, 0].set_ylabel(
                    'L2 loss [kW²]', 
                    fontsize=FONTSIZE+3
                )
                
            # set title
            ax[index_var, 0].set_title(fig_title_list[title_counter])
            title_counter +=1
            ax[index_var, 1].set_title(fig_title_list[title_counter])
            title_counter +=1
                
                
            # set legend
            ax[index_var, 0].legend(
                custom_lines, 
                ['random sequence', 'AL sequence'], 
                loc="best", 
                frameon=False
            )
            ax[index_var, 1].legend(
                custom_lines, 
                ['random sequence', 'AL sequence'], 
                loc="best", 
                frameon=False
            )
                
        # set x-axis
        ax[index_var, 0].set_xlabel(
            'epoch', 
            fontsize=FONTSIZE+3
        )
        ax[index_var, 1].set_xlabel(
            'epoch', 
            fontsize=FONTSIZE+3
        )
            
        # set column titles
        cols = [
            'Validation losses with δ=1 \n a.', 
            'Validation losses with δ=0 \n b.'
        ]
        for axes, col in zip(ax[0], cols):
            axes.set_title(col)


    # create saving paths 
    saving_path_valup0 = (
        path_to_saving_seqimportance 
        + pred_type
        + ' valup0.pdf'
    )

    saving_path_valup1 = (
        path_to_saving_seqimportance 
        + pred_type 
        + ' valup1.pdf'
    )

    # set layout tight
    fig_valup0.tight_layout()
    fig_valup1.tight_layout()

    # save figures
    fig_valup0.savefig(saving_path_valup0)
    fig_valup1.savefig(saving_path_valup1)

## 8. Exemplar predictions

For each experiment, we have saved a sample of 1,000 randomly chosen data points that were not queried by our passive learning or active learning algorithm when these terminated. Here, we loadfive exemplar points for each of our conducted experiments from these at random and compare the prediction of each of our three models. The first column represents the performance of the initial model prior to being further trained on any data from the candidate data pool. The second column represents the performance of our passively trained benchmark model. The third column represents the performance of our actively trained prediction model.

What we ideally want to observe visually is that the prediction performance of our actively trained models is approximately as good as the performance of our passively trained models, as we are able to use fewer data points and sensors with our active learning models. 

In [25]:
%%capture 
# prevents figures being printed out if used at begining of cell

# choose how many random datapoints to predict and plot
N_DATAPOINTS = 5

# number of columns to plot data points. Keep this 3
n_cols = 3

# choose a different font size for these figures
FONTSIZE = 14

# set the fontsize
mpl.rcParams.update({'font.size': FONTSIZE})

# iterate over all considered prediction types
for index_pred, pred_type in enumerate(PRED_TYPE_LIST):
    
    # iterate over all considered parameter constellations
    for index_param, parameter in enumerate(PARAMETER_LIST):
  
        delta = int(parameter[5])
        valup = int(parameter[12]) 

        # provide paths to initial and PL models and samples
        path_to_initial_model = (
            path_to_models 
            + parameter 
            + '/' 
            + pred_type 
            + '/' 
            + initial_model_filename
        )
        path_to_PL_model = (
            path_to_models 
            + parameter 
            + '/' 
            + pred_type 
            + '/' 
            + PL_model_filename
        )
        path_to_PL_data = (
            path_to_samples 
            + parameter 
            + '/' 
            + pred_type
            + '/' 
            + 'PL_'
        )
        
        # import models and samples for PL
        initial_model =  tf.keras.models.load_model(
            path_to_initial_model, 
            compile=False
        )
        PL_model = tf.keras.models.load_model(
            path_to_PL_model, 
            compile=False
        )
        
        path_to_file = path_to_PL_data + 'X_t.npy' 
        X_t = np.load(path_to_file)

        path_to_file = path_to_PL_data + 'X_s.npy' 
        X_s = np.load(path_to_file)

        path_to_file = path_to_PL_data + 'X_s1.npy' 
        X_s1 = np.load(path_to_file)

        path_to_file = path_to_PL_data + 'X_st.npy' 
        X_st = np.load(path_to_file)

        path_to_file = path_to_PL_data + 'Y.npy' 
        Y_pl = np.load(path_to_file)
        
        # make predictions
        initial_predictions = initial_model.predict(
            [X_t, X_s1, X_st]
        )
        PL_predictions = PL_model.predict(
            [X_t, X_s1, X_st]
        )

        # iterate over all AL variables
        for index_var, AL_variable in enumerate(AL_VARIABLES):
            
            # iterate over all AL variants
            for index_method, AL_variant in enumerate(AL_VARIANTS):
                
                # create figure
                fig, ax = plt.subplots(
                    N_DATAPOINTS, 
                    n_cols, 
                    sharex=True , 
                    figsize=(16, N_DATAPOINTS * 4)
                )

                AL_model_filename = '{} {}.h5'.format(
                    AL_variable, 
                    AL_variant
                )
                AL_sample_name = '{} {} '.format(
                    AL_variable, 
                    AL_variant
                )
                
                # provide paths to initial and PL models and samples
                path_to_AL_model = (
                    path_to_models 
                    + parameter 
                    + '/' 
                    + pred_type 
                    + '/' 
                    + AL_model_filename
                )
                path_to_AL_data = (
                    path_to_samples 
                    + parameter 
                    + '/' 
                    + pred_type 
                    + '/' 
                    + AL_sample_name
                )
                
                
                # import models and samples for AL
                AL_model =  tf.keras.models.load_model(
                    path_to_AL_model, 
                    compile=False
                )
        
                path_to_file = path_to_AL_data + 'X_t.npy' 
                X_t = np.load(path_to_file)

                path_to_file = path_to_AL_data + 'X_s.npy' 
                X_s = np.load(path_to_file)

                path_to_file = path_to_AL_data + 'X_s1.npy' 
                X_s1 = np.load(path_to_file)

                path_to_file = path_to_AL_data + 'X_st.npy' 
                X_st = np.load(path_to_file)

                path_to_file = path_to_AL_data + 'Y.npy' 
                Y_al = np.load(path_to_file)
                
                # make predictions
                AL_predictions = AL_model.predict(
                    [X_t, X_s1, X_st]
                )
                
                # plot predictions for randomly chosen points
                rnd_index_array_initial = np.random.choice(
                    np.arange(len(Y_pl)),
                    N_DATAPOINTS
                )
                rnd_index_array_PL = np.random.choice(
                    np.arange(len(Y_pl)),
                    N_DATAPOINTS
                )
                rnd_index_array_AL = np.random.choice(
                    np.arange(len(Y_al)), 
                    N_DATAPOINTS
                )

                title_counter = 0
                
                # iterate over each row of figure
                for row in range(N_DATAPOINTS):
                    
                    plot1 = ax[row, 0].plot(
                        initial_predictions[
                            rnd_index_array_initial[
                                row
                            ]
                        ]
                    )
                    ax[row, 1].plot(
                        PL_predictions[
                            rnd_index_array_PL[
                                row
                            ]
                        ]
                    )
                    ax[row, 2].plot(
                        AL_predictions[
                            rnd_index_array_AL[
                                row
                            ]
                        ]
                    )
                    
                    plot2 = ax[row, 0].plot(
                        Y_pl[
                            rnd_index_array_initial[
                                row
                            ]
                        ]
                    )
                    ax[row, 1].plot(
                        Y_pl[
                            rnd_index_array_PL[
                                row
                            ]
                        ]
                    )
                    ax[row, 2].plot(
                        Y_al[
                            rnd_index_array_AL[
                                row
                            ]
                        ]
                    )
                    
                    # set title
                    for col in range(n_cols):
                        ax[row, col].set_title(fig_title_list[title_counter])
                        title_counter +=1
                
                # add a figure legend
                fig.legend(
                    [plot1, plot2], 
                    labels=['true load profile', 'predicted load profile'], 
                    loc='upper center', 
                    bbox_to_anchor=(0.8, 0.95)
                )

                colname_list = [
                    'Initial model \n a.', 
                    'Passive learning \n b.', 
                    'Active learning \n c.'
                ]

                # set col names
                for axes, colname in zip(ax[0], colname_list):
                    axes.set_title(colname)

                # set one y- and x-axis for all sub plots
                fig.add_subplot(111, frame_on=False)
                plt.tick_params(
                    labelcolor="none", 
                    bottom=False, 
                    left=False
                )
                plt.xlabel(
                    'time [15-min]', 
                    fontsize=FONTSIZE+3
                )
                plt.ylabel(
                    'building electric consumption [kW]', 
                    fontsize=FONTSIZE+3
                )
                
                
                filename = '{} {} {} delta{} valup{}.pdf'.format(
                    pred_type, 
                    AL_variable, 
                    AL_variant, 
                    delta, 
                    valup
                )

                saving_path = (
                    path_to_exemplar_predictions 
                    + filename
                )

                fig.savefig(saving_path)

KeyboardInterrupt: 