In [1]:
import torch
import pandas as pd 
import os
import glob 
import numpy as np 

from utilities_DL import get_DataSet_and_invalid_dates,get_MultiModel_loss_args_emb_opts
from DL_class import MultiModelTrainer, Trainer, PI_object
from config import get_args
from save_results import build_results_df
from paths import folder_path,file_name,get_save_directory

Training and Hyper-parameter tuning with Ray is not possible


In [2]:
def class2str(label,dic_class2rpz):
    rpz = dic_class2rpz[label]
    d,h = rpz[0][0], rpz[1][0][0]
    days = ['Mon','Tue','Wed','Thu','Fri','Sat','Sun']
    return(f'{days[d]} - {h}:00')

def update_args_and_get_path(args,loss,embedding,transfer,multi_emb,str_path0,str_path1):
    args.time_embedding = embedding
    args.multi_embedding = multi_emb
    args.TE_transfer = transfer
    if loss == 'MSE':
        args.out_dim = 1
        args.loss_function_type = loss
    else:
        args.out_dim = 2
        args.loss_function_type = 'quantile'

    if embedding:
        path = f'save/{loss}/{str_path0}/TE_transfer_{args.TE_transfer}/Multi_Emb{args.multi_embedding}/{str_path1}/'
    else:
        path = f'save/{loss}/{str_path0}/no_embedding/' 
    #df = pd.read_csv(f'{path}results_by_fold.csv',index_col = 0)
    #epoch = df[df.fold == fold].sort_values('valid_loss').iloc[0,:].epoch
    return(args,path)

def get_list_object(args,loss,embedding,transfer,multi_emb,str_path0,str_path1):
    args,path = update_args_and_get_path(args,loss,embedding,transfer,multi_emb,str_path0,str_path1) 
    dataset,invalid_dates = get_DataSet_and_invalid_dates(folder_path,file_name,args.W,args.D,args.H,args.step_ahead,single_station = args.single_station)
    (Datasets,DataLoader_list,time_slots_labels,dic_class2rpz,dic_rpz2class,nb_words_embedding) =  dataset.split_K_fold(args,invalid_dates)
    # Load associated K_folds Models: 
    (loss_function,Model_list,Optimizer_list,args_embedding) = get_MultiModel_loss_args_emb_opts(args,nb_words_embedding,dic_class2rpz,n_vertex = len(Datasets[0].columns))
    
    return(Model_list,Datasets,DataLoader_list,time_slots_labels,args,path,dic_class2rpz,dic_rpz2class,Optimizer_list,args_embedding,loss_function)
    
def load_prediction(trainer,dataset,dataloader,args):
    data=  [[x,y,t] for x,y,t in dataloader['test']]
    X = torch.cat([x for x,_,_ in data]).to(args.device)
    Y = torch.cat([y for _,y,_ in data]).to(args.device)
    T = torch.cat([t for _,_,t in data]).to(args.device)
    Preds,Y,T = trainer.testing(dataset,False,'test',X, Y,T)
    return(Preds,Y,T)

def update_gain(Total_gain,args,Preds,Y,T,loss_function,fold):
    gain = pd.DataFrame()
    for k in range(168):
        mask = torch.where(T==k)
        preds,y = Preds[mask],Y[mask]
        if args.loss_function_type == 'quantile':
            pi = PI_object(preds,y,args.alpha,type_calib = 'classic')
            picp,mpiw = pi.picp,pi.mpiw
            loss = loss_function(y,preds)
        else:
            picp,mpiw = None, None
            loss = loss_function(y,preds)

        dict_row = dict(loss_function = args.loss_function_type,label = k,picp = picp, mpiw = mpiw, loss = loss.item(),fold = fold, TE = args.time_embedding, Transfer = args.TE_transfer, MultiEmb = args.multi_embedding)
        df_add = pd.DataFrame.from_records([dict_row])
        gain = pd.concat([gain, df_add])

    Total_gain = pd.concat([Total_gain, gain])
    return(Total_gain)

def load_K_models_and_get_metrics(Total_gain,args,loss,embedding,transfer,multi_emb,str_path0,str_path1):
    (Model_list,Datasets,DataLoader_list,time_slots_labels,args,path,dic_class2rpz,dic_rpz2class,Optimizer_list,args_embedding,loss_function) = get_list_object(args,loss,embedding,transfer,multi_emb,str_path0,str_path1)
    for fold in range(args.K_fold):
        model_path = f'{path}fold{fold}/best_model.pkl'
        model,optimizer,dataloader,dataset = Model_list[fold], Optimizer_list[fold],DataLoader_list[fold],Datasets[fold]

        # Load checkpoint 
        saved_checkpoint = torch.load(model_path)
        model.load_state_dict(saved_checkpoint['state_dict'])
        trainer = Trainer(dataset,model,dataloader,args,optimizer,loss_function,args_embedding  =args_embedding,dic_class2rpz = dic_class2rpz)

        (Preds,Y,T) = load_prediction(trainer,dataset,dataloader,args)
        Total_gain = update_gain(Total_gain,args,Preds,Y,T,loss_function,fold)
    return(Total_gain,dic_rpz2class,dic_class2rpz)

def agg_df(df):
    df_agg = df.groupby(['label','TE','Transfer','MultiEmb'],dropna = False).mean(numeric_only = True)
    df_agg = df_agg.reset_index() 
    return df_agg

def f_gain(row,ref,metric):
    if np.isnan(row[metric]):
        return(np.nan)
    else:
        return((1-row[metric]/ref)*100)

In [3]:
model_name = 'STGCN' #'CNN' 
args = get_args(model_name)

# Modif: 
args.H = 6
args.W = 1
args.D = 1
args.L =args.H+args.W+args.D
args.single_station = False
args.calendar_class = 3
args.embedding_dim = 3
args.type_calendar = 'tuple'
args.K_fold = 5

str_path1 = f'FC1_17_8_FC2_8_4/Emb_dim{args.embedding_dim}/Specific_lr_False/CalendarClass{args.calendar_class}/position_input'

Total_gain = pd.DataFrame()

loss = 'MSE' #'Quantile Loss'
for loss in ['Quantile Loss','MSE']:
    if loss == 'MSE':
        epoch = 300
        str_path0 = f'STGCN/K_fold{args.K_fold}/H6_D1_W1/graph_conv_sym_norm_lap/act_glu_Ks2/E{epoch}_lr0.0001_B64_train_valid_calib_0.60.20.5'
    else:
        epoch = 350
        str_path0 = f'STGCN/K_fold{args.K_fold}/H6_D1_W1/graph_conv/sym_norm_lap/act_glu_Ks2/optadamw/train_valid_calib_0.60.20.5/E{epoch}_lr0.0001_B64'
    for embedding in [True,False]:
        if embedding is True:
            for transfer in [True,False]:
                for multi_emb in [True,False]:
                    print(f'Embedding ON. Loss {loss} - Transfer {transfer} - Multi-Emb {multi_emb} \n')
                    Total_gain,dic_rpz2class,dic_class2rpz = load_K_models_and_get_metrics(Total_gain,args,loss,embedding,transfer,multi_emb,str_path0,str_path1)
             
        else:
            transfer,multi_emb = None,None
            print(f'Embedding OFF - Loss {loss} \n')
            Total_gain,dic_rpz2class,dic_class2rpz = load_K_models_and_get_metrics(Total_gain,args,loss,embedding,transfer,multi_emb,str_path0,str_path1)

Embedding ON. Loss Quantile Loss - Transfer True - Multi-Emb True 

coverage period: 2019-01-01 00:00:00 - 2020-01-01 00:00:00
Time-step per hour: 4.0
Embedding ON. Loss Quantile Loss - Transfer True - Multi-Emb False 

coverage period: 2019-01-01 00:00:00 - 2020-01-01 00:00:00
Time-step per hour: 4.0
Embedding ON. Loss Quantile Loss - Transfer False - Multi-Emb True 

coverage period: 2019-01-01 00:00:00 - 2020-01-01 00:00:00
Time-step per hour: 4.0
Embedding ON. Loss Quantile Loss - Transfer False - Multi-Emb False 

coverage period: 2019-01-01 00:00:00 - 2020-01-01 00:00:00
Time-step per hour: 4.0
Embedding OFF - Loss Quantile Loss 

coverage period: 2019-01-01 00:00:00 - 2020-01-01 00:00:00
Time-step per hour: 4.0
Embedding ON. Loss MSE - Transfer True - Multi-Emb True 

coverage period: 2019-01-01 00:00:00 - 2020-01-01 00:00:00
Time-step per hour: 4.0
Embedding ON. Loss MSE - Transfer True - Multi-Emb False 

coverage period: 2019-01-01 00:00:00 - 2020-01-01 00:00:00
Time-step per

In [9]:
df_gain_mean_fold = pd.DataFrame()
for label in range(len(Total_gain.label.unique())):
    for loss_function in ['quantile','MSE']:
        df = Total_gain[(Total_gain.label == label) &
                            (Total_gain.loss_function == loss_function)]
        df_agg = agg_df(df)
        
        for metric in ['mpiw','loss']:
            ref = df_agg[df_agg['TE'] == False][metric]
            new_columns = df_agg.apply(lambda row: f_gain(row,ref,metric),axis = 1)
            df_tmps = df_agg.copy()
            
            df_tmps[f'{metric}-gain'] = new_columns
            df_tmps['loss_function'] = loss_function
        df_gain_mean_fold = pd.concat([df_gain_mean_fold,df_tmps])
    
df_gain_mean_fold = df_gain_mean_fold.drop(columns = ['fold'])
df_gain_mean_fold['config'] = df_gain_mean_fold.apply(lambda row: f'transfer {row.Transfer}-MultiEmb {row.MultiEmb}' if row.TE else 'No TE',axis = 1)
df_gain_mean_fold = df_gain_mean_fold.drop(columns = ['TE','Transfer','MultiEmb'])    

In [49]:
df_gain_mean_fold

Unnamed: 0,label,picp,mpiw,loss,loss-gain,loss_function,config
0,0,0.869219,24.756232,24.354079,0.000000,quantile,No TE
1,0,0.915938,25.286232,25.040818,-2.819812,quantile,transfer False-MultiEmb False
2,0,0.885469,25.337924,25.355043,-4.110048,quantile,transfer False-MultiEmb True
3,0,0.912344,25.762361,25.320701,-3.969034,quantile,transfer True-MultiEmb False
4,0,0.913438,27.249672,26.772940,-9.932057,quantile,transfer True-MultiEmb True
...,...,...,...,...,...,...,...
0,167,,,2450.537720,0.000000,MSE,No TE
1,167,,,1870.018848,23.689449,MSE,transfer False-MultiEmb False
2,167,,,2083.946643,14.959618,MSE,transfer False-MultiEmb True
3,167,,,1912.974438,21.936544,MSE,transfer True-MultiEmb False


In [48]:
import bokeh 
from bokeh.plotting import figure, show, output_file, save,output_notebook
from bokeh.models import ColumnDataSource
from bokeh.palettes import Dark2
from bokeh.layouts import layout,row,column
for loss_function in df_gain_mean_fold.loss_function.unique():
    df_plot = df_gain_mean_fold[df_gain_mean_fold.loss_function == loss_function]
    df_plot = df_plot.pivot_table(index=  'label', columns = 'config',values = 'loss-gain')
    df_source=  df_plot.reset_index()
    [28,124,220,316,412,508,604,80,176,272,368,464,560,656]
    df_source['str'] = df_source.apply(lambda row : class2str(row.label,dic_class2rpz),axis = 1)
    source = ColumnDataSource(df_source)
    p = figure(title=f"Gain on {loss_function} Loss, for each Config and each Time Slot",
               x_axis_label='Class',
               y_axis_label='Gain (%)')

    quad_born = [[7,  20],[31,  44],[55,  68],[79,  92],[103, 116],[127, 140],[151,
       164]]
    for x1,x2 in quad_born:
        p.quad(left=x1, right=x2, bottom=min(df_gain_mean_fold['loss-gain'].min(), 0), top=df_gain_mean_fold['loss-gain'].max(), 
       fill_color='green', fill_alpha=0.2)
        
    for k,c in enumerate(df_plot.columns):
        p.line(x='label', y=c, source=source,color = Dark2[8][k],legend_label = c)

    # Configuration du HoverTool
    hover = HoverTool()
    hover.tooltips = [("Class: ", "@str")
                     ]
    p.add_tools(hover)
    p.legend.location = "top_right"
    
    output_notebook()
    show(p)

In [54]:
Total_gain.groupby(['loss_function','TE','Transfer','MultiEmb'],dropna = False).mean(numeric_only=True)[['loss']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,loss
loss_function,TE,Transfer,MultiEmb,Unnamed: 4_level_1
MSE,False,,,2356.981286
MSE,True,False,False,2054.566682
MSE,True,False,True,2089.052154
MSE,True,True,False,2112.243663
MSE,True,True,True,2198.838882
quantile,False,,,69.237646
quantile,True,False,False,68.43069
quantile,True,False,True,68.880643
quantile,True,True,False,69.075619
quantile,True,True,True,68.908869
