# Points à vérifier dans mon code: 

In [1]:
# ========= PERSONAL_INPUT =========
# >>>> 
# >>>> PB_1 : Il faut absolument pouvoir prendre en compte les invalid dates de tout le monde.
# >>>> 1. Il le faire en amont. i.e lorsqu'on load les données, on load aussi les invalid dates associées
# >>>> 2. Puis on fait l'union des invalid dates
# >>>> 3. Après ça on train/valid/test split normalize tout.
# >>>> 
# >>>> PB_2: Idem pour le coverage pour que ça corresponde à tout le monde.  
# >>>> 
# >>>> PB_3 : Génère des input de taille différentes selon l'historique demandé. 
# >>>> Cela ne va pas convenir si jamais je travail avec du calendar class ou autre.
# >>>> 1. On peut donc enregistrer l'ensemble des invalid-dates
# >>>> 2. Créer les séquences/input_tensor associées 
# >>>> 3. Pour chaque donnée, on ne prend peut être pas en compte les même historiques. 
# >>>>    Donc on peut faire une liste de date 'D' correspondant à l'intersection de toute les target (df_verif[-1] ...?)
# >>>> 4. Puis on extrait l'ensemble des sequences de chaque Data sur D 


### Resultats de Normalization qu'on est censé conserver : 
**Tackling Subway Data:**
- U_train min:  tensor(0.) U_train max:  tensor(1.)
- U_valid min:  tensor(0.0447) U_valid max:  tensor(4.1455)
- U_test min:  tensor(0.) U_test max:  tensor(4.1892)

**Tackling NetMob Data:** 
- U_train min:  tensor(0.) U_train max:  tensor(1.)
- U_valid min:  tensor(-0.1006) U_valid max:  tensor(1.0808)
- U_test min:  tensor(-0.1259) U_test max:  tensor(1.1083)

# Entrainement avec Subway pour voir si tout est bien conservé 

In [1]:
from dataset import TrainValidTest_Split_Normalize,PersonnalInput
from loader import DictDataLoader
from trainer import Trainer 

from plotting_bokeh import generate_bokeh
from calendar_class import get_time_slots_labels
import os
import pandas as pd 
import torch
import pickle 
import numpy as np 

from paths import folder_path,file_name
from config import get_args
from utilities_DL import get_DataSet_and_invalid_dates, match_period_coverage_with_netmob,get_loss,load_model_and_optimizer,get_model_loss_args_emb_opts


def load_subway_in(folder_path,file_name,args,coverage):
    '''Tackling Subway_in data'''
    dataset,invalid_dates = get_DataSet_and_invalid_dates(args.abs_path, folder_path,file_name,
                                                        args.W,args.D,args.H,args.step_ahead,
                                                        single_station = False,coverage_period = coverage)

    subway_ds = PersonnalInput(invalid_dates,args, tensor = dataset.raw_values, dates = dataset.df_dates,
                            time_step_per_hour = dataset.time_step_per_hour,Weeks = args.W, Days = args.D, historical_len = args.H,step_ahead = args.step_ahead,minmaxnorm = True ,dims=[0])
    subway_ds.preprocess(args.train_prop,args.valid_prop,args.test_prop)
    return(subway_ds,dataset,invalid_dates)

def load_calendar(subway_ds):
    '''Tackling Calendar Data''' 
    time_slots_labels,dic_class2rpz,dic_rpz2class,nb_words_embedding = get_time_slots_labels(subway_ds)
    tensor_limits_keeper = subway_ds.tensor_limits_keeper

    dict_calendar_U_train,dict_calendar_U_valid,dict_calendar_U_test = {},{},{}
    for calendar_class in [0,1,2,3]:
        calendar_tensor = time_slots_labels[calendar_class] #args.calendar_class

        splitter = TrainValidTest_Split_Normalize(calendar_tensor,
                                    first_train = tensor_limits_keeper.first_train_U, last_train= tensor_limits_keeper.last_train_U,
                                    first_valid= tensor_limits_keeper.first_valid_U, last_valid = tensor_limits_keeper.last_valid_U,
                                    first_test = tensor_limits_keeper.first_test_U, last_test = tensor_limits_keeper.last_test_U)

        train_tensor_ds,valid_tensor_ds,test_tensor_ds = splitter.split_normalize_tensor_datasets(normalizer = None)
        calendar_U_train,calendar_U_valid,calendar_U_test = train_tensor_ds.tensor,valid_tensor_ds.tensor,test_tensor_ds.tensor
        dict_calendar_U_train[calendar_class] = calendar_U_train
        dict_calendar_U_valid[calendar_class] = calendar_U_valid
        dict_calendar_U_test[calendar_class] = calendar_U_test
    return(dict_calendar_U_train,dict_calendar_U_valid,dict_calendar_U_test,dic_class2rpz,dic_rpz2class,nb_words_embedding)


def load_netmob_data(dataset,invalid_dates,args,save_folder,ref_subway = None):
    '''Load NetMob Data:
    outputs:
    --------
    # NetMob Tensor : [T,N,C,H,W]
    # dims : [0,3,4] #[0,-2,-1]  -> dimension for which we want to retrieve stats 
    '''
    try :
        netmob_T = torch.stack([torch.load(f"{save_folder}station_{station}.pt") for station in ref_subway.COD_TRG])
        netmob_T = netmob_T.permute(1,0,*range(2, netmob_T.dim()))

    except:
        netmob_T = torch.randn(dataset.length,40,2,3,3)  # (7400,40,67,22,22)
        print("Load des données NetMob .pt impossible. Création d'un random Tensor")

    print('Init NetMob Dataset: ', netmob_T.size())
    print('Number of Nan Value: ',torch.isnan(netmob_T).sum())
    print('Total Number of Elements: ', netmob_T.numel(),'\n')

    NetMob_ds = PersonnalInput(invalid_dates,args, tensor = netmob_T, dates = dataset.df_dates,
                           time_step_per_hour = dataset.time_step_per_hour,Weeks = args.W, Days = args.D, historical_len = args.H,step_ahead = args.step_ahead,minmaxnorm = True,dims =[0,3,4])
    NetMob_ds.preprocess(args.train_prop,args.valid_prop,args.test_prop)

    return(NetMob_ds)


def add_contextual_data(dataset_names,args,subway_ds,NetMob_ds,dict_calendar_U_train,dict_calendar_U_valid,dict_calendar_U_test):
    # === Define DataLoader : 
    contextual_tensors,positions = {},{}

    # Define contextual tensor for Calibration with Calendar Class:
    contextual_tensors = {f'calendar_{calendar_class}': {'train': dict_calendar_U_train[calendar_class],
                                'valid': dict_calendar_U_valid[calendar_class],
                                'test': dict_calendar_U_test[calendar_class]} for calendar_class in dict_calendar_U_train.keys()
                                } 
    # ...
    pos_calibration_calendar = list(contextual_tensors.keys()).index(f'calendar_{args.calibration_calendar_class}')
    positions['calibration_calendar'] = pos_calibration_calendar

    if 'calendar' in dataset_names:
        pos_calendar = list(contextual_tensors.keys()).index(f'calendar_{args.calendar_class}')
        positions['calendar'] = pos_calendar
        

    if 'netmob' in dataset_names:
        contextual_tensors.update({'netmob': {'train': NetMob_ds.U_train,
                                        'valid': NetMob_ds.U_valid,
                                        'test': NetMob_ds.U_test}
                                        }
                                        )
        
        pos_netmob = list(contextual_tensors.keys()).index('netmob')
        positions['netmob'] = pos_netmob



    subway_ds.contextual_tensors = contextual_tensors
    subway_ds.get_dataloader()

    return(subway_ds,positions)

def load_everything(dataset_names,folder_path,file_name,args,coverage,data_folder_path):
    subway_ds,dataset,invalid_dates = load_subway_in(folder_path,file_name,args,coverage)
    # Calendar data for Calibration : 
    dict_calendar_U_train,dict_calendar_U_valid,dict_calendar_U_test,dic_class2rpz,dic_rpz2class,nb_words_embedding = load_calendar(subway_ds)

    # Calendar data for training (with Time-Embedding):
    if 'calendar' in dataset_names:
        args.time_embedding = True
    else:
        dic_class2rpz,dic_rpz2class,nb_words_embedding = None,None,None
        args.time_embedding = False
    # ...

    # Netmob: 
    if 'netmob' in dataset_names:
        NetMob_ds = load_netmob_data(dataset,invalid_dates,args,data_folder_path,ref_subway = None)
    else:
        NetMob_ds = None
    # ...
    
    subway_ds,positions = add_contextual_data(dataset_names,args,subway_ds,NetMob_ds,dict_calendar_U_train,dict_calendar_U_valid,dict_calendar_U_test)
    args.contextual_positions = positions
    
    return(args,subway_ds,positions,nb_words_embedding,dic_class2rpz)

def get_small_ds(small_ds,coverage,args):
    if small_ds:
        coverage = coverage[:100]
        args.W = 0
        args.D = 0
        print('Seulement les 100 premiers time-slots sont utilisés.')
    return(coverage,args)

def evaluate_config(dataset_names,folder_path,file_name,args,coverage,data_folder_path):
    args,subway_ds, positions,nb_words_embedding,dic_class2rpz = load_everything(dataset_names,folder_path,file_name,args,coverage,data_folder_path)
    # Load Model, Optimizer, Scheduler: 
    loss_function,model,optimizer,scheduler,args_embedding = get_model_loss_args_emb_opts(args,nb_words_embedding,dic_class2rpz,n_vertex = subway_ds.raw_values.size(1))
    # Load trainer: 
    trainer = Trainer(subway_ds,model,args,optimizer,loss_function,scheduler = None,args_embedding  =args_embedding,dic_class2rpz = dic_class2rpz,show_figure = True,positions = positions)# Ajoute dans trainer, if calibration_prop is not None .... et on modifie le dataloader en ajoutant un clabration set
    # Train Model 
    trainer.train_and_valid(mod = 1000)  # Récupère les conformity scores sur I1, avec les estimations faites precedemment 

    if args.loss_function_type == 'quantile':
        Q = trainer.conformal_calibration(args.alpha,conformity_scores_type =args.conformity_scores_type, quantile_method = args.quantile_method)  # calibration for PI 90%
    else:
        Q = None
    station = 0
    pi,pi_cqr = generate_bokeh(trainer,trainer.dataloader,
                                        trainer.dataset,Q,args,trainer.dic_class2rpz,
                                        station = station,
                                        show_figure = True,
                                        save_plot = False
                                        )
    return(trainer,model,args,pi,pi_cqr)

'pynvml' is not available on this environment.


## Accès au Tensor Train et Contextual_tensors Train: 

In [13]:
U_train = trainer.dataset.U_train
netmob_U_train = trainer.dataset.contextual_tensors['netmob']['train']
print('U_train size: ',U_train.size())
print('netmob_U_train size: ', netmob_U_train.size())

x_b,y_b,contextual_b = next(iter(trainer.dataloader['train']))
netmob_b = contextual_b[args.contextual_positions['netmob']]
print('Subway_in batch: ',x_b.size())
print('Netmob batch: ', netmob_b.size())

U_train size:  torch.Size([2934, 40, 8])
netmob_U_train size:  torch.Size([2934, 40, 2, 3, 3, 8])
Subway_in batch:  torch.Size([32, 40, 8])
Netmob batch:  torch.Size([32, 40, 2, 3, 3, 8])


# Génération d'un Micro-ResNet pour extraire les informations contenues dans les NetMob Image d'une seule station :

In [15]:
from dl_models.video_encoder import  ResNet,BasicBlock   
import torch
# Hyperparameters
c_out = 1
kernel_size = (3,3,3)
padding = (1,0,0) #Keep the temporal dimension L while reducing H,W 
Z_dimension = 32

# Init Inputs
B,N,C,H,W,L = 32, 40, 4, 22,22,6
netmob = torch.randn(B,N,C,H,W,L)
netmob_station_i = netmob[:,0,:,:,:,:]

block = BasicBlock
H_dims =  [128,128] #[128,256]  #[64,128,256,512]
layers =  [1,1] # [2,2]   # ResNet10 :  [1, 1, 1, 1], ResNet18 :[2, 2, 2, 2], ResNet50 : [3, 4, 6, 3],   #  # nb 3D conv blocks par couches 
block_inplanes = H_dims
n_classes = Z_dimension
encoder_model = ResNet(block,layers, block_inplanes,
                    n_input_channels=C,
                    conv1_t_size=L, #7 ? 
                    conv1_t_stride=1,
                    no_max_pool=False,
                    shortcut_type='B',
                    widen_factor=1.0,
                    n_classes=n_classes
        )

output = encoder_model(netmob_station_i)
nb_total_param = sum(p.numel() for p in encoder_model.parameters() if p.requires_grad)
print('Model parameters: ',nb_total_param)
print('Inputs: ',netmob_station_i.size())
print('Outputs: ',output.size())

print(f'>>>> Il faudra donc {N*nb_total_param} paramètres pour extraire les informations contenues dans les NetMob Image')
print(f">>>> Rappel :le STGCN utilisé jusqu'alors ne contient que 300k paramètres.")


Model parameters:  1912642
Inputs:  torch.Size([32, 4, 22, 22, 6])
Outputs:  torch.Size([32, 32])
>>>> Il faudra donc 76505680 paramètres pour extraire les informations contenues dans les NetMob Image
>>>> Rappel :le STGCN utilisé jusqu'alors ne contient que 300k paramètres.


# Vérification nb paramètres du STGCN utilisé jusqu'alors:

In [3]:
if False:
    model_name = 'STGCN' #'CNN'
    args = get_args(model_name)
    data_folder_path = '../../../data/'
    dataset_names = ['subway_in','calendar'] # 'netmob'
    coverage = match_period_coverage_with_netmob(file_name
                                                )
    if args.loss_function_type == 'MSE': out_dim = 1
    elif args.loss_function_type == 'quantile': out_dim = 2
    else: raise NotImplementedError(f'loss function {args.loss_function_type} has not been implemented')
    args.out_dim = out_dim

    args,subway_ds, positions,nb_words_embedding,dic_class2rpz = load_everything(dataset_names,folder_path,file_name,args,coverage,data_folder_path)
    # Load Model, Optimizer, Scheduler: 
    loss_function,model,optimizer,scheduler,args_embedding = get_model_loss_args_emb_opts(args,nb_words_embedding,dic_class2rpz,n_vertex = subway_ds.raw_values.size(1))

# Génération d'exemples pour voir que les choix de configurations fonctionnes bien :
- entre **quantile/MSE**
- entre **['subway_in','calendar'], ['subway_in'], ['subway_in','calendar','netmob']** 
- entre CNN, STGCN ...

In [2]:
# Load config
model_name = 'STGCN' #'CNN'
args = get_args(model_name)
#args = get_args(model_name = model_name,learn_graph_structure = True)  # MTGNN


# Modification : 
args.K_fold = 1
args.epochs = 10
args.loss_function_type = 'MSE' # 'quantile'

args.track_pi = False
args.ray = False
args.calibration_calendar_class = 3
args.calendar_class = 3

# Update modification : 
if args.loss_function_type == 'MSE': out_dim = 1
elif args.loss_function_type == 'quantile': out_dim = 2
else: raise NotImplementedError(f'loss function {args.loss_function_type} has not been implemented')
args.out_dim = out_dim


small_ds = False
coverage = match_period_coverage_with_netmob(file_name)

if torch.cuda.is_available():
    data_folder_path = '../../../data/' 
else:
    data_folder_path = '../../Data/'

(coverage,args) = get_small_ds(small_ds,coverage,args)
dataset_names = ['subway_in']#['subway_in','netmob','calendar']

trainer,model,args,pi,pi_cqr = evaluate_config(dataset_names,folder_path,file_name,args,coverage,data_folder_path)

Time-step per hour: 4.0
coverage period: 2019-03-16 00:00:00 - 2019-05-31 23:45:00

 Tackling Training Set
Values with issues:  0.000%
Regular Values that we have to set to 0:  0.000%

 Tackling Validation Set
Values with issues:  0.000%
Regular Values that we have to set to 0:  0.000%

 Tackling Testing Set
Values with issues:  0.000%
Regular Values that we have to set to 0:  0.000%

 Tackling Training Set
Values with issues:  0.000%
Regular Values that we have to set to 0:  0.000%

 Tackling Validation Set
Values with issues:  0.000%
Regular Values that we have to set to 0:  0.000%

 Tackling Testing Set
Values with issues:  0.000%
Regular Values that we have to set to 0:  0.000%

U size:  torch.Size([5662, 40, 8]) Utarget size:  torch.Size([5662, 40, 1])
U_train size:  torch.Size([2934, 40, 8]) Utarget_train size:  torch.Size([2934, 40, 1])
U_valid size:  torch.Size([978, 40, 8]) Utarget_valid size:  torch.Size([978, 40, 1])
U_test size:  torch.Size([979, 40, 8]) Utarget_test size: 



epoch: 0 
 min\epoch : 0.14
Estimated time for training: 1.3min 
Training Throughput:424.64 sequences per seconds
>>> Training complete in: 0:01:20.750871
>>> Training performance time: min 0.11921191215515137 avg 0.14791202545166016 seconds (+/- 0.01069698340345577)
>>> Loading performance time: min 0.00047898292541503906 avg 0.08679990727325966 seconds (+/- 0.13282556344057714)
>>> Forward performance time: 0.04197866880296362 seconds (+/- 0.004693872436842426)
>>> Backward performance time: 0.10812574430228838 seconds (+/- 0.008732274814871542)
>>> Plotting performance time: 1.3377931382921008e-05 seconds (+/- 3.202242381007544e-05)
>>> Saving performance time: 0.004718208312988281 seconds (+/- 0.00014261836707663353)
>>> PI-tracking performance time: 2.3312038845486113e-06 seconds (+/- 5.594548141146469e-07)
>>> Scheduler-update performance time: 2.013312445746528e-06 seconds (+/- 1.1847114443777877e-07)
>>> Validation time: 0:00:01.114929
Proportion of time consumed for Loading: 3