In [1]:
import sys
import os
import h5py
# Get Parent folder : 
current_path = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_path, '..'))
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

from numpy import load
import pandas as pd
import pickle

DATA_FOLDER_PATH = f"{parent_dir}/../data/"

def load_df_speed(dataset_name,key):
    ''' Load the Flow / Speed dataset'''
    data = h5py.File(f"{DATA_FOLDER_PATH}/{dataset_name}/{dataset_name}.h5", 'r')

    axis0 = pd.Series(data[key]['axis0'][:].astype(str))
    axis1 = pd.Series(data[key]['axis1'][:].astype(str))
    df_metr_la = pd.DataFrame(data[key]['block0_values'][:], columns=axis0, index = pd.to_datetime(axis1.astype(int)/1_000_000_000,unit='s'))
    return(df_metr_la)


def load_df_w_adj_gaussian_kernel(dataset_name):
    ''' Load the Weighted Distance Matrix based on Gaussian Kernel Threshold'''
    sensor_ids,sensor_id_to_ind,adj_mx = pickle.load(open(f"{DATA_FOLDER_PATH}/{dataset_name}/adj/adj_mx.pkl",'rb'),encoding='latin1')
    df_adj = pd.DataFrame(adj_mx, index = sensor_ids, columns = sensor_ids)
    return(df_adj,sensor_id_to_ind)

## Open PEMS_d3
PEMS_d3 is a traffic speed dataset collected from Califor-
nia Transportation Agencies (CalTrans) Performance Mea-
surement System (PeMS) https://people.eecs.berkeley.edu/~varaiya/papers_ps.dir/MiningLoopDetectorData.pdf

It contains data of :
- XXX selected sensors 
- period of XXX
- Timestep XXX minutes
- total number of time slices :XXXX

In [2]:
file = 'PEMS_d3/PEMSd3'

path_npz = f"{parent_dir}/../data/{file}.npz"
data = load(path_npz)['data']
print('data shape: ',data.shape)


path_csv = f"{parent_dir}/../data/{file}.csv"
df_distance = pd.read_csv(path_csv)
print('Number of OD distance : ',df_distance.shape)

data shape:  (26208, 358, 1)
Number of OD distance :  (547, 3)


## Open PEMS-BAY
PEMS-BAY is a traffic speed dataset collected from Califor-
nia Transportation Agencies (CalTrans) Performance Mea-
surement System (PeMS) https://people.eecs.berkeley.edu/~varaiya/papers_ps.dir/MiningLoopDetectorData.pdf

It contains data of :
- 325 selected sensors 
- period of 4 months from 6 months from Jan 1st 2017
to May 31th 2017  . 
- Timestep 5 minutes
- total number of time slices : 52,116.

In [3]:
dataset_name = "PEMS_BAY"

df_pems_bay = load_df_speed(dataset_name,key = 'speed')
print('Traffic Speed df: ')
print('number of timestep: ',df_pems_bay.shape[0], 'number of sensors: ',df_pems_bay.shape[1])
print('Coverage period: ',df_pems_bay.index.min(),df_pems_bay.index.max())
display(df_pems_bay.head())

df_w_adj_k, sensor_id_to_ind = load_df_w_adj_gaussian_kernel(dataset_name)
print('Weighted Adjacency matrix with Gaussian Kernel Threshold: ')
display(df_w_adj_k.head())

# In case we need to save the Weighted Distance Adjacency Matrix : 
if False:
    df_w_adj_k.to_csv(f"{DATA_FOLDER_PATH}/{dataset_name}/adj/dist.csv")



Traffic Speed df: 
number of timestep:  52116 number of sensors:  325
Coverage period:  2017-01-01 00:00:00 2017-06-30 23:55:00


Unnamed: 0,400001,400017,400030,400040,400045,400052,400057,400059,400065,400069,...,409525,409526,409528,409529,413026,413845,413877,413878,414284,414694
2017-01-01 00:00:00,71.4,67.8,70.5,67.4,68.8,66.6,66.8,68.0,66.8,69.0,...,68.8,67.9,68.8,68.0,69.2,68.9,70.4,68.8,71.1,68.0
2017-01-01 00:05:00,71.6,67.5,70.6,67.5,68.7,66.6,66.8,67.8,66.5,68.2,...,68.4,67.3,68.4,67.6,70.4,68.8,70.1,68.4,70.8,67.4
2017-01-01 00:10:00,71.6,67.6,70.2,67.4,68.7,66.1,66.8,67.8,66.2,67.8,...,68.4,67.4,68.4,67.5,70.2,68.3,69.8,68.4,70.5,67.9
2017-01-01 00:15:00,71.1,67.5,70.3,68.0,68.5,66.7,66.6,67.7,65.9,67.8,...,68.5,67.5,68.5,67.5,70.4,68.7,70.2,68.4,70.8,67.6
2017-01-01 00:20:00,71.7,67.8,70.2,68.1,68.4,66.9,66.1,67.7,66.1,67.8,...,68.5,67.7,68.5,67.4,69.6,69.1,70.0,68.4,71.0,67.9


Weighted Adjacency matrix with Gaussian Kernel Threshold: 


Unnamed: 0,400001,400017,400030,400040,400045,400052,400057,400059,400065,400069,...,409525,409526,409528,409529,413026,413845,413877,413878,414284,414694
400001,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
400017,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
400030,0.0,0.0,1.0,0.0,0.136553,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
400040,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
400045,0.0,0.0,0.614808,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Open METR_LA
METR-LA is a traffic speed dataset collected from loop-
detectors located on the LA County road network https://www.worldpece.org/system/files/artifacts/media/pdf/p86-jagadish.pdf.

It contains data of :
- 207 selected sensors 
- period of 4 months from Mar to Jun in 2012 . 
- Timestep 5 minutes
- total number of time slices : 34,272.

In [6]:
#Open the H5 file in read mode
dataset_name = "METR_LA"
df_metr_la = load_df_speed(dataset_name,key = 'df')
print('Traffic Speed df: ')
print('number of timestep: ',df_metr_la.shape[0], 'number of sensors: ',df_metr_la.shape[1])
print('Coverage period: ',df_metr_la.index.min(),df_metr_la.index.max())
display(df_metr_la.head())

df_w_adj_k, sensor_id_to_ind = load_df_w_adj_gaussian_kernel(dataset_name)
print('Weighted Adjacency matrix with Gaussian Kernel threshold: ')
display(df_w_adj_k.head())

# In case we need to save the Weighted Distance Adjacency Matrix : 
if False:
    df_w_adj_k.to_csv(f"{DATA_FOLDER_PATH}/{dataset_name}/adj/dist.csv")
if False: 
    ''' Useless'''
    # 4106 Sensors : 
    df_dist = pd.read_csv(f"{data_path}/distances_la_2012.csv").pivot_table(index='from',columns='to')

    # Lat, Lon positions:
    df_latlon_sensor =  pd.read_csv(f"{data_path}/graph_sensor_locations.csv",index_col = 0)
    print('Lat/Lon positions of the METR-LR sensors: ')
    display(df_latlon_sensor.head())
    # Symmetr road adjacent file
    weighted_dist = pd.read_csv(f"{data_path}/W_metrla.csv")
    weighted_dist

Traffic Speed df: 
number of timestep:  34272 number of sensors:  207
Coverage period:  2012-03-01 00:00:00 2012-06-27 23:55:00


Unnamed: 0,773869,767541,767542,717447,717446,717445,773062,767620,737529,717816,...,772167,769372,774204,769806,717590,717592,717595,772168,718141,769373
2012-03-01 00:00:00,64.375,67.625,67.125,61.5,66.875,68.75,65.125,67.125,59.625,62.75,...,45.625,65.5,64.5,66.428571,66.875,59.375,69.0,59.25,69.0,61.875
2012-03-01 00:05:00,62.666667,68.555556,65.444444,62.444444,64.444444,68.111111,65.0,65.0,57.444444,63.333333,...,50.666667,69.875,66.666667,58.555556,62.0,61.111111,64.444444,55.888889,68.444444,62.875
2012-03-01 00:10:00,64.0,63.75,60.0,59.0,66.5,66.25,64.5,64.25,63.875,65.375,...,44.125,69.0,56.5,59.25,68.125,62.5,65.625,61.375,69.857143,62.0
2012-03-01 00:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2012-03-01 00:20:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Weighted Adjacency matrix with Gaussian Kernel threshold: 


Unnamed: 0,773869,767541,767542,717447,717446,717445,773062,767620,737529,717816,...,772167,769372,774204,769806,717590,717592,717595,772168,718141,769373
773869,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.119804,0.0,0.0,0.0,0.0,0.0,0.0,0.0
767541,0.0,1.0,0.390955,0.0,0.0,0.0,0.0,0.390457,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
767542,0.0,0.717438,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
717447,0.0,0.0,0.0,1.0,0.633722,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
717446,0.0,0.0,0.0,0.626464,1.0,0.0,0.135197,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [2]:
    # GET PARAMETERS
from examples.benchmark import local_get_args,get_inputs,train_on_ds,keep_track_on_model_metrics,get_trial_id

dataset_names = ["subway_in"] # ["subway_in","calendar"] # ["subway_in"] # ['data_bidon'] # ['METR_LA'] # ['PEMS_BAY']
dataset_for_coverage = ['subway_in','netmob'] #  ['data_bidon','netmob'] #  ['subway_in','netmob']  # ['METR_LA'] # ['PEMS_BAY']
model_name = 'STGCN'

vision_model_name = None
save_folder = None
df_loss,df_results = pd.DataFrame(),pd.DataFrame()
modification = {'epochs' : 50, #100,
                }

# Tricky but here we net to set 'netmob' so that we will use the same period for every combination
args,folds,hp_tuning_on_first_fold = local_get_args(model_name,
                                                        dataset_names=dataset_names,
                                                        dataset_for_coverage=dataset_for_coverage,
                                                        modification = modification)
trial_id = get_trial_id(args,vision_model_name=None)
K_fold_splitter,K_subway_ds,dic_class2rpz = get_inputs(args,vision_model_name,folds)
ds = K_subway_ds[0]

trainer,df_loss = train_on_ds(model_name,ds,args,trial_id,save_folder,dic_class2rpz,df_loss)

'pynvml' is not available on this environment.
Training and Hyper-parameter tuning with Ray is not possible
40
>>>>Model: STGCN; K_fold = 6; Loss function: MSE 

Init Dataset:  torch.Size([7392, 40])
Number of Nan Value:  tensor(0)
Total Number of Elements:  295680 

>>>> Train/Valid/Test split method : similar_length_method

U size:  torch.Size([6238, 40, 7]) Utarget size:  torch.Size([6238, 40, 1])
U_train size:  torch.Size([3742, 40, 7]) Utarget_train size:  torch.Size([3742, 40, 1])
U_valid size:  torch.Size([1248, 40, 7]) Utarget_valid size:  torch.Size([1248, 40, 1])
U_test size:  torch.Size([1247, 40, 7]) Utarget_test size:  torch.Size([1247, 40, 1])
U_train min:  tensor(0.) U_train max:  tensor(10798.)
U_valid min:  tensor(0.) U_valid max:  tensor(1405.)
U_test min:  tensor(0.) U_test max:  tensor(1352.)

Fold n°0

Init Dataset:  torch.Size([2174, 40])
Number of Nan Value:  tensor(0)
Total Number of Elements:  86960 

>>>> Train/Valid/Test split method : similar_length_method






start training
epoch: 0 
 min\epoch : 0.34
Estimated time for training: 20.0min 

Training Throughput:71.08 sequences per seconds
>>> Training complete in: 0:19:53.714104
>>> Training performance time: min 0.26783013343811035 avg 0.44363903999328613 seconds (+/- 0.04688018171300918)
>>> Loading performance time: min 0.0006327629089355469 avg 0.2448604087629141 seconds (+/- 0.32081756734145433)
>>> Forward performance time: 0.1327694403779609 seconds (+/- 0.01883652474958572)
>>> Backward performance time: 0.31460278761440413 seconds (+/- 0.042101356867416916)
>>> Plotting performance time: 3.814697265625e-06 seconds (+/- 1.3467366057928637e-05)
>>> Saving performance time: 2.7961448669433593 seconds (+/- 0.8433884886537226)
>>> PI-tracking performance time: 1.8100349270567603e-06 seconds (+/- 6.406834634813536e-07)
>>> Scheduler-update performance time: 1.445108530472736e-06 seconds (+/- 5.605049815599951e-07)
>>> Validation time: 0:00:00.852057
Proportion of time consumed for Loading

In [5]:
Preds,Y_true,T_labels = trainer.testing(ds.normalizer, training_mode = 'test')



In [31]:
from plotting.TS_analysis import plot_subway_patterns,drag_selection_box
from build_inputs.preprocess_subway_15 import get_trigram_correspondance
from bokeh.palettes import Set3_12 as palette
from bokeh.plotting import show,output_notebook,save,output_file
from bokeh.layouts import column
import torch 

df_correspondance = get_trigram_correspondance()
Metro_B_stations = ['Charpennes','Brotteaux','Part-Dieu' ,'Place Guichard', 'Saxe - Gambetta',
       'Jean Macé','Place Jean Jaurès','Debourg','Stade de Gerland',"Gare d'Oullins"]
Metro_B_TRG = list(df_correspondance.set_index('Station').loc[Metro_B_stations].values.reshape(-1))

# Get df_True Volume: 
spatial_units = ds.spatial_unit[ds.spatial_unit.isin(Metro_B_TRG)]
metro_b_indices = list(spatial_units.index)
df_true = pd.DataFrame(Y_true[:,metro_b_indices,0],columns = spatial_units.values,index = ds.tensor_limits_keeper.df_verif_test.iloc[:,-1])

# Get df Predicted Volume : 
df_prediction = pd.DataFrame(Preds[:,metro_b_indices,0],columns = spatial_units.values,index = ds.tensor_limits_keeper.df_verif_test.iloc[:,-1])


if False: 
       # Ok mais compliqué pour rien 
       width=1500
       height=400
       p1 = plot_subway_patterns(df_true,Metro_B_TRG,palette,width,height,title=f'Real Trafic Volume by stations')
       p2 = plot_subway_patterns(df_prediction,Metro_B_TRG,palette,width,height,title=f'Predicted Trafic Volume by stations')
       select = drag_selection_box(df_prediction,p1,p2,width=width,height=height//3)
       output_notebook()
       grid = column(p1,p2,select)
       show(grid)

from bokeh.plotting import figure
from bokeh.models import Legend
from bokeh.models import BoxAnnotation
from datetime import timedelta
from utils.specific_event import rugby_matches

range = 3*60
kick_off_time,match_times = rugby_matches(df_true.index,range)
station = 'GER'
width = 1500
height = 400

def plot_single_point_prediction(df_true,df_prediction,station,title = '',kick_off_time = [], range = None,width=1500,height=400,show=False):
       legend_it = []
       p = figure(x_axis_type="datetime", title= title,
                     width=1500,height=400)

       c = p.line(x=df_true.index, line_width = 2.5, y=df_true[station], alpha=0.8,  legend_label = f'{station}',color = 'blue')
       legend_it.append(('True', [c]))

       c = p.line(x=df_prediction.index, line_width = 2.5, y=df_prediction[station], alpha=0.8,  legend_label = f'{station}',color = 'red')
       legend_it.append(('Prediction', [c]))


       # Add rugby matches :
       for kick_time in kick_off_time:
              box = BoxAnnotation(left=kick_time - timedelta(minutes=1) , right=kick_time+ timedelta(minutes=1) ,
                                   fill_alpha=0.3, fill_color='darkgray')
              p.add_layout(box)
              # Ajouter une box verticale pour la période de +/- 'range'
              box = BoxAnnotation(left=kick_time - timedelta(minutes=range), right=kick_time + timedelta(minutes=range),
                                   fill_alpha=0.3, fill_color='lightgray')
              p.add_layout(box)


       p.xaxis.major_label_orientation = 1.2  # Pour faire pivoter les labels des x
       legend = Legend(items=legend_it)
       p.add_layout(legend, 'right')

       if show:
              output_notebook()
              show(p)

       return p

def plot_prediction_error(df_true,df_prediction,station,metrics =['mae','mse','mape'], title = '',width=1500,height=400,show=False, min_flow = 20):
       legend_it = []
       p = figure(x_axis_type="datetime", title= title,
                     width=1500,height=400)
       
       def f_error(predict,real,metric):
              real = torch.tensor(real).reshape(-1)
              predict = torch.tensor(predict).reshape(-1)

              mask = real>min_flow
              error = torch.full(real.shape, -1.0)  # Remplir avec -1 par défaut
              if metric == 'mape':
                     error[mask] = 100 * (torch.abs(real[mask] - predict[mask]) / real[mask]) 

              elif metric == 'mae':
                     err = torch.abs(real[mask] - predict[mask])
                     error[mask] = 100 * err/err.max()
              elif metric == 'mse':
                     err = (real[mask] - predict[mask])**2
                     error[mask] = 100 * err/err.max()
              else:
                     raise NotImplementedError
              
              return(error)
       
       for k,metric in enumerate(metrics):
              error = f_error(predict= df_prediction[station],real= df_true[station],metric = metric)
              df_error = pd.DataFrame(error.numpy(), index = df_true.index, columns = [station])
              
              c = p.line(x=df_error.index, line_width = 2.5, y=df_error[station], alpha=0.8,color = palette[k+2])
              legend_it.append((metric, [c]))

       p.xaxis.major_label_orientation = 1.2  # Pour faire pivoter les labels des x
       legend = Legend(items=legend_it)
       legend.click_policy="hide"
       p.add_layout(legend, 'right')

       if show:
              output_notebook()
              show(p)

       return p

p1 = plot_single_point_prediction(df_true,df_prediction,station,title= 'Trafic Volume Prediction around at "Stade du Lou Gerland" subway station ',kick_off_time=kick_off_time, range=range,width=width,height = height,show = False)
p2 = plot_prediction_error(df_true,df_prediction,station,metrics =['mae','mse','mape'],title = 'Prediction Error',width=1500,height=400,show=False,min_flow = 20)
select = drag_selection_box(df_true,p1,p2,width=width,height=height//3)
output_notebook()
grid = column(p1,p2,select)
show(grid)

  real = torch.tensor(real).reshape(-1)
  predict = torch.tensor(predict).reshape(-1)


In [23]:
ds.tensor_limits_keeper.df_verif_test[96*2:]

Unnamed: 0,t-96,t-6,t-5,t-4,t-3,t-2,t-1,t+0
5949,2019-05-15 23:15:00,2019-05-16 21:45:00,2019-05-16 22:00:00,2019-05-16 22:15:00,2019-05-16 22:30:00,2019-05-16 22:45:00,2019-05-16 23:00:00,2019-05-16 23:15:00
5950,2019-05-15 23:30:00,2019-05-16 22:00:00,2019-05-16 22:15:00,2019-05-16 22:30:00,2019-05-16 22:45:00,2019-05-16 23:00:00,2019-05-16 23:15:00,2019-05-16 23:30:00
5951,2019-05-15 23:45:00,2019-05-16 22:15:00,2019-05-16 22:30:00,2019-05-16 22:45:00,2019-05-16 23:00:00,2019-05-16 23:15:00,2019-05-16 23:30:00,2019-05-16 23:45:00
6026,2019-05-16 18:30:00,2019-05-17 17:00:00,2019-05-17 17:15:00,2019-05-17 17:30:00,2019-05-17 17:45:00,2019-05-17 18:00:00,2019-05-17 18:15:00,2019-05-17 18:30:00
6027,2019-05-16 18:45:00,2019-05-17 17:15:00,2019-05-17 17:30:00,2019-05-17 17:45:00,2019-05-17 18:00:00,2019-05-17 18:15:00,2019-05-17 18:30:00,2019-05-17 18:45:00
...,...,...,...,...,...,...,...,...
7386,2019-05-30 22:30:00,2019-05-31 21:00:00,2019-05-31 21:15:00,2019-05-31 21:30:00,2019-05-31 21:45:00,2019-05-31 22:00:00,2019-05-31 22:15:00,2019-05-31 22:30:00
7387,2019-05-30 22:45:00,2019-05-31 21:15:00,2019-05-31 21:30:00,2019-05-31 21:45:00,2019-05-31 22:00:00,2019-05-31 22:15:00,2019-05-31 22:30:00,2019-05-31 22:45:00
7388,2019-05-30 23:00:00,2019-05-31 21:30:00,2019-05-31 21:45:00,2019-05-31 22:00:00,2019-05-31 22:15:00,2019-05-31 22:30:00,2019-05-31 22:45:00,2019-05-31 23:00:00
7389,2019-05-30 23:15:00,2019-05-31 21:45:00,2019-05-31 22:00:00,2019-05-31 22:15:00,2019-05-31 22:30:00,2019-05-31 22:45:00,2019-05-31 23:00:00,2019-05-31 23:15:00


In [7]:
predict= df_prediction[station]
real= df_true[station]
real = torch.tensor(real).reshape(-1)
predict = torch.tensor(predict).reshape(-1)
mask = real>0
error = 100*(torch.abs(real[mask] - predict[mask])/real[mask])


  real = torch.tensor(real).reshape(-1)
  predict = torch.tensor(predict).reshape(-1)


In [15]:
 df_true.index

DatetimeIndex(['2019-05-14 03:15:00', '2019-05-14 03:30:00',
               '2019-05-14 03:45:00', '2019-05-14 04:00:00',
               '2019-05-14 04:15:00', '2019-05-14 04:30:00',
               '2019-05-14 04:45:00', '2019-05-14 05:00:00',
               '2019-05-14 05:15:00', '2019-05-14 05:30:00',
               ...
               '2019-05-31 21:15:00', '2019-05-31 21:30:00',
               '2019-05-31 21:45:00', '2019-05-31 22:00:00',
               '2019-05-31 22:15:00', '2019-05-31 22:30:00',
               '2019-05-31 22:45:00', '2019-05-31 23:00:00',
               '2019-05-31 23:15:00', '2019-05-31 23:30:00'],
              dtype='datetime64[ns]', name='t+0', length=1247, freq=None)

In [14]:
pd.DataFrame(error.reshape(-1).numpy(), index = df_true.index, columns = [station])

ValueError: Shape of passed values is (1037, 1), indices imply (1247, 1)