In [1]:
import sys
import os
import h5py
# Get Parent folder : 
current_path = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_path, '..'))
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

from numpy import load
import pandas as pd
import pickle

DATA_FOLDER_PATH = f"{parent_dir}/../data/"

def load_df_speed(dataset_name,key):
    ''' Load the Flow / Speed dataset'''
    data = h5py.File(f"{DATA_FOLDER_PATH}/{dataset_name}/{dataset_name}.h5", 'r')

    axis0 = pd.Series(data[key]['axis0'][:].astype(str))
    axis1 = pd.Series(data[key]['axis1'][:].astype(str))
    df_metr_la = pd.DataFrame(data[key]['block0_values'][:], columns=axis0, index = pd.to_datetime(axis1.astype(int)/1_000_000_000,unit='s'))
    return(df_metr_la)


def load_df_w_adj_gaussian_kernel(dataset_name):
    ''' Load the Weighted Distance Matrix based on Gaussian Kernel Threshold'''
    sensor_ids,sensor_id_to_ind,adj_mx = pickle.load(open(f"{DATA_FOLDER_PATH}/{dataset_name}/adj/adj_mx.pkl",'rb'),encoding='latin1')
    df_adj = pd.DataFrame(adj_mx, index = sensor_ids, columns = sensor_ids)
    return(df_adj,sensor_id_to_ind)

## Open PEMS_d3
PEMS_d3 is a traffic speed dataset collected from Califor-
nia Transportation Agencies (CalTrans) Performance Mea-
surement System (PeMS) https://people.eecs.berkeley.edu/~varaiya/papers_ps.dir/MiningLoopDetectorData.pdf

It contains data of :
- XXX selected sensors 
- period of XXX
- Timestep XXX minutes
- total number of time slices :XXXX

In [2]:
file = 'PEMS_d3/PEMSd3'

path_npz = f"{parent_dir}/../data/{file}.npz"
data = load(path_npz)['data']
print('data shape: ',data.shape)


path_csv = f"{parent_dir}/../data/{file}.csv"
df_distance = pd.read_csv(path_csv)
print('Number of OD distance : ',df_distance.shape)

data shape:  (26208, 358, 1)
Number of OD distance :  (547, 3)


## Open PEMS-BAY
PEMS-BAY is a traffic speed dataset collected from Califor-
nia Transportation Agencies (CalTrans) Performance Mea-
surement System (PeMS) https://people.eecs.berkeley.edu/~varaiya/papers_ps.dir/MiningLoopDetectorData.pdf

It contains data of :
- 325 selected sensors 
- period of 4 months from 6 months from Jan 1st 2017
to May 31th 2017  . 
- Timestep 5 minutes
- total number of time slices : 52,116.

In [3]:
dataset_name = "PEMS_BAY"

df_pems_bay = load_df_speed(dataset_name,key = 'speed')
print('Traffic Speed df: ')
print('number of timestep: ',df_pems_bay.shape[0], 'number of sensors: ',df_pems_bay.shape[1])
print('Coverage period: ',df_pems_bay.index.min(),df_pems_bay.index.max())
display(df_pems_bay.head())

df_w_adj_k, sensor_id_to_ind = load_df_w_adj_gaussian_kernel(dataset_name)
print('Weighted Adjacency matrix with Gaussian Kernel Threshold: ')
display(df_w_adj_k.head())

# In case we need to save the Weighted Distance Adjacency Matrix : 
if False:
    df_w_adj_k.to_csv(f"{DATA_FOLDER_PATH}/{dataset_name}/adj/dist.csv")



Traffic Speed df: 
number of timestep:  52116 number of sensors:  325
Coverage period:  2017-01-01 00:00:00 2017-06-30 23:55:00


Unnamed: 0,400001,400017,400030,400040,400045,400052,400057,400059,400065,400069,...,409525,409526,409528,409529,413026,413845,413877,413878,414284,414694
2017-01-01 00:00:00,71.4,67.8,70.5,67.4,68.8,66.6,66.8,68.0,66.8,69.0,...,68.8,67.9,68.8,68.0,69.2,68.9,70.4,68.8,71.1,68.0
2017-01-01 00:05:00,71.6,67.5,70.6,67.5,68.7,66.6,66.8,67.8,66.5,68.2,...,68.4,67.3,68.4,67.6,70.4,68.8,70.1,68.4,70.8,67.4
2017-01-01 00:10:00,71.6,67.6,70.2,67.4,68.7,66.1,66.8,67.8,66.2,67.8,...,68.4,67.4,68.4,67.5,70.2,68.3,69.8,68.4,70.5,67.9
2017-01-01 00:15:00,71.1,67.5,70.3,68.0,68.5,66.7,66.6,67.7,65.9,67.8,...,68.5,67.5,68.5,67.5,70.4,68.7,70.2,68.4,70.8,67.6
2017-01-01 00:20:00,71.7,67.8,70.2,68.1,68.4,66.9,66.1,67.7,66.1,67.8,...,68.5,67.7,68.5,67.4,69.6,69.1,70.0,68.4,71.0,67.9


Weighted Adjacency matrix with Gaussian Kernel Threshold: 


Unnamed: 0,400001,400017,400030,400040,400045,400052,400057,400059,400065,400069,...,409525,409526,409528,409529,413026,413845,413877,413878,414284,414694
400001,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
400017,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
400030,0.0,0.0,1.0,0.0,0.136553,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
400040,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
400045,0.0,0.0,0.614808,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Open METR_LA
METR-LA is a traffic speed dataset collected from loop-
detectors located on the LA County road network https://www.worldpece.org/system/files/artifacts/media/pdf/p86-jagadish.pdf.

It contains data of :
- 207 selected sensors 
- period of 4 months from Mar to Jun in 2012 . 
- Timestep 5 minutes
- total number of time slices : 34,272.

In [6]:
#Open the H5 file in read mode
dataset_name = "METR_LA"
df_metr_la = load_df_speed(dataset_name,key = 'df')
print('Traffic Speed df: ')
print('number of timestep: ',df_metr_la.shape[0], 'number of sensors: ',df_metr_la.shape[1])
print('Coverage period: ',df_metr_la.index.min(),df_metr_la.index.max())
display(df_metr_la.head())

df_w_adj_k, sensor_id_to_ind = load_df_w_adj_gaussian_kernel(dataset_name)
print('Weighted Adjacency matrix with Gaussian Kernel threshold: ')
display(df_w_adj_k.head())

# In case we need to save the Weighted Distance Adjacency Matrix : 
if False:
    df_w_adj_k.to_csv(f"{DATA_FOLDER_PATH}/{dataset_name}/adj/dist.csv")
if False: 
    ''' Useless'''
    # 4106 Sensors : 
    df_dist = pd.read_csv(f"{data_path}/distances_la_2012.csv").pivot_table(index='from',columns='to')

    # Lat, Lon positions:
    df_latlon_sensor =  pd.read_csv(f"{data_path}/graph_sensor_locations.csv",index_col = 0)
    print('Lat/Lon positions of the METR-LR sensors: ')
    display(df_latlon_sensor.head())
    # Symmetr road adjacent file
    weighted_dist = pd.read_csv(f"{data_path}/W_metrla.csv")
    weighted_dist

Traffic Speed df: 
number of timestep:  34272 number of sensors:  207
Coverage period:  2012-03-01 00:00:00 2012-06-27 23:55:00


Unnamed: 0,773869,767541,767542,717447,717446,717445,773062,767620,737529,717816,...,772167,769372,774204,769806,717590,717592,717595,772168,718141,769373
2012-03-01 00:00:00,64.375,67.625,67.125,61.5,66.875,68.75,65.125,67.125,59.625,62.75,...,45.625,65.5,64.5,66.428571,66.875,59.375,69.0,59.25,69.0,61.875
2012-03-01 00:05:00,62.666667,68.555556,65.444444,62.444444,64.444444,68.111111,65.0,65.0,57.444444,63.333333,...,50.666667,69.875,66.666667,58.555556,62.0,61.111111,64.444444,55.888889,68.444444,62.875
2012-03-01 00:10:00,64.0,63.75,60.0,59.0,66.5,66.25,64.5,64.25,63.875,65.375,...,44.125,69.0,56.5,59.25,68.125,62.5,65.625,61.375,69.857143,62.0
2012-03-01 00:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2012-03-01 00:20:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Weighted Adjacency matrix with Gaussian Kernel threshold: 


Unnamed: 0,773869,767541,767542,717447,717446,717445,773062,767620,737529,717816,...,772167,769372,774204,769806,717590,717592,717595,772168,718141,769373
773869,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.119804,0.0,0.0,0.0,0.0,0.0,0.0,0.0
767541,0.0,1.0,0.390955,0.0,0.0,0.0,0.0,0.390457,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
767542,0.0,0.717438,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
717447,0.0,0.0,0.0,1.0,0.633722,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
717446,0.0,0.0,0.0,0.626464,1.0,0.0,0.135197,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
