1. Generate metatadata (adj matrix, road info, cat2index, timestamps) from raw JSON, and turn each JSON into npy with features
2. Use timestamps to find consecutive chunks of data, split chunks into train val test
3. Find mean and std of train set
4. Split npy idxs into samples using sliding window, save as new files in 3 main directories (train, val, test)
5. Dataloader will read sample files

In [None]:
%matplotlib inline 
import preprocessing_utils
import speedband_dataset
import numpy as np
from pathlib import Path
from tqdm.notebook import tqdm

In [2]:
raw_trunc_dir = Path("/home/jovyan/scratch/downtown")
process_dir = Path("/home/jovyan/scratch/downtown_processed")

# Idea is to process each JSON into an npy with a 2D array of roads and features.
# Continue generation of timestamps and other metadata
# Filenames of npy will be indices from 0

In [3]:
preprocessing_utils.processed_large(raw_trunc_dir, process_dir)

In [4]:
A, metadata, cat2index, timestamps = preprocessing_utils.load_metadata(process_dir)

In [5]:
data_chunks = preprocessing_utils.find_consecutive_chunks(timestamps, 9)

In [6]:
print(len(timestamps))
for chunk in data_chunks:
    start_time = timestamps["{}".format(chunk[0])]
    end_time = timestamps["{}".format(chunk[len(chunk)-1])]
    print(start_time, end_time, len(chunk))

49618
Mon_Mar_8_2021_22:10:08 Sun_Mar_14_2021_01:25:09 1480
Sun_Mar_14_2021_01:36:20 Sun_Mar_14_2021_01:36:20 1
Sun_Mar_14_2021_02:21:22 Sun_Mar_14_2021_02:55:09 8
Sun_Mar_14_2021_03:20:22 Sun_Apr_4_2021_02:30:09 6039
Sun_Apr_4_2021_03:35:09 Mon_Apr_5_2021_03:40:17 290
Fri_Apr_30_2021_15:05:08 Sun_May_2_2021_00:25:09 401
Sun_May_2_2021_01:20:12 Sun_May_2_2021_01:25:08 2
Sun_May_2_2021_01:35:09 Sun_May_2_2021_07:26:21 71
Sun_May_2_2021_07:50:10 Sun_May_23_2021_01:00:09 5967
Sun_May_23_2021_01:10:08 Sun_Jul_4_2021_00:35:08 12090
Sun_Jul_4_2021_04:40:10 Tue_Aug_17_2021_00:10:08 12619
Tue_Aug_17_2021_00:20:10 Sun_Sep_12_2021_06:25:08 7562
Sun_Sep_12_2021_06:35:09 Sun_Sep_12_2021_06:40:17 2
Sun_Sep_12_2021_06:50:08 Wed_Sep_15_2021_22:25:07 1052
Wed_Sep_15_2021_22:34:14 Wed_Sep_22_2021_23:55:10 2034


In [7]:
num_timesteps_input = 12 # 40 minutes
num_timesteps_output = 6 # 20 minutes

In [8]:
short_chunks = []
for i in range(len(data_chunks)):
    if len(data_chunks[i]) < num_timesteps_input + num_timesteps_output:
        short_chunks.append(i)
for idx in short_chunks[::-1]:
    del data_chunks[idx]
for chunk in data_chunks:
    print(len(chunk))
proportions = preprocessing_utils.chunk_len_proportion(data_chunks)
print(proportions)

1480
6039
290
401
71
5967
12090
12619
7562
1052
2034
[0.029835702046164703, 0.12174175990323556, 0.005846184860397137, 0.00808386251385949, 0.0014313073278903337, 0.12029029331720592, 0.24372543090414273, 0.2543896784598327, 0.15244431004939019, 0.0212075395625441, 0.041003931055337166]


In [65]:
train_idxs = []
for i in range(7):
    train_idxs.extend(data_chunks[i])
preprocessing_utils.mean_std(process_dir/"features", train_idxs)

HBox(children=(FloatProgress(value=0.0, max=26338.0), HTML(value='')))




(array([ 4.43166567,  2.55106077,  0.08281905,  4.0460931 , 11.53766421]),
 array([1.6744557 , 1.55828848, 0.04885751, 1.98377927, 6.93358581]))

In [9]:
means = [4.43166567, 2.55106077, 0.08281905, 4.0460931, 11.53766421]
stds = [1.6744557, 1.55828848, 0.04885751, 1.98377927, 6.93358581]

In [14]:
preprocessing_utils.generate_samples(data_chunks[:7], 12, 6, process_dir/"dataset"/"train", process_dir/"features")
preprocessing_utils.generate_samples(data_chunks[7:8], 12, 6, process_dir/"dataset"/"val", process_dir/"features")
preprocessing_utils.generate_samples(data_chunks[8:], 12, 6, process_dir/"dataset"/"test", process_dir/"features")

HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1463.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6022.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=273.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=384.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=54.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5950.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=12073.0), HTML(value='')))





HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=12602.0), HTML(value='')))





HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=7545.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1035.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2017.0), HTML(value='')))





In [25]:
test = SpeedbandDataset(process_dir/"dataset"/"train")