In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import os
import sys
import numpy as np
import pandas as ps

from torch.utils.data import DataLoader

from joblib import Parallel, delayed

from l5kit.geometry import transform_points
from l5kit.data import LocalDataManager, ChunkedDataset
from l5kit.dataset import AgentDataset, EgoDataset
from l5kit.rasterization import build_rasterizer
from l5kit.evaluation import write_pred_csv
from l5kit.visualization import draw_trajectory

from utils import traj_geometry_from_item

from tqdm import tqdm

In [3]:
cfg = {
    "format_version": 4,
    "model_params": {
        "history_num_frames": 10,
        "history_step_size": 1,
        "history_delta_time": 0.1,
        "future_num_frames": 50,
        "future_step_size": 1,
        "future_delta_time": 0.1,
    },
    "raster_params": {
        "raster_size": [384, 384],
        "pixel_size": [0.5, 0.5],
        "ego_center": [0.25, 0.5],
        "map_type": "py_semantic",
        "satellite_map_key": "aerial_map/aerial_map.png",
        "semantic_map_key": "semantic_map/semantic_map.pb",
        "dataset_meta_key": "meta.json",
        "filter_agents_threshold": 0.5,
    },
    "train_data_loader": {
        "key": "scenes/train.zarr",
        "batch_size": 12,
        "shuffle": True,
        "num_workers": 4,
    },
}

In [4]:
class MyDataset(AgentDataset):
    def __getitem__(self, index):
        sample = super().__getitem__(index)
        return (
            sample["track_id"],
            sample["timestamp"],
            traj_geometry_from_item(sample)[0]
        )

In [5]:
dm = LocalDataManager("../data")
rasterizer = build_rasterizer(cfg, dm)

zarr = ChunkedDataset(dm.require("scenes/train.zarr")).open()
# test_mask = np.load(f"{DATA_DIR}/scenes/mask.npz")["arr_0"]

dataset = MyDataset(cfg, zarr, rasterizer)

In [6]:
dataset.rasterizer = lambda a, b, c, d: np.zeros((2, 2))

In [7]:
list(dataset[0])

[1, 1572643685901838786, 13.683506216882]

In [8]:
# def sample_information(sample):
#     return (
#         sample["track_id"],
#         sample["timestamp"],
#         traj_geometry_from_item(sample)[0]
#     )

In [9]:
# results = []
# for elem in tqdm(dataset):
#     results.append(sample_information(elem))

In [10]:
# sizes = Parallel(n_jobs=2)(
#     delayed(sample_information)(item)
#     for item in tqdm(dataset, desc="computing sizes")
# )

In [11]:
# from multiprocessing import Pool

# results = []
# with Pool(processes=30) as pool:
#     for res in tqdm(pool.imap(func=sample_information, iterable=dataset), total=len(dataset)):
#         results.append(res)

In [12]:
loader = DataLoader(
    dataset, batch_size=16, num_workers=16,
)


track_ids = []
timestamps = []
sizes = []
for tid, ts, sz in tqdm(loader):
    track_ids.append(tid.numpy().copy())
    timestamps.append(ts.numpy().copy())
    sizes.append(sz.numpy().copy())

100%|██████████| 1406045/1406045 [30:20:23<00:00, 12.87it/s]   


In [13]:
track_ids_arr = np.concatenate(track_ids)
timestamps_arr = np.concatenate(timestamps)
sizes_arr = np.concatenate(sizes)

track_ids_arr.shape, timestamps_arr.shape, sizes_arr.shape

((22496709,), (22496709,), (22496709,))

In [14]:
result_df = ps.DataFrame({
    "track_id": track_ids_arr,
    "timestamp": timestamps_arr,
    "size": sizes_arr
})

print(result_df.shape)
result_df.head()

(22496709, 3)


Unnamed: 0,track_id,timestamp,size
0,1,1572643685901838786,13.683506
1,1,1572643686001682106,13.598927
2,1,1572643686101481026,13.07382
3,1,1572643686201245346,11.914124
4,1,1572643686301017666,10.300927


In [15]:
result_df.to_csv("train_zarr_sizes.csv", index=False)