### Basic Installation

In [None]:
!python3 -m pip install gcsfs waymo-open-dataset-tf-2-11-0==1.6.1
!python3 -m pip install "notebook>=5.3" "ipywidgets>=7.5"

### Python Version

In [2]:
!python --version

Python 3.9.18


### Install TensorFlow

In [None]:
!pip install tensorflow

### Initial Setup

In [None]:
#@title Initial setup
from typing import Optional
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import tensorflow as tf
import dask.dataframe as dd

dataset_dir = '/scratch/gpfs/ms90/dataset_prep'

context_name = '10017090168044687777_6380_000_6400_000'

def read(tag: str, file_name: str) -> dd.DataFrame:
  """Creates a Dask DataFrame for the component specified by its tag."""
  paths = tf.io.gfile.glob(f'{dataset_dir}/{tag}/{file_name}')
  print(paths)
  return dd.read_parquet(paths)


### Image Processing - UNIFIED

In [None]:
from PIL import Image
import io
import os
import itertools

base_folder_path = '/scratch/gpfs/ms90/dataset_prep'

# folder_path = os.path.join(base_folder_path, 'camera_image')
folder_path = os.path.join(base_folder_path, 'camera_segmentation')

output_base_path = os.path.join(base_folder_path, 'waymo_images_4k_new')
output_seg_path = os.path.join(base_folder_path, 'waymo_segmaps_4k_new')

if not os.path.exists(output_base_path):
    os.makedirs(output_base_path)
if not os.path.exists(output_seg_path):
    os.makedirs(output_seg_path)

files = os.listdir(folder_path)

for file_name in files:
    if not file_name.startswith('.'):
        cam_image_df = read('camera_image', file_name)
        cam_seg_df = read('camera_segmentation', file_name)
        img_df = cam_image_df.loc[cam_image_df['key.camera_name'].isin([1, 2, 3])]
        seg_df = cam_seg_df.loc[cam_seg_df['key.camera_name'].isin([1, 2, 3])]
        
        # for i, seg_row in itertools.islice(iter(seg_df.iterrows()), None, None, seg_df.shape[0].compute() // 5):
        for i, seg_row in itertools.islice(iter(seg_df.iterrows()), None, None, seg_df.shape[0].compute() // 4):
            
            img_key = i.replace(';', '_')
            cam_seg_raw = seg_row['[CameraSegmentationLabelComponent].panoptic_label']
            image_cam_id = seg_row['key.camera_name']
            im = Image.open(io.BytesIO(cam_seg_raw))
            
            img_width, img_height = im.size
            dimension_folder_name = f"{img_width}x{img_height}"
            dimension_folder_path = os.path.join(output_seg_path, dimension_folder_name)
            
            if not os.path.exists(dimension_folder_path):
                os.makedirs(dimension_folder_path)
            
            im.save(os.path.join(dimension_folder_path, f'cam_img_k={img_key}_c={image_cam_id}.png'))
            
            filtered_img_df = img_df.loc[
                (img_df['key.camera_name'] == image_cam_id) &
                (img_df['key.segment_context_name'] == seg_row['key.segment_context_name']) &
                (img_df['key.frame_timestamp_micros'] == seg_row['key.frame_timestamp_micros'])
            ]
            
            _, img_row = next(iter(filtered_img_df.iterrows()))
            
            cam_img_raw = img_row['[CameraImageComponent].image']
            im = Image.open(io.BytesIO(cam_img_raw))
            
            img_width, img_height = im.size
            dimension_folder_name = f"{img_width}x{img_height}"
            dimension_folder_path = os.path.join(output_base_path, dimension_folder_name)
            
            if not os.path.exists(dimension_folder_path):
                os.makedirs(dimension_folder_path)
            
            im.save(os.path.join(dimension_folder_path, f'cam_img_k={img_key}_c={image_cam_id}.png'))