In [2]:
pip install pyarrow

Defaulting to user installation because normal site-packages is not writeable
Collecting pyarrow
  Downloading pyarrow-19.0.1-cp310-cp310-manylinux_2_28_x86_64.whl (42.1 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.1/42.1 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hInstalling collected packages: pyarrow
Successfully installed pyarrow-19.0.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pyarrow.parquet as pq

parquet_file = "/home/Waymo/1005081002024129653_5313_150_5333_150.parquet"
table = pq.read_table(parquet_file)

num_rows = table.num_rows  
print(f"Number of rows (frames): {num_rows}")

print(table.schema)

image_column_name = "[CameraImageComponent].image"  
image_column = table.column(image_column_name)

total_images = 0
for i in range(num_rows):
    image_bytes = image_column[i].as_py() 
    if image_bytes: 
        total_images += 1 

print(f"Total number of images in the parquet file: {total_images}")

Number of rows (frames): 995
index: string
key.segment_context_name: string
key.frame_timestamp_micros: int64
key.camera_name: int8
[CameraImageComponent].image: binary
[CameraImageComponent].pose.transform: fixed_size_list<item: double>[16]
  child 0, item: double
[CameraImageComponent].velocity.linear_velocity.x: float
[CameraImageComponent].velocity.linear_velocity.y: float
[CameraImageComponent].velocity.linear_velocity.z: float
[CameraImageComponent].velocity.angular_velocity.x: double
[CameraImageComponent].velocity.angular_velocity.y: double
[CameraImageComponent].velocity.angular_velocity.z: double
[CameraImageComponent].pose_timestamp: double
[CameraImageComponent].rolling_shutter_params.shutter: double
[CameraImageComponent].rolling_shutter_params.camera_trigger_time: double
[CameraImageComponent].rolling_shutter_params.camera_readout_done_time: double
-- schema metadata --
pandas: '{"column_indexes": [{"field_name": null, "metadata": {"encoding"' + 3109
Total number of image

In [4]:
import os
import pyarrow.parquet as pq
import cv2
import tensorflow as tf

WAYMO_DATA_DIR = "/homeWaymo" 
OUTPUT_DIR = "/homeWaymo_Images"

os.makedirs(OUTPUT_DIR, exist_ok=True)

parquet_files = [
    os.path.join(WAYMO_DATA_DIR, f)
    for f in os.listdir(WAYMO_DATA_DIR)
    if f.endswith(".parquet")
]

image_count = 0

for parquet_file in parquet_files:
    table = pq.read_table(parquet_file)
    image_column_name = "[CameraImageComponent].image"  
    image_column = table.column(image_column_name)
    num_rows = table.num_rows

    for i in range(num_rows):
        image_count += 1  
        image_name = f"image_{image_count:06d}.jpg"
        image_path = os.path.join(OUTPUT_DIR, image_name)

        try:
            image_bytes = image_column[i].as_py()
            if image_bytes:  
                image = tf.image.decode_jpeg(image_bytes).numpy()
                image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)  
                cv2.imwrite(image_path, image)

                if image_count % 100 == 0:
                    print(f"Extracted {image_count} images...")
            else:
                print(f"Warning: No image data found in row {i} of {parquet_file}")

        except tf.errors.InvalidArgumentError as e:  
            print(f"Error decoding image {i} from {parquet_file}: {e}")
        except Exception as e: 
            print(f"An error occurred processing image {i} from {parquet_file}: {e}")

print(f"Finished extracting {image_count} images.")

Extracted 100 images...
Extracted 200 images...
Extracted 300 images...
Extracted 400 images...
Extracted 500 images...
Extracted 600 images...
Extracted 700 images...
Extracted 800 images...
Extracted 900 images...
Extracted 1000 images...
Extracted 1100 images...
Extracted 1200 images...
Extracted 1300 images...
Extracted 1400 images...
Extracted 1500 images...
Extracted 1600 images...
Extracted 1700 images...
Extracted 1800 images...
Extracted 1900 images...
Extracted 2000 images...
Extracted 2100 images...
Extracted 2200 images...
Extracted 2300 images...
Extracted 2400 images...
Extracted 2500 images...
Extracted 2600 images...
Extracted 2700 images...
Extracted 2800 images...
Extracted 2900 images...
Extracted 3000 images...
Extracted 3100 images...
Extracted 3200 images...
Extracted 3300 images...
Extracted 3400 images...
Extracted 3500 images...
Extracted 3600 images...
Extracted 3700 images...
Extracted 3800 images...
Extracted 3900 images...
Extracted 4000 images...
Extracted

In [1]:
import os
import cv2
import hashlib

image_folder = "/home/Waymo_Images"
unique_images_folder = "/home/Waymo_unique_images"
os.makedirs(unique_images_folder, exist_ok=True)

hashes = {}

def dhash(image, size=8):
    """Compute the difference hash for an image."""
    resized = cv2.resize(image, (size + 1, size))
    diff = resized[:, 1:] > resized[:, :-1] 
    return sum([2 ** i for (i, v) in enumerate(diff.flatten()) if v])

for filename in os.listdir(image_folder):
    img_path = os.path.join(image_folder, filename)
    img = cv2.imread(img_path, cv2.IMREAD_COLOR)

    if img is not None:
        img_hash = dhash(img) 

        if img_hash not in hashes:
            hashes[img_hash] = filename
            new_img_path = os.path.join(unique_images_folder, filename)
            cv2.imwrite(new_img_path, img)

print(f"Unique images saved in {unique_images_folder}")


Unique images saved in /home/anirudh/Waymo_unique_images
