In [None]:
import pandas as pd
import pyarrow.parquet as pq
from pathlib import Path

In [10]:
path = "/home/finn/.cache/huggingface/lerobot/F-Fer/ur-task3-1/meta/episodes/chunk-000/file-000.parquet"

# 1. Read with 'dtype_backend="pyarrow"' to handle nested list columns without crashing
df = pd.read_parquet(path, engine='pyarrow', dtype_backend='pyarrow')

In [13]:
for col in df.columns:
    print(f"{col}: {df[col].dtype}")


stats/observation.images.zed2i_left/q10: list<element: list<element: list<element: double>>>[pyarrow]
stats/episode_index/q99: list<element: double>[pyarrow]
stats/timestamp/min: list<element: double>[pyarrow]
stats/frame_index/std: list<element: double>[pyarrow]
stats/frame_index/q90: list<element: double>[pyarrow]
stats/task_index/std: list<element: double>[pyarrow]
stats/frame_index/count: list<element: int64>[pyarrow]
stats/observation.images.zed2i_left/q99: list<element: list<element: list<element: double>>>[pyarrow]
stats/observation.images.zed2i_right/count: list<element: int64>[pyarrow]
stats/action/mean: list<element: double>[pyarrow]
stats/observation.state/max: list<element: double>[pyarrow]
stats/observation.images.zedm_right/count: list<element: int64>[pyarrow]
stats/observation.images.zedm_left/max: list<element: list<element: list<element: double>>>[pyarrow]
stats/task_index/q01: list<element: double>[pyarrow]
data/file_index: int64[pyarrow]
stats/episode_index/std: list

In [48]:

dataset_path = Path("/home/finn/.cache/huggingface/lerobot/F-Fer/ur-task3")  # Update this path
meta_files = list(dataset_path.glob("meta/episodes/**/*.parquet"))

n_cols_updated = 0

for p in meta_files:
    # 1. Read with 'dtype_backend="pyarrow"' to handle nested list columns without crashing
    df = pd.read_parquet(p, engine='pyarrow', dtype_backend='pyarrow')

    # 2. Iterate and cast columns
    for col in df.columns:
        if col.endswith("_index"):
            # We cast to "int64[pyarrow]" to match the backend we used for loading
            # (This is the PyArrow equivalent of the "Int64" nullable type)
            print(f"{p}: Casting {col} to int64[pyarrow]")
            df[col] = df[col].astype("int64[pyarrow]")
            n_cols_updated += 1
    # 3. Save it back to disk
    # Note: This overwrites the original file. Change 'path' if you want to save a copy.
    df.to_parquet(p, engine='pyarrow')

print(f"Updated {n_cols_updated} columns in {len(meta_files)} files.")
print("Done! Columns cast and file saved.")


/home/finn/.cache/huggingface/lerobot/F-Fer/ur-task3/meta/episodes/chunk-000/file-003.parquet: Casting data/chunk_index to int64[pyarrow]
/home/finn/.cache/huggingface/lerobot/F-Fer/ur-task3/meta/episodes/chunk-000/file-003.parquet: Casting episode_index to int64[pyarrow]
/home/finn/.cache/huggingface/lerobot/F-Fer/ur-task3/meta/episodes/chunk-000/file-003.parquet: Casting meta/episodes/file_index to int64[pyarrow]
/home/finn/.cache/huggingface/lerobot/F-Fer/ur-task3/meta/episodes/chunk-000/file-003.parquet: Casting data/file_index to int64[pyarrow]
/home/finn/.cache/huggingface/lerobot/F-Fer/ur-task3/meta/episodes/chunk-000/file-003.parquet: Casting dataset_to_index to int64[pyarrow]
/home/finn/.cache/huggingface/lerobot/F-Fer/ur-task3/meta/episodes/chunk-000/file-003.parquet: Casting meta/episodes/chunk_index to int64[pyarrow]
/home/finn/.cache/huggingface/lerobot/F-Fer/ur-task3/meta/episodes/chunk-000/file-003.parquet: Casting dataset_from_index to int64[pyarrow]
/home/finn/.cache/h

In [32]:
import pyarrow.parquet as pq

path1 = "/home/finn/.cache/huggingface/lerobot/F-Fer/ur-task3-0/meta/episodes/chunk-000/file-000.parquet"
path2 = "/home/finn/.cache/huggingface/lerobot/F-Fer/ur-task3-1_old/meta/episodes/chunk-000/file-002.parquet"

def get_schema_dict(path):
    try:
        # Read schema only
        schema = pq.read_schema(path)
        # Create a dict of {column_name: string_representation_of_type}
        return {name: str(schema.field(name).type) for name in schema.names}
    except Exception as e:
        print(f"Error reading {path}: {e}")
        return {}

schema1 = get_schema_dict(path1)
schema2 = get_schema_dict(path2)

# Get all unique columns from both files
all_cols = sorted(set(schema1.keys()) | set(schema2.keys()))

print(f"{'Column':<50} | {'File 1 Type':<30} | {'File 2 Type':<30}")
print("-" * 116)

for col in all_cols:
    type1 = schema1.get(col, "MISSING")
    type2 = schema2.get(col, "MISSING")
    
    # Highlight differences
    if type1 != type2:
        marker = "Mismatch -> " 
    else:
        marker = ""
        
    # Optional: Print only mismatches or everything. Currently prints everything.
    print(f"{marker}{col:<38} | {type1:<30} | {type2:<30}")


Column                                             | File 1 Type                    | File 2 Type                   
--------------------------------------------------------------------------------------------------------------------
data/chunk_index                       | int64                          | int64                         
data/file_index                        | int64                          | int64                         
dataset_from_index                     | int64                          | int64                         
dataset_to_index                       | int64                          | int64                         
episode_index                          | int64                          | int64                         
length                                 | int64                          | int64                         
meta/episodes/chunk_index              | int64                          | int64                         
meta/episodes/file_index       

In [51]:
for i in range(0, 22):
    path1 = f"/home/finn/.cache/huggingface/lerobot/F-Fer/ur-task3-0/meta/episodes/chunk-000/file-000.parquet"
    path2 = f"/home/finn/.cache/huggingface/lerobot/F-Fer/ur-task3/meta/episodes/chunk-000/file-{i:03d}.parquet"
    
    schema1 = get_schema_dict(path1)
    schema2 = get_schema_dict(path2)

    all_cols = sorted(set(schema1.keys()) | set(schema2.keys()))

    for col in all_cols:
        type1 = schema1.get(col, "MISSING")
        type2 = schema2.get(col, "MISSING")

        if type1 != type2:
            marker = "Mismatch -> "
            print(f"{path2}: {marker}{col:<38} | {type1:<30} | {type2:<30}")
        else:
            marker = ""
            

In [44]:
import pandas as pd
from pathlib import Path

# Path to your dataset's episode metadata files
dataset_path = Path("/home/finn/.cache/huggingface/lerobot/F-Fer/ur-task3-1")  # Update this path
meta_files = list(dataset_path.glob("meta/episodes/**/*.parquet"))

for p in meta_files:
    # Read with pyarrow backend (which handles the complex types)
    df = pd.read_parquet(p, dtype_backend="pyarrow")
    
    # Write back (pandas will infer standard schema from the data)
    df.to_parquet(p)
    print(f"Fixed {p}")

Fixed /home/finn/.cache/huggingface/lerobot/F-Fer/ur-task3-1/meta/episodes/chunk-000/file-003.parquet
Fixed /home/finn/.cache/huggingface/lerobot/F-Fer/ur-task3-1/meta/episodes/chunk-000/file-008.parquet
Fixed /home/finn/.cache/huggingface/lerobot/F-Fer/ur-task3-1/meta/episodes/chunk-000/file-000.parquet
Fixed /home/finn/.cache/huggingface/lerobot/F-Fer/ur-task3-1/meta/episodes/chunk-000/file-018.parquet
Fixed /home/finn/.cache/huggingface/lerobot/F-Fer/ur-task3-1/meta/episodes/chunk-000/file-010.parquet
Fixed /home/finn/.cache/huggingface/lerobot/F-Fer/ur-task3-1/meta/episodes/chunk-000/file-007.parquet
Fixed /home/finn/.cache/huggingface/lerobot/F-Fer/ur-task3-1/meta/episodes/chunk-000/file-011.parquet
Fixed /home/finn/.cache/huggingface/lerobot/F-Fer/ur-task3-1/meta/episodes/chunk-000/file-014.parquet
Fixed /home/finn/.cache/huggingface/lerobot/F-Fer/ur-task3-1/meta/episodes/chunk-000/file-005.parquet
Fixed /home/finn/.cache/huggingface/lerobot/F-Fer/ur-task3-1/meta/episodes/chunk-0