# Fog-X Demo

In this demo, we show how to use Fog-X to collect and manage your robotics learning dataset. 

In [1]:
import fog_x 

dataset = fog_x.dataset.Dataset(
    name="demo_ds",
    path="~/test_dataset",
)

## Loading From Existing Open-X/RT-X datasets

In [2]:
dataset.load_rtx_episodes(
    name="berkeley_autolab_ur5",
    split="train[:3]",
)

2024-04-10 04:06:01.675729: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-10 04:06:02.506547: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-10 04:06:02.506679: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-10 04:06:02.683658: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-10 04:06:03.039133: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-10 04:06:03.044046: I tensorflow/core/platform/cpu_feature_guard.cc:1

### Trajectory Metadata and Data

Fog-X makes a distinction between trajectory metadata and the actual data. 
* **Metadata**: information that is consistent across a certain trajectory, such as language command, tags
* **Data**: data for individual steps within a trajectory

In [3]:
# metadata
trajectory_metadata = dataset.get_episode_info()
trajectory_metadata

episode_id,Finished,feature_gripper_closedness_action_type,feature_gripper_closedness_action_shape,gripper_closedness_action_count,feature_rotation_delta_type,feature_rotation_delta_shape,rotation_delta_count,feature_terminate_episode_type,feature_terminate_episode_shape,terminate_episode_count,feature_world_vector_type,feature_world_vector_shape,world_vector_count,feature_is_first_type,feature_is_first_shape,is_first_count,feature_is_last_type,feature_is_last_shape,is_last_count,feature_is_terminal_type,feature_is_terminal_shape,is_terminal_count,feature_hand_image_type,feature_hand_image_shape,hand_image_count,feature_image_type,feature_image_shape,image_count,feature_image_with_depth_type,feature_image_with_depth_shape,image_with_depth_count,feature_natural_language_embedding_type,feature_natural_language_embedding_shape,natural_language_embedding_count,feature_natural_language_instruction_type,feature_natural_language_instruction_shape,natural_language_instruction_count,feature_robot_state_type,feature_robot_state_shape,robot_state_count,feature_reward_type,feature_reward_shape,reward_count
i64,bool,str,str,f64,str,str,f64,str,str,f64,str,str,f64,str,str,f64,str,str,f64,str,str,f64,str,str,f64,str,str,f64,str,str,f64,str,str,f64,str,str,f64,str,str,f64,str,str,f64
0,true,"""float32""","""()""",71.0,"""float32""","""(3,)""",71.0,"""float32""","""()""",71.0,"""float32""","""(3,)""",71.0,"""bool""","""()""",71.0,"""bool""","""()""",71.0,"""bool""","""()""",71.0,"""uint8""","""(480, 640, 3)""",71.0,"""uint8""","""(480, 640, 3)""",71.0,"""float32""","""(480, 640, 1)""",71.0,"""float32""","""(512,)""",71.0,"""string""","""()""",71.0,"""float32""","""(15,)""",71.0,"""float32""","""()""",71.0
1,true,"""float32""","""()""",76.0,"""float32""","""(3,)""",76.0,"""float32""","""()""",76.0,"""float32""","""(3,)""",76.0,"""bool""","""()""",76.0,"""bool""","""()""",76.0,"""bool""","""()""",76.0,"""uint8""","""(480, 640, 3)""",76.0,"""uint8""","""(480, 640, 3)""",76.0,"""float32""","""(480, 640, 1)""",76.0,"""float32""","""(512,)""",76.0,"""string""","""()""",76.0,"""float32""","""(15,)""",76.0,"""float32""","""()""",76.0
2,true,"""float32""","""()""",81.0,"""float32""","""(3,)""",81.0,"""float32""","""()""",81.0,"""float32""","""(3,)""",81.0,"""bool""","""()""",81.0,"""bool""","""()""",81.0,"""bool""","""()""",81.0,"""uint8""","""(480, 640, 3)""",81.0,"""uint8""","""(480, 640, 3)""",81.0,"""float32""","""(480, 640, 1)""",81.0,"""float32""","""(512,)""",81.0,"""string""","""()""",81.0,"""float32""","""(15,)""",81.0,"""float32""","""()""",81.0
3,true,"""float32""","""()""",71.0,"""float32""","""(3,)""",71.0,"""float32""","""()""",71.0,"""float32""","""(3,)""",71.0,"""bool""","""()""",71.0,"""bool""","""()""",71.0,"""bool""","""()""",71.0,"""uint8""","""(480, 640, 3)""",71.0,"""uint8""","""(480, 640, 3)""",71.0,"""float32""","""(480, 640, 1)""",71.0,"""float32""","""(512,)""",71.0,"""string""","""()""",71.0,"""float32""","""(15,)""",71.0,"""float32""","""()""",71.0
4,true,"""float32""","""()""",76.0,"""float32""","""(3,)""",76.0,"""float32""","""()""",76.0,"""float32""","""(3,)""",76.0,"""bool""","""()""",76.0,"""bool""","""()""",76.0,"""bool""","""()""",76.0,"""uint8""","""(480, 640, 3)""",76.0,"""uint8""","""(480, 640, 3)""",76.0,"""float32""","""(480, 640, 1)""",76.0,"""float32""","""(512,)""",76.0,"""string""","""()""",76.0,"""float32""","""(15,)""",76.0,"""float32""","""()""",76.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
46,true,"""float32""","""()""",76.0,"""float32""","""(3,)""",76.0,"""float32""","""()""",76.0,"""float32""","""(3,)""",76.0,"""bool""","""()""",76.0,"""bool""","""()""",76.0,"""bool""","""()""",76.0,"""uint8""","""(480, 640, 3)""",76.0,"""uint8""","""(480, 640, 3)""",76.0,"""float32""","""(480, 640, 1)""",76.0,"""float32""","""(512,)""",76.0,"""string""","""()""",76.0,"""float32""","""(15,)""",76.0,"""float32""","""()""",76.0
47,true,"""float32""","""()""",81.0,"""float32""","""(3,)""",81.0,"""float32""","""()""",81.0,"""float32""","""(3,)""",81.0,"""bool""","""()""",81.0,"""bool""","""()""",81.0,"""bool""","""()""",81.0,"""uint8""","""(480, 640, 3)""",81.0,"""uint8""","""(480, 640, 3)""",81.0,"""float32""","""(480, 640, 1)""",81.0,"""float32""","""(512,)""",81.0,"""string""","""()""",81.0,"""float32""","""(15,)""",81.0,"""float32""","""()""",81.0
48,true,"""float32""","""()""",71.0,"""float32""","""(3,)""",71.0,"""float32""","""()""",71.0,"""float32""","""(3,)""",71.0,"""bool""","""()""",71.0,"""bool""","""()""",71.0,"""bool""","""()""",71.0,"""uint8""","""(480, 640, 3)""",71.0,"""uint8""","""(480, 640, 3)""",71.0,"""float32""","""(480, 640, 1)""",71.0,"""float32""","""(512,)""",71.0,"""string""","""()""",71.0,"""float32""","""(15,)""",71.0,"""float32""","""()""",71.0
49,true,"""float32""","""()""",76.0,"""float32""","""(3,)""",76.0,"""float32""","""()""",76.0,"""float32""","""(3,)""",76.0,"""bool""","""()""",76.0,"""bool""","""()""",76.0,"""bool""","""()""",76.0,"""uint8""","""(480, 640, 3)""",76.0,"""uint8""","""(480, 640, 3)""",76.0,"""float32""","""(480, 640, 1)""",76.0,"""float32""","""(512,)""",76.0,"""string""","""()""",76.0,"""float32""","""(15,)""",76.0,"""float32""","""()""",76.0


In [4]:
# data for ALL trajectories 
# these data are loaded lazily that only actively used data is loaded to memory
all_step_data = dataset.get_step_data()
# use .describe to get the summary of the information
all_step_data.describe() 

: 

### Lazy Loading Step Data
Al the step data are loaded on demand to save space in memory. You can see the loading time difference between the lazy loading and loading all the data from disk. 

In [None]:
# data for individual episode 
%timeit dataset.get_step_data_by_episode_ids([1,2,3])

In [None]:
%timeit dataset.get_step_data_by_episode_ids([1,2,3], as_lazy_frame=False)

## Data Analytics and Management


### Example 1: Add new Episode information metadata and Filter

Suppose another person collects another set of the data and you want to distinguish who collects what.  


In [None]:
# this loads another 2 episodes 
dataset.load_rtx_episodes(
    name="berkeley_autolab_ur5",
    split="train[3:5]",
    additional_metadata={"collector": "User 2", "custom_tag": "Partition_2"},
)

now the metadata table looks like

In [None]:
dataset.get_episode_info().select(["episode_id", "collector", "custom_tag"])

In [None]:
episode_info = dataset.get_episode_info()
# querying non-existent metadata 
metadata = episode_info.filter(episode_info["collector"] == "User_Do_No_Exist")
episodes = dataset.read_by(metadata)

In [None]:
metadata = episode_info.filter(episode_info["custom_tag"] == "Partition_2")
episodes = dataset.read_by(metadata)
episodes, episodes[0].describe()

### Example 2: Extracts and Searches natural language instructions from step data 

Existing Open-X datasets store natural language instructions for every step, which costs inefficiency and manage complexity. This example shows 
1. how to extracts natural language instruction from existing Open-X datasets
2. search for keywords or **regex** 

In [None]:
id_to_language_instruction = (
    dataset.get_step_data()
    .select("episode_id", "natural_language_instruction")# only interested in episode id and language column
    .collect() # the frame is lazily evaluated at memory when we call collect() 
)

# print out unique natural_language_instructions 
# https://docs.pola.rs/py-polars/html/reference/dataframe/api/polars.DataFrame.unique.html 
id_to_language_instruction.unique(subset=["natural_language_instruction"], maintain_order=True)

In [None]:
all_step_data = dataset.get_step_data() # get lazy frame of the entire step-level dataset
id_to_language_instruction = (
    all_step_data
    .select("episode_id", "natural_language_instruction") 
    .group_by("episode_id") # group by unqiue language ids, since language instruction is stored for every step
    .last()  # since instruction is same for all steps in an episode, we can just take the last one
    .collect() # the frame is lazily evaluated until we call collect() 
)

# join with the metadata 
episode_metadata = dataset.get_episode_info().join(id_to_language_instruction, on="episode_id")

In [None]:
import polars as pl 
# Decode byte strings to strings
episode_metadata = episode_metadata.with_columns(episode_metadata['natural_language_instruction'].map_elements(lambda x: x.decode('utf-8')).alias('decoded'))

# Filter rows where 'string_col' contains "example"
result = episode_metadata.filter(
    pl.col("decoded").str.contains("green|red").alias("cloth") # supports regex!
)
print(result.select(["episode_id", "decoded"]))

We use polars as backend for data processing and management. This example demonstrates its capabaility and flexiblitiy. Please refer to https://docs.pola.rs/py-polars/html/reference/lazyframe/index.html all the available interfaces 

## Use, Export and Share

### As Open-X dataset 
In tensorflow rlds dataset format

In [None]:
dataset.export(format="rtx")

### Huggingface dataset 

In [None]:
import datasets

huggingface_ds = dataset.get_as_huggingface_dataset()

print(f"Hugging face dataset: {huggingface_ds}")

### Pytorch Dataset

In [None]:
import torch 

pytorch_ds = dataset.pytorch_dataset_builder(
    metadata=dataset.get_metadata_as_pandas_df()
)


for data in torch.utils.data.DataLoader(
    pytorch_ds,
    batch_size=2,
    collate_fn=lambda x: x,
    sampler=torch.utils.data.RandomSampler(pytorch_ds),
):
    print(data)


In [None]:
%load_ext autoreload
%autoreload 2