In [2]:
!pwd

/Users/dangminh/Desktop/SpatialRGPT


In [3]:
# Assuming transformer_normalize_monkey_patch.py is in llava/train/
from llava.train.transformer_normalize_monkey_patch import patch_normalize_preprocess
patch_normalize_preprocess() # Apply the patch
print("Applied transformers.image_transforms.normalize patch.")

  from .autonotebook import tqdm as notebook_tqdm


Applied transformers.image_transforms.normalize patch.


In [None]:
import torch
import os
import json
from PIL import Image
import numpy as np # For potential visualization/debug
import matplotlib.pyplot as plt # For visualization

# --- Configuration ---
# Paths on your Colab/RunPod instance
PROCESSED_JSONL_PATH = "PhysicalAI-Spatial-Intelligence-Warehouse/formatted_dataset/train_aicity_srgpt.jsonl" # Path to your converted JSONL
RGB_IMAGE_BASE_DIR = "PhysicalAI-Spatial-Intelligence-Warehouse/train_sample/images"      # Base path for AI City RGB images
DEPTH_IMAGE_BASE_DIR = "PhysicalAI-Spatial-Intelligence-Warehouse/train_sample/depths"    # Base path for AI City Depth images
SPATIAL_RGPT_MODEL_PATH = "checkpoints/SpatialRGPT-VILA1.5-8B" # For tokenizer & image_processor

# Ensure you are in the SpatialRGPT directory for relative imports if not installed as a package
# Or add to sys.path if needed:
# import sys
# sys.path.insert(0, '/content/SpatialRGPT') # Adjust if your SpatialRGPT clone is elsewhere

from llava.train.args import DataArguments # Assuming this is how DataArguments is defined
from llava.model.builder import load_pretrained_model # To get tokenizer and image_processor
from llava.data.aicity_dataset import AICityLazySpatialDataset # Your custom class
from llava.constants import IGNORE_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_MASK_TOKEN, DEFAULT_DEPTH_TOKEN
from llava.mm_utils import get_model_name_from_path

# --- 1. Load Tokenizer and Image Processor (from the base model) ---
print(f"Loading tokenizer and image_processor from {SPATIAL_RGPT_MODEL_PATH}...")
model_name = get_model_name_from_path(SPATIAL_RGPT_MODEL_PATH)
tokenizer, model, image_processor, context_len = load_pretrained_model(
    model_path=SPATIAL_RGPT_MODEL_PATH,
    model_name=model_name, model_base=None, load_8bit=False, load_4bit=False
)
# We don't need the full 'model' here, just tokenizer and image_processor
del model 
torch.cuda.empty_cache()
print("Tokenizer and Image Processor loaded.")

# --- 2. Setup DataArguments ---
# These arguments are usually passed via command line in train.py
# We need to manually create a DataArguments object or a simple namespace
class SimpleDataArgs:
    def __init__(self):
        self.image_processor = image_processor # Crucial: assign the loaded processor
        self.image_aspect_ratio = 'pad' # Or 'resize', consistent with 3_sft.sh or desired setting. 'pad' is often safer.
        self.is_multimodal = True
        self.mm_use_im_start_end = False # Match 3_sft.sh
        # Add other data_args if your AICityLazySpatialDataset or preprocess functions depend on them
        # e.g. from the `train.py` script DataArguments dataclass
        self.image_grid_pinpoints = None 
        self.use_rle_masks = True # Assuming you want to use RLEs
        self.vflan_no_system_prompt = True # From 3_sft.sh, passed to preprocess

data_args = SimpleDataArgs()

# --- 3. Instantiate your AICityLazySpatialDataset ---
print(f"Instantiating AICityLazySpatialDataset with JSONL: {PROCESSED_JSONL_PATH}")
try:
    aicity_dataset = AICityLazySpatialDataset(
        data_path=PROCESSED_JSONL_PATH,
        rgb_image_folder=RGB_IMAGE_BASE_DIR,
        depth_image_folder=DEPTH_IMAGE_BASE_DIR,
        tokenizer=tokenizer,
        data_args=data_args
    )
    print(f"Dataset instantiated. Number of samples: {len(aicity_dataset)}")
except Exception as e:
    print(f"Error instantiating dataset: {e}")
    import traceback
    traceback.print_exc()
    aicity_dataset = None

# --- 4. Inspect a Few Samples ---
if aicity_dataset:
    num_samples_to_check = min(3, len(aicity_dataset))
    if num_samples_to_check == 0:
        print("No samples in the dataset to check.")

    for i in range(num_samples_to_check):
        print(f"\n----------- Checking Sample Index: {i} -----------")
        try:
            sample_data_dict = aicity_dataset[i] # This calls __getitem__
            original_json_sample = aicity_dataset.list_data_dict[i] # Get the raw entry from JSONL

            print(f"Original Sample ID from JSONL: {original_json_sample.get('id')}")
            print("Sample Data Dictionary Keys:", sample_data_dict.keys())

            # Check input_ids and labels
            print(f"  input_ids shape: {sample_data_dict['input_ids'].shape}, dtype: {sample_data_dict['input_ids'].dtype}")
            decoded_input_ids = tokenizer.decode(sample_data_dict['input_ids'], skip_special_tokens=True)
            print(f"  Decoded input_ids (snippet): {decoded_input_ids[:500]}...") # Print a longer snippet
            
            # Verify <image>\n and <mask> <depth> substitution
            if not decoded_input_ids.strip().startswith(DEFAULT_IMAGE_TOKEN + "\n"):
                print(f"  WARNING: Decoded input_ids DO NOT start with '{DEFAULT_IMAGE_TOKEN}\\n'")
            if "<mask> <depth>" not in decoded_input_ids:
                # This check might be tricky if <mask> <depth> are already tokenized to special tokens
                # Instead, check the original conversation in the sample_data_dict if it went through preprocess_multimodal
                original_human_q = ""
                for conv_turn in original_json_sample['conversations']:
                    if conv_turn['from'] == 'human':
                        original_human_q = conv_turn['value']
                        break
                if "<mask> <depth>" not in original_human_q: # This checks pre-tokenization from your JSONL
                     print(f"  WARNING: '<mask> <depth>' not found in human question from JSONL: {original_human_q[:100]}")


            print(f"  labels shape: {sample_data_dict['labels'].shape}, dtype: {sample_data_dict['labels'].dtype}")
            valid_labels_indices = sample_data_dict['labels'] != IGNORE_INDEX
            valid_labels = sample_data_dict['labels'][valid_labels_indices]
            print(f"  Decoded valid labels (snippet): {tokenizer.decode(valid_labels[:50], skip_special_tokens=True)}...")

            # Check image tensor
            if sample_data_dict['image'] is not None:
                print(f"  image tensor shape: {sample_data_dict['image'].shape}, dtype: {sample_data_dict['image'].dtype}")
                assert sample_data_dict['image'].ndim == 4, "Image tensor should be [1, C, H, W]"
                assert sample_data_dict['image'].shape[1] == 3, "Image tensor should have 3 channels"
            else:
                print("  ERROR: image tensor is None!")

            # Check depths tensor
            if sample_data_dict.get('depths') is not None:
                print(f"  depths tensor shape: {sample_data_dict['depths'].shape}, dtype: {sample_data_dict['depths'].dtype}")
                assert sample_data_dict['depths'].ndim == 4, "Depth tensor should be [1, C, H, W]"
                # For SigLIP/CLIP processors, depth is usually converted to 3 channels.
                assert sample_data_dict['depths'].shape[1] == 3, "Depth tensor should have 3 channels after processing"
            else:
                print("  ERROR: depths tensor is None!")

            # Check masks tensor
            if sample_data_dict.get('masks') is not None:
                print(f"  masks tensor shape: {sample_data_dict['masks'].shape}, dtype: {sample_data_dict['masks'].dtype}")
                num_rle_original = len(original_json_sample.get('rle', []))
                assert sample_data_dict['masks'].shape[0] == num_rle_original, \
                    f"Mismatch in mask count: tensor has {sample_data_dict['masks'].shape[0]}, RLEs in source: {num_rle_original}"
                assert sample_data_dict['masks'].ndim == 3, "Masks tensor should be [num_masks, H_proc, W_proc]"
            elif original_json_sample.get('rle'): # If RLEs were present but masks tensor is None
                 print("  ERROR: masks tensor is None, but RLEs were present in the source JSONL!")
            else: # No RLEs in source, so masks tensor being None is OK
                print("  masks tensor: None (as expected, no RLEs in source JSONL for this sample)")

        except Exception as e:
            print(f"ERROR checking sample {i} (ID: {aicity_dataset.list_data_dict[i].get('id')}): {e}")
            import traceback
            traceback.print_exc()

  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'accelerate'

In [None]:
import random
i = random.randint(0, len(aicity_dataset))
sample_data_dict = aicity_dataset[i]
if sample_data_dict.get('depths') is not None:
    depth_to_show = sample_data_dict['depths'][0].cpu().numpy().transpose(1, 2, 0)
    # Assuming it's normalized to [0,1] by processor, or you might need to denormalize
    # If it became 3-channel by duplicating, take one channel or average
    if depth_to_show.shape[2] == 3:
        depth_to_show = depth_to_show[:, :, 0] 
    plt.subplot(1, 2, 1)
    plt.imshow(sample_data_dict['image'][0].cpu().numpy().transpose(1, 2, 0))
    plt.show()
    
    plt.subplot(1, 2, 2)
    plt.imshow(depth_to_show, cmap='gray')
    plt.title(f"Processed Depth - Sample {i}")
    plt.show()