In [2]:
!pwd

/Users/dangminh/Desktop/SpatialRGPT


In [1]:
import torch
import os
import json
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt # For optional visualization
import sys

# --- Configuration: Update these paths for your local machine ---
SPATIAL_RGPT_REPO_PATH = "./SpatialRGPT"  # Path to your cloned SpatialRGPT repository
PROCESSED_JSONL_PATH = "./test_data/aicity_srgpt_sample.jsonl" # Path to your small test JSONL
RGB_IMAGE_BASE_DIR = "./test_data/aicity_sample_data/train/images/"
DEPTH_IMAGE_BASE_DIR = "./test_data/aicity_sample_data/train/depths/"
# Model path for tokenizer & image_processor (can be a local path if you downloaded, or HF ID)
BASE_MODEL_PATH_FOR_CONFIG = "a8cheng/SpatialRGPT-VILA1.5-8B" 

# Add SpatialRGPT to Python path for imports
sys.path.insert(0, os.path.abspath(SPATIAL_RGPT_REPO_PATH))

try:
    from llava.train.args import DataArguments
    from llava.model.builder import load_pretrained_model
    from llava.data.aicity_dataset import AICityLazySpatialDataset # Your custom class
    from llava.constants import IGNORE_INDEX, DEFAULT_IMAGE_TOKEN
    from llava.mm_utils import get_model_name_from_path
    from llava.train.transformer_normalize_monkey_patch import patch_normalize_preprocess
except ImportError as e:
    print(f"Error importing from LLaVA/SpatialRGPT. Ensure SPATIAL_RGPT_REPO_PATH is correct and it's on sys.path: {e}")
    sys.exit(1)

def create_dummy_data_if_not_exists():
    """Creates dummy data and folder structure if they don't exist for testing."""
    os.makedirs(os.path.dirname(PROCESSED_JSONL_PATH), exist_ok=True)
    os.makedirs(RGB_IMAGE_BASE_DIR, exist_ok=True)
    os.makedirs(DEPTH_IMAGE_BASE_DIR, exist_ok=True)

    if not os.path.exists(PROCESSED_JSONL_PATH):
        print(f"Creating dummy JSONL: {PROCESSED_JSONL_PATH}")
        dummy_data = [
            {
                "id": "dummy_sample_0", "image_base_filename": "dummy_rgb_0",
                "conversations": [{"from": "human", "value": "<image>\nTest question with <mask> <depth>"}, {"from": "gpt", "value": "Test answer."}],
                "rle": [{"size": [100,100], "counts": "RLE0"}] # Simplified RLE for structure
            },
            {
                "id": "dummy_sample_1", "image_base_filename": "dummy_rgb_1",
                "conversations": [{"from": "human", "value": "<image>\nAnother <mask> <depth> and <mask> <depth>"}, {"from": "gpt", "value": "Another answer."}],
                "rle": [{"size": [100,100], "counts": "RLE1"}, {"size": [100,100], "counts": "RLE2"}]
            }
        ]
        with open(PROCESSED_JSONL_PATH, 'w') as f:
            for entry in dummy_data:
                f.write(json.dumps(entry) + '\n')

    for i in range(2): # Create 2 dummy images
        try:
            img_path = os.path.join(RGB_IMAGE_BASE_DIR, f"dummy_rgb_{i}.png")
            depth_path = os.path.join(DEPTH_IMAGE_BASE_DIR, f"dummy_rgb_{i}_depth.png")
            if not os.path.exists(img_path):
                Image.new('RGB', (100, 100), color='blue').save(img_path)
                print(f"Created dummy RGB image: {img_path}")
            if not os.path.exists(depth_path):
                Image.new('L', (100, 100), color=128).save(depth_path) # Grayscale
                print(f"Created dummy Depth image: {depth_path}")
        except Exception as e:
            print(f"Could not create dummy image {i}: {e}")

def main():
    # Apply the normalization patch for consistency with training
    patch_normalize_preprocess()
    print("Applied transformers.image_transforms.normalize patch.")

    # 1. Load Tokenizer and Image Processor (minimal parts of the model)
    print(f"Loading tokenizer and image_processor from {BASE_MODEL_PATH_FOR_CONFIG}...")
    try:
        # We only need tokenizer and image_processor, not the full model weights for this test
        # Temporarily load model just to get these, then delete model
        # Use trust_remote_code=True if the model requires it (common for VILA/LLaVA based models)
        tokenizer, model_temp, image_processor, _ = load_pretrained_model(
            BASE_MODEL_PATH_FOR_CONFIG, model_name_or_path=None, model_base=None, 
            load_8bit=False, load_4bit=False, trust_remote_code=True 
        )
        del model_temp # Free up memory
        torch.cuda.empty_cache() if torch.cuda.is_available() else None
        print("Tokenizer and Image Processor loaded successfully.")
    except Exception as e:
        print(f"Error loading tokenizer/image_processor from {BASE_MODEL_PATH_FOR_CONFIG}: {e}")
        print("Please ensure the model path is correct and you have an internet connection if it's a Hugging Face ID.")
        return

    # 2. Setup DataArguments
    # Mimic DataArguments from llava.train.args or what AICityLazySpatialDataset expects
    class MinimalDataArgs:
        def __init__(self, processor):
            self.image_processor = processor
            self.image_aspect_ratio = 'pad'  # Critical: Test with 'pad' or 'resize'
            self.is_multimodal = True
            self.mm_use_im_start_end = False # As per 3_sft.sh
            # Add any other attributes from DataArguments your dataset class or preprocess functions use
            self.image_grid_pinpoints = None 
            self.use_rle_masks = True # Assuming your dataset class uses this
            self.vflan_no_system_prompt = True # if your preprocess expects this

    data_args = MinimalDataArgs(image_processor)

    # Create dummy data if your test paths don't exist
    create_dummy_data_if_not_exists()


    # 3. Instantiate your AICityLazySpatialDataset
    print(f"\nInstantiating AICityLazySpatialDataset with JSONL: {PROCESSED_JSONL_PATH}")
    try:
        aicity_dataset = AICityLazySpatialDataset(
            data_path=PROCESSED_JSONL_PATH,
            rgb_image_folder=RGB_IMAGE_BASE_DIR,
            depth_image_folder=DEPTH_IMAGE_BASE_DIR,
            tokenizer=tokenizer,
            data_args=data_args
        )
        print(f"Dataset instantiated. Number of samples: {len(aicity_dataset)}")
    except Exception as e:
        print(f"ERROR instantiating AICityLazySpatialDataset: {e}")
        import traceback
        traceback.print_exc()
        return

    # 4. Inspect a Few Samples
    if not aicity_dataset or len(aicity_dataset) == 0:
        print("No samples in dataset to check.")
        return
        
    num_samples_to_check = min(3, len(aicity_dataset))
    print(f"\nWill check {num_samples_to_check} samples...")

    for i in range(num_samples_to_check):
        print(f"\n----------- Checking Sample Index: {i} -----------")
        try:
            sample_data_dict = aicity_dataset[i] # Calls __getitem__
            original_json_sample = aicity_dataset.list_data_dict[i]

            print(f"Original Sample ID from JSONL: {original_json_sample.get('id')}")
            print(f"Original 'image_base_filename': {original_json_sample.get('image_base_filename')}")
            print("Sample Data Dictionary Keys:", sample_data_dict.keys())

            # Check input_ids and labels
            input_ids = sample_data_dict['input_ids']
            labels = sample_data_dict['labels']
            print(f"  input_ids shape: {input_ids.shape}, dtype: {input_ids.dtype}")
            
            # Robust decode
            decoded_input_ids = robust_decode_local(tokenizer, input_ids)
            print(f"  Decoded input_ids (snippet): {decoded_input_ids[:300]}...") # First 300 chars
            
            # Check for <image>\n and <mask> <depth> in the original conversation from JSONL
            # as the decoded_input_ids will have IMAGE_TOKEN_INDEX resolved.
            first_human_turn_jsonl = ""
            for turn in original_json_sample.get("conversations", []):
                if turn.get("from") == "human":
                    first_human_turn_jsonl = turn.get("value", "")
                    break
            
            if not first_human_turn_jsonl.strip().startswith(DEFAULT_IMAGE_TOKEN + "\n"):
                print(f"  WARNING (JSONL Check): First human turn in JSONL DOES NOT start with '{DEFAULT_IMAGE_TOKEN}\\n'. Found: '{first_human_turn_jsonl[:50]}...'")
            if "<mask> <depth>" not in first_human_turn_jsonl:
                print(f"  WARNING (JSONL Check): '<mask> <depth>' not found in human question from JSONL. Found: '{first_human_turn_jsonl[:100]}...'")


            print(f"  labels shape: {labels.shape}, dtype: {labels.dtype}")
            valid_labels_indices = labels != IGNORE_INDEX
            valid_labels = labels[valid_labels_indices]
            decoded_valid_labels = robust_decode_local(tokenizer, valid_labels)
            print(f"  Decoded valid labels (snippet): {decoded_valid_labels[:200]}...")

            # Check image tensor
            if sample_data_dict.get('image') is not None:
                img_tensor = sample_data_dict['image']
                print(f"  image tensor shape: {img_tensor.shape}, dtype: {img_tensor.dtype}, min: {img_tensor.min():.2f}, max: {img_tensor.max():.2f}")
                assert img_tensor.ndim == 4 and img_tensor.shape[0] == 1 and img_tensor.shape[1] == 3, "Image tensor shape error"
            else: print("  ERROR: image tensor is None!")

            # Check depths tensor
            if sample_data_dict.get('depths') is not None:
                depth_tensor = sample_data_dict['depths']
                print(f"  depths tensor shape: {depth_tensor.shape}, dtype: {depth_tensor.dtype}, min: {depth_tensor.min():.2f}, max: {depth_tensor.max():.2f}")
                assert depth_tensor.ndim == 4 and depth_tensor.shape[0] == 1 and depth_tensor.shape[1] == 3, "Depth tensor shape error"
            else: print("  ERROR: depths tensor is None!")

            # Check masks tensor
            if sample_data_dict.get('masks') is not None:
                masks_tensor = sample_data_dict['masks']
                print(f"  masks tensor shape: {masks_tensor.shape}, dtype: {masks_tensor.dtype}, min: {masks_tensor.min():.2f}, max: {masks_tensor.max():.2f}")
                num_rle_original = len(original_json_sample.get('rle', []))
                assert masks_tensor.shape[0] == num_rle_original, f"Mask count mismatch: {masks_tensor.shape[0]} vs {num_rle_original}"
                assert masks_tensor.ndim == 3, "Masks tensor shape error" # [num_masks, H_proc, W_proc]
            elif original_json_sample.get('rle'): print("  ERROR: masks tensor is None, but RLEs present in JSONL!")
            else: print("  masks tensor: None (no RLEs in JSONL)")

        except Exception as e:
            print(f"ERROR during sample {i} (ID: {aicity_dataset.list_data_dict[i].get('id')}) check: {e}")
            import traceback
            traceback.print_exc()

def robust_decode_local(tokenizer, token_ids_tensor):
    """Helper to decode token IDs, replacing known problematic IDs like -200."""
    if token_ids_tensor.ndim > 1: # Handle batched tensors if they appear
        token_ids_tensor = token_ids_tensor.squeeze(0)
    
    cloned_ids = token_ids_tensor.clone().tolist() # Convert to list for easier manipulation
    
    # Replace IMAGE_TOKEN_INDEX (-200) with a placeholder string or pad token string for readability
    # because even skip_special_tokens=True might error on raw -200 with some fast tokenizers.
    # For decoding, we want to see if other text is fine.
    final_ids_for_decode = []
    for token_id in cloned_ids:
        if token_id == -200: # IMAGE_TOKEN_INDEX
            # Option 1: Skip (might be handled by skip_special_tokens if it doesn't error)
            # continue 
            # Option 2: Replace with pad token ID, then skip_special_tokens will remove it.
            final_ids_for_decode.append(tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0)
        elif token_id < 0 and token_id != IGNORE_INDEX: # Other unexpected negative IDs
             final_ids_for_decode.append(tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0)
        else:
            final_ids_for_decode.append(token_id)
            
    return tokenizer.decode(final_ids_for_decode, skip_special_tokens=True)


if __name__ == "__main__":
    main()

Updated sys.path to include: /Users/dangminh/Desktop/SpatialRGPT
Current working directory: /Users/dangminh/Desktop/SpatialRGPT


ModuleNotFoundError: No module named 'flash_attn'

In [None]:
import random
i = random.randint(0, len(aicity_dataset))
sample_data_dict = aicity_dataset[i]
if sample_data_dict.get('depths') is not None:
    depth_to_show = sample_data_dict['depths'][0].cpu().numpy().transpose(1, 2, 0)
    # Assuming it's normalized to [0,1] by processor, or you might need to denormalize
    # If it became 3-channel by duplicating, take one channel or average
    if depth_to_show.shape[2] == 3:
        depth_to_show = depth_to_show[:, :, 0] 
    plt.subplot(1, 2, 1)
    plt.imshow(sample_data_dict['image'][0].cpu().numpy().transpose(1, 2, 0))
    plt.show()
    
    plt.subplot(1, 2, 2)
    plt.imshow(depth_to_show, cmap='gray')
    plt.title(f"Processed Depth - Sample {i}")
    plt.show()