In [None]:
import argparse
import os
import re
import json
from PIL import Image
from tqdm import tqdm
import torch
from g2vlm_utils import load_model_and_tokenizer, build_transform, process_conversation

def process_conversation(model_path, image_path, user_question):
    """
    Process a conversation with an image and question using a vision-language model.
    
    This function loads a pre-trained model, processes an image and user question,
    and returns the model's response based on the 3D scene understanding.
    
    Args:
        model_path (str): Path to the pre-trained model checkpoint.
        image_path (str): Path to the input image file. If None, uses default example.
        user_question (str): The user's question about the image. If None, uses default question.
    
    Returns:
        str: The model's response to the user's question.
    
    Example:
        >>> response = process_conversation(
        ...     model_path='path/to/model',
        ...     image_path='examples/25_0.jpg',
        ...     user_question='What is the depth of the object?'
        ... )
        >>> print(response)
    """
    enable_template = True

    model, tokenizer, new_token_ids , vit_image_transform, dino_transform = load_model_and_tokenizer(model_path=model_path)
    image_transform = build_transform(pixel=768)

    total_params = sum(p.numel() for p in model.parameters()) / 1e9
    print(f'[test] total_params: {total_params}B')

    img_path = 'examples/25_0.jpg'
    question = "If the table (red point) is positioned at 2.6 meters, estimate the depth of the clothes (blue point).  Calculate or judge based on the 3D center points of these objects. The unit is meter. Submit your response as one numeric value only."

    post_prompt = "Please answer the question using a single word or phrase."
    templated_question =  "\n" + question + "\n" + post_prompt

    if user_question is not None: 
        templated_question = user_question
    
    if image_path is not None:
        img_path = image_path

    print(question)

    images = [Image.open(img_path).convert('RGB') ]
    images, conversation = process_conversation(images, templated_question)

    response = model.chat_with_recon(
        tokenizer, 
        new_token_ids,
        image_transform,
        dino_transform,
        images=images,
        prompt=conversation,
        max_length=100,
    )
    return response

In [None]:
import cv2
import os
import pandas as pd

def process_videos(video_folder, model_path, output_excel):
    """
    Load videos, sample frames, query the model, and save navigation instructions.
    
    Args:
        video_folder (str): Path to folder containing videos.
        model_path (str): Path to the pre-trained model checkpoint.
        output_excel (str): Path to save the Excel file with instructions.
    """
    results = []

    # Prompt for navigation instructions
    prompt = (
        """You are an assistive navigation system for a visually impaired user. Analyze the provided video from the user's forward perspective. 
        Identify all the immediate, high-risk obstructions. 
        State the obstruction's location using the 12-hour clock face. 
        Process the provided video and generate a single, actionable safety alert."""
    )

    # Iterate over all videos in the folder
    for video_file in os.listdir(video_folder):
        if not video_file.lower().endswith((".mp4", ".avi", ".mov")):
            continue

        video_path = os.path.join(video_folder, video_file)
        cap = cv2.VideoCapture(video_path)

        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        instructions = []

        # Sample every 3rd frame
        for i in range(0, frame_count, 3):
            cap.set(cv2.CAP_PROP_POS_FRAMES, i)
            ret, frame = cap.read()
            if not ret:
                continue

            # Save frame temporarily
            frame_path = f"temp_frame_{i}.jpg"
            cv2.imwrite(frame_path, frame)

            # Query the model
            response = process_conversation(
                model_path=model_path,
                image_path=frame_path,
                user_question=prompt
            )
            instructions.append(response)

            # Clean up temp frame
            os.remove(frame_path)

        cap.release()

        # Store aggregated instructions for this video
        results.append({
            "video": video_file,
            "instructions": " ".join(instructions)  # combine responses
        })

    # Save all results to Excel
    df = pd.DataFrame(results)
    df.to_excel(output_excel, index=False)
    print(f"Saved navigation instructions to {output_excel}")




In [None]:

# Example usage
video_folder = "../../G2VLM/videos"   # folder with your videos
model_path = "path/to/model"          # replace with actual model checkpoint
output_excel = "navigation_instructions.xlsx"
process_videos(video_folder, model_path, output_excel)