In [1]:
!pip install torch torchvision torchaudio transformers Pillow

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
import torch
from transformers import BlipProcessor, BlipForQuestionAnswering
from PIL import Image
import time
import os

# --- Configuration ---
MODEL_ID = "aryamanpathak/blip-vqa-abo"
IMAGE_PATH = "1.jpg"  # <<< IMPORTANT: Replace with your image file name
QUESTION = "what is color of car"
NUM_INFERENCE_RUNS = 5 # Number of times to run inference for averaging time

# --- Helper Functions ---
def get_gpu_memory_usage_gb():
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / (1024**3)
        reserved = torch.cuda.memory_reserved() / (1024**3) # Total reserved by PyTorch
        max_allocated = torch.cuda.max_memory_allocated() / (1024**3)
        return allocated, reserved, max_allocated
    return 0, 0, 0

def reset_gpu_memory_stats():
    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats()
        torch.cuda.empty_cache() # Clear any cached memory

def print_memory_usage(stage=""):
    if torch.cuda.is_available():
        current_allocated, current_reserved, peak_allocated = get_gpu_memory_usage_gb()
        print(f"GPU Memory Usage ({stage}):")
        print(f"  - Currently Allocated: {current_allocated:.3f} GB")
        # print(f"  - Currently Reserved by PyTorch: {current_reserved:.3f} GB") # Can be noisy
        print(f"  - Peak Allocated during this stage: {peak_allocated:.3f} GB")
    else:
        print("GPU not available. Memory usage not tracked.")

def benchmark_model(model_name, precision_dtype_str, torch_dtype, use_cache_for_generation, image_path, question):
    print(f"\n--- Benchmarking: {model_name} ---")
    print(f"Precision: {precision_dtype_str}, KV Cache for Generation: {use_cache_for_generation}")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    reset_gpu_memory_stats()

    # 1. Load Model and Processor
    print("Loading model and processor...")
    time_load_start = time.time()
    try:
        processor = BlipProcessor.from_pretrained(model_name)
        if torch_dtype:
            model = BlipForQuestionAnswering.from_pretrained(model_name, torch_dtype=torch_dtype)
        else:
            model = BlipForQuestionAnswering.from_pretrained(model_name)
        model.to(device)
        model.eval() # Set to evaluation mode
    except Exception as e:
        print(f"Error loading model: {e}")
        return
    time_load_end = time.time()
    print(f"Model and processor loaded in {time_load_end - time_load_start:.2f} seconds.")
    print_memory_usage("After Model Load")

    # 2. Prepare Inputs
    try:
        raw_image = Image.open(image_path).convert('RGB')
        inputs = processor(images=raw_image, text=question, return_tensors="pt").to(device)
        if torch_dtype == torch.float16: # For fp16, inputs might also need to be cast depending on model
             pass # Blip model usually handles internal casting or works with fp32 inputs to fp16 model
    except FileNotFoundError:
        print(f"ERROR: Image file not found at '{image_path}'. Please check the path.")
        return
    except Exception as e:
        print(f"Error preparing inputs: {e}")
        del model, processor
        reset_gpu_memory_stats()
        return

    # 3. Inference
    print(f"Running inference {NUM_INFERENCE_RUNS} times...")
    total_inference_time = 0
    generated_answer = ""

    # Warm-up run (optional, but good practice)
    if torch.cuda.is_available():
        torch.cuda.synchronize() # Wait for all kernels to complete
    _ = model.generate(**inputs, use_cache=use_cache_for_generation, max_new_tokens=20) # Default is usually more
    if torch.cuda.is_available():
        torch.cuda.synchronize()

    reset_gpu_memory_stats() # Reset peak memory specifically for the timed inference loop

    for i in range(NUM_INFERENCE_RUNS):
        if torch.cuda.is_available():
            torch.cuda.synchronize()
        inference_start_time = time.time()

        with torch.no_grad(): # Important for inference
            outputs = model.generate(**inputs, use_cache=use_cache_for_generation, max_new_tokens=20)

        if torch.cuda.is_available():
            torch.cuda.synchronize()
        inference_end_time = time.time()

        run_time = inference_end_time - inference_start_time
        total_inference_time += run_time
        if i == 0: # Decode only once to see the answer
            generated_answer = processor.decode(outputs[0], skip_special_tokens=True)

    average_inference_time = total_inference_time / NUM_INFERENCE_RUNS
    print(f"Average inference time: {average_inference_time:.4f} seconds per run.")
    print(f"Generated Answer: {generated_answer}")
    print_memory_usage("After Inference Loop")

    # 4. Clean up
    del model, processor, inputs, outputs
    if torch.cuda.is_available():
        reset_gpu_memory_stats() # Call empty_cache here
        torch.cuda.empty_cache()
    print("Model and data unloaded.")
    print_memory_usage("After Cleanup")
    print("--------------------------------------")


if __name__ == "__main__":
    if not os.path.exists(IMAGE_PATH):
        print(f"ERROR: Image file '{IMAGE_PATH}' does not exist. Please create or replace it.")
        print("The script will not run the benchmarks.")
    else:
        # Scenario 1: FP32 without KV Caching for generation
        # Note: `use_cache=False` disables the caching mechanism during generation.
        benchmark_model(MODEL_ID, "FP32", None, use_cache_for_generation=False, image_path=IMAGE_PATH, question=QUESTION)

        # Scenario 2: FP32 with KV Caching for generation (default)
        # `use_cache=True` is typically the default when model is not in training mode.
        benchmark_model(MODEL_ID, "FP32", None, use_cache_for_generation=True, image_path=IMAGE_PATH, question=QUESTION)

        # Scenario 3: FP16 with KV Caching for generation
        if torch.cuda.is_available(): # FP16 is most beneficial on GPU
            benchmark_model(MODEL_ID, "FP16", torch.float16, use_cache_for_generation=True, image_path=IMAGE_PATH, question=QUESTION)
        else:
            print("\nSkipping FP16 benchmark as CUDA is not available. FP16 provides most benefits on GPU.")

        print("\n--- Notes ---")
        print(f"1. Image used: '{IMAGE_PATH}', Question: '{QUESTION}'")
        print(f"2. Inference time is averaged over {NUM_INFERENCE_RUNS} runs.")
        print("3. Peak GPU memory usage is specific to the measured stage (model load or inference loop).")
        print("4. If your GPU supports bfloat16, you can try `torch.bfloat16` for `torch_dtype` as another option.")
        print("5. Memory numbers can fluctuate slightly. For precise memory profiling, dedicated tools might be needed.")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.



--- Benchmarking: aryamanpathak/blip-vqa-abo ---
Precision: FP32, KV Cache for Generation: False
Loading model and processor...


preprocessor_config.json:   0%|          | 0.00/471 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/840 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Model and processor loaded in 14.79 seconds.
GPU Memory Usage (After Model Load):
  - Currently Allocated: 1.438 GB
  - Peak Allocated during this stage: 1.438 GB
Running inference 5 times...
Average inference time: 0.0914 seconds per run.
Generated Answer: red
GPU Memory Usage (After Inference Loop):
  - Currently Allocated: 1.448 GB
  - Peak Allocated during this stage: 1.497 GB
Model and data unloaded.
GPU Memory Usage (After Cleanup):
  - Currently Allocated: 0.008 GB
  - Peak Allocated during this stage: 0.008 GB
--------------------------------------

--- Benchmarking: aryamanpathak/blip-vqa-abo ---
Precision: FP32, KV Cache for Generation: True
Loading model and processor...


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Model and processor loaded in 3.20 seconds.
GPU Memory Usage (After Model Load):
  - Currently Allocated: 1.446 GB
  - Peak Allocated during this stage: 1.446 GB
Running inference 5 times...
Average inference time: 0.1149 seconds per run.
Generated Answer: red
GPU Memory Usage (After Inference Loop):
  - Currently Allocated: 1.448 GB
  - Peak Allocated during this stage: 1.498 GB
Model and data unloaded.
GPU Memory Usage (After Cleanup):
  - Currently Allocated: 0.008 GB
  - Peak Allocated during this stage: 0.008 GB
--------------------------------------

--- Benchmarking: aryamanpathak/blip-vqa-abo ---
Precision: FP16, KV Cache for Generation: True
Loading model and processor...


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Model and processor loaded in 3.59 seconds.
GPU Memory Usage (After Model Load):
  - Currently Allocated: 0.752 GB
  - Peak Allocated during this stage: 0.752 GB
Running inference 5 times...
Average inference time: 0.0640 seconds per run.
Generated Answer: red
GPU Memory Usage (After Inference Loop):
  - Currently Allocated: 0.753 GB
  - Peak Allocated during this stage: 0.777 GB
Model and data unloaded.
GPU Memory Usage (After Cleanup):
  - Currently Allocated: 0.008 GB
  - Peak Allocated during this stage: 0.008 GB
--------------------------------------

--- Notes ---
1. Image used: '1.jpg', Question: 'what is color of car'
2. Inference time is averaged over 5 runs.
3. Peak GPU memory usage is specific to the measured stage (model load or inference loop).
4. If your GPU supports bfloat16, you can try `torch.bfloat16` for `torch_dtype` as another option.
5. Memory numbers can fluctuate slightly. For precise memory profiling, dedicated tools might be needed.
