# Multi-Image Attentional Pooling Test

Testing the new multi-image attentional pooling functionality with dummy data to ensure no runtime errors.

In [1]:
import torch
import numpy as np
from epsclassifiers.intern_vit_classifier.intern_vit_classifier import InternVitClassifier

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name()}")

  from .autonotebook import tqdm as notebook_tqdm


PyTorch version: 2.5.1+cu121
CUDA available: True
CUDA device: NVIDIA A100-SXM4-80GB


In [2]:
# Configuration
INTERN_VL_CHECKPOINT_DIR = "/mnt/data/intervl_weights/no_labels/internvl3_chimera_20250609_233409_1e-5_epsilon_all_0608/checkpoint-24934"
INTERN_VIT_OUTPUT_DIM = 3200  # 3200 for InternVL 26B model
NUM_CLASSES = 1
BATCH_SIZE = 8  # Required due to BatchNorm1d limitation
IMAGE_SIZE = 448  # Standard input size for InternVL
NUM_CHANNELS = 3

## Test 1: Single Image with Attentional Pooling (Baseline)

In [3]:
print("=== Test 1: Single Image with Attentional Pooling ===")

# Create model for single image input
model_single = InternVitClassifier(
    num_classes=NUM_CLASSES,
    intern_vl_checkpoint_dir=INTERN_VL_CHECKPOINT_DIR,
    intern_vit_output_dim=INTERN_VIT_OUTPUT_DIM,
    multi_image_input=False,
    use_attentional_pooling=True,

)

model_single.eval().to('cuda').to(torch.bfloat16)

# Create dummy single image data
dummy_images = torch.randn(BATCH_SIZE, NUM_CHANNELS, IMAGE_SIZE, IMAGE_SIZE).to('cuda').to(torch.bfloat16)

# Test forward pass
with torch.no_grad():
    output = model_single(dummy_images)

print(f"Input shape: {dummy_images.shape}")
print(f"Output keys: {output.keys()}")
print(f"Output shape: {output['output'].shape}")
print(f"Embeddings shape: {output['embeddings'].shape}")
print(f"Attention weights shape: {output['attention_weights'].shape}")
print("✅ Single image test passed!")

# Clear memory
del model_single, dummy_images, output
torch.cuda.empty_cache()
print("🧹 Memory cleared\n")

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


=== Test 1: Single Image with Attentional Pooling ===
trainable params: 40,370,176 || all params: 7,653,191,168 || trainable%: 0.5275


Loading checkpoint shards: 100%|██████████| 6/6 [00:02<00:00,  2.41it/s]


INFO: InternVitClassifier will NOT be using multi image input and will NOT be using tile splitting
INFO: InternVitClassifier will be using attentive pooling
Input shape: torch.Size([8, 3, 448, 448])
Output keys: dict_keys(['output', 'embeddings', 'last_hidden_state', 'attention_weights'])
Output shape: torch.Size([8, 1])
Embeddings shape: torch.Size([8, 3200])
Attention weights shape: torch.Size([8, 1, 1024])
✅ Single image test passed!
🧹 Memory cleared



## Test 2: Multi-Image with Fixed Number of Images

In [4]:
print("=== Test 2: Multi-Image with Fixed Number (2 images) ===")

NUM_MULTI_IMAGES = 2

# Create model for multi-image input with fixed number
model_multi_fixed = InternVitClassifier(
    num_classes=NUM_CLASSES,
    intern_vl_checkpoint_dir=INTERN_VL_CHECKPOINT_DIR,
    intern_vit_output_dim=INTERN_VIT_OUTPUT_DIM,
    multi_image_input=True,
    num_multi_images=NUM_MULTI_IMAGES,
    use_attentional_pooling=True,

)

model_multi_fixed.eval().to('cuda').to(torch.bfloat16)

# Create dummy multi-image data: (batch_size, num_images, channels, height, width)
dummy_multi_images = torch.randn(BATCH_SIZE, NUM_MULTI_IMAGES, NUM_CHANNELS, IMAGE_SIZE, IMAGE_SIZE).to('cuda').to(torch.bfloat16)

# Test forward pass
with torch.no_grad():
    output = model_multi_fixed(dummy_multi_images)

print(f"Input shape: {dummy_multi_images.shape}")
print(f"Output keys: {output.keys()}")
print(f"Output shape: {output['output'].shape}")
print(f"Embeddings shape: {output['embeddings'].shape}")
print(f"Attention weights shape: {output['attention_weights'].shape}")
print("✅ Multi-image fixed test passed!")

# Clear memory
del model_multi_fixed, dummy_multi_images, output
torch.cuda.empty_cache()
print("🧹 Memory cleared\n")

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


=== Test 2: Multi-Image with Fixed Number (2 images) ===
trainable params: 40,370,176 || all params: 7,653,191,168 || trainable%: 0.5275


Loading checkpoint shards: 100%|██████████| 6/6 [00:01<00:00,  3.03it/s]


INFO: InternVitClassifier will be using multi image input of size 2
INFO: InternVitClassifier will be using attentive pooling
Input shape: torch.Size([8, 2, 3, 448, 448])
Output keys: dict_keys(['output', 'embeddings', 'last_hidden_state', 'attention_weights'])
Output shape: torch.Size([8, 1])
Embeddings shape: torch.Size([8, 3200])
Attention weights shape: torch.Size([8, 1, 2048])
✅ Multi-image fixed test passed!
🧹 Memory cleared



## Test 3: Multi-Image with Variable Number of Images

In [5]:
print("=== Test 3: Multi-Image with Variable Number ===")

# Create model for multi-image input with variable number
model_multi_var = InternVitClassifier(
    num_classes=NUM_CLASSES,
    intern_vl_checkpoint_dir=INTERN_VL_CHECKPOINT_DIR,
    intern_vit_output_dim=INTERN_VIT_OUTPUT_DIM,
    multi_image_input=True,
    num_multi_images=None,  # Variable number
    use_attentional_pooling=True,
)

model_multi_var.eval().to('cuda').to(torch.bfloat16)

# Create dummy variable multi-image data as list of tensors
# Batch 1: 2 images, Batch 2: 4 images
batch_1_images = [torch.randn(NUM_CHANNELS, IMAGE_SIZE, IMAGE_SIZE).to('cuda').to(torch.bfloat16) for _ in range(2)]
batch_2_images = [torch.randn(NUM_CHANNELS, IMAGE_SIZE, IMAGE_SIZE).to('cuda').to(torch.bfloat16) for _ in range(4)]
dummy_var_images = [batch_1_images, batch_2_images]

# Test forward pass
with torch.no_grad():
    output = model_multi_var(dummy_var_images)

print(f"Input batch 1 images: {len(dummy_var_images[0])}")
print(f"Input batch 2 images: {len(dummy_var_images[1])}")
print(f"Input image shapes: {[img.shape for img in dummy_var_images[0]]}")
print(f"Output keys: {output.keys()}")
print(f"Output shape: {output['output'].shape}")
print(f"Embeddings shape: {output['embeddings'].shape}")
print(f"Attention weights shape: {output['attention_weights'].shape}")
print("✅ Multi-image variable test passed!")

# Clear memory
del model_multi_var, batch_1_images, batch_2_images, dummy_var_images, output
torch.cuda.empty_cache()
print("🧹 Memory cleared\n")

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


=== Test 3: Multi-Image with Variable Number ===
trainable params: 40,370,176 || all params: 7,653,191,168 || trainable%: 0.5275


Loading checkpoint shards: 100%|██████████| 6/6 [00:01<00:00,  3.31it/s]


INFO: InternVitClassifier will be using multi image input of size None
INFO: InternVitClassifier will be using attentive pooling
Input batch 1 images: 2
Input batch 2 images: 4
Input image shapes: [torch.Size([3, 448, 448]), torch.Size([3, 448, 448])]
Output keys: dict_keys(['output', 'embeddings', 'last_hidden_state', 'attention_weights'])
Output shape: torch.Size([2, 1])
Embeddings shape: torch.Size([2, 3200])
Attention weights shape: torch.Size([2, 1, 4096])
✅ Multi-image variable test passed!
🧹 Memory cleared



## Test 4: Multi-Image without Attentional Pooling (Comparison)

In [6]:
print("=== Test 4: Multi-Image without Attentional Pooling ===")

# Create model for multi-image input without attentional pooling
model_multi_no_attn = InternVitClassifier(
    num_classes=NUM_CLASSES,
    intern_vl_checkpoint_dir=INTERN_VL_CHECKPOINT_DIR,
    intern_vit_output_dim=INTERN_VIT_OUTPUT_DIM,
    multi_image_input=True,
    num_multi_images=2,  # Use fixed number for this test
    use_attentional_pooling=False,  # No attentional pooling
)

model_multi_no_attn.eval().to('cuda').to(torch.bfloat16)

# Create dummy multi-image data for this test
dummy_multi_images_no_attn = torch.randn(BATCH_SIZE, 2, NUM_CHANNELS, IMAGE_SIZE, IMAGE_SIZE).to('cuda').to(torch.bfloat16)

# Test forward pass
with torch.no_grad():
    output = model_multi_no_attn(dummy_multi_images_no_attn)

print(f"Input shape: {dummy_multi_images_no_attn.shape}")
print(f"Output keys: {output.keys()}")
print(f"Output shape: {output['output'].shape}")
print(f"Embeddings shape: {output['embeddings'].shape}")
print(f"Attention weights: {output['attention_weights']}")
print("✅ Multi-image without attentional pooling test passed!")

# Clear memory
del model_multi_no_attn, dummy_multi_images_no_attn, output
torch.cuda.empty_cache()
print("🧹 Memory cleared\n")

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


=== Test 4: Multi-Image without Attentional Pooling ===
trainable params: 40,370,176 || all params: 7,653,191,168 || trainable%: 0.5275


Loading checkpoint shards: 100%|██████████| 6/6 [00:01<00:00,  3.64it/s]


INFO: InternVitClassifier will be using multi image input of size 2
Input shape: torch.Size([8, 2, 3, 448, 448])
Output keys: dict_keys(['output', 'embeddings', 'last_hidden_state', 'attention_weights'])
Output shape: torch.Size([8, 1])
Embeddings shape: torch.Size([8, 6400])
Attention weights: None
✅ Multi-image without attentional pooling test passed!
🧹 Memory cleared



## Test Summary

In [7]:
print("=== Test Summary ===")
print("✅ All tests completed successfully!")
print("✅ Single image attentional pooling works")
print("✅ Multi-image with fixed number works")
print("✅ Multi-image with variable number works")
print("✅ Multi-image without attentional pooling works (comparison)")
print("\n🎉 Multi-image attentional pooling implementation is ready!")

=== Test Summary ===
✅ All tests completed successfully!
✅ Single image attentional pooling works
✅ Multi-image with fixed number works
✅ Multi-image with variable number works
✅ Multi-image without attentional pooling works (comparison)

🎉 Multi-image attentional pooling implementation is ready!
