In [1]:
!pip install --upgrade transformers accelerate --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m90.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m367.1/367.1 kB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━

In [2]:
import torch
import os
from transformers import AutoModelForVision2Seq, AutoProcessor
from PIL import Image
import gc

2025-07-17 06:47:07.322034: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752734827.524720      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752734827.583175      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# CUDA config
import os
import torch
import gc
from PIL import Image
from transformers import AutoModelForVision2Seq, AutoProcessor

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
torch.cuda.empty_cache()
gc.collect()

# Model ID for Qwen2-VL-7B
model_id = "Qwen/Qwen2-VL-7B"

print("Loading model and processor...")

# Load model using accelerate + device_map="auto"
model = AutoModelForVision2Seq.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
    low_cpu_mem_usage=True
)

# Load processor
processor = AutoProcessor.from_pretrained(
    model_id,
    trust_remote_code=True
)

print(f"Model loaded on device: {model.device}")
print(f"Processor tokenizer vocab size: {len(processor.tokenizer)}")

# Load & resize image
image_path = "/kaggle/input/qwen-7b/Image 2.jpeg"
image = Image.open(image_path).convert('RGB')
print(f"Original image size: {image.size}")

# Conservative resize
max_size = 224  # Even smaller for testing
if max(image.size) > max_size:
    ratio = max_size / max(image.size)
    new_size = tuple(int(dim * ratio) for dim in image.size)
    image = image.resize(new_size, Image.Resampling.LANCZOS)

print(f"Resized image size: {image.size}")

def extract_text_v1(image, prompt, model, processor):
    """Method 1: Using chat template (original approach)"""
    print("\n=== Method 1: Chat Template ===")
    
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": prompt}
            ]
        }
    ]
    
    try:
        text = processor.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
        )
        print(f"Chat template result: '{text[:100]}...'")
        
        if not text or len(text.strip()) == 0:
            return "❌ Chat template produced empty text"
            
    except Exception as e:
        print(f"Chat template error: {e}")
        return f"❌ Chat template failed: {e}"
    
    # Process inputs
    inputs = processor(
        text=[text],
        images=[image],
        return_tensors="pt",
        padding=True
    )
    
    print(f"Input IDs shape: {inputs['input_ids'].shape}")
    print(f"Input IDs content: {inputs['input_ids']}")
    
    if inputs['input_ids'].shape[1] == 0:
        return "❌ Tokenization produced empty input_ids"
    
    # Move to device
    inputs = {k: v.to(model.device) if hasattr(v, "to") else v for k, v in inputs.items()}
    
    # Generate
    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=64,
            do_sample=False,
            pad_token_id=processor.tokenizer.eos_token_id,
        )
    
    # Decode
    input_token_len = inputs["input_ids"].shape[1]
    response_ids = generated_ids[:, input_token_len:]
    
    if response_ids.shape[1] == 0:
        return "❌ No tokens generated"
    
    decoded = processor.batch_decode(response_ids, skip_special_tokens=True)
    return decoded[0].strip() if decoded else "❌ Decoding failed"

def extract_text_v2(image, prompt, model, processor):
    """Method 2: Direct processor call without chat template"""
    print("\n=== Method 2: Direct Processing ===")
    
    try:
        # Try direct processing without chat template
        inputs = processor(
            images=image,
            text=prompt,
            return_tensors="pt",
            padding=True
        )
        
        print(f"Direct input IDs shape: {inputs['input_ids'].shape}")
        
        if inputs['input_ids'].shape[1] == 0:
            return "❌ Direct processing produced empty input_ids"
        
        # Move to device
        inputs = {k: v.to(model.device) if hasattr(v, "to") else v for k, v in inputs.items()}
        
        # Generate
        with torch.no_grad():
            generated_ids = model.generate(
                **inputs,
                max_new_tokens=64,
                do_sample=False,
                pad_token_id=processor.tokenizer.eos_token_id,
            )
        
        # Decode
        input_token_len = inputs["input_ids"].shape[1]
        response_ids = generated_ids[:, input_token_len:]
        
        if response_ids.shape[1] == 0:
            return "❌ No tokens generated"
        
        decoded = processor.batch_decode(response_ids, skip_special_tokens=True)
        return decoded[0].strip() if decoded else "❌ Decoding failed"
        
    except Exception as e:
        print(f"Direct processing error: {e}")
        return f"❌ Direct processing failed: {e}"

def extract_text_v3(image, prompt, model, processor):
    """Method 3: Manual tokenization approach"""
    print("\n=== Method 3: Manual Tokenization ===")
    
    try:
        # Manually tokenize text
        text_tokens = processor.tokenizer.encode(prompt, return_tensors="pt")
        print(f"Text tokens shape: {text_tokens.shape}")
        
        if text_tokens.shape[1] == 0:
            return "❌ Text tokenization failed"
        
        # Process image separately
        image_inputs = processor.image_processor(image, return_tensors="pt")
        
        # Combine inputs manually
        inputs = {
            "input_ids": text_tokens,
            "pixel_values": image_inputs["pixel_values"]
        }
        
        # Move to device
        inputs = {k: v.to(model.device) if hasattr(v, "to") else v for k, v in inputs.items()}
        
        print(f"Manual input IDs shape: {inputs['input_ids'].shape}")
        
        # Generate
        with torch.no_grad():
            generated_ids = model.generate(
                **inputs,
                max_new_tokens=64,
                do_sample=False,
                pad_token_id=processor.tokenizer.eos_token_id,
            )
        
        # Decode
        input_token_len = inputs["input_ids"].shape[1]
        response_ids = generated_ids[:, input_token_len:]
        
        if response_ids.shape[1] == 0:
            return "❌ No tokens generated"
        
        decoded = processor.tokenizer.decode(response_ids[0], skip_special_tokens=True)
        return decoded.strip() if decoded else "❌ Decoding failed"
        
    except Exception as e:
        print(f"Manual tokenization error: {e}")
        return f"❌ Manual tokenization failed: {e}"

def debug_processor(processor):
    """Debug processor state"""
    print("\n=== Processor Debug Info ===")
    print(f"Tokenizer type: {type(processor.tokenizer)}")
    print(f"Has chat_template: {hasattr(processor.tokenizer, 'chat_template')}")
    if hasattr(processor.tokenizer, 'chat_template'):
        print(f"Chat template: {processor.tokenizer.chat_template[:200] if processor.tokenizer.chat_template else 'None'}")
    print(f"Vocab size: {len(processor.tokenizer)}")
    print(f"Pad token: {processor.tokenizer.pad_token}")
    print(f"EOS token: {processor.tokenizer.eos_token}")
    
    # Test basic tokenization
    test_text = "What text do you see?"
    tokens = processor.tokenizer.encode(test_text)
    print(f"Test tokenization of '{test_text}': {tokens}")

def simple_test():
    """Simple test without image"""
    print("\n=== Simple Text-Only Test ===")
    test_prompt = "Hello, how are you?"
    
    try:
        inputs = processor.tokenizer(test_prompt, return_tensors="pt")
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        
        print(f"Text-only input shape: {inputs['input_ids'].shape}")
        
        with torch.no_grad():
            generated_ids = model.generate(
                **inputs,
                max_new_tokens=32,
                do_sample=False,
                pad_token_id=processor.tokenizer.eos_token_id,
            )
        
        response_ids = generated_ids[:, inputs["input_ids"].shape[1]:]
        decoded = processor.tokenizer.decode(response_ids[0], skip_special_tokens=True)
        print(f"Text-only response: '{decoded}'")
        
    except Exception as e:
        print(f"Text-only test failed: {e}")

# Run all diagnostics
debug_processor(processor)
simple_test()

# Try all methods
prompt = "What text do you see in this image?"

methods = [extract_text_v1, extract_text_v2, extract_text_v3]
method_names = ["Chat Template", "Direct Processing", "Manual Tokenization"]

for method, name in zip(methods, method_names):
    print(f"\n{'='*50}")
    print(f"TESTING: {name}")
    print('='*50)
    
    try:
        result = method(image, prompt, model, processor)
        print(f"✅ {name} Result: {result}")
        
        if result and not result.startswith("❌"):
            print(f"\n🎉 SUCCESS with {name}!")
            break
            
    except Exception as e:
        print(f"❌ {name} Exception: {e}")
        import traceback
        traceback.print_exc()

# Memory cleanup
torch.cuda.empty_cache()
gc.collect()

Loading model and processor...


config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


chat_template.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

Model loaded on device: cpu
Processor tokenizer vocab size: 151657
Original image size: (1440, 1920)


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Resized image size: (168, 224)

=== Processor Debug Info ===
Tokenizer type: <class 'transformers.models.qwen2.tokenization_qwen2_fast.Qwen2TokenizerFast'>
Has chat_template: True
Chat template: {% if messages is string %}{{ messages }}{% else %}{% for content in messages %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}<|vision_start|><|image_pad|><|vision
Vocab size: 151657
Pad token: <|endoftext|>
EOS token: <|im_end|>
Test tokenization of 'What text do you see?': [3838, 1467, 653, 498, 1490, 30]

=== Simple Text-Only Test ===
Text-only input shape: torch.Size([1, 6])


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Text-only response: ' I'm a 15 year old girl and I'm trying to learn how to play the piano. I've been taking lessons for about 2 years now'

TESTING: Chat Template

=== Method 1: Chat Template ===
Chat template result: '...'
✅ Chat Template Result: ❌ Chat template produced empty text

TESTING: Direct Processing

=== Method 2: Direct Processing ===
Direct input IDs shape: torch.Size([1, 9])


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Direct processing error: Image features and image tokens do not match: tokens: 0, features 48
✅ Direct Processing Result: ❌ Direct processing failed: Image features and image tokens do not match: tokens: 0, features 48

TESTING: Manual Tokenization

=== Method 3: Manual Tokenization ===
Text tokens shape: torch.Size([1, 9])
Manual input IDs shape: torch.Size([1, 9])
Manual tokenization error: 'NoneType' object is not iterable
✅ Manual Tokenization Result: ❌ Manual tokenization failed: 'NoneType' object is not iterable


151