In [1]:
import torch
from transformers import AutoConfig, AutoModel, AutoTokenizer
from PIL import Image
import requests
from io import BytesIO
import os


In [2]:
import torch
from transformers import AutoConfig, AutoModel, AutoTokenizer
from PIL import Image
import requests
from io import BytesIO
import os

class H2OTextExtractor:
    def __init__(self, model_path='h2oai/h2ovl-mississippi-800m', use_flash_attention=False):
        """
        Initialize the H2O Vision-Language model for text extraction
        """
        self.model_path = model_path
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        print(f"Loading model: {model_path}")
        print(f"Using device: {self.device}")
        
        # Set up the model configuration
        self.config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
        
        # Configure attention implementation
        if use_flash_attention and torch.cuda.is_available():
            try:
                self.config.llm_config._attn_implementation = 'flash_attention_2'
                print("Using Flash Attention 2")
            except Exception as e:
                print(f"Flash Attention failed: {e}")
                print("Falling back to eager attention")
                try:
                    self.config.llm_config._attn_implementation = 'eager'
                except:
                    pass
        else:
            # Use eager attention or default
            try:
                if hasattr(self.config, 'llm_config'):
                    self.config.llm_config._attn_implementation = 'eager'
                print("Using eager attention")
            except Exception as e:
                print(f"Could not set attention implementation: {e}")
                print("Using default attention")
        
        # Load model with error handling
        try:
            self.model = AutoModel.from_pretrained(
                model_path,
                torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
                config=self.config,
                low_cpu_mem_usage=True,
                trust_remote_code=True
            ).eval()
        except Exception as e:
            print(f"Error loading with config: {e}")
            print("Trying to load without custom config...")
            # Fallback: load without custom config
            self.model = AutoModel.from_pretrained(
                model_path,
                torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
                low_cpu_mem_usage=True,
                trust_remote_code=True
            ).eval()
        
        if torch.cuda.is_available():
            self.model = self.model.cuda()
        
        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_path, 
            trust_remote_code=True, 
            use_fast=False
        )
        
        # Generation configuration
        self.generation_config = dict(
            max_new_tokens=2048,
            do_sample=False,  # Set to False for deterministic text extraction
            temperature=0.0,
            top_p=1.0
        )
        
        print("Model loaded successfully!")
    
    def save_image_locally(self, image_path):
        """
        Save image locally if it's a URL, or return the path if it's already local
        """
        if image_path.startswith(('http://', 'https://')):
            try:
                response = requests.get(image_path)
                response.raise_for_status()
                
                # Create temp directory if it doesn't exist
                os.makedirs('./temp_images', exist_ok=True)
                
                # Save image with a temporary name
                temp_path = './temp_images/temp_image.jpg'
                with open(temp_path, 'wb') as f:
                    f.write(response.content)
                
                return temp_path
            except Exception as e:
                print(f"Error downloading image: {e}")
                return None
        else:
            # Check if local file exists
            if os.path.exists(image_path):
                return image_path
            else:
                print(f"Local file not found: {image_path}")
                return None
    
    def extract_text(self, image_path, prompt="Read the text in the image."):
        """
        Extract text from image using the H2O VL model
        """
        # Handle image path (download if URL, verify if local)
        local_image_path = self.save_image_locally(image_path)
        if local_image_path is None:
            return "Error: Could not load image"
        
        try:
            # Format the question with image token
            question = f'<image>\n{prompt}'
            
            # Use the model's chat method
            response, history = self.model.chat(
                self.tokenizer, 
                local_image_path, 
                question, 
                self.generation_config, 
                history=None, 
                return_history=True
            )
            
            # Clean up temporary image if it was downloaded
            if image_path.startswith(('http://', 'https://')) and os.path.exists(local_image_path):
                try:
                    os.remove(local_image_path)
                except:
                    pass
            
            return response.strip()
            
        except Exception as e:
            # Clean up temporary image on error
            if image_path.startswith(('http://', 'https://')) and local_image_path and os.path.exists(local_image_path):
                try:
                    os.remove(local_image_path)
                except:
                    pass
            return f"Error during text extraction: {e}"
    
    def extract_with_custom_prompt(self, image_path, custom_prompt):
        """
        Extract text with a custom prompt for specific needs
        """
        return self.extract_text(image_path, custom_prompt)
    
    def chat_with_image(self, image_path, question, history=None):
        """
        Have a conversation about an image (supports follow-up questions)
        """
        local_image_path = self.save_image_locally(image_path)
        if local_image_path is None:
            return "Error: Could not load image", history
        
        try:
            # Format question with image token if it's the first question
            if history is None:
                formatted_question = f'<image>\n{question}'
            else:
                formatted_question = question
            
            response, new_history = self.model.chat(
                self.tokenizer,
                local_image_path if history is None else None,  # Only pass image for first question
                formatted_question,
                self.generation_config,
                history=history,
                return_history=True
            )
            
            # Clean up temporary image if it was downloaded
            if image_path.startswith(('http://', 'https://')) and os.path.exists(local_image_path):
                try:
                    os.remove(local_image_path)
                except:
                    pass
            
            return response.strip(), new_history
            
        except Exception as e:
            if image_path.startswith(('http://', 'https://')) and local_image_path and os.path.exists(local_image_path):
                try:
                    os.remove(local_image_path)
                except:
                    pass
            return f"Error during chat: {e}", history
    
    def batch_extract(self, image_paths, prompt="Read the text in the image."):
        """
        Extract text from multiple images
        """
        results = []
        for i, image_path in enumerate(image_paths):
            print(f"Processing image {i+1}/{len(image_paths)}: {image_path}")
            result = self.extract_text(image_path, prompt)
            results.append({
                'image_path': image_path,
                'extracted_text': result
            })
        return results
    
    def text_only_chat(self, question, history=None):
        """
        Pure text conversation without images
        """
        try:
            response, new_history = self.model.chat(
                self.tokenizer,
                None,  # No image
                question,
                self.generation_config,
                history=history,
                return_history=True
            )
            return response.strip(), new_history
        except Exception as e:
            return f"Error during text chat: {e}", history

def main():
    # Initialize the extractor without Flash Attention to avoid compatibility issues
    print("Initializing H2O Text Extractor...")
    extractor = H2OTextExtractor(use_flash_attention=False)
    
    # Example 1: Basic text extraction
    print("=== Example 1: Basic Text Extraction ===")
    # You can use a URL or local file path
    test_image = "/kaggle/input/try-bad-handwritting/Image 2.jpeg"
    
    result = extractor.extract_text(test_image)
    print(f"Extracted text: {result}")
    
    # Example 2: Custom prompts for different extraction needs
    print("\n=== Example 2: Custom Prompts ===")
    custom_prompts = [
        "What text can you see in this image?",
        "Please transcribe all visible text from this document.",
        "Extract any numbers, dates, and important information from this image.",
        "Describe the layout and read all text elements in order.",
        "What is the total amount shown in this receipt?"
    ]
    
    for prompt in custom_prompts:
        print(f"\nPrompt: {prompt}")
        result = extractor.extract_with_custom_prompt(test_image, prompt)
        print(f"Result: {result}")
    
    # Example 3: Conversational interface with follow-up questions
    print("\n=== Example 3: Conversational Interface ===")
    question1 = "What type of document is this?"
    response1, history = extractor.chat_with_image(test_image, question1)
    print(f"Q: {question1}")
    print(f"A: {response1}")
    
    # Follow-up question using the same history
    question2 = "Can you tell me the total amount?"
    response2, history = extractor.chat_with_image(test_image, question2, history)
    print(f"Q: {question2}")
    print(f"A: {response2}")
    
    # Example 4: Text-only conversation
    print("\n=== Example 4: Text-Only Chat ===")
    text_question = "Hello, how are you?"
    text_response, _ = extractor.text_only_chat(text_question)
    print(f"User: {text_question}")
    print(f"Assistant: {text_response}")
    
    # Example 5: Batch processing (uncomment and provide real image paths)
    print("\n=== Example 5: Batch Processing ===")
    # image_list = [
    #     "path/to/image1.jpg",
    #     "path/to/image2.png",
    #     "https://example.com/image3.jpg"
    # ]
    # batch_results = extractor.batch_extract(image_list, "Extract all text from this image:")
    # for result in batch_results:
    #     print(f"Image: {result['image_path']}")
    #     print(f"Text: {result['extracted_text']}\n")

# Utility functions for common text extraction tasks
def create_safe_extractor():
    """Create an extractor with safe settings"""
    return H2OTextExtractor(use_flash_attention=False)

def extract_document_text(image_path):
    """Quick function to extract text from document images"""
    extractor = create_safe_extractor()
    return extractor.extract_text(
        image_path, 
        "Please transcribe all the text from this document image, maintaining the original structure and formatting as much as possible."
    )

def extract_handwritten_text(image_path):
    """Specialized function for handwritten text"""
    extractor = create_safe_extractor()
    return extractor.extract_text(
        image_path,
        "Please carefully read and transcribe any handwritten text in this image."
    )


if __name__ == "__main__":
    main()

Initializing H2O Text Extractor...
Loading model: h2oai/h2ovl-mississippi-800m
Using device: cuda


config.json: 0.00B [00:00, ?B/s]

configuration_h2ovl_chat.py: 0.00B [00:00, ?B/s]

configuration_intern_vit.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/h2oai/h2ovl-mississippi-800m:
- configuration_intern_vit.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/h2oai/h2ovl-mississippi-800m:
- configuration_h2ovl_chat.py
- configuration_intern_vit.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Using eager attention


modelling_h2ovl_chat.py: 0.00B [00:00, ?B/s]

2025-07-17 06:49:56.370711: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752734996.572796      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752734996.634742      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


image_process.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/h2oai/h2ovl-mississippi-800m:
- image_process.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_intern_vit.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/h2oai/h2ovl-mississippi-800m:
- modeling_intern_vit.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


conversation.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/h2oai/h2ovl-mississippi-800m:
- conversation.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/h2oai/h2ovl-mississippi-800m:
- modelling_h2ovl_chat.py
- image_process.py
- modeling_intern_vit.py
- conversation.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


FlashAttention is not installed.


model.safetensors:   0%|          | 0.00/1.65G [00:00<?, ?B/s]



generation_config.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/199 [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Model loaded successfully!
=== Example 1: Basic Text Extraction ===




Extracted text: ATS
- Growth will be at least
device of
- Higher massings
- High Raw / FTE

=== Example 2: Custom Prompts ===

Prompt: What text can you see in this image?
Result: ATS
- Growth will be at least
- device of
- Higher Montana
- High Raw / FTE

Prompt: Please transcribe all visible text from this document.
Result: ATS
- Growth will be at least
device of
- Higher massings
- High Raw / FTE

Prompt: Extract any numbers, dates, and important information from this image.
Result: 11

Prompt: Describe the layout and read all text elements in order.
Result: ATS
- Growth will be at least
device of
- Higher massing
- High Raw / FTE

Prompt: What is the total amount shown in this receipt?
Result: $10.00

=== Example 3: Conversational Interface ===
Q: What type of document is this?
A: ATS
Q: Can you tell me the total amount?
A: I'm sorry, but I cannot provide the total amount as I am an AI language model and do not have access to the current financial information.

=== Example 4: Text-