# Di√°rio de Lisboa - OCR Processing on Google Colab

**Purpose**: Process newspaper scans with DOTS OCR using Colab's free GPU

**Setup**:
1. Runtime > Change runtime type > **GPU (T4)** > Save
2. Upload newspaper scans to Google Drive
3. Run all cells

**Note**: Free Colab has 12-hour session limit and ~15GB VRAM

## 1. Setup & GPU Check

In [None]:
import torch
import sys
from datetime import datetime

print(f"Execution started: {datetime.now()}")
print(f"Python version: {sys.version}")
print(f"PyTorch version: {torch.__version__}")

# Check GPU availability
if torch.cuda.is_available():
    print(f"\n‚úÖ GPU Available: {torch.cuda.get_device_name(0)}")
    print(f"   GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    device = torch.device('cuda')
else:
    print("\n‚ùå WARNING: GPU not available, OCR will be VERY slow")
    print("   Go to: Runtime > Change runtime type > GPU")
    device = torch.device('cpu')

print(f"\nDevice: {device}")

## 2. Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Configure paths - CHANGE THESE TO MATCH YOUR DRIVE STRUCTURE
DRIVE_BASE = '/content/drive/MyDrive/diario-lisboa'
DATA_DIR = f'{DRIVE_BASE}/data'  # Where your newspaper scans are
OUTPUT_DIR = f'{DRIVE_BASE}/ocr_output'  # Where OCR results will be saved

print(f"‚úÖ Google Drive mounted")
print(f"   Data directory: {DATA_DIR}")
print(f"   Output directory: {OUTPUT_DIR}")

## 3. Install Dependencies

In [None]:
# Install DOTS OCR and dependencies
print("Installing dependencies (this takes ~3-5 minutes)...")

# Install PyTorch with CUDA support (usually pre-installed in Colab)
!pip install -q torch torchvision torchaudio

# Install transformers and dependencies
!pip install -q transformers>=4.45.0 accelerate>=0.21.0
!pip install -q qwen-vl-utils Pillow tqdm

# Install DOTS OCR - we'll install dependencies manually to avoid flash-attn
print("\nInstalling DOTS OCR dependencies...")
!pip install -q tikzplotlib timm einops

# Clone and install DOTS OCR without flash-attn
print("Installing DOTS OCR (without flash-attention)...")
!git clone -q https://github.com/rednote-hilab/dots.ocr.git /tmp/dots_ocr 2>&1 | grep -v "Cloning"

# Create a custom setup without flash-attn
import os
os.chdir('/tmp/dots_ocr')

# Install without dependencies first, then we control what gets installed
!pip install -q --no-deps -e .

print("\n‚úÖ All dependencies installed (flash-attention skipped for Colab compatibility)")

## 4. Download DOTS OCR Model

In [None]:
from transformers import AutoModelForCausalLM, AutoProcessor
import torch
import warnings

MODEL_ID = "rednote-hilab/dots.ocr"

print(f"Loading DOTS OCR model: {MODEL_ID}")
print("This will download ~8-10GB on first run (cached for future runs)")
print("Please wait...\n")

# Suppress flash attention warnings
warnings.filterwarnings('ignore', message='.*flash attention.*')

# Load model (without flash_attention_2 for Colab compatibility)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
    # Note: Using eager attention instead of flash_attention_2 for Colab
)

# Load processor - use Qwen2VLProcessor directly to avoid video processor issue
try:
    from transformers import Qwen2VLProcessor
    processor = Qwen2VLProcessor.from_pretrained(
        MODEL_ID,
        trust_remote_code=True
    )
except:
    # Fallback to AutoProcessor if Qwen2VLProcessor not available
    processor = AutoProcessor.from_pretrained(
        MODEL_ID,
        trust_remote_code=True
    )

print("\n‚úÖ Model loaded successfully")
print(f"   Model parameters: {sum(p.numel() for p in model.parameters()) / 1e9:.2f}B")
print(f"   Model device: {next(model.parameters()).device}")

## 5. Define OCR Processing Functions

In [None]:
import json
from pathlib import Path
from qwen_vl_utils import process_vision_info

def get_ocr_prompt():
    """Get optimized prompt for Portuguese newspaper OCR"""
    return """Please output the layout information from this Portuguese newspaper page image, including each layout element's bbox, its category, and the corresponding text content within the bbox.

1. Bbox format: [x1, y1, x2, y2]

2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].

3. Text Extraction & Formatting Rules:
    - Picture: For the 'Picture' category, the text field should be omitted.
    - Formula: Format its text as LaTeX.
    - Table: Format its text as HTML.
    - All Others (Text, Title, etc.): Format their text as Markdown.

4. Constraints:
    - The output text must be the original Portuguese text from the image, with no translation.
    - All layout elements must be sorted according to human reading order.

5. Final Output: The entire output must be a single JSON object."""


def process_image(image_path, model, processor):
    """Process a single newspaper scan with OCR"""
    try:
        prompt = get_ocr_prompt()
        
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": str(image_path)},
                    {"type": "text", "text": prompt}
                ]
            }
        ]
        
        # Prepare inputs
        text = processor.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        
        image_inputs, video_inputs = process_vision_info(messages)
        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )
        inputs = inputs.to("cuda")
        
        # Generate OCR output
        generated_ids = model.generate(**inputs, max_new_tokens=24000)
        generated_ids_trimmed = [
            out_ids[len(in_ids):]
            for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        
        output_text = processor.batch_decode(
            generated_ids_trimmed,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False
        )
        
        return {
            "image_path": str(image_path),
            "text": output_text[0],
            "timestamp": datetime.now().isoformat(),
            "success": True
        }
        
    except Exception as e:
        return {
            "image_path": str(image_path),
            "error": str(e),
            "timestamp": datetime.now().isoformat(),
            "success": False
        }

print("‚úÖ OCR functions defined")

## 6. Find Images to Process

In [None]:
from pathlib import Path
import os

# Check if directory exists
data_path = Path(DATA_DIR)

if not data_path.exists():
    print(f"‚ùå Directory does not exist: {DATA_DIR}")
    print("\nPlease check:")
    print("1. Did you mount Google Drive in Cell 2?")
    print("2. Is the path correct? Update DRIVE_BASE in Cell 2")
    print("3. Did you upload images to Google Drive?")
    print(f"\nTrying to list parent directory...")
    parent = data_path.parent
    if parent.exists():
        print(f"\nContents of {parent}:")
        for item in parent.iterdir():
            print(f"  {'[DIR]' if item.is_dir() else '[FILE]'} {item.name}")
else:
    print(f"‚úì Directory exists: {DATA_DIR}")
    
    # Show what's in the directory
    print(f"\nContents of {DATA_DIR}:")
    items = list(data_path.iterdir())[:20]  # Show first 20 items
    if not items:
        print("  (empty directory)")
    else:
        for item in items:
            print(f"  {'[DIR]' if item.is_dir() else '[FILE]'} {item.name}")
        if len(list(data_path.iterdir())) > 20:
            print(f"  ... and {len(list(data_path.iterdir())) - 20} more items")
    
    # Find all newspaper scans (supports multiple formats)
    image_files = []
    
    # Search for common image formats
    print(f"\nSearching for images...")
    for ext in ['*.jpg', '*.jpeg', '*.png', '*.JPG', '*.JPEG', '*.PNG']:
        found = list(data_path.glob(f'**/{ext}'))
        if found:
            print(f"  Found {len(found)} {ext} files")
        image_files.extend(found)
    
    image_files = sorted(image_files)
    
    print(f"\nTotal: {len(image_files)} images found")
    
    if len(image_files) == 0:
        print("\n‚ùå No images found!")
        print(f"\nExpected structure:")
        print(f"  {DATA_DIR}/")
        print(f"  ‚îî‚îÄ‚îÄ YYYY/")
        print(f"      ‚îî‚îÄ‚îÄ MM/")
        print(f"          ‚îî‚îÄ‚îÄ DD/")
        print(f"              ‚îú‚îÄ‚îÄ image1.jpg")
        print(f"              ‚îî‚îÄ‚îÄ image2.png")
        print(f"\nOr flat structure:")
        print(f"  {DATA_DIR}/")
        print(f"  ‚îú‚îÄ‚îÄ image1.jpg")
        print(f"  ‚îî‚îÄ‚îÄ image2.png")
    else:
        # Show format breakdown
        from collections import Counter
        formats = Counter(f.suffix.lower() for f in image_files)
        print(f"\n   Image formats: {dict(formats)}")
        
        print(f"\nFirst image: {image_files[0]}")
        print(f"Last image: {image_files[-1]}")
        
        # Estimate processing time
        minutes_per_image = 0.5  # Rough estimate for T4 GPU
        total_minutes = len(image_files) * minutes_per_image
        print(f"\nEstimated processing time: {total_minutes/60:.1f} hours")
        
        if total_minutes > 600:  # 10 hours
            print("\n‚ö†Ô∏è  WARNING: This will take >10 hours")
            print("   Consider processing in batches or limiting the number of images")

## 7. Process Images (with Progress Tracking)

In [None]:
from tqdm.notebook import tqdm
import os
import time

# CONFIGURATION
BATCH_LIMIT = 100  # Process only first N images (change to None for all)
SAVE_EVERY = 10    # Save progress every N images

# Limit images if specified
images_to_process = image_files[:BATCH_LIMIT] if BATCH_LIMIT else image_files

print(f"Processing {len(images_to_process)} images...")
print(f"Output will be saved to: {OUTPUT_DIR}\n")

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Process images
results = []
start_time = time.time()

for idx, image_path in enumerate(tqdm(images_to_process, desc="Processing")):
    # Process image
    result = process_image(image_path, model, processor)
    results.append(result)
    
    # Save individual result
    if result["success"]:
        image_name = Path(image_path).stem
        
        # Save JSON
        json_path = Path(OUTPUT_DIR) / f"{image_name}_ocr.json"
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(result, f, ensure_ascii=False, indent=2)
        
        # Save text only
        txt_path = Path(OUTPUT_DIR) / f"{image_name}_ocr.txt"
        with open(txt_path, 'w', encoding='utf-8') as f:
            f.write(result.get('text', ''))
    
    # Save progress periodically
    if (idx + 1) % SAVE_EVERY == 0:
        progress_path = Path(OUTPUT_DIR) / "batch_results.json"
        with open(progress_path, 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
    
    # Memory management
    if (idx + 1) % 50 == 0:
        torch.cuda.empty_cache()

# Final save
batch_path = Path(OUTPUT_DIR) / "batch_results.json"
with open(batch_path, 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

# Statistics
elapsed_time = time.time() - start_time
successful = sum(1 for r in results if r["success"])
failed = len(results) - successful

print("\n" + "="*60)
print("PROCESSING COMPLETE")
print("="*60)
print(f"Total images: {len(results)}")
print(f"Successful: {successful}")
print(f"Failed: {failed}")
print(f"Total time: {elapsed_time/60:.1f} minutes")
print(f"Average time per image: {elapsed_time/len(results):.1f} seconds")
print(f"\nResults saved to: {OUTPUT_DIR}")

## 8. View Sample Results

In [None]:
# Display a sample result
if successful > 0:
    sample_result = next(r for r in results if r["success"])
    
    print("Sample OCR Result:")
    print("=" * 60)
    print(f"Image: {sample_result['image_path']}")
    print("\nExtracted Text (first 500 characters):")
    print("-" * 60)
    print(sample_result['text'][:500])
    print("-" * 60)
else:
    print("No successful results to display")

## 9. Cleanup & Download Results

In [None]:
import gc

# Free GPU memory
del model
del processor
torch.cuda.empty_cache()
gc.collect()

print("‚úÖ GPU memory cleared")
print(f"\nüìÅ All results saved to Google Drive: {OUTPUT_DIR}")
print("\nYou can now:")
print("1. Access results from Google Drive on any device")
print("2. Download the ocr_output folder to your local machine")
print("3. Run this notebook again to process more images")
print(f"\nExecution completed: {datetime.now()}")

## Optional: Zip Results for Easy Download

In [None]:
# Uncomment to create a ZIP file of all results
# !cd {DRIVE_BASE} && zip -r ocr_output.zip ocr_output/
# print(f"‚úÖ Created: {DRIVE_BASE}/ocr_output.zip")
# print("Download this file from Google Drive")