In [None]:
# Step 1: Install dependencies
!pip install surya-ocr transformers datasets accelerate -q

# Check GPU
import torch
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'NOT FOUND'}")
print(f"CUDA: {torch.version.cuda}")

In [None]:
# Step 2: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Set your data path (update this to your actual path)
DATA_PATH = "/content/drive/MyDrive/OCR_Bangla/training/data"

import os
print(f"\nChecking data path: {DATA_PATH}")
if os.path.exists(DATA_PATH):
    print("‚úì Data path found!")
    print(f"  Images: {len(os.listdir(os.path.join(DATA_PATH, 'raw')))} files")
else:
    print("‚úó Data path NOT found!")
    print("  Please upload your training/data folder to Google Drive")

In [None]:
# Step 3: Load training data
import json
from pathlib import Path
from PIL import Image
from datasets import Dataset, DatasetDict

def load_nid_dataset(data_path):
    """Load NID OCR training data from folder."""
    data_path = Path(data_path)
    images_dir = data_path / "raw"
    labels_file = data_path / "annotations" / "labels.txt"
    
    # Load labels
    labels = {}
    with open(labels_file, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if '\t' in line:
                parts = line.split('\t', 1)
                if len(parts) == 2:
                    labels[parts[0]] = parts[1]
    
    # Create dataset
    samples = []
    for img_name, text in labels.items():
        img_path = images_dir / img_name
        if img_path.exists() and text.strip():
            samples.append({
                "image_path": str(img_path),
                "text": text.strip(),
            })
    
    print(f"Loaded {len(samples)} samples")
    
    # Split 90/10
    import random
    random.seed(42)
    random.shuffle(samples)
    split_idx = int(len(samples) * 0.9)
    
    train_data = samples[:split_idx]
    val_data = samples[split_idx:]
    
    print(f"Train: {len(train_data)}, Val: {len(val_data)}")
    
    return {
        "train": Dataset.from_list(train_data),
        "val": Dataset.from_list(val_data),
    }

# Load dataset
dataset = load_nid_dataset(DATA_PATH)

# Show sample
print("\nSample data:")
for i, sample in enumerate(dataset["train"]):
    if i >= 5:
        break
    print(f"  {sample['text'][:50]}...")

In [None]:
# Step 4: Initialize Surya model
from surya.foundation import FoundationPredictor
from surya.recognition import RecognitionPredictor

print("Loading Surya OCR model...")
foundation = FoundationPredictor()
rec_predictor = RecognitionPredictor(foundation)

print("‚úì Model loaded!")

In [None]:
# Step 5: Test model BEFORE fine-tuning
from PIL import Image

# Test on first few training images
print("Testing model BEFORE fine-tuning:")
print("="*60)

for i, sample in enumerate(dataset["train"]):
    if i >= 5:
        break
    
    img = Image.open(sample["image_path"])
    results = rec_predictor([img], ["bn", "en"])
    
    pred_text = " ".join([line.text for line in results[0].text_lines]) if results else ""
    
    print(f"\n[{i+1}] Ground truth: {sample['text']}")
    print(f"    Prediction:   {pred_text}")
    match = "‚úì" if pred_text.strip() == sample['text'].strip() else "‚úó"
    print(f"    Match: {match}")

In [None]:
# Step 6: Run Fine-tuning using official Surya script
import subprocess
import sys

# Save dataset in HuggingFace format
OUTPUT_DIR = "/content/surya_bangla_finetuned"
DATASET_DIR = "/content/bangla_nid_dataset"

# Create local dataset copy
import os
import shutil

os.makedirs(f"{DATASET_DIR}/images", exist_ok=True)

# Copy images and create proper format
train_data = []
for sample in dataset["train"]:
    src = sample["image_path"]
    dst = f"{DATASET_DIR}/images/{os.path.basename(src)}"
    shutil.copy(src, dst)
    train_data.append({
        "image": f"images/{os.path.basename(src)}",
        "text": sample["text"]
    })

# Save metadata
with open(f"{DATASET_DIR}/metadata.jsonl", "w", encoding="utf-8") as f:
    for item in train_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"Dataset prepared at {DATASET_DIR}")
print(f"Total samples: {len(train_data)}")

In [None]:
# Step 7: Find and run official Surya fine-tune script
import surya
from pathlib import Path

surya_path = Path(surya.__file__).parent
finetune_script = surya_path / "scripts" / "finetune_ocr.py"

print(f"Surya path: {surya_path}")
print(f"Finetune script exists: {finetune_script.exists()}")

if finetune_script.exists():
    # Read the script to understand its arguments
    !head -100 "{finetune_script}"

In [None]:
# Step 8: Alternative - Custom training loop
# Use this if official script doesn't work with local dataset

from transformers import TrainingArguments, Trainer
from torch.utils.data import DataLoader
import torch
from tqdm import tqdm

# Custom training function
def train_surya_bangla(dataset, rec_predictor, num_epochs=10, batch_size=8, lr=5e-5):
    """
    Simple fine-tuning loop for Surya OCR.
    Note: Full fine-tuning requires access to internal model structure.
    This is a simplified demonstration.
    """
    print(f"\nStarting training...")
    print(f"  Epochs: {num_epochs}")
    print(f"  Batch size: {batch_size}")
    print(f"  Learning rate: {lr}")
    
    # Get the underlying model
    model = rec_predictor.model
    model.train()
    
    # Setup optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    
    # Training loop
    for epoch in range(num_epochs):
        total_loss = 0
        num_batches = 0
        
        # Process in batches
        samples = list(dataset["train"])
        for i in tqdm(range(0, len(samples), batch_size), desc=f"Epoch {epoch+1}"):
            batch_samples = samples[i:i+batch_size]
            
            # Load images
            images = [Image.open(s["image_path"]) for s in batch_samples]
            texts = [s["text"] for s in batch_samples]
            
            # Forward pass (simplified - actual implementation varies)
            # This is a placeholder - real fine-tuning needs model internals
            optimizer.zero_grad()
            
            # Note: Actual loss computation requires tokenized targets
            # and proper model forward with labels
            
            num_batches += 1
        
        print(f"Epoch {epoch+1}/{num_epochs} complete")
    
    return model

print("\n‚ö†Ô∏è  For full Surya fine-tuning, use the official script:")
print("python -m surya.scripts.finetune_ocr --help")
print("\nOr contact hi@datalab.to for their internal training stack.")

In [None]:
# Step 9: Upload dataset to HuggingFace Hub for official fine-tuning
from huggingface_hub import notebook_login, HfApi

# Login to HuggingFace
notebook_login()

# Upload dataset
api = HfApi()

# Create dataset repo (change 'your-username' to your actual username)
DATASET_REPO = "your-username/bangla-nid-ocr"  # UPDATE THIS!

print(f"\nTo upload your dataset to HuggingFace:")
print(f"1. Update DATASET_REPO with your username")
print(f"2. Run: api.upload_folder(folder_path='{DATASET_DIR}', repo_id='{DATASET_REPO}', repo_type='dataset')")

In [None]:
# Step 10: Run official Surya fine-tuning (after uploading dataset)

# UNCOMMENT and UPDATE these lines after uploading dataset to HuggingFace:

# !python -m surya.scripts.finetune_ocr \
#     --output_dir /content/surya_bangla_finetuned \
#     --dataset_name your-username/bangla-nid-ocr \
#     --per_device_train_batch_size 8 \
#     --gradient_checkpointing true \
#     --max_sequence_length 512 \
#     --num_train_epochs 50 \
#     --learning_rate 5e-5 \
#     --save_steps 500 \
#     --logging_steps 50

print("Update DATASET_REPO and uncomment the command above to start training!")

In [None]:
# Step 11: Test AFTER fine-tuning
# Run this after training completes

# Load fine-tuned model
# rec_predictor_finetuned = RecognitionPredictor(
#     foundation,
#     model_path="/content/surya_bangla_finetuned/final"
# )

# Test on same images
# print("Testing model AFTER fine-tuning:")
# ... same testing code as Step 5 ...

In [None]:
# Step 12: Download fine-tuned model
# Run after training to save model to Google Drive

# !cp -r /content/surya_bangla_finetuned "/content/drive/MyDrive/OCR_Bangla/models/"
# print("Model saved to Google Drive!")

## üìã Summary

### What this notebook does:
1. Loads your NID training data (images + Bangla labels)
2. Tests Surya OCR **before** fine-tuning
3. Prepares dataset in HuggingFace format
4. Runs fine-tuning with official Surya script
5. Tests Surya OCR **after** fine-tuning
6. Saves fine-tuned model to Google Drive

### To use the fine-tuned model locally:
```python
from surya.recognition import RecognitionPredictor
from surya.foundation import FoundationPredictor

foundation = FoundationPredictor()
rec = RecognitionPredictor(
    foundation,
    model_path="models/surya_bangla_finetuned"
)

results = rec([image], ["bn", "en"])
```

### Tips:
- More training data = better results
- Aim for 500+ labeled samples
- Include variety in NID conditions
- Training takes ~2 hours on T4 GPU