In [None]:
# GPU required: True
# curl -fsSL https://ollama.com/install.sh | sh
# ollama pull deepseek-ocr

In [None]:
# ! pip install -q pymupdf ollama

In [None]:
# convert pdf into high-quality images and save them with page numbers

import fitz  # PyMuPDF
import os
from PIL import Image
import io

def convert_pdf_to_images(pdf_path, output_folder="dilmah_report_images"):
    """
    Converts a PDF into high-quality images and saves them with page numbers.
    """
    # 1. Create the output directory
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        print(f"Created folder: {output_folder}")

    # 2. Open the PDF
    try:
        doc = fitz.open(pdf_path)
        print(f"Processing: {pdf_path} ({len(doc)} pages)")
    except Exception as e:
        return f"Error opening PDF: {e}"

    image_paths = []

    # 3. Iterate through pages
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        
        # Using 300 DPI (Matrix 300/72 = 4.166) for high-quality OCR input
        zoom = 300 / 72
        matrix = fitz.Matrix(zoom, zoom)
        
        # Render page to a pixmap (image)
        pix = page.get_pixmap(matrix=matrix, alpha=False)
        
        # 4. Define filename with page number (1-based indexing)
        filename = f"page_{page_num + 1}.png"
        filepath = os.path.join(output_folder, filename)
        
        # 5. Save the file
        pix.save(filepath)
        image_paths.append(filepath)
        
        if (page_num + 1) % 5 == 0:
            print(f"Done: {page_num + 1} pages...")

    doc.close()
    print(f"\n‚úÖ Success! {len(image_paths)} images saved in '{output_folder}'")
    return image_paths

# --- EXECUTION ---
# Ensure you have uploaded "Annual Report - Dilmah.pdf" to your Colab files / appropriate directory
# pdf_name = "Annual Report - Dilmah.pdf"
# saved_images = convert_pdf_to_images(pdf_name)

In [None]:
# --- EXECUTION ---
# Ensure your images are in the folder from the previous step
# Extract text from images using Ollama's DeepSeek-OCR model
# This will convert the images into Markdown text files

import ollama
import os
import glob
from tqdm import tqdm

def run_ollama_ocr(input_folder="dilmah_report_images", output_folder="ollama_results"):
    # 1. Get images (sorted by page number)
    # Assumes filenames like "page_1.png", "page_2.png"
    image_paths = sorted(
        glob.glob(os.path.join(input_folder, "*.png")),
        key=lambda x: int(os.path.splitext(os.path.basename(x))[0].split('_')[-1])
    )
    
    if not image_paths:
        print(f"‚ùå No images found in {input_folder}")
        return

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        
    print(f"üìù Processing {len(image_paths)} pages with DeepSeek-OCR...")

    # 2. Iterate and process
    for img_path in tqdm(image_paths):
        try:
            # Specific prompt trigger for DeepSeek-OCR to output Markdown
            # The model is trained to recognize this specific grounding tag
            prompt = "<|grounding|>Convert the document to markdown."
            
            response = ollama.chat(
                model='deepseek-ocr',
                messages=[{
                    'role': 'user',
                    'content': prompt,
                    'images': [img_path]
                }]
            )
            
            # 3. Save result
            extracted_text = response['message']['content']
            
            base_name = os.path.basename(img_path)
            txt_filename = os.path.splitext(base_name)[0] + ".md"
            save_path = os.path.join(output_folder, txt_filename)
            
            with open(save_path, "w", encoding="utf-8") as f:
                f.write(extracted_text)
                
        except Exception as e:
            print(f"\n‚ö†Ô∏è Error on {img_path}: {e}")

    print(f"\n‚úÖ Done! Markdown files saved in '{output_folder}'")

# --- EXECUTION ---
# Ensure your images are in the folder from the previous step
# run_ollama_ocr()