In [1]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io
from langchain.schema import Document
from typing import List
import os

In [2]:
class PDFImageOCRLoader:
    """
    Custom LangChain-compatible loader for extracting text from PDF images using OCR
    """

    def __init__(self, file_path: str, tesseract_config: str = "--oem 3 --psm 6"):
        """
        Initialize the PDF Image OCR Loader

        Args:
            file_path (str): Path to the PDF file
            tesseract_config (str): Tesseract configuration string
        """
        self.file_path = file_path
        self.tesseract_config = tesseract_config

        # Verify file exists
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"PDF file not found: {file_path}")

    def load(self) -> List[Document]:
        """
        Load PDF and extract text from each page image using OCR

        Returns:
            List[Document]: List of LangChain Document objects with OCR text
        """
        documents = []

        try:
            # Open PDF with PyMuPDF
            pdf_document = fitz.open(self.file_path)

            for page_num in range(len(pdf_document)):
                # Get the page
                page = pdf_document.load_page(page_num)

                # Convert page to image (pixmap)
                mat = fitz.Matrix(2.0, 2.0)  # Scale factor for better OCR quality
                pix = page.get_pixmap(matrix=mat)

                # Convert pixmap to PIL Image
                img_data = pix.tobytes("png")
                image = Image.open(io.BytesIO(img_data))

                # Perform OCR using pytesseract
                extracted_text = pytesseract.image_to_string(
                    image,
                    config=self.tesseract_config
                ).strip()

                # Create LangChain Document
                metadata = {
                    "source": self.file_path,
                    "page": page_num + 1,  # 1-indexed page numbers
                    "total_pages": len(pdf_document)
                }

                document = Document(
                    page_content=extracted_text,
                    metadata=metadata
                )

                documents.append(document)

                print(f"Processed page {page_num + 1}/{len(pdf_document)}")

            pdf_document.close()

        except Exception as e:
            raise Exception(f"Error processing PDF: {str(e)}")

        return documents

In [3]:
# Usage example
def main():
    """
    Example usage of the PDF Image OCR Loader
    """
    # Path to your PDF file
    pdf_path = "/home/vedant/Desktop/glimpsify/most_info_frame_extractor/video2pdf/archives/docloader/tioppr.pdf"

    try:
        # Method 1: Custom OCR Loader
        print("Method 1: Using custom OCR loader...")
        loader = PDFImageOCRLoader(pdf_path)
        documents = loader.load()

        print(f"Loaded {len(documents)} documents")

        # Display results
        for i, doc in enumerate(documents):
            print(f"\n--- Page {doc.metadata['page']} ---")
            print(f"Content length: {len(doc.page_content)} characters")
            print(f"First 200 characters: {doc.page_content[:200]}...")
            print(f"Metadata: {doc.metadata}")

        return documents

    except Exception as e:
        print(f"Error: {str(e)}")
        return []

if __name__ == "__main__":
    # Install required packages:
    # pip install langchain pymupdf pytesseract pillow
    #
    # Also install Tesseract OCR:
    # - Ubuntu/Debian: sudo apt-get install tesseract-ocr
    # - macOS: brew install tesseract
    # - Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki

    documents = main()

Method 1: Using custom OCR loader...
Processed page 1/5
Processed page 2/5
Processed page 3/5
Processed page 4/5
Processed page 5/5
Loaded 5 documents

--- Page 1 ---
Content length: 35 characters
First 200 characters: ' ©
Introduction e
Machine Learning...
Metadata: {'source': '/home/vedant/Desktop/glimpsify/most_info_frame_extractor/video2pdf/archives/docloader/tioppr.pdf', 'page': 1, 'total_pages': 5}

--- Page 2 ---
Content length: 208 characters
First 200 characters: ll aa ( had
@ FT Machin: tn nan
_ —» le
> AAAAAASO
> F 4 _- -c*ee@s
» - os -
x >
A e wh 28
- Wien aban » To
; : ~ >
= ~*
re | : - Pes
POPC eeeeee oeeeee
ee i
a i a i i i i a ~*
ywauvwe Ss i i i i we W...
Metadata: {'source': '/home/vedant/Desktop/glimpsify/most_info_frame_extractor/video2pdf/archives/docloader/tioppr.pdf', 'page': 2, 'total_pages': 5}

--- Page 3 ---
Content length: 173 characters
First 200 characters: Machine Learning {
What you'll learn
' * Core concepts of ML
* A bit of ML history
* Statistical tec

In [10]:
from langchain_community.document_loaders import DirectoryLoader
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
extracted_frames_dir = r"/home/vedant/Desktop/glimpsify/most_info_frame_extractor/experiments/33/data/tioppr_extracted_frames"

In [3]:
extracted_frames_dir

'/home/vedant/Desktop/glimpsify/most_info_frame_extractor/experiments/33/data/tioppr_extracted_frames'

In [4]:
# !pip install unstructured[image]

In [5]:
loader = DirectoryLoader(extracted_frames_dir, glob="**/*.jpg")
docs = loader.load()
len(docs)

5

In [6]:
docs

[Document(metadata={'source': '/home/vedant/Desktop/glimpsify/most_info_frame_extractor/experiments/33/data/tioppr_extracted_frames/frame_5785.jpg'}, page_content=''),
 Document(metadata={'source': '/home/vedant/Desktop/glimpsify/most_info_frame_extractor/experiments/33/data/tioppr_extracted_frames/frame_3382.jpg'}, page_content="Machine Learning\n\ntevened)\n\n\n\n»\n\nWhat you'll learn\n\n* Core concepts of ML\n\n* Abit of ML history\n\n+ Statistical techniques including regression, classification, clustering and more"),
 Document(metadata={'source': '/home/vedant/Desktop/glimpsify/most_info_frame_extractor/experiments/33/data/tioppr_extracted_frames/frame_178.jpg'}, page_content='Introduction\n\nMachine Learning\n\ney'),
 Document(metadata={'source': '/home/vedant/Desktop/glimpsify/most_info_frame_extractor/experiments/33/data/tioppr_extracted_frames/frame_1691.jpg'}, page_content=''),
 Document(metadata={'source': '/home/vedant/Desktop/glimpsify/most_info_frame_extractor/experiment

In [8]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [9]:
# from langchain_core.vectorstores import InMemoryVectorStore
#
# vector_store = InMemoryVectorStore(embeddings)

from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

In [11]:
# ids = vector_store.add_documents(documents=docs)

In [12]:
def test_query(query: str) -> None:
    results = vector_store.similarity_search_with_score(
        query
    )

    for res, score in results:
        print("="*50)
        print(f"* [SIM={score:3f}] {res.page_content}")


In [13]:
# test_query("What will I learn?")

* [SIM=1.104641] Machine Learning

tevened)



»

What you'll learn

* Core concepts of ML

* Abit of ML history

+ Statistical techniques including regression, classification, clustering and more
* [SIM=1.466313] Introduction

Machine Learning

ey
* [SIM=1.765806] 
* [SIM=1.765806] 
