In [1]:
pip install pytesseract pdf2image PyPDF2 Pillow

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
import sys
import pytesseract
from pdf2image import convert_from_path
from PyPDF2 import PdfReader
import re
import tkinter as tk
from tkinter import filedialog, scrolledtext, messagebox, ttk
from PIL import Image, ImageTk

In [3]:
class PDFOCRSearchApp:
    def __init__(self, root):
        self.root = root
        self.root.title("PDF OCR Text Search")
        self.root.geometry("900x700")
        self.root.configure(bg="#f0f0f0")
        
        self.pdf_path = None
        self.extracted_text = ""
        self.ocr_text = ""
        self.full_text = ""
        self.current_page = 0
        self.total_pages = 0
        self.search_results = []
        self.current_result = -1
        self.page_images = []
        
        self.setup_ui()

In [None]:
def setup_ui(self):
        # Top frame for controls
        control_frame = tk.Frame(self.root, bg="#f0f0f0")
        control_frame.pack(fill=tk.X, padx=10, pady=10)
        
        # Load PDF button
        self.load_btn = tk.Button(control_frame, text="Load PDF", command=self.load_pdf, 
                                bg="#4CAF50", fg="white", font=("Arial", 10, "bold"))
        self.load_btn.pack(side=tk.LEFT, padx=5)
        
        # OCR button
        self.ocr_btn = tk.Button(control_frame, text="Run OCR", command=self.run_ocr,
                               bg="#2196F3", fg="white", font=("Arial", 10, "bold"), state=tk.DISABLED)
        self.ocr_btn.pack(side=tk.LEFT, padx=5)
        
        # Search frame
        search_frame = tk.Frame(control_frame, bg="#f0f0f0")
        search_frame.pack(side=tk.LEFT, padx=10, fill=tk.X, expand=True)

         # Search entry
        self.search_var = tk.StringVar()
        self.search_entry = tk.Entry(search_frame, textvariable=self.search_var, font=("Arial", 10))
        self.search_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
        
        # Search button
        self.search_btn = tk.Button(search_frame, text="Search", command=self.search_text,
                                 bg="#FF9800", fg="white", font=("Arial", 10, "bold"), state=tk.DISABLED)
        self.search_btn.pack(side=tk.LEFT, padx=5)
        
        # Navigation buttons frame
        nav_frame = tk.Frame(control_frame, bg="#f0f0f0")
        nav_frame.pack(side=tk.RIGHT)

        # Previous result button
        self.prev_btn = tk.Button(nav_frame, text="Prev", command=self.prev_result,
                               bg="#9E9E9E", fg="white", font=("Arial", 10, "bold"), state=tk.DISABLED)
        self.prev_btn.pack(side=tk.LEFT, padx=2)
        
        # Next result button
        self.next_btn = tk.Button(nav_frame, text="Next", command=self.next_result,
                               bg="#9E9E9E", fg="white", font=("Arial", 10, "bold"), state=tk.DISABLED)
        self.next_btn.pack(side=tk.LEFT, padx=2)
        
        # Results label
        self.result_label = tk.Label(control_frame, text="Results: 0/0", bg="#f0f0f0", font=("Arial", 10))
        self.result_label.pack(side=tk.RIGHT, padx=10)

        # Main content frame with notebook
        self.notebook = ttk.Notebook(self.root)
        self.notebook.pack(fill=tk.BOTH, expand=True, padx=10, pady=5)
        
        # Tab for PDF viewer
        self.viewer_tab = tk.Frame(self.notebook, bg="white")
        self.notebook.add(self.viewer_tab, text="PDF View")
        
        # PDF viewer canvas
        self.canvas_frame = tk.Frame(self.viewer_tab)
        self.canvas_frame.pack(fill=tk.BOTH, expand=True)
        
        self.canvas = tk.Canvas(self.canvas_frame, bg="white")
        self.canvas.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)

        # Scrollbar for canvas
        self.scrollbar = tk.Scrollbar(self.canvas_frame, orient=tk.VERTICAL, command=self.canvas.yview)
        self.scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
        self.canvas.configure(yscrollcommand=self.scrollbar.set)

        # PDF page navigation
        self.page_frame = tk.Frame(self.viewer_tab, bg="#f0f0f0")
        self.page_frame.pack(fill=tk.X, padx=5, pady=5)
        
        self.prev_page_btn = tk.Button(self.page_frame, text="Previous Page", command=self.prev_page,
                                    state=tk.DISABLED, bg="#9E9E9E", fg="white")
        self.prev_page_btn.pack(side=tk.LEFT, padx=5)
        
        self.page_label = tk.Label(self.page_frame, text="Page: 0/0", bg="#f0f0f0")
        self.page_label.pack(side=tk.LEFT, padx=10)
        
        self.next_page_btn = tk.Button(self.page_frame, text="Next Page", command=self.next_page,
                                    state=tk.DISABLED, bg="#9E9E9E", fg="white")
        self.next_page_btn.pack(side=tk.LEFT, padx=5)
        
        # Tab for text content
        self.text_tab = tk.Frame(self.notebook)
        self.notebook.add(self.text_tab, text="Text Content")

        # Text area for displaying content
        self.text_area = scrolledtext.ScrolledText(self.text_tab, wrap=tk.WORD, font=("Courier", 11))
        self.text_area.pack(fill=tk.BOTH, expand=True, padx=5, pady=5)
        
        # Status bar
        self.status_bar = tk.Label(self.root, text="Ready", bd=1, relief=tk.SUNKEN, anchor=tk.W)
        self.status_bar.pack(side=tk.BOTTOM, fill=tk.X)

        def load_pdf(self):
            # """Load a PDF file and extract text if possible"""
            self.pdf_path = filedialog.askopenfilename(filetypes=[("PDF files", "*.pdf")])
        
            if not self.pdf_path:
                return
        
            try:
                # Reset variables
                self.extracted_text = ""
                self.ocr_text = ""
                self.full_text = ""
                self.page_images = []

                # Update status
                self.status_bar.config(text=f"Loading PDF: {os.path.basename(self.pdf_path)}")
                self.root.update()

                # Load PDF with PyPDF2
                pdf = PdfReader(self.pdf_path)
                self.total_pages = len(pdf.pages)

                # Extract text from PDF
                for i, page in enumerate(pdf.pages):
                    text = page.extract_text()
                    if text.strip():
                        self.extracted_text += f"\n--- Page {i+1} ---\n{text}\n"

                # Convert PDF to images for display and potential OCR
                self.page_images = convert_from_path(self.pdf_path)

                # Display first page
                self.current_page = 0
                self.display_page(self.current_page)

                # Update page navigation
                self.page_label.config(text=f"Page: {self.current_page + 1}/{self.total_pages}")
                self.prev_page_btn.config(state=tk.DISABLED)
                self.next_page_btn.config(state=tk.NORMAL if self.total_pages > 1 else tk.DISABLED)

                # Update text area
                if self.extracted_text.strip():
                    self.full_text = self.extracted_text
                    self.text_area.delete(1.0, tk.END)
                    self.text_area.insert(tk.END, self.full_text)
                    self.status_bar.config(text=f"Text extracted successfully from {os.path.basename(self.pdf_path)}")
                    self.search_btn.config(state=tk.NORMAL)
                else:
                    self.text_area.delete(1.0, tk.END)
                    self.text_area.insert(tk.END, "No text could be extracted automatically. Try running OCR.")
                    self.status_bar.config(text="No text extracted. PDF might be scanned - try OCR.")
                
                # Enable OCR button
                self.ocr_btn.config(state=tk.NORMAL)
            
            except Exception as e:
                messagebox.showerror("Error", f"Failed to load PDF: {str(e)}")
                self.status_bar.config(text="Error loading PDF")

        def display_page(self, page_index):
        # """Display the specified page image on the canvas"""
            if not self.page_images or page_index >= len(self.page_images):
                return
            
            # Clear canvas
            self.canvas.delete("all")
        
            # Get image and resize if needed
            img = self.page_images[page_index]
            canvas_width = self.canvas.winfo_width()
        
            # Initial canvas size might be small, set a minimum
            if canvas_width < 100:
                canvas_width = 600
        
            # Calculate scale to fit width
            scale = canvas_width / img.width
            new_height = int(img.height * scale)

            # Resize image
            resized_img = img.resize((canvas_width, new_height), Image.LANCZOS)
        
            # Convert to PhotoImage
            photo_img = ImageTk.PhotoImage(resized_img)
        
            # Store reference to prevent garbage collection
            self.photo_img = photo_img
        
            # Display on canvas
            self.canvas.create_image(0, 0, anchor=tk.NW, image=photo_img)
            self.canvas.config(scrollregion=(0, 0, canvas_width, new_height))

            # Update current page
            self.current_page = page_index
            self.page_label.config(text=f"Page: {self.current_page + 1}/{self.total_pages}")
        
            # Update page buttons
            self.prev_page_btn.config(state=tk.NORMAL if page_index > 0 else tk.DISABLED)
            self.next_page_btn.config(state=tk.NORMAL if page_index < self.total_pages - 1 else tk.DISABLED)

        def prev_page(self):
        # """Navigate to previous page"""
            if self.current_page > 0:
                self.display_page(self.current_page - 1)
    
        def next_page(self):
        # """Navigate to next page"""
            if self.current_page < self.total_pages - 1:
                self.display_page(self.current_page + 1)
    
        def run_ocr(self):
        # """Perform OCR on PDF pages"""
            if not self.page_images:
                messagebox.showerror("Error", "No PDF loaded")
                return        
            
            try:
                self.status_bar.config(text="Running OCR, please wait...")
                self.root.update()
            
                # Reset OCR text
                self.ocr_text = ""

            # Process each page with OCR
            for i, img in enumerate(self.page_images):
                # Update status to show progress
                self.status_bar.config(text=f"OCR processing page {i+1}/{len(self.page_images)}...")
                self.root.update()
                
                # Perform OCR
                text = pytesseract.image_to_string(img)
                self.ocr_text += f"\n--- Page {i+1} ---\n{text}\n"

            

SyntaxError: expected 'except' or 'finally' block (3746754071.py, line 219)