In [8]:
import os
from pdf2image import convert_from_path
import pytesseract
import pytesseract

# Set the path to your Tesseract executable
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Adjust this path

# Rest of your code...
from PIL import Image
import cv2
import numpy as np
from typing import List, Dict, Optional
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class PDFOCRParser:
    def __init__(self, tesseract_path: Optional[str] = None, dpi: int = 300):
        """
        Initialize the PDF OCR Parser
        
        Args:
            tesseract_path: Path to tesseract executable (optional)
            dpi: DPI for PDF to image conversion (default: 300)
        """
        self.dpi = dpi
        if tesseract_path:
            pytesseract.pytesseract.tesseract_cmd = tesseract_path
            
    def preprocess_image(self, image: np.ndarray) -> np.ndarray:
        """
        Preprocess the image to improve OCR accuracy
        
        Args:
            image: Input image as numpy array
            
        Returns:
            Preprocessed image as numpy array
        """
        # Convert to grayscale
        if len(image.shape) == 3:
            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        else:
            gray = image
            
        # Apply thresholding to get black and white image
        thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
        
        # Reduce noise
        denoised = cv2.fastNlMeansDenoising(thresh)
        
        # Deskew image if needed
        angle = self._get_skew_angle(denoised)
        if abs(angle) > 0.5:
            rotated = self._rotate_image(denoised, angle)
        else:
            rotated = denoised
            
        return rotated
    
    def _get_skew_angle(self, image: np.ndarray) -> float:
        """
        Calculate skew angle of the image
        
        Args:
            image: Input image
            
        Returns:
            Skew angle in degrees
        """
        # Find all non-zero points in the image
        coords = np.column_stack(np.where(image > 0))
        
        # Calculate angle
        angle = cv2.minAreaRect(coords)[-1]
        
        if angle < -45:
            angle = 90 + angle
            
        return -angle
    
    def _rotate_image(self, image: np.ndarray, angle: float) -> np.ndarray:
        """
        Rotate the image by given angle
        
        Args:
            image: Input image
            angle: Rotation angle in degrees
            
        Returns:
            Rotated image
        """
        # Get image dimensions
        (h, w) = image.shape[:2]
        center = (w // 2, h // 2)
        
        # Perform rotation
        M = cv2.getRotationMatrix2D(center, angle, 1.0)
        rotated = cv2.warpAffine(image, M, (w, h), 
                                flags=cv2.INTER_CUBIC,
                                borderMode=cv2.BORDER_REPLICATE)
        
        return rotated
    
    def extract_text_from_image(self, image: np.ndarray) -> str:
        """
        Extract text from a single image using Tesseract OCR
        
        Args:
            image: Input image as numpy array
            
        Returns:
            Extracted text as string
        """
        try:
            # Preprocess the image
            processed_image = self.preprocess_image(image)
            
            # Extract text using Tesseract
            custom_config = r'--oem 3 --psm 6'
            text = pytesseract.image_to_string(processed_image, config=custom_config)
            
            return text.strip()
            
        except Exception as e:
            logger.error(f"Error in OCR processing: {str(e)}")
            return ""
    
    def parse_pdf(self, pdf_path: str, output_path: Optional[str] = None) -> Dict[int, str]:
        """
        Parse PDF file and extract text using OCR
        
        Args:
            pdf_path: Path to input PDF file
            output_path: Path to save extracted text (optional)
            
        Returns:
            Dictionary with page numbers as keys and extracted text as values
        """
        try:
            logger.info(f"Processing PDF: {pdf_path}")
            
            POPPLER_PATH = r"C:\Program Files\Poppler\poppler-24.08.0\Library\bin"
            
            # Convert PDF to images
            images = convert_from_path(pdf_path, dpi=self.dpi, poppler_path=POPPLER_PATH)
            
            # Process each page
            results = {}
            for i, image in enumerate(images, start=1):
                logger.info(f"Processing page {i}/{len(images)}")
                
                # Convert PIL Image to numpy array
                image_np = np.array(image)
                
                # Extract text from the page
                text = self.extract_text_from_image(image_np)
                results[i] = text
                
                # Save intermediate results
                if output_path:
                    page_output = os.path.join(output_path, f"page_{i}.txt")
                    os.makedirs(output_path, exist_ok=True)
                    with open(page_output, 'w', encoding='utf-8') as f:
                        f.write(text)
            
            # Save complete results if output path is provided
            if output_path:
                complete_output = os.path.join(output_path, "complete.txt")
                with open(complete_output, 'w', encoding='utf-8') as f:
                    for page_num, text in sorted(results.items()):
                        f.write(f"=== Page {page_num} ===\n")
                        f.write(text)
                        f.write("\n\n")
            
            return results
            
        except Exception as e:
            logger.error(f"Error processing PDF: {str(e)}")
            raise

In [9]:
def main():
    # Example usage
    parser = PDFOCRParser()
    
    # Configure paths
    pdf_path = r"C:\Users\asua\DataScience Exercises\Downloads\B16_1_1.pdf"
    output_dir = r"scanned_output"
    
    try:
        # Process the PDF
        results = parser.parse_pdf(pdf_path, output_dir)
        
        # Print results
        for page_num, text in results.items():
            print(f"\nPage {page_num}:")
            print(text)
            
    except Exception as e:
        logger.error(f"Failed to process PDF: {str(e)}")

if __name__ == "__main__":
    main()

INFO:__main__:Processing PDF: C:\Users\asua\DataScience Exercises\Downloads\B16_1_1.pdf
INFO:__main__:Processing page 1/4
INFO:__main__:Processing page 2/4
INFO:__main__:Processing page 3/4
INFO:__main__:Processing page 4/4



Page 1:
ay Coe Ska fa ae 4
Se ZAT ARE eegs
RpEES ER EE Ba &
28: BESS OLR RS
Re ee Pg sa zg eg
SoS = 3 8 2 ea
msi Por bbaoge
SS: ir P SESE x
SSiiiiii Bese e
SRP iii i BS ee a
QPpbaeriiPias te
Sipiidi:i:p:i gio os
Ri PibiiEiiEss o
A) Pi Gi:id tiie is:p Ss
PF of 3 fo: toy ft hanes pet
RA SSRSSESKBY
RSRSSGsesesss
, ggebe Pegsees SERGE RFF
cages BREar? BE Pasar
Sicbiaelg etrectar sibs tiis
» | SS ey 26 4. oP eo oe Se eS wo
jE SaEPELESER GPE CiPeebiees 1
5 , 2 2 2 4 sR yy
St 2&8 S. oe gH - a8 wee OO Fad B
-25 O58 3 @ oc Ba eB ae
coger Pecedlng idsbaegise: =
Few. fg = 2 2 Sopot es zy 2
TREE pi praiiie ee ]eeh Es :
Seiseeeibegztsg eT pS 2geb ie < 3
. fd & s : Pad
PERESEGEREGE CEE ER ERE e fas Dm 2
* ERE & eee tear ese 3s (ss An. « @ Ss
a6 (8 8 @ 2 28g & wee Be >. si >
Pigs Bay. Oe e et & f Seog eck PB ss o 8 @ Be
= SLEGEP Fee kes t Fa SESE EEL PS Fs > a C
E SBeges Pee ReSSGg bree. FERE5 é
PohEPS Ebel eggs i LE PeSa pf By Z
eSee Ss SRSRPESES 2 832 45 A
P wt oe 5 SERS Fe go dA ford Bo fa
SESTEE

In [6]:
%pip install pytesseract

Note: you may need to restart the kernel to use updated packages.


In [4]:
import os
from pdf2image import convert_from_path
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Adjust this path
from PIL import Image
import logging
from typing import Dict, Optional
import tempfile

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class PDFOCRParser:
    def __init__(self, 
                 tesseract_path: Optional[str] = None, 
                 dpi: int = 400,
                 batch_size: int = 10):
        """
        Initialize the PDF OCR Parser
        
        Args:
            tesseract_path: Path to tesseract executable (optional)
            dpi: DPI for PDF to image conversion (default: 400)
            batch_size: Number of pages to process at once (default: 10)
        """
        self.dpi = dpi
        self.batch_size = batch_size
        if tesseract_path:
            pytesseract.pytesseract.tesseract_cmd = tesseract_path
        else:
            # Default Tesseract path for Windows
            pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

    def extract_text_from_image(self, image: Image.Image) -> str:
        """
        Extract text from a single image using Tesseract OCR
        Optimized for digital PDFs with technical content
        """
        try:
            # Configure Tesseract for optimal text recognition
            custom_config = r'''--oem 3 --psm 6 
                -c preserve_interword_spaces=1
                -c textord_min_linesize=3
                -c textord_separate_tables=1'''
            
            text = pytesseract.image_to_string(
                image,
                config=custom_config,
                lang='eng'
            )
            return text.strip()
            
        except Exception as e:
            logger.error(f"Error in OCR processing: {str(e)}")
            return ""

    def get_pdf_info(self, pdf_path: str) -> dict:
        """Get information about the PDF file"""
        from pdf2image.pdf2image import pdfinfo_from_path
        try:
            return pdfinfo_from_path(pdf_path)
        except Exception as e:
            logger.error(f"Error getting PDF info: {str(e)}")
            raise

    def process_batch(self, 
                     pdf_path: str, 
                     start_page: int, 
                     end_page: int,
                     output_path: Optional[str] = None,) -> Dict[int, str]:
        """
        Process a batch of PDF pages
        """
        POPPLER_PATH = r"C:\Program Files\Poppler\poppler-24.08.0\Library\bin"
        
        try:
            # Create temporary directory for processing
            with tempfile.TemporaryDirectory() as temp_dir:
                # Convert PDF pages to image
                
                images = convert_from_path(
                    pdf_path,
                    poppler_path=POPPLER_PATH,
                    dpi=self.dpi,
                    first_page=start_page,
                    last_page=end_page,
                    output_folder=temp_dir,
                    grayscale=False,  # Keep color for digital PDFs
                    thread_count=4,

                )
                
                results = {}
                for i, image in enumerate(images, start=start_page):
                    logger.info(f"Processing page {i}")
                    
                    # Extract text
                    text = self.extract_text_from_image(image)
                    results[i] = text
                    
                    # Save intermediate results if requested
                    if output_path:
                        page_output = os.path.join(output_path, f"page_{i}.txt")
                        os.makedirs(output_path, exist_ok=True)
                        with open(page_output, 'w', encoding='utf-8') as f:
                            f.write(text)
                    
                    # Clean up
                    image.close()
                
                return results
                
        except Exception as e:
            logger.error(f"Error processing batch: {str(e)}")
            return {}

    def parse_pdf(self, pdf_path: str, output_path: Optional[str] = None) -> Dict[int, str]:
        """
        Parse PDF file and extract text using OCR with batch processing
        """
        try:
            logger.info(f"Processing PDF: {pdf_path}")
            
            # Get PDF info
            pdf_info = self.get_pdf_info(pdf_path)
            total_pages = pdf_info["Pages"]
            logger.info(f"Total pages: {total_pages}")
            
            # Process in batches
            all_results = {}
            for start_page in range(1, total_pages + 1, self.batch_size):
                end_page = min(start_page + self.batch_size - 1, total_pages)
                logger.info(f"Processing batch: pages {start_page} to {end_page}")
                
                batch_results = self.process_batch(
                    pdf_path, 
                    start_page, 
                    end_page,
                    output_path
                )
                all_results.update(batch_results)
            
            # Save complete results if output path is provided
            if output_path:
                complete_output = os.path.join(output_path, "complete.txt")
                with open(complete_output, 'w', encoding='utf-8') as f:
                    for page_num, text in sorted(all_results.items()):
                        f.write(f"\nPage {page_num}:\n")
                        f.write(text)
                        f.write("\n\n")
            
            return all_results
            
        except Exception as e:
            logger.error(f"Error processing PDF: {str(e)}")
            raise


In [None]:
def main():
    # Example usage for digital PDFs
    parser = PDFOCRParser(
        dpi=400,  # Higher DPI for better text recognition
        batch_size=5  # Smaller batch size for better memory management
    )
    
    # Configure paths
    pdf_path = r"C:\Users\asua\DataScience Exercises\Downloads\B16_1_1.pdf"
    output_dir = r"scanned_output"
    
    try:
        # Process the PDF
        results = parser.parse_pdf(pdf_path, output_dir)
        
        # Print results
        for page_num, text in results.items():
            print(f"\nPage {page_num}:")
            print(text)
            
    except Exception as e:
        logger.error(f"Failed to process PDF: {str(e)}")

if __name__ == "__main__":
    main()

In [1]:
import os
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import logging
from typing import Dict, Optional
import tempfile

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Set Tesseract path
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

class PDFOCRParser:
    def __init__(self, 
                 tesseract_path: Optional[str] = None, 
                 poppler_path: str = r"C:\Program Files\Poppler\poppler-24.08.0\Library\bin",
                 dpi: int = 400,
                 batch_size: int = 10):
        """
        Initialize the PDF OCR Parser
        
        Args:
            tesseract_path: Path to tesseract executable (optional)
            poppler_path: Path to poppler binaries
            dpi: DPI for PDF to image conversion (default: 400)
            batch_size: Number of pages to process at once (default: 10)
        """
        self.dpi = dpi
        self.batch_size = batch_size
        self.poppler_path = poppler_path
        
        if tesseract_path:
            pytesseract.pytesseract.tesseract_cmd = tesseract_path

    def extract_text_from_image(self, image: Image.Image) -> str:
        """
        Extract text from a single image using Tesseract OCR
        Optimized for digital PDFs with technical content
        """
        try:
            # Configure Tesseract for optimal text recognition
            custom_config = r'''--oem 3 --psm 6 
                -c preserve_interword_spaces=1
                -c textord_min_linesize=3
                -c textord_separate_tables=1'''
            
            text = pytesseract.image_to_string(
                image,
                config=custom_config,
                lang='eng'
            )
            return text.strip()
            
        except Exception as e:
            logger.error(f"Error in OCR processing: {str(e)}")
            return ""

    def get_pdf_info(self, pdf_path: str) -> dict:
        """Get information about the PDF file"""
        from pdf2image.pdf2image import pdfinfo_from_path
        try:
            return pdfinfo_from_path(pdf_path, poppler_path=self.poppler_path)
        except Exception as e:
            logger.error(f"Error getting PDF info: {str(e)}")
            raise

    def process_batch(self, 
                     pdf_path: str, 
                     start_page: int, 
                     end_page: int,
                     output_path: Optional[str] = None) -> Dict[int, str]:
        """
        Process a batch of PDF pages
        """
        try:
            # Create temporary directory for processing
            with tempfile.TemporaryDirectory() as temp_dir:
                # Convert PDF pages to images
                images = convert_from_path(
                    pdf_path,
                    poppler_path=self.poppler_path,
                    dpi=self.dpi,
                    first_page=start_page,
                    last_page=end_page,
                    output_folder=temp_dir,
                    grayscale=False,  # Keep color for digital PDFs
                    thread_count=4
                )
                
                results = {}
                for i, image in enumerate(images, start=start_page):
                    logger.info(f"Processing page {i}")
                    
                    # Extract text
                    text = self.extract_text_from_image(image)
                    results[i] = text
                    
                    # Save intermediate results if requested
                    if output_path:
                        page_output = os.path.join(output_path, f"page_{i}.txt")
                        os.makedirs(output_path, exist_ok=True)
                        with open(page_output, 'w', encoding='utf-8') as f:
                            f.write(text)
                    
                    # Clean up
                    image.close()
                
                return results
                
        except Exception as e:
            logger.error(f"Error processing batch: {str(e)}")
            return {}

    def parse_pdf(self, pdf_path: str, output_path: Optional[str] = None) -> Dict[int, str]:
        """
        Parse PDF file and extract text using OCR with batch processing
        """
        try:
            logger.info(f"Processing PDF: {pdf_path}")
            
            # Get PDF info
            pdf_info = self.get_pdf_info(pdf_path)
            total_pages = pdf_info["Pages"]
            logger.info(f"Total pages: {total_pages}")
            
            # Process in batches
            all_results = {}
            for start_page in range(1, total_pages + 1, self.batch_size):
                end_page = min(start_page + self.batch_size - 1, total_pages)
                logger.info(f"Processing batch: pages {start_page} to {end_page}")
                
                batch_results = self.process_batch(
                    pdf_path, 
                    start_page, 
                    end_page,
                    output_path
                )
                all_results.update(batch_results)
            
            # Save complete results if output path is provided
            if output_path:
                complete_output = os.path.join(output_path, "complete.txt")
                with open(complete_output, 'w', encoding='utf-8') as f:
                    for page_num, text in sorted(all_results.items()):
                        f.write(f"\nPage {page_num}:\n")
                        f.write(text)
                        f.write("\n\n")
            
            return all_results
            
        except Exception as e:
            logger.error(f"Error processing PDF: {str(e)}")
            raise

In [2]:
def main():
    # Example usage for digital PDFs
    parser = PDFOCRParser(
        dpi=400,  # Higher DPI for better text recognition
        batch_size=5  # Smaller batch size for better memory management
    )
    
    # Configure paths
    pdf_path = r"C:\Users\asua\DataScience Exercises\Downloads\B16_1_1.pdf"
    output_dir = r"scanned_output"
    
    try:
        # Process the PDF
        results = parser.parse_pdf(pdf_path, output_dir)
        
        # Print results
        for page_num, text in results.items():
            print(f"\nPage {page_num}:")
            print(text)
            
    except Exception as e:
        logger.error(f"Failed to process PDF: {str(e)}")

if __name__ == "__main__":
    main()

INFO:__main__:Processing PDF: C:\Users\asua\DataScience Exercises\Downloads\B16_1_1.pdf
INFO:__main__:Total pages: 4
INFO:__main__:Processing batch: pages 1 to 4
INFO:__main__:Processing page 1
INFO:__main__:Processing page 2
INFO:__main__:Processing page 3
INFO:__main__:Processing page 4



Page 1:
IN SEARCH OF Jyotish                             |  7                           THE SPECIAL LAGNaS                                  "
|                      16.5.1.4 Othér Relations ...scucueceessncnnnetnensmnninnnnenneneneennee: 888                  a                                                                                                                                        |
|               16.5.2 Using KGrakas a8 Lagnd...essssssssessssesersssesssnnescsnssesnssresesee B92                                                    16. 1  |
     165.22 Karaka Kundali. TT ses    ,      THE VISESA LAGNAS      1
           16.6 DHANA LAGNAS....0......cccccccceeesecccenseeceresenceeseseasseresserseserseeeersreerecess 400                                                        i? «                                                   a
:             16.6.1 Indes Lag sessosssersssnevieresinesvnesnesineetnssinsesnseussiserneeinsseiseees 401                                              