In [None]:
! pip install PyMuPDF

In [10]:
# imports
import os
import logging
import fitz  # PyMuPDF
from pathlib import Path
from typing import List, Dict, Optional
import datetime

In [2]:
# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('./logs/pdf_extraction.log', encoding='utf-8'),
        logging.StreamHandler()
    ]
)
# Create a logger
logger = logging.getLogger(__name__)

# Set the console handler encoding to handle Unicode characters
for handler in logger.handlers:
    if isinstance(handler, logging.StreamHandler):
        handler.setStream(sys.stdout)

In [None]:
# Testing logging
logger.info("Testing logs.")

In [5]:
def extract_text_from_pdf(pdf_path: str) -> Optional[str]:
    """
    Extract text from a single PDF file using PyMuPDF.
    
    Args:
        pdf_path (str): Path to the PDF file
        
    Returns:
        Optional[str]: Extracted text or None if extraction fails
    """
    try:
        print(f"🚀 Starting text extraction from: {pdf_path}")
        logger.info(f"🚀 Starting text extraction from: {pdf_path}")
        
        # Open the PDF document
        doc = fitz.open(pdf_path)
        text_content = ""
        page_count = len(doc)
        
        # Extract text from each page
        for page_num in range(page_count):
            page = doc.load_page(page_num)
            page_text = page.get_text()
            text_content += f"\n--- Page {page_num + 1} ---\n{page_text}"
            
        doc.close()
        print(f"✅ Successfully extracted text from {pdf_path} ({page_count} pages)")
        logger.info(f"✅ Successfully extracted text from {pdf_path} ({page_count} pages)")
        return text_content
        
    except Exception as e:
        print(f"❌ Failed to extract text from {pdf_path}: {str(e)}")
        logger.error(f"❌ Failed to extract text from {pdf_path}: {str(e)}")
        return None

In [None]:
# Test the function with a sample PDF file
pdf_path = os.path.abspath("../src/first_batch/IG03056_V2.pdf")
extracted_text = extract_text_from_pdf(pdf_path)
folder_path = os.path.abspath(f"./outputs/{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}")
file_path = os.path.join(folder_path, "extracted_text.txt")
os.makedirs(folder_path, exist_ok=True)
with open(file_path, "w", encoding="utf-8") as f:
    f.write(extracted_text if extracted_text else "No text extracted.")
    logger.info("Text extraction completed and saved to outputs/extracted_text.txt")

2025-08-06 10:33:57,055 - INFO - 🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\IG03056_V2.pdf
2025-08-06 10:33:57,259 - INFO - ✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\IG03056_V2.pdf (101 pages)


🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\IG03056_V2.pdf
✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\IG03056_V2.pdf (101 pages)


2025-08-06 10:33:57,264 - INFO - Text extraction completed and saved to outputs/extracted_text.txt


In [24]:
def extract_text_from_directory(directory_path: str, output_dir: str = "extracted_texts") -> Dict[str, str]:
    """
    Extract text from all PDF files in a directory.
    
    Args:
        directory_path (str): Path to directory containing PDF files
        output_dir (str): Directory to save extracted text files
        
    Returns:
        Dict[str, str]: Dictionary mapping PDF filenames to extracted text
    """
    directory = Path(directory_path)
    output_path = Path(output_dir)
    
    if not directory.exists():
        print(f"Directory does not exist: {directory_path}")
        return {}
    
    # Create output directory if it doesn't exist
    output_path.mkdir(exist_ok=True)
    print(f"Output directory created/verified: {output_path}")
    
    # Find all PDF files in the directory
    pdf_files = list(directory.glob("*.pdf"))
    print(f"Found {len(pdf_files)} PDF files in {directory_path}")
    
    extracted_texts = {}
    successful_extractions = 0
    
    for pdf_file in pdf_files:
        try:
            print(f"Processing: {pdf_file.name}")
            
            # Extract text from PDF
            extracted_text = extract_text_from_pdf(str(pdf_file))
            
            if extracted_text:
                # Store in dictionary
                extracted_texts[pdf_file.name] = extracted_text
                
                # Save to text file
                output_file = output_path / f"{pdf_file.stem}.txt"
                with open(output_file, 'w', encoding='utf-8') as f:
                    f.write(extracted_text)
                
                successful_extractions += 1
                print(f"Text saved to: {output_file}")
            else:
                print(f"Failed to extract text from: {pdf_file.name}")
            
        except Exception as e:
            print(f"Error processing {pdf_file.name}: {str(e)}")
    
    print(f"Extraction complete: {successful_extractions}/{len(pdf_files)} files processed successfully")
    return extracted_texts

In [25]:
input_directory = os.path.abspath("../src/first_batch")
output_directory = os.path.abspath("./outputs/extracted_texts")
extracted_texts = extract_text_from_directory(input_directory, output_directory)
logger.info(f"Extraction complete: {len(extracted_texts)} files processed successfully")

2025-08-06 10:36:48,911 - INFO - 🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\IG03056_V2.pdf


Output directory created/verified: e:\Stage\RAG-CSEE\scripts\outputs\extracted_texts
Found 17 PDF files in e:\Stage\RAG-CSEE\src\first_batch
Processing: IG03056_V2.pdf
🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\IG03056_V2.pdf


2025-08-06 10:36:49,140 - INFO - ✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\IG03056_V2.pdf (101 pages)
2025-08-06 10:36:49,143 - INFO - 🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\IG03057_V1.pdf


✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\IG03056_V2.pdf (101 pages)
Text saved to: e:\Stage\RAG-CSEE\scripts\outputs\extracted_texts\IG03056_V2.txt
Processing: IG03057_V1.pdf
🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\IG03057_V1.pdf


2025-08-06 10:36:49,398 - INFO - ✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\IG03057_V1.pdf (60 pages)
2025-08-06 10:36:49,400 - INFO - 🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\IG03058_V1.pdf
2025-08-06 10:36:49,572 - INFO - ✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\IG03058_V1.pdf (76 pages)
2025-08-06 10:36:49,573 - INFO - 🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\IG03096_V2.pdf


✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\IG03057_V1.pdf (60 pages)
Text saved to: e:\Stage\RAG-CSEE\scripts\outputs\extracted_texts\IG03057_V1.txt
Processing: IG03058_V1.pdf
🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\IG03058_V1.pdf
✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\IG03058_V1.pdf (76 pages)
Text saved to: e:\Stage\RAG-CSEE\scripts\outputs\extracted_texts\IG03058_V1.txt
Processing: IG03096_V2.pdf
🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\IG03096_V2.pdf


2025-08-06 10:36:50,654 - INFO - ✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\IG03096_V2.pdf (258 pages)
2025-08-06 10:36:50,659 - INFO - 🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\IG03245_V1.pdf


✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\IG03096_V2.pdf (258 pages)
Text saved to: e:\Stage\RAG-CSEE\scripts\outputs\extracted_texts\IG03096_V2.txt
Processing: IG03245_V1.pdf
🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\IG03245_V1.pdf


2025-08-06 10:36:51,258 - INFO - ✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\IG03245_V1.pdf (170 pages)
2025-08-06 10:36:51,261 - INFO - 🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\IG33276_V1.pdf


✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\IG03245_V1.pdf (170 pages)
Text saved to: e:\Stage\RAG-CSEE\scripts\outputs\extracted_texts\IG03245_V1.txt
Processing: IG33276_V1.pdf
🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\IG33276_V1.pdf


2025-08-06 10:37:34,920 - INFO - ✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\IG33276_V1.pdf (2280 pages)
2025-08-06 10:37:34,940 - INFO - 🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\IG33282_V6.pdf


✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\IG33276_V1.pdf (2280 pages)
Text saved to: e:\Stage\RAG-CSEE\scripts\outputs\extracted_texts\IG33276_V1.txt
Processing: IG33282_V6.pdf
🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\IG33282_V6.pdf


2025-08-06 10:37:36,046 - INFO - ✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\IG33282_V6.pdf (356 pages)
2025-08-06 10:37:36,050 - INFO - 🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\IG33305V_2_18122020.pdf
2025-08-06 10:37:36,116 - INFO - ✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\IG33305V_2_18122020.pdf (14 pages)
2025-08-06 10:37:36,118 - INFO - 🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\IG90502_V3_20122022_fil.pdf


✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\IG33282_V6.pdf (356 pages)
Text saved to: e:\Stage\RAG-CSEE\scripts\outputs\extracted_texts\IG33282_V6.txt
Processing: IG33305V_2_18122020.pdf
🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\IG33305V_2_18122020.pdf
✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\IG33305V_2_18122020.pdf (14 pages)
Text saved to: e:\Stage\RAG-CSEE\scripts\outputs\extracted_texts\IG33305V_2_18122020.txt
Processing: IG90502_V3_20122022_fil.pdf
🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\IG90502_V3_20122022_fil.pdf


2025-08-06 10:37:36,647 - INFO - ✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\IG90502_V3_20122022_fil.pdf (70 pages)
2025-08-06 10:37:36,649 - INFO - 🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\IG91844_V1.pdf


✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\IG90502_V3_20122022_fil.pdf (70 pages)
Text saved to: e:\Stage\RAG-CSEE\scripts\outputs\extracted_texts\IG90502_V3_20122022_fil.txt
Processing: IG91844_V1.pdf
🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\IG91844_V1.pdf


2025-08-06 10:37:37,655 - INFO - ✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\IG91844_V1.pdf (217 pages)
2025-08-06 10:37:37,660 - INFO - 🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\IG93267_V2.pdf


✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\IG91844_V1.pdf (217 pages)
Text saved to: e:\Stage\RAG-CSEE\scripts\outputs\extracted_texts\IG91844_V1.txt
Processing: IG93267_V2.pdf
🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\IG93267_V2.pdf


2025-08-06 10:37:38,139 - INFO - ✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\IG93267_V2.pdf (212 pages)
2025-08-06 10:37:38,144 - INFO - 🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\IG93269_V1.pdf
2025-08-06 10:37:38,330 - INFO - ✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\IG93269_V1.pdf (76 pages)
2025-08-06 10:37:38,332 - INFO - 🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\IN00180_V1.pdf


✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\IG93267_V2.pdf (212 pages)
Text saved to: e:\Stage\RAG-CSEE\scripts\outputs\extracted_texts\IG93267_V2.txt
Processing: IG93269_V1.pdf
🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\IG93269_V1.pdf
✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\IG93269_V1.pdf (76 pages)
Text saved to: e:\Stage\RAG-CSEE\scripts\outputs\extracted_texts\IG93269_V1.txt
Processing: IN00180_V1.pdf
🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\IN00180_V1.pdf
✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\IN00180_V1.pdf (62 pages)


2025-08-06 10:37:38,363 - INFO - ✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\IN00180_V1.pdf (62 pages)
2025-08-06 10:37:38,365 - INFO - 🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\IN03737_V1.pdf
2025-08-06 10:37:38,554 - INFO - ✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\IN03737_V1.pdf (147 pages)
2025-08-06 10:37:38,558 - INFO - 🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\LFRP Principes TVM430 - II.SF 17843-oo4973 V01.pdf


Text saved to: e:\Stage\RAG-CSEE\scripts\outputs\extracted_texts\IN00180_V1.txt
Processing: IN03737_V1.pdf
🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\IN03737_V1.pdf
✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\IN03737_V1.pdf (147 pages)
Text saved to: e:\Stage\RAG-CSEE\scripts\outputs\extracted_texts\IN03737_V1.txt
Processing: LFRP Principes TVM430 - II.SF 17843-oo4973 V01.pdf
🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\LFRP Principes TVM430 - II.SF 17843-oo4973 V01.pdf


2025-08-06 10:37:41,474 - INFO - ✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\LFRP Principes TVM430 - II.SF 17843-oo4973 V01.pdf (773 pages)
2025-08-06 10:37:41,481 - INFO - 🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\OP00580_V4.pdf


✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\LFRP Principes TVM430 - II.SF 17843-oo4973 V01.pdf (773 pages)
Text saved to: e:\Stage\RAG-CSEE\scripts\outputs\extracted_texts\LFRP Principes TVM430 - II.SF 17843-oo4973 V01.txt
Processing: OP00580_V4.pdf
🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\OP00580_V4.pdf


2025-08-06 10:37:41,830 - INFO - ✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\OP00580_V4.pdf (84 pages)
2025-08-06 10:37:41,832 - INFO - 🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\RRG00507_V1.pdf


✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\OP00580_V4.pdf (84 pages)
Text saved to: e:\Stage\RAG-CSEE\scripts\outputs\extracted_texts\OP00580_V4.txt
Processing: RRG00507_V1.pdf
🚀 Starting text extraction from: e:\Stage\RAG-CSEE\src\first_batch\RRG00507_V1.pdf


2025-08-06 10:37:42,244 - INFO - ✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\RRG00507_V1.pdf (158 pages)
2025-08-06 10:37:42,248 - INFO - Extraction complete: 17 files processed successfully


✅ Successfully extracted text from e:\Stage\RAG-CSEE\src\first_batch\RRG00507_V1.pdf (158 pages)
Text saved to: e:\Stage\RAG-CSEE\scripts\outputs\extracted_texts\RRG00507_V1.txt
Extraction complete: 17/17 files processed successfully
