In [1]:
import fitz  # PyMuPDF
import os
from PIL import Image
import pytesseract
import io
import json
import re

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    full_text = []
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text()
        full_text.append(text)
    return full_text

def extract_images_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    images = []
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        for img in page.get_images(full=True):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            images.append(image_bytes)
    return images

def ocr_images(images):
    extracted_texts = []
    for image_bytes in images:
        image = Image.open(io.BytesIO(image_bytes))
        text = pytesseract.image_to_string(image)
        extracted_texts.append(text)
    return extracted_texts

def clean_text(text):
    # Basic text cleaning
    text = re.sub(r'\n+', '\n', text)  # Replace multiple newlines with a single newline
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = text.strip()
    return text

def text_to_json(texts):
    data = {
        "pages": []
    }
    for i, text in enumerate(texts):
        cleaned_text = clean_text(text)
        page_data = {
            "page_number": i + 1,
            "content": cleaned_text
        }
        data["pages"].append(page_data)
    return data

def save_json(data, json_path):
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

def process_pdf_directory(pdf_directory, json_output_directory):
    if not os.path.exists(json_output_directory):
        os.makedirs(json_output_directory)

    file_count = 0

    for filename in os.listdir(pdf_directory):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_directory, filename)
            json_filename = os.path.splitext(filename)[0] + ".json"
            json_path = os.path.join(json_output_directory, json_filename)

            # Extract text and images
            texts = extract_text_from_pdf(pdf_path)
            images = extract_images_from_pdf(pdf_path)

            # Perform OCR on images if necessary
            if images:
                ocr_texts = ocr_images(images)
                texts.extend(ocr_texts)

            # Convert to JSON format
            data = text_to_json(texts)
            save_json(data, json_path)

            file_count += 1
            print(f"Processed file: {filename}")

    print(f"Total files processed: {file_count}")

# Update the Tesseract executable path if necessary
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Example usage
pdf_directory = r"C:\Users\EstifanosT\AwashRagSystem\SourceDocuments"
json_output_directory = r"C:\Users\EstifanosT\AwashRagSystem\ConvertedToJson"

process_pdf_directory(pdf_directory, json_output_directory)



Processed file: Asset Classification and Provisioning.pdf
Processed file: Awash Bank Access Control Policy.pdf
Processed file: Bank Corporate Governance.pdf
Processed file: Credit Procedure.pdf
Processed file: Data Center Access and Operations Procedure Manual.pdf
Processed file: directive-for-the-establishment-of-and-operation-of-credit-reference-bureau-no-crb_02_2019.pdf
Processed file: FCP-01-2020.pdf
Processed file: FXD012024-FOREIGN-EXCHANGE-1-1.pdf
Processed file: Information Classification Policy.pdf
Processed file: Information Systems Asset Management Policy.pdf
Processed file: Interest Free Banking Operations Procedure Manual (Revised 2022).pdf
Processed file: Limits on Birr and Foreign Currency in the Territory of Ethiopia.pdf
Processed file: ONPS-04-2021.pdf
Processed file: Requirnment for person with Significant Influence.pdf
Processed file: sbb-52-12.pdf
Processed file: sbb-53-12.pdf
Processed file: SBB-64-2016.pdf
Processed file: SBB-83-2022.pdf
Processed file: SBB_87_202