In [2]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io
import os
import json

def extract_text_from_pdf(pdf_path):
    """
    Extract text from each page of the PDF. Use OCR if necessary.
    """
    doc = fitz.open(pdf_path)
    text_by_page = []
    
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        page_text = page.get_text("text")
        
        if page_text.strip():  # If text is found, use it
            text_by_page.append(page_text)
        else:
            # If no text found, use OCR
            pix = page.get_pixmap()
            img = Image.open(io.BytesIO(pix.tobytes("png")))
            page_text = pytesseract.image_to_string(img)
            text_by_page.append(page_text)
    
    return text_by_page

def structure_pdf_content(document_id, title, text_by_page):
    """
    Create a structured JSON-like dictionary with sections.
    Each section includes the page number.
    """
    sections = []
    
    # Splitting sections by pages and adding page numbers
    for i, page_text in enumerate(text_by_page):
        section = {
            "section_id": i + 1,
            "section_title": f"Section {i + 1}",
            "page_number": i + 1,  # Add page number to each section
            "content": page_text.strip(),
        }
        sections.append(section)
    
    # Final document structure
    document_structure = {
        "document_id": document_id,
        "title": title,
        "sections": sections
    }
    
    return document_structure

def save_as_json(document_structure, output_folder, filename):
    """
    Save the structured document as a JSON file.
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    output_path = os.path.join(output_folder, filename)
    with open(output_path, "w", encoding="utf-8") as json_file:
        json.dump(document_structure, json_file, indent=4, ensure_ascii=False)
    
    print(f"Saved JSON file: {output_path}")

def process_pdfs(pdf_folder, output_folder):
    """
    Process each PDF in the given folder and save structured JSON files.
    """
    for pdf_file in os.listdir(pdf_folder):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, pdf_file)
            
            # Extract text from PDF (with OCR fallback)
            text_by_page = extract_text_from_pdf(pdf_path)
            
            # Create structured content (this is customizable based on document content)
            document_id = os.path.splitext(pdf_file)[0]
            title = f"{document_id} Directive"  # Example title, customize as needed
            structured_content = structure_pdf_content(document_id, title, text_by_page)
            
            # Save the JSON file
            json_filename = f"{document_id}.json"
            save_as_json(structured_content, output_folder, json_filename)

# Example usage
pdf_folder = r"D:\NBEDirectivesAssist\National Bank of Ethiopia Directives"  # Folder containing your PDF files
output_folder = r"D:\NBEDirectivesAssist\ConvertedToJson"  # Folder to save the JSON files
process_pdfs(pdf_folder, output_folder)


Saved JSON file: D:\NBEDirectivesAssist\Converted\Asset Classification and Provisioning Directive.json
Saved JSON file: D:\NBEDirectivesAssist\Converted\Bank Corporate Governance.json
Saved JSON file: D:\NBEDirectivesAssist\Converted\Financial Consumer Protection Directive.json
Saved JSON file: D:\NBEDirectivesAssist\Converted\Foreign Exchange Directive.json
Saved JSON file: D:\NBEDirectivesAssist\Converted\Limits on Birr and Foreign Currency in the Territory of Ethiopia.json
Saved JSON file: D:\NBEDirectivesAssist\Converted\Payment Authorization Directive.json
Saved JSON file: D:\NBEDirectivesAssist\Converted\Related banking Transaction Directive.json
Saved JSON file: D:\NBEDirectivesAssist\Converted\Requirnment for person with Significant Influence directive.json
Saved JSON file: D:\NBEDirectivesAssist\Converted\Supervision of Banking business directive.json
Saved JSON file: D:\NBEDirectivesAssist\Converted\The Establishment of and operation of credit reference bureau Directive.json
