In [2]:
import os
import json

def generate_level_document(json_data):
    """
    Takes a JSON data (as a dictionary) that contains a pdf_name and sections,
    and returns a string document summarizing the number of sections at each level.
    
    Parameters:
      json_data (dict): The JSON data containing keys "pdf_name" and "sections".
      
    Returns:
      str: A document with the pdf_name and counts of each level.
    """
    # Get the PDF name; default to "Unknown PDF" if not present
    pdf_name = json_data.get("pdf_name", "Unknown PDF")
    
    # Get the sections dictionary; default to an empty dict if not present
    sections = json_data.get("sections", {})
    
    # Create a dictionary to count sections per level
    level_counts = {}
    for key, section in sections.items():
        # Each section is expected to have a "level" attribute.
        level = section.get("level")
        if level is not None:
            level_counts[level] = level_counts.get(level, 0) + 1

    # Build the output document as a string
    document_lines = [f"PDF Name: {pdf_name}", "Level counts:"]
    for level in sorted(level_counts.keys()):
        document_lines.append(f"  Level {level}: {level_counts[level]} section(s)")
    
    return "\n".join(document_lines)


def process_all_json_files(start_folder):
    """
    Recursively processes all .json files in the given folder and its subfolders.
    For each JSON file found, it loads the JSON, generates a level document, and
    adds information about the subfolder (relative to the start folder).
    
    Parameters:
      start_folder (str): The root folder path to search for JSON files.
      
    Returns:
      str: A combined document containing the results for all JSON files.
    """
    documents = []
    for root, dirs, files in os.walk(start_folder):
        for file in files:
            if file.lower().endswith(".json"):
                file_path = os.path.join(root, file)
                try:
                    with open(file_path, "r", encoding="utf-8") as f:
                        json_data = json.load(f)
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")
                    continue
                
                # Generate the document for this JSON file
                doc = generate_level_document(json_data)
                # Determine subfolder relative to the start_folder
                rel_path = os.path.relpath(root, start_folder)
                documents.append(f"File: {file} (Subfolder: {rel_path})\n{doc}")
    
    return "\n\n".join(documents)


if __name__ == "__main__":
    # Ask the user for the folder path; default to "dataset/extraction_results_renamed"
    folder_path = input("Enter the folder path (default: dataset/extraction_results_renamed): ").strip()
    if not folder_path:
        folder_path = "dataset/extraction_results_renamed"
    
    # Process all JSON files in the given folder and print the result
    combined_document = process_all_json_files(folder_path)
    print("\n=== Processing Results ===\n")
    print(combined_document)



=== Processing Results ===

File: ISO45001_ISO14001_POLICY_SSD0003_R01_EN.json (Subfolder: SSD)
PDF Name: ISO45001_ISO14001_POLICY_SSD0003_EN
Level counts:
  Level 0: 1 section(s)
  Level 1: 1 section(s)

File: MORT015_MORT017_MORT022_JDFT049_JDFT050_ZONE_2_DEU0019_R07_EN.json (Subfolder: DEU)
PDF Name: MORT015_MORT017_MORT022_JDFT049_JDFT050_ZONE_2_DEU0019_EN
Level counts:
  Level 0: 1 section(s)
  Level 1: 1 section(s)

File: X1-IS-DI-03_DEU0122_R00_EN.json (Subfolder: DEU)
PDF Name: X1-IS-DI-03_DEU0122_EN
Level counts:
  Level 0: 1 section(s)
  Level 1: 1 section(s)

File: X1-NIS-RLI-01_DEU0140_R00_EN.json (Subfolder: DEU)
PDF Name: X1-NIS-RLI-01_DEU0140_EN
Level counts:
  Level 0: 1 section(s)
  Level 1: 1 section(s)

File: D5263_D5264_DEU0032_R07_EN.json (Subfolder: DEU)
PDF Name: D5263_D5264_DEU0032_EN
Level counts:
  Level 0: 1 section(s)
  Level 1: 1 section(s)

File: D9410_D9420_D9510_D9520_DEU0043_R05_EN.json (Subfolder: DEU)
PDF Name: D9410_D9420_D9510_D9520_DEU0043_EN
Leve