In [34]:
import json

In [35]:
# Load the JSON file
with open("input.json", "r", encoding="utf-8") as file:
    data = json.load(file)

In [36]:
def process_node(node):
    """
    Recursively processes a node to match the required format.
    Each node is structured as:
    {
        "tag": "UNK",
        "node": { original node data },
        "children": [processed child nodes]
    }
    """
    processed_node = {
        "tag": "UNK",
        "node": node.copy(),  # Copy node to avoid modifying the original
        "children": []
    }

    # If the node has children, process them recursively
    if "children" in node and isinstance(node["children"], list):
        processed_node["children"] = [process_node(child) for child in node["children"]]

    return processed_node

In [37]:
# Extract the "document" key from the nested "nodes"
document_data = {}

for node in data.get("nodes", {}).values():
    if "document" in node:
        document_data = node["document"]
        break  # Assuming you only need the first "document" found

# Process the document recursively
if document_data:
    formatted_data = process_node(document_data)

In [38]:
with open("formatted_document.json", "w", encoding="utf-8") as output_file:
    json.dump(formatted_data, output_file, indent=4)
    
print("Extracted 'document' and saved in recursive structured format to formatted_document.json")

Extracted 'document' and saved in recursive structured format to formatted_document.json
