In [8]:
from langchain.schema import Document
import json

def extract_documents(node, path_acc=""):
    # Extracts LangChain Documents from nested JSON data
    
    documents = []
    name = node.get("name", "")
    current_path = f"{path_acc}/{name}".strip("/")

    # Final summary for file or directory
    final = node.get("final_summary")
    if final and final.get("summary") is not None:
        documents.append(Document(
            page_content=final["summary"],
            metadata={
                "type": node.get("type"),
                "name": final.get("name", ""),
                "path": node.get("path"),
                "code": final.get("code", ""),
                "summary" : final["summary"]
            }
        ))

    # Function-level summaries inside PYTHON_FILEs
    if node.get("type") == "PYTHON_FILE":
        curr_path = node['path']
        for func in node.get("summaries", []):
            documents.append(Document(
                page_content=f"{func['summary']}\n\n{func['code']}",
                metadata={
                    "type": "FUNCTION",
                    "name": func["name"],
                    "path": curr_path,
                    "code": func['code'],
                    "summary": func['summary'],
                }
            ))

    for child in node.get("children", []):
        documents.extend(extract_documents(child, current_path))

    return documents

# Get data
with open("summary_output.json", "r") as f:
    data = json.load(f)
    
documents = extract_documents(data)

In [9]:
serializable_docs = [
    {"page_content": doc.page_content, "metadata": doc.metadata}
    for doc in documents
]

# Step 4: Write to file
with open("langchain_docs.json", "w", encoding="utf-8") as f:
    json.dump(serializable_docs, f, indent=2)