In [None]:
import json
from pathlib import Path

from langchain.schema import Document

from medical_graph_rag.data_processing.batch_processor import PMCBatchProcessor
from medical_graph_rag.data_processing.document_processor import DocumentProcessor

In [None]:
%%nbqa_ignore F704
load_dotenv()


async def main():
    downloader = PubMedEntrezDownloader("olandechris@gmail.com")
    pmids = await downloader.search_pubmed("", max_results=10000)
    articles = await downloader.fetch_article_details(pmids)
    downloader.save_to_json(articles, "results.json")


await main()

In [None]:
doc_processor = DocumentProcessor()
batch_processor = PMCBatchProcessor(
    document_processor=doc_processor,
    batch_size=50,
    max_concurrent_batches=4,
    retry_attempts=3,
)

In [None]:
%%nbqa_ignore F704

results = await batch_processor.process_pmc_file_async(
    file_path="../data/input/research20250605_002659.json", max_docs=1000, batch_size=30
)
# print(f"Generated {results['processing_summary']['total_chunks']} chunks")

In [None]:
data_path = Path("../data/output/processed_pmc_data/pmc_chunks.json")
with data_path.open(encoding="utf-8") as f:
    data = json.load(f)

# Create documents
documents = [
    Document(page_content=doc["content"], metadata=doc["metadata"])
    for doc in data["documents"]
]

In [None]:
%%nbqa_ignore F704

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("MainScript")


async def main():
    doc_processor = DocumentProcessor(embeddings_model="BAAI/bge-small-en-v1.5")

    batch_processor = PMCBatchProcessor(
        document_processor=doc_processor,
        batch_size=96,
        max_concurrent_batches=3,
        retry_attempts=2,
        retry_delay=1.0,
        inter_batch_delay=0.1,
    )

    file_path = "../data/research20250605_002659.json"
    output_directory = "../data/output/processed_pmc_data"

    logger.info(f"Starting batch processing of {file_path}")

    try:
        processing_results = await batch_processor.process_pmc_file_async(
            file_path=file_path
        )

        batch_processor.save_results(
            processing_results, output_directory, save_batch_details=True
        )

        s = processing_results["processing_summary"]
        logger.info(
            f"Processing complete: {s['total_documents']:,} docs → {s['total_chunks']:,} chunks ({s['processing_time']:.1f}s)"
        )
        logger.info(f"Success rate: {s['success_rate']:.1f}%")

    except Exception as e:
        logger.error(f"An error occurred during batch processing: {e}", exc_info=True)



await main()

In [None]:
%%nbqa_ignore F704

main = Main()
await main.process_documents(documents)
response, path, content = await main.query("How are brain tumors detected?")