In [35]:
from PubMedDownloader import PubMedEntrezDownloader
from datetime import datetime
import os
from dotenv import load_dotenv
load_dotenv()

downloader = PubMedEntrezDownloader(email="olandechris@gmail.com", api_key = os.getenv("PUBMED_API_KEY"))

pmids = downloader.advanced_search(
    #query="Diabetes treatment",
    #mesh_terms=["Music"],
    #title_words=,
    #journal = ,
    #publication_types=["Journal Article", "Review"],
    #languages=["eng"],
    date_from="2023/01/01",
    date_to = "2025/06/01",
    max_results=3000
)

articles = downloader.fetch_article_details(pmids)

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
downloader.save_to_json(articles, f"research{timestamp}.json")

Searching PubMed with query:  AND 2023/01/01[PDAT]:2025/06/01[PDAT]
Found 3919060 total articles, retrieving 3000 IDs
Fetching batch 1/30 (100 articles)...
Fetching batch 2/30 (100 articles)...
Fetching batch 3/30 (100 articles)...
Fetching batch 4/30 (100 articles)...
Fetching batch 5/30 (100 articles)...
Fetching batch 6/30 (100 articles)...
Fetching batch 7/30 (100 articles)...
Fetching batch 8/30 (100 articles)...
Fetching batch 9/30 (100 articles)...
Fetching batch 10/30 (100 articles)...
Fetching batch 11/30 (100 articles)...
Fetching batch 12/30 (100 articles)...
Fetching batch 13/30 (100 articles)...
Fetching batch 14/30 (100 articles)...
Fetching batch 15/30 (100 articles)...
Fetching batch 16/30 (100 articles)...
Fetching batch 17/30 (100 articles)...
Fetching batch 18/30 (100 articles)...
Fetching batch 19/30 (100 articles)...
Fetching batch 20/30 (100 articles)...
Fetching batch 21/30 (100 articles)...
Fetching batch 22/30 (100 articles)...
Fetching batch 23/30 (100 article

In [None]:
# To avoid reloading the kernel, i monkey patch the function 
#INFO: we can use the function from the module later, will need to use caching
def get_article_statistics(articles):
        """Generate basic statistics about downloaded articles"""
        if not articles:
            return {}
        
        df = pd.DataFrame(articles)
        
        stats = {
            'total_articles': len(articles),
            'articles_with_abstracts': len(df[df['abstract'].str.len() > 0]),
            'date_range': {
                'earliest': df['year'].min(),
                'latest': df['year'].max()
            },
            'top_journals': df['journal'].value_counts().head(10).to_dict(),
            'publication_types': df['publication_types'].value_counts().head(10).to_dict(),
            'articles_per_year': df['year'].value_counts().sort_index().to_dict()
        }
        
        return stats

# Monkey patch it
import types
downloader.get_article_statistics = types.MethodType(get_article_statistics, downloader)


In [41]:
get_article_statistics(articles)

{'total_articles': 2998,
 'articles_with_abstracts': 2452,
 'date_range': {'earliest': '', 'latest': '2025'},
 'top_journals': {'Scientific reports': 34,
  'PloS one': 33,
  'Methods in molecular biology (Clifton, N.J.)': 26,
  'International journal of biological macromolecules': 24,
  'Cureus': 20,
  'Optics letters': 18,
  'Advanced science (Weinheim, Baden-Wurttemberg, Germany)': 17,
  'Journal of the American Chemical Society': 17,
  'Small (Weinheim an der Bergstrasse, Germany)': 16,
  'The Journal of organic chemistry': 15},
 'publication_types': {'Journal Article': 2216,
  'Journal Article; Review': 266,
  'Letter': 115,
  'Editorial': 71,
  'Published Erratum': 56,
  'Case Reports; Journal Article': 54,
  'News': 22,
  'Journal Article; Retraction Notice': 20,
  'English Abstract; Journal Article': 18,
  'Journal Article; Case Reports': 13},
 'articles_per_year': {'': 12, '2024': 10, '2025': 2976}}

In [1]:
from document_processor import DocumentProcessor
from batchprocessor import PMCBatchProcessor
from typing import Optional, Dict, Any
def create_pmc_processor(
    cohere_api_key: Optional[str] = None,
    batch_size: int = 50,
    max_concurrent_batches: int = 2,
    max_api_calls_per_minute: int = 100
) -> PMCBatchProcessor:

    
    # Create document processor
    doc_processor = DocumentProcessor(
        cohere_api_key=cohere_api_key,
        breakpoint_threshold_type="gradient",
        #breakpoint_threshold_amount=95
    )
    
    # Create PMC batch processor
    pmc_processor = PMCBatchProcessor(
        document_processor=doc_processor,
        batch_size=batch_size,
        max_concurrent_batches=max_concurrent_batches,
        max_api_calls_per_minute=max_api_calls_per_minute
    )
    
    return pmc_processor


def pmc_progress_callback(completed_batches: int, total_batches: int, batch_result: Dict[str, Any]):
    """Progress callback function for PMC processing"""
    percentage = (completed_batches / total_batches) * 100
    
    if batch_result["success"]:
        print(f" Progress: {completed_batches}/{total_batches} batches "
              f"({percentage:.1f}%) - Batch {batch_result['batch_num']}: "
              f"{batch_result['original_count']} docs → {batch_result['chunk_count']} chunks")
    else:
        print(f" Progress: {completed_batches}/{total_batches} batches "
              f"({percentage:.1f}%) - Batch {batch_result['batch_num']} failed")




In [2]:
processor = create_pmc_processor(
    batch_size=96,  
    max_concurrent_batches=10,  
    max_api_calls_per_minute=2000 
)
    
# Process the PMC file
results = processor.process_pmc_file(
    file_path="research20250605_002659.json",
    max_docs=None,  # Process all documents
    progress_callback=pmc_progress_callback
)
    
# Save results
processor.save_results(
    results=results,
    output_dir="output/pmc_chunks",
    save_batch_details=True
)
    
print("\nProcessing Summary:")
print(f"Total documents: {results['processing_summary']['total_documents']}")
print(f"Total chunks: {results['processing_summary']['total_chunks']}")
print(f"Success rate: {results['processing_summary']['success_rate']:.1f}%")
print(f"Processing time: {results['processing_summary']['processing_time']:.2f} seconds")
print(f"Average time per batch: {results['processing_summary']['avg_time_per_batch']:.2f} seconds")
print(f"Documents per second: {results['processing_summary']['docs_per_second']:.2f}")

INFO:document_processor:Initialized DocumentProcessor with semantic chunking
INFO:document_processor:Embeddings model: embed-english-v3.0
INFO:document_processor:Breakpoint threshold type: gradient
INFO:batchprocessor:Loading PMC data from research20250605_002659.json
INFO:batchprocessor:Loaded 2998 PMC documents from research20250605_002659.json
INFO:batchprocessor:Found 2452 documents with valid abstracts
INFO:batchprocessor:Creating 26 batches of size 96
INFO:batchprocessor:Processing 2452 documents in 26 batches
INFO:batchprocessor:Max concurrent batches: 10
INFO:batchprocessor:Processing batch 1/26 (96 documents, attempt 1)
INFO:batchprocessor:Processing batch 2/26 (96 documents, attempt 1)
INFO:batchprocessor:Processing batch 3/26 (96 documents, attempt 1)
INFO:batchprocessor:Processing batch 4/26 (96 documents, attempt 1)
INFO:batchprocessor:Processing batch 5/26 (96 documents, attempt 1)
INFO:batchprocessor:Processing batch 6/26 (96 documents, attempt 1)
INFO:batchprocessor:Pro

 Progress: 1/26 batches (3.8%) - Batch 3: 96 docs → 182 chunks


INFO:document_processor:Performing semantic chunking...
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "H

 Progress: 2/26 batches (7.7%) - Batch 10: 96 docs → 181 chunks


INFO:document_processor:Performing semantic chunking...
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "H

 Progress: 3/26 batches (11.5%) - Batch 2: 96 docs → 192 chunks


INFO:document_processor:Processing file: temp_batch_13_1749132294.1389005.json
INFO:document_processor:Loading documents...
INFO:document_processor:Loaded 96 documents
INFO:document_processor:Performing semantic chunking...
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:document_processor:Created 192 semantic chunks from 96 documents
INFO:batchprocessor:Processing batch 14/26 (96 documents, attempt 1)
INFO:batchprocessor: Batch 6 completed: 96 docs → 192 chunks
INFO:document_processor:Processing file: temp_batch_14_1749132297.0965922.json
INFO:document_processor:Loading documents...
INFO:document_processor:Loaded 96 documents


 Progress: 4/26 batches (15.4%) - Batch 6: 96 docs → 192 chunks


INFO:document_processor:Performing semantic chunking...
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:document_processor:Filtered out 2 chunks smaller than 50 characters
INFO:document_processor:Created 188 semantic chunks from 96 documents
INFO:batchprocessor:Processing batch 15/26 (96 documents, attempt 1)
INFO:batchpr

 Progress: 5/26 batches (19.2%) - Batch 7: 96 docs → 188 chunks


INFO:document_processor:Processing file: temp_batch_15_1749132299.723882.json
INFO:document_processor:Loading documents...
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:document_processor:Loaded 96 documents
INFO:document_processor:Performing semantic chunking...
INFO:document_processor:Created 191 semantic chunks from 96 documents
INFO:document_processor:Filtered out 1 chunks smaller than 50 characters
INFO:batchprocessor:Processing batch 16/26 (96 documents, attempt 1)
INFO:document_processor:Created 191 semantic chunks from 96 documents
INFO:batchprocessor: Batch 5 completed: 96 docs → 191 chunks
INFO:batchprocessor:Processing batch 17/26 (96 documents, attempt 1)
INFO:document_processor:Created 188 semantic chunks from 96 documents
INFO:batchprocessor:Processing batch 18/26 (96 documents, attempt 1)
INFO:document_processor:Processing file: temp_batch_17_1749132300.9308517.json


 Progress: 6/26 batches (23.1%) - Batch 5: 96 docs → 191 chunks


INFO:document_processor:Processing file: temp_batch_16_1749132300.8522236.json
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:document_processor:Loading documents...
INFO:document_processor:Processing file: temp_batch_18_1749132301.00746.json
INFO:document_processor:Loading documents...
INFO:document_processor:Loaded 96 documents
INFO:document_processor:Loading documents...
INFO:document_processor:Loaded 96 documents
INFO:document_processor:Performing semantic chunking...
INFO:document_processor:Loaded 96 documents
INFO:document_processor:Performing semantic chunking...
INFO:batchprocessor: Batch 8 completed: 96 docs → 191 chunks
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:document_processor:Performing semantic chunking...
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"


 Progress: 7/26 batches (26.9%) - Batch 8: 96 docs → 191 chunks


INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:batchprocessor: Batch 1 completed: 96 docs → 188 chunks
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"


 Progress: 8/26 batches (30.8%) - Batch 1: 96 docs → 188 chunks


INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://ap

 Progress: 9/26 batches (34.6%) - Batch 4: 96 docs → 189 chunks


INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://ap

 Progress: 10/26 batches (38.5%) - Batch 9: 96 docs → 186 chunks


INFO:document_processor:Processing file: temp_batch_20_1749132314.3793352.json
INFO:document_processor:Loading documents...
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:document_processor:Loaded 96 documents
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:document_processor:Performing semantic chunking...
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/

 Progress: 11/26 batches (42.3%) - Batch 11: 96 docs → 188 chunks


INFO:document_processor:Loaded 96 documents
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:document_processor:Performing semantic chunking...
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Requ

 Progress: 12/26 batches (46.2%) - Batch 12: 96 docs → 194 chunks


INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:document_processor:Loaded 96 documents
INFO:document_processor:Performing semantic chunking...
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Requ

 Progress: 13/26 batches (50.0%) - Batch 14: 96 docs → 188 chunks


INFO:document_processor:Performing semantic chunking...
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "H

 Progress: 14/26 batches (53.8%) - Batch 17: 96 docs → 187 chunks


INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:document_processor:Created 190 semantic chunks from 96 documents
INFO:batchprocessor:Processing batch 25/26 (96 documents, attempt 1)
INFO:batchprocessor: Batch 16 completed: 96 docs → 190 chunks
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"

 Progress: 15/26 batches (57.7%) - Batch 16: 96 docs → 190 chunks


INFO:document_processor:Loading documents...
INFO:document_processor:Loaded 96 documents
INFO:document_processor:Performing semantic chunking...
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1

 Progress: 16/26 batches (61.5%) - Batch 15: 96 docs → 194 chunks


INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://ap

 Progress: 17/26 batches (65.4%) - Batch 19: 96 docs → 187 chunks


INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://ap

 Progress: 18/26 batches (69.2%) - Batch 18: 96 docs → 190 chunks


INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:document_processor:Filtered out 1 chunks smaller than 50 characters
INFO:document_processor:Created 192 semantic chunks from 96 documents
INFO:batchprocessor: Batch 13 completed: 96 docs → 192 ch

 Progress: 19/26 batches (73.1%) - Batch 13: 96 docs → 192 chunks


INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:document_processor:Filtered out 2 chunks smaller than 50 characters
INFO:document_processor:Created 191 semantic chunks from 96 documents
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:batchprocessor: Batch 20 completed: 96 docs → 191 chunks
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200

 Progress: 20/26 batches (76.9%) - Batch 20: 96 docs → 191 chunks


INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://ap

 Progress: 21/26 batches (80.8%) - Batch 26: 52 docs → 105 chunks


INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://ap

 Progress: 22/26 batches (84.6%) - Batch 21: 96 docs → 190 chunks


INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://ap

 Progress: 23/26 batches (88.5%) - Batch 22: 96 docs → 190 chunks


INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://ap

 Progress: 24/26 batches (92.3%) - Batch 24: 96 docs → 186 chunks


INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://ap

 Progress: 25/26 batches (96.2%) - Batch 23: 96 docs → 191 chunks


INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
INFO:document_processor:Filtered out 1 chunks smaller than 50 characters
INFO:document_processor:Created 190 semantic chunks from 96 documents
INFO:batchprocessor: Batch 25 completed: 96 docs → 190 chunks
INFO:batchprocessor:PMC processing complete in 858.26 seconds
INFO:batchprocessor:Success rate: 100.0%
INFO:batchprocessor:Total chunks created: 4833


 Progress: 26/26 batches (100.0%) - Batch 25: 96 docs → 190 chunks


INFO:batchprocessor:Saved 4833 chunks to output/pmc_chunks/pmc_semantic_chunks.json
INFO:batchprocessor:Saved processing log to output/pmc_chunks/processing_log.json



Processing Summary:
Total documents: 2452
Total chunks: 4833
Success rate: 100.0%
Processing time: 858.26 seconds
Average time per batch: 33.01 seconds
Documents per second: 2.86


In [1]:
import json
from langchain_core.documents import Document

with open("output/pmc_chunks/pmc_semantic_chunks.json", 'r', encoding='utf-8') as f:
    data = json.load(f)
    

documents = []

# Extract documents from the JSON structure
for doc_data in data.get('documents', []):
    # Create Document with content and metadata
    doc = Document(
        page_content=doc_data['content'],
        metadata=doc_data.get('metadata', {})
    )
    documents.append(doc)

In [6]:
from knowledge_graph import KnowledgeGraph
kg = KnowledgeGraph(
    batch_size=5,           
    entity_batch_size=500,  
    rel_batch_size=200,      
    max_concurrent=15        
)

# Process your documents
#await kg.create_graph_from_documents(documents)
await kg.load_or_create_graph(documents) # Load the previously created graph

Loading existing graph...
Found 26307 nodes and 24066 relationships
Entity types: ['Medical_Conditions', 'Anatomy', 'Medications', 'Methods', 'Equipment', 'Procedures', 'Demographics', 'Institutions', 'Locations', 'Researchers', 'Medical_Condition', 'Time_Periods', 'Method', 'Proteins', 'Protein', 'Journals', 'Procedure', 'Chemical', 'Substance', 'Materials', 'Microorganism', 'Gene', 'Enzyme', 'Medication', 'Studies', 'Cell_Line', 'Material', 'Genes', 'Organism', 'Measurements', 'Biological_Process', 'Chemical_Compound', 'Location', 'Datasets', 'Biomarker', 'Field', 'Microorganisms', 'Cell_Lines', 'Mutation', 'Measure', 'Dataset', 'Database', 'Study', 'Model', 'Cell_Type', 'Events', 'Chemical_Compounds', 'Property', 'Genome', 'Phenomenon', 'Amino_Acid', 'Food', 'Metabolite', 'Genetic_Variant', 'Models', 'None', 'Breeds', 'Metabolites', 'Receptors', 'Genetics', 'Genetic_Mutation', 'Industry', 'Genetic_Material', 'Substances', 'Pathway', 'Stimuli', 'Event', 'Toxin', 'Chemical_Element', '

True

In [8]:
kg.visualize_graph()

GraphWidget(layout=Layout(height='800px', width='100%'))