In [1]:
import spacy
from scispacy.abbreviation import AbbreviationDetector
import re


In [2]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("jonathansuru/pharmacopeia_pdf")

In [3]:
def clean_text(text):
    """Clean and preprocess table-like text to extract disease mentions."""
    # Remove table formatting and LaTeX-like patterns
    text = re.sub(r'\|\s*\d+\s*\|', ' ', text)  # Remove table cell numbers
    text = re.sub(r'\$\\begin\{aligned\}.*?\\end\{aligned\}\$', ' ', text)  # Remove LaTeX
    text = re.sub(r'<br>', ' ', text)  # Replace HTML breaks
    text = re.sub(r'\|\s*:--:\s*\|', ' ', text)  # Remove table formatting
    text = re.sub(r'\|\s*\|', ' ', text)  # Remove empty cells
    return text



In [4]:
def extract_diseases(text, use_multiple_models=True):
    """
    Extract disease mentions from text using multiple SciSpacy models.

    Args:
        text (str): The input text to analyze
        use_multiple_models (bool): Whether to use multiple models for better coverage

    Returns:
        dict: Dictionary with disease entities and their sources
    """
    # Clean the text to improve entity recognition
    cleaned_text = clean_text(text)

    # Store all detected disease entities
    diseases = {}

    # BC5CDR model - good for disease and chemical recognition
    nlp_bc5cdr = spacy.load("en_ner_bc5cdr_md")
    nlp_bc5cdr.add_pipe("abbreviation_detector")

    doc = nlp_bc5cdr(cleaned_text)
    for ent in doc.ents:
        if ent.label_ == "DISEASE":
            diseases[ent.text] = {"source": "BC5CDR", "label": "DISEASE"}

    # Add abbreviation resolutions
    for abrv in doc._.abbreviations:
        if abrv.text not in diseases and abrv._.long_form.text in diseases:
            diseases[abrv.text] = {"source": "BC5CDR", "label": "DISEASE",
                                   "abbreviation_of": abrv._.long_form.text}

    # If using multiple models for better coverage
    if use_multiple_models:
        # BioNLP13CG model for broader biomedical entity recognition
        nlp_bionlp = spacy.load("en_ner_bionlp13cg_md")
        nlp_bionlp.add_pipe("abbreviation_detector")

        doc = nlp_bionlp(cleaned_text)
        for ent in doc.ents:
            if ent.label_ == "DISORDER" and ent.text not in diseases:
                diseases[ent.text] = {"source": "BioNLP13CG", "label": "DISORDER"}

    # Add common disease patterns not covered by models
    disease_patterns = [
            r"(east coast fever)",
            r"(theileriosis)",
            r"(measles)",
            r"(cough)",
            r"(diarrhoea)"
    ]

    for pattern in disease_patterns:
        matches = re.finditer(pattern, cleaned_text, re.IGNORECASE)
        for match in matches:
            disease = match.group(0)
            if disease not in diseases:
                diseases[disease] = {"source": "Pattern", "label": "DISEASE"}

    return diseases



In [5]:
data = []
results = []

In [6]:
ds = ds['train']

In [7]:
from tqdm import tqdm

for i in tqdm(ds['text']):
    result = extract_diseases(i)
    if result:
        data.append(i)
        results.append(result)

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]
  global_matches = self.global_matcher(doc)
  1%|          | 74/10003 [14:26<32:16:37, 11.70s/it]
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x1075f64e0>>
Traceback (most recent call last):
  File "/opt/miniconda3/envs/OpenPharm/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


KeyboardInterrupt: 

In [26]:
len(data)

15

In [None]:
from tqdm import tqdm
import concurrent.futures
from functools import partial

def process_text(text):
    result = extract_diseases(text)
    if result:
        return (text, result)
    return None

# Use ThreadPoolExecutor instead of ProcessPoolExecutor
def parallel_extract_diseases(texts, max_workers=None):
    data = []
    results = []

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Use tqdm to track progress
        processed_results = list(tqdm(
                executor.map(process_text, texts),
                total=len(texts)
        ))

        # Filter out None results and separate into data and results
        for item in processed_results:
            if item:
                data.append(item[0])
                results.append(item[1])

    return data, results

# Run the accelerated extraction
data, results = parallel_extract_diseases(ds['text'])

  0%|          | 13/10003 [07:41<80:22:30, 28.96s/it] 