# Development script for testing PICO extraction from clinical trials data

In [1]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1" # suppresses Huggingface warning of storing data rather than symlinking it
from transformers import AutoTokenizer, AutoModelForPreTraining, pipeline
import pandas as pd

import sys
sys.path.append('../../')
# load functions for import of clinicaltrials.gov data written previously
from app.data.loader import load_trials_json, extract_from_clinicaltrials

# extractive summarization with textrank:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer

#import nltk
#nltk.download('punkt_tab') # needed for first time usage? Is being downloaded into user folder --> TODO: check out if works with dockerization and if it works with 'punkt' only

In [None]:
# Load BioM-ELECTRA-Base-PICO model directly
# TODO: clean up C:\Users\USER\.cache\huggingface\hub, deleting unneeded models
# tokenizer = AutoTokenizer.from_pretrained("kamalkraj/BioELECTRA-PICO")
# model = AutoModelForPreTraining.from_pretrained("kamalkraj/BioELECTRA-PICO")
ner_pipeline = pipeline("token-classification", 
                        model="kamalkraj/BioELECTRA-PICO",
                        aggregation_strategy="simple")

                

Some weights of ElectraForPreTraining were not initialized from the model checkpoint at kamalkraj/BioELECTRA-PICO and are newly initialized: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


In [28]:
# TEsting mini-models for NER
#ner_pipeline = pipeline("token-classification", model="nlpie/bio-mobilebert")      # only has LABEL_0, LABEL_1 as labels, not PICO
#ner_pipeline = pipeline("token-classification", model="nlpie/compact-biobert")  # only has LABEL_0, LABEL_1 as labels, not PICO
#ner_pipeline = pipeline("token-classification", model="nlpie/bio-distilbert-cased")  # only has LABEL_0, LABEL_1 as labels, not PICO
#ner_pipeline = pipeline("token-classification", model="nlpie/bio-tinybert")  # only has LABEL_0, LABEL_1 as labels, not PICO
# ner_pipeline = pipeline("token-classification", model="nlpie/distil-biobert") # only has LABEL_0, LABEL_1 as labels, not PICO
#ner_pipeline = pipeline("token-classification", model="nlpie/tiny-biobert")  # only has LABEL_0, LABEL_1 as labels, not PICO



config.json:   0%|          | 0.00/840 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/55.6M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at nlpie/tiny-biobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/55.6M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


[{'entity': 'LABEL_0',
  'score': np.float32(0.53955513),
  'index': 1,
  'word': 'This',
  'start': 0,
  'end': 4},
 {'entity': 'LABEL_1',
  'score': np.float32(0.52265954),
  'index': 2,
  'word': 'will',
  'start': 5,
  'end': 9},
 {'entity': 'LABEL_1',
  'score': np.float32(0.55467814),
  'index': 3,
  'word': 'be',
  'start': 10,
  'end': 12},
 {'entity': 'LABEL_1',
  'score': np.float32(0.52197886),
  'index': 4,
  'word': 'a',
  'start': 13,
  'end': 14},
 {'entity': 'LABEL_1',
  'score': np.float32(0.53868324),
  'index': 5,
  'word': 'double',
  'start': 15,
  'end': 21},
 {'entity': 'LABEL_0',
  'score': np.float32(0.50593114),
  'index': 6,
  'word': 'blind',
  'start': 22,
  'end': 27},
 {'entity': 'LABEL_1',
  'score': np.float32(0.51071393),
  'index': 7,
  'word': ',',
  'start': 27,
  'end': 28},
 {'entity': 'LABEL_1',
  'score': np.float32(0.5679425),
  'index': 8,
  'word': 'place',
  'start': 29,
  'end': 34},
 {'entity': 'LABEL_1',
  'score': np.float32(0.53768367),

In [None]:
# Extractzive summarization model
#sum_pipeline = pipeline("summarization",
#  model = "NotXia/pubmedbert-bio-ext-summ",
#  tokenizer = AutoTokenizer.from_pretrained("NotXia/pubmedbert-bio-ext-summ"),
#  trust_remote_code = True,
#  device = 0
#)

# Abstractive summarization model
sum_pipeline = pipeline("summarization", model="L-macc/autotrain-Biomedical_sc_summ-1217846148")

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu


In [None]:
# Load and process data from clinicaltrials.gov
PROJECT_ROOT = os.path.expanduser('~/Documents/github/biomed_extractor')

# Data directory at top level
DATA_DIR = os.path.join(PROJECT_ROOT, 'data')

df_json = load_trials_json(filepath = DATA_DIR, filename ='example_trials.json')
#print(df_json.head())
mydf = extract_from_clinicaltrials(df_json)
mydf.head()

Loaded 100 records from example_trials.json


Unnamed: 0,nctId,briefSummary,detailedDescription,inclusion_criteria,exclusion_criteria,intervention_name_clean,outcomes_name
0,NCT00105105,The purpose of this study is to evaluate the e...,"This will be a double blind, placebo controlle...",diagnosis of alzheimer's disease; women must h...,women with an intact uterus; a clinically sign...,Mifepristone,effects on cognition
1,NCT00160147,This is a 10-week study with bifeprunox and pl...,,diagnosis of dementia of the alzheimer's type,history of seizure disorder; clinically signif...,bifeprunox; Placebo,Brief Psychiatric Rating Scale (BPRS) Total Score
2,NCT00299988,The overall goal of this double-blind Phase II...,Abnormal processing of the beta-amyloid protei...,1. diagnosis of probable alzheimer's disease (...,1. non-alzheimer dementia.2. active renal dise...,Intravenous Immunoglobulin; Placebo,ADAS-Cog; CGIC
3,NCT00334568,Clinical features in patients with the familia...,,meets the national institute of neurological a...,has a history of or suffers from claustrophobi...,Rosiglitazone XR (extended release) oral table...,Change in global and regional cerebral glucose...
4,NCT00362024,MK0952 is a phosphodiesterase type IV (PDE4) i...,,"male or females; age \>/= 55 years, with mild-...",patients must not be living in nursing home or...,MK0952; Comparator: Placebo,


In [4]:
mytext = mydf['detailedDescription'][0]
mytext_list = mytext.split('. ')
print(mytext_list[:5])  # print first 5 sentences

["This will be a double blind, placebo controlled study of C-1073 to evaluate the effects on cognition in patients with mild to moderate Alzheimer's disease who are already receiving an acetylcholinesterase inhibitor and have been on a stable dose for at least 12 weeks", 'Patients will be randomized (1:1) to either daily dosing with 300 mg C-1073 or a placebo for 16 weeks', 'Patients will continue the stable daily dose of acetylcholinesterase inhibitor throughout the study.\n\nVisits will be weekly at the beginning of the study, then every two weeks, and every 4 weeks after week 12', 'Assessments during these visits may include cognition and behavior, depression, safety, as well as physical exams, clinical laboratory tests, EKG and adverse event reporting.']


In [22]:
#res_sum = sum_pipeline({"sentences": mytext_list}, strategy="count", strategy_args=2)
res_sum = sum_pipeline(mytext, max_length=50, min_length=25, do_sample=True)
res_sum

[{'summary_text': 'the purpose of this study was to evaluate the effects of acetylcholinesterase inhibitors on cognition in patients with mild to moderate dementia who are already receiving an acetoylcholesterase inhibitor and have been'}]

In [5]:
parser = PlaintextParser.from_string(mytext, Tokenizer("english"))
summarizer = TextRankSummarizer()
summary = summarizer(parser.document, sentences_count=2)  # number of sentences in summary

for sentence in summary:
    print(sentence)

This will be a double blind, placebo controlled study of C-1073 to evaluate the effects on cognition in patients with mild to moderate Alzheimer's disease who are already receiving an acetylcholinesterase inhibitor and have been on a stable dose for at least 12 weeks.
Visits will be weekly at the beginning of the study, then every two weeks, and every 4 weeks after week 12.


In [21]:
res_ner = ner_pipeline(mytext)
res_ner

[{'entity': 'LABEL_0',
  'score': np.float32(0.59156156),
  'index': 1,
  'word': 'This',
  'start': 0,
  'end': 4},
 {'entity': 'LABEL_0',
  'score': np.float32(0.55473566),
  'index': 2,
  'word': 'will',
  'start': 5,
  'end': 9},
 {'entity': 'LABEL_0',
  'score': np.float32(0.5352636),
  'index': 3,
  'word': 'be',
  'start': 10,
  'end': 12},
 {'entity': 'LABEL_1',
  'score': np.float32(0.52000993),
  'index': 4,
  'word': 'a',
  'start': 13,
  'end': 14},
 {'entity': 'LABEL_0',
  'score': np.float32(0.5830644),
  'index': 5,
  'word': 'double',
  'start': 15,
  'end': 21},
 {'entity': 'LABEL_0',
  'score': np.float32(0.5304068),
  'index': 6,
  'word': 'blind',
  'start': 22,
  'end': 27},
 {'entity': 'LABEL_0',
  'score': np.float32(0.52235097),
  'index': 7,
  'word': ',',
  'start': 27,
  'end': 28},
 {'entity': 'LABEL_1',
  'score': np.float32(0.50336933),
  'index': 8,
  'word': 'place',
  'start': 29,
  'end': 34},
 {'entity': 'LABEL_1',
  'score': np.float32(0.50228494),
 

In [6]:
# function to provide model a text to run PICO extraction/sumamrization on. In this case, either brief summary or detailed description
def compose_trial_text(row):
    text = row['briefSummary']
    if pd.notna(row['detailedDescription']) and row['detailedDescription'].strip():
        text += " " + row['detailedDescription']
    return text

def chunk_text_by_chars(text, chunk_char_length=1200, overlap=100):
    start = 0
    n_chars = len(text)
    while start < n_chars:
        end = min(start + chunk_char_length, n_chars)
        yield text[start:end]
        if end == n_chars:
            break
        start = end - overlap  # overlap for entities at borders

def run_ner_on_long_text(text, ner_pipeline, chunk_char_length=1200, overlap=100):
    entities = []
    for chunk in chunk_text_by_chars(text, chunk_char_length, overlap):
        ents = ner_pipeline(chunk)
        entities.extend(ents)
    return entities

def clean_population_entities(pop_entities):
    # Accept entities longer than X char, containing at least one demographic or diagnosis word
    min_len = 8
    demo_kw = ['patients', 'subjects', 'adults', 'children', 'individuals', 'men', 'women','male','female', 'participants']
    result = []
    for ent in pop_entities:
        ent_clean = ent.strip().strip(';,')
        if len(ent_clean) < min_len:
            continue
        if any(kw in ent_clean.lower() for kw in demo_kw):
            result.append(ent_clean)
    # Remove exact duplicates
    return '; '.join(sorted(set(result)))

def merge_entities(entities):
    """
    Merge adjacent tokens of the same entity_group into full phrases using start/end char indices.
    entities: List of dicts, each with 'entity_group', 'word', 'start', 'end'
    Returns: dict of entity_group -> list of phrases
    """
    if not entities:
        return {}

    # Sort by character start position for safety
    entities = sorted(entities, key=lambda x: x['start'])
    merged = []
    current = None

    for ent in entities:
        if (current is None or
            ent['entity_group'] != current['entity_group'] or
            ent['start'] != current['end']):   # must be adjacent

            # Start new entity
            if current:
                merged.append(current)
            current = {
                'entity_group': ent['entity_group'],
                'word': ent['word'],
                'start': ent['start'],
                'end': ent['end']
            }
        else:
            # Extend current entity
            current['word'] += ' ' + ent['word']
            current['end'] = ent['end']
    if current:
        merged.append(current)
    return merged

def extract_pico_from_merged_entities(entities):
    pico_dict = {"participants": [], "intervention": [], "comparator": [], "outcome": []}
    for ent in entities:
        group = ent['entity_group'].lower()
        if group in pico_dict:
            pico_dict[group].append(ent['word'])
    # join multiple same-type entities per doc
    return {k: "; ".join(sorted(set(v))) if v else "" for k, v in pico_dict.items()}

import re
from difflib import SequenceMatcher

GENERIC_WORDS = {'medications', 'treatment', 'antipsychotics', 'antidepressants'}

def normalize_intervention(ent):
    ent = ent.lower().strip()
    ent = re.sub(r"\(([^\)]*)\)", "", ent)
    ent = re.sub(r"[^\w\s]", " ", ent)
    ent = re.sub(r"\s+", " ", ent)
    ent = re.sub(r"\bor\b.*$", "", ent)
    ent = re.sub(r"\band\b.*$", "", ent)
    ent = ent.strip()
    return ent

def is_substring_duplicate(e, deduped):
    for existing in deduped:
        if e != existing and (e in existing or existing in e):
            return True
    return False

def deduplicate_intervention_entities(entities, threshold=0.85):
    cleaned = [normalize_intervention(e) for e in entities if len(e.strip()) > 2]
    cleaned = [ent for ent in cleaned if ent and ent not in GENERIC_WORDS]
    final = []
    for cand in cleaned:
        if not is_substring_duplicate(cand, final) and \
           not any(SequenceMatcher(None, cand, d).ratio() > threshold for d in final):
            final.append(cand)
    return "; ".join(sorted(final))

def summarize_textRank(text):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = TextRankSummarizer()
    summary = summarizer(parser.document, sentences_count=2)  # number of sentences in summary
    return ' '.join(str(sentence) for sentence in summary)



In [7]:
def extract_comparator(interventions):
    """
    Returns the first matching comparator keyword if found in interventions string, else ''.
    """
    if not isinstance(interventions, str):
        return ''
    comparators = [
        'placebo', 'standard care', 'usual care', 'sham', 'control',
        'standard therapy', 'vehicle', 'observation only', 'waitlist',
        'best supportive care'
    ]
    interventions_lower = interventions.lower()
    for comp in comparators:
        if comp in interventions_lower:
            return comp
    return ''



def remove_comparator_terms(interventions):
    if not isinstance(interventions, str):
        return interventions
    comparators = [
        'placebo', 'standard care', 'usual care', 'sham', 'control',
        'standard therapy', 'vehicle', 'observation only', 'waitlist',
        'best supportive care'
    ]
    # Split, trim, remove comparators, filter empty
    parts = [p.strip() for p in interventions.split(';')]
    cleaned = [p for p in parts if p and p.lower() not in comparators]
    return '; '.join(cleaned)

def clean_outcomes(outcome_str):
    if not isinstance(outcome_str, str):
        return ''
    # Replace brackets, backslashes, asterisks, colons, slashes, parentheses, commas with space or nothing
    cleaned = re.sub(r'[\[\]()*\\/:#]', '', outcome_str)
    cleaned = re.sub(r'\s+', ' ', cleaned)   # Multiple spaces to one
    cleaned = cleaned.replace(' ,', ',').replace(' ;', ';')
    cleaned = cleaned.lower().strip()
    # Split on semicolon
    outcomes = [o.strip(' .;,-') for o in cleaned.split(';')]
    # Remove empty, deduplicate and join
    outcomes = list(dict.fromkeys([o for o in outcomes if o and len(o) > 1]))  
    return '; '.join(outcomes)

    return cleaned.strip('; ').strip()

In [8]:
def process_trials(df, ner_pipeline):
    pop, intervention, outcome, summary = [], [], [], []
    for _, row in df.iterrows():
        main_text = compose_trial_text(row)
        inclusion_text = row.get("inclusion_criteria", "")

        # Only run NER on desired fields for each PICO
        main_entities = merge_entities(run_ner_on_long_text(main_text, ner_pipeline))
        inclusion_entities = merge_entities(run_ner_on_long_text(str(inclusion_text), ner_pipeline))

        pico_main = extract_pico_from_merged_entities(main_entities)
        pico_inc = extract_pico_from_merged_entities(inclusion_entities)

        # Population: from inclusion_criteria, Intervention/Outcome: from main text
        cleaned_population = clean_population_entities(pico_inc["participants"].split(";")) if pico_inc["participants"] else ""

        pop.append(cleaned_population)
        raw_interventions = pico_main["intervention"].split(";") if pico_main["intervention"] else []
        cleaned_intervention = deduplicate_intervention_entities(raw_interventions)
        intervention.append(cleaned_intervention)

        outcome.append(pico_main["outcome"])

        summary2sent = summarize_textRank(main_text)
        summary.append(summary2sent)

    df["population_extracted"] = pop
    df["intervention_extracted"] = intervention
    df["outcome_extracted"] = outcome
    
    # clean up columns
    df['outcome_extracted'] = df['outcome_extracted'].apply(clean_outcomes)
    # Apply extract_comparator to the intervention_extracted column
    df['comparator_extracted'] = df['intervention_extracted'].apply(extract_comparator)
    # Clean the intervention_extracted column by removing found comparators    
    df['intervention_extracted'] = df['intervention_extracted'].apply(remove_comparator_terms)
    # add main text summary to the dataframe
    df["summary_extracted"] = summary

    return df

In [9]:
ner_res = process_trials(mydf, ner_pipeline)
ner_res.head()

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


KeyError: 'entity_group'

In [42]:
for i in range(52):
    print(f"Trial {i+1}:")
    print(f"{ner_res['outcome_extracted'][i]}")
    #print("-" * 40)

Trial 1:
cognition and behavior, depression, safety, as well as physical exams, clinical laboratory tests, ekg and adverse event reporting.
Trial 2:

Trial 3:
cognitive, behavioral and functional measures; safety, efficacy
Trial 4:

Trial 5:

Trial 6:

Trial 7:

Trial 8:
123; brain regional distribution volumes; plasma; time to the peak uptake and amplitude of the peak uptake; tissue
Trial 9:
oxidative damage
Trial 10:
brain regional distribution volumes; peak uptake
Trial 11:
amplitude of; brain regional distribution volumes; peak uptake
Trial 12:
efficacy; safety
Trial 13:

Trial 14:
efficacy; safety
Trial 15:
assessment of patient accommodation status and caregiver burden ( apas carb ); cognitive decline; effectiveness; mini - mental state examination ( mmse ) ; the disability assessment in dementia ( dad ) ; and; mortality rate; safety; vital status and institutionalization status, adverse events, vital signs, weight, physical and neurologic examinations
Trial 16:

Trial 17:
cognit