# Compare self-trained models on GOLD dataset

In [9]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1" # suppresses Huggingface warning of storing data rather than symlinking it
from transformers import AutoTokenizer, AutoModelForPreTraining, pipeline

import sys
sys.path.append('../../')
# load functions for import of clinicaltrials.gov data written previously
from app.data.loader import load_trials_json, extract_from_clinicaltrials
from app.nlp.pipelines import load_ner_trained_pipeline
from app.nlp.utils import * # custom functions required for NER and summarization
from app.nlp.evaluate_model import elements_from_cell, substring_partial_overlap, evaluate_ner_model_partial_overlap
from rouge_score import rouge_scorer # library for ROUGE score calculation

In [10]:
PROJECT_ROOT = os.path.expanduser('~/Documents/github/biomed_extractor')

ner_pipeline_hf = pipeline(
    "token-classification",
    model="kamalkraj/BioELECTRA-PICO",
    aggregation_strategy="simple"
)

model_dirs = [
    os.path.join(PROJECT_ROOT, 'app/model/nlpie_compact_biobert_PICO'),
    os.path.join(PROJECT_ROOT, 'app/model/nlpie_bio-mobilebert_PICO'),
    os.path.join(PROJECT_ROOT, 'app/model/dmis-lab_biobert-v1.1')
]

ner_pipelines = [
    load_ner_trained_pipeline(model_dir=dir_path)
    for dir_path in model_dirs
]


Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu


In [11]:
# load gold standard trials
# Load and process data from clinicaltrials.gov
PROJECT_ROOT = os.path.expanduser('~/Documents/github/biomed_extractor')

# Data directory at top level
DATA_DIR = os.path.join(PROJECT_ROOT, 'data\\annotated')

df_json = load_trials_json(filepath = DATA_DIR, filename ='ctg-studies_for_gold.json')
#print(df_json.head())
mydf_manual_annotation = extract_from_clinicaltrials(df_json)
mydf_manual_annotation.head()

Loaded 20 records from ctg-studies_for_gold.json


Unnamed: 0,nctId,briefSummary,detailedDescription,inclusion_criteria,exclusion_criteria,intervention_name_clean,outcomes_name
0,NCT00667810,This is a study to evaluate the efficacy and s...,,"diagnosis of probable alzheimer disease (ad), ...",significant neurological disease other than ad...,bapineuzumab; placebo,The Change From Baseline in the Alzheimer's Di...
1,NCT00676143,This is a study to evaluate the efficacy and s...,,"diagnosis of probable ad, with mmse score of 1...",significant neurological disease other than ad...,bapineuzumab; placebo,Change From Baseline in Alzheimer's Disease As...
2,NCT01561430,The purpose of this Phase 1/Phase 2 study is t...,,meets criteria for mci due to ad or mild adall...,participant in another drug or device study; h...,LY2886721; Placebo,Change From Baseline to 12 Weeks in Cerebrospi...
3,NCT01900665,To test the idea that solanezumab will slow th...,,meets national institute of neurological and c...,does not have a reliable caregiver who is in f...,Solanezumab; Placebo,Change From Baseline in Alzheimer's Disease As...
4,NCT02565511,The purpose of this study was to test whether ...,The study (also known as the Generation Study ...,consented to receive disclosure of their risk ...,any disability that prevented the participant ...,CAD106 Immunotherapy; Placebo to CAD106; CNP52...,Time to Event (Diagnosis of Mild Cognitive Imp...


In [12]:
# Process all NER pipelines on the gold dataset and store results in a list
ner_results = []

# Process custom models
for ner_pipeline in ner_pipelines:
    ner_res_model = process_trials_for_retrained_PICO(mydf_manual_annotation, ner_pipeline)
    ner_res_model.sort_values(by=['nctId'], inplace=True)
    ner_res_model.rename(columns={
        'population_extracted': 'population',
        'intervention_extracted': 'intervention',
        'comparator_extracted': 'comparator',
        'outcome_extracted': 'outcome',
        'summary_extracted': 'summary'
    }, inplace=True)
    # Only append, do not merge or concat
    ner_results.append(ner_res_model[['nctId', 'population', 'intervention', 'comparator', 'outcome', 'summary']].copy())

# Huggingface pipeline
ner_res_model_hf = process_trials_for_PICO(mydf_manual_annotation, ner_pipeline_hf)
ner_res_model_hf.sort_values(by=['nctId'], inplace=True)
ner_res_model_hf.rename(columns={
    'population_extracted': 'population',
    'intervention_extracted': 'intervention',
    'comparator_extracted': 'comparator',
    'outcome_extracted': 'outcome',
    'summary_extracted': 'summary'
}, inplace=True)
ner_results = [ner_res_model_hf[['nctId', 'population', 'intervention', 'comparator', 'outcome', 'summary']].copy()] + ner_results

model_names = ["BioELECTRA-PICO", "nlpie_compact_biobert_PICO", "nlpie_bio-mobilebert_PICO", "dmis-lab_biobert-v1.1"]
ner_results_dict = dict(zip(model_names, ner_results))

# Peek into results
for name, df in ner_results_dict.items():
    print(f"{name} predictions:")
    print(df.head())


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


BioELECTRA-PICO predictions:
         nctId                                         population  \
0  NCT00667810  diagnosis of probable alzheimer disease ( ad )...   
1  NCT00676143  diagnosis of probable ad, with mmse score of 1...   
2  NCT01561430  absence of dementia; clinical dementia rating ...   
3  NCT01900665  ) result consistent with the presence of amylo...   
4  NCT02565511  homozygous apoe4 genotype.; male or female, ag...   

     intervention comparator  \
0    bapineuzumab              
1    bapineuzumab              
2                              
3     solanezumab    placebo   
4  cad106; cnp520    placebo   

                                             outcome  \
0                                   efficacy; safety   
1                                   efficacy; safety   
2                                                      
3                                  cognitive decline   
4  and body weight loss; cognitive function, brai...   

                          

In [14]:
# load GOLD-standard results
df_gold = load_trials_json(filepath = DATA_DIR, filename ='gold_standard.json')
for col in ["population", "intervention","comparator", "outcome"]:
    df_gold[col] = df_gold[col].apply(
        lambda x: '; '.join(str(e) for e in x) if isinstance(x, list) else x
    )
    # set to lowercase (to align with automatic extraction)
    df_gold[col] = df_gold[col].str.lower()
df_gold.sort_values(by=['doc_id'], inplace=True)
df_gold["intervention"] = df_gold["intervention"].apply(normalize_intervention)
df_gold

Loaded 20 records from gold_standard.json


Unnamed: 0,doc_id,summary,population,intervention,comparator,outcome
12,NCT00667810,[This is a study to evaluate the efficacy and ...,patients with mild to moderate alzheimer disease,bapineuzumab,placebo,the change from baseline in the alzheimer's di...
11,NCT00676143,[This is a study to evaluate the efficacy and ...,patients with mild to moderate alzheimer disease,bapineuzumab,placebo,change from baseline in alzheimer's disease as...
13,NCT01561430,[The purpose of this Phase 1/Phase 2 study is ...,participants with mild cognitive impairment (m...,ly2886721,placebo,change from baseline to 12 weeks in cerebrospi...
19,NCT01900665,[To test the idea that solanezumab will slow t...,participants with mild ad,solanezumab,placebo,change from baseline in alzheimer's disease as...
3,NCT02565511,[The purpose of this study was to test whether...,cognitively unimpaired apoe4 homozygotes (hms)...,cad106 immunotherapy cnp520 alum,placebo,time to event (diagnosis of mild cognitive imp...
7,NCT02769000,[Evaluate safety and toxicity/adverse events a...,patients with early alzheimer's dementia,radiation 10 gy radiation 20 gy,,ctcae toxicity grading
14,NCT02783573,[The main purpose of this study is to evaluate...,participants with mild alzheimer's disease (ad...,lanabecestat,placebo,change from baseline in alzheimer´s disease as...
9,NCT02791191,[The main purpose of this study is to evaluate...,participants with mild alzheimer's disease (ad...,ly3202626,placebo,change from baseline in f-av-1451 positron emi...
15,NCT02884492,[This study is being done to learn about tau t...,elderly subjects with different clinical and b...,18f thk 5351,,18f-thk-5351 standardized uptake value ratio
10,NCT02972658,[This study is an extension of study I8D-MC-AZ...,participants with early alzheimer's disease de...,lanabecestat,,change from baseline analysis on the 13-item a...


In [15]:
pico_cols = ["population", "intervention", "comparator", "outcome"]
evaluation_tables = {}

for model_name, pred_df in ner_results_dict.items():
    _, evaluation_table = evaluate_ner_model_partial_overlap(
        df_gold,
        pred_df,
        pico_cols,
        summary_gold_col='summary',
        summary_pred_col='summary',
        add_rouge=True
    )
    evaluation_tables[model_name] = evaluation_table

# Example: print all evaluation tables
for name, table in evaluation_tables.items():
    print(f"Evaluation for {name}:")
    print(table)

Evaluation for BioELECTRA-PICO:
                 precision    recall        f1
element                                       
population        0.378636  0.722026  0.477297
intervention      0.794156  0.829167  0.759756
comparator        0.600000  0.566667  0.575000
outcome           0.362562  0.149262  0.155473
SUMMARY_ROUGE-1   0.798815  0.908080  0.833012
SUMMARY_ROUGE-2   0.768481  0.863371  0.798287
SUMMARY_ROUGE-L   0.779970  0.881646  0.812388
Evaluation for nlpie_compact_biobert_PICO:
                 precision    recall        f1
element                                       
population        0.830010  0.855816  0.820786
intervention      0.573728  0.866667  0.613520
comparator        0.950000  0.950000  0.950000
outcome           0.901699  0.974242  0.926972
SUMMARY_ROUGE-1   0.798815  0.908080  0.833012
SUMMARY_ROUGE-2   0.768481  0.863371  0.798287
SUMMARY_ROUGE-L   0.779970  0.881646  0.812388
Evaluation for nlpie_bio-mobilebert_PICO:
                 precision    recall 

# **Comparison of BERT-based PICO NER Models**

### **1. Population**
- **Precision/Recall/F1:**  
  - All three models perform well, with F1 scores between 0.82–0.91.
  - **bio-mobilebert** has the highest F1 (0.91), suggesting slightly better capture of eligible populations from trial texts.
  - All three achieve high recall (0.86–0.96) and acceptable precision (0.82–0.91).
- **Interpretation:**  
  Models are robust at extracting population, with *bio-mobilebert* having an edge.

### **2. Intervention**
- **Precision/Recall/F1:**  
  - F1 hovers in the 0.57–0.62 range: good recall (~0.83–0.87), but lower precision (0.54–0.61).
  - **dmis-lab_biobert-v1.1** is strongest on precision (0.61) and has the highest F1 (0.62), though only marginally better than the others.
- **Interpretation:**  
  All models successfully recall interventions, but still display a tendency to overcall, resulting in modest precision. *dmis-lab_biobert* may be best balanced.

### **3. Comparator**  
- **Precision/Recall/F1:**  All identical (0.95). **Remember**: this isn't a test of the models, but of the comparator extraction function from cleaned intervention metadata.
- **Interpretation:** Do not use this column to differentiate models.

### **4. Outcome**
- **Precision/Recall/F1:**  
  - All models nearly identical and very strong (precision ~0.90, recall ~0.97, F1 ~0.93).
- **Interpretation:**  
  All are highly dependable for outcome extraction, with no practical difference detected in these results.

## **Strengths and Weaknesses**

| Model                  | Key Strength                                 | Weakness                                      |
|------------------------|----------------------------------------------|-----------------------------------------------|
| nlpie_compact_biobert  | Balanced on population/intervention/outcome  | Slightly lower on population F1               |
| nlpie_bio-mobilebert   | Strongest on population extraction           | Slightly lower on intervention precision/F1   |
| dmis-lab_biobert-v1.1  | Highest intervention F1 & strong population  | None significant in these results             |

## **Practical Guidance**

- **For most purposes:** Any of these models will deliver good outcome extraction, high-quality population extraction, and moderately reliable intervention extraction, though users should expect some false positives in interventions.
- **If population extraction is mission critical:** *nlpie_bio-mobilebert* edges out the others.
- **If a small advantage in intervention precision/F1 matters:** *dmis-lab_biobert-v1.1* is preferred.
- **Outcome extraction:** All are essentially tied.

## **Summary Table: Best by Metric**

| PICO Element    | Top Model                        | Notes                                   |
|-----------------|----------------------------------|-----------------------------------------|
| Population      | nlpie_bio-mobilebert             | F1 = 0.91, recall = 0.96                |
| Intervention    | dmis-lab_biobert-v1.1            | F1 = 0.62, precision = 0.61             |
| Outcome         | All (technical tie)              | F1 = 0.93                               |

## **Final Recommendation**

**All three BERT-based models are strong performers for PICO NER extraction.**  
- **nlpie_bio-mobilebert**: Ideal if maximizing population capture is most important.
- **dmis-lab_biobert-v1.1**: Slight edge in intervention extraction.
- In practice, the differences are minor; any are quite suitable for robust clinical trial PICO element extraction.

**(Comparator extraction scores cannot be used for model selection in this context.)**

