## Getting Started with Cantemist

## Preparation

Go to the root folder of the repository and run the following commands

`xmen dict examples/conf/cantemist.yaml --code examples/dicts/cantemist.py`


This command creates the Cantemist- ICD-O jsonl file.

`xmen index examples/conf/cantemist.yaml --all`

This command creates the indices used candidate generations (both TF-IDF and SapBERT).

By default, all files are written to `~/.cache/xmen`, unless configured otherwise.

In [1]:
#xmen index C:\Users\Daniela\xmen\examples\conf\cantemist.yaml --all

In [2]:
#!git clone https://github.com/bigscience-workshop/biomedical

## Loading Knowledge Base and Dataset

In [5]:
variant = "4_cantemist_custom"

In [6]:
from pathlib import Path
base_path = Path.home() / f".cache/xmen/{variant}"

In [7]:
#print(base_path)

In [6]:
#!pip install ./biomedical/

In [8]:
# Load cantemist subset through BigBIO dataloader
from datasets import load_dataset
#dataset = load_dataset("bigbio/cantemist", "cantemist_bigbio_kb",download_mode="force_redownload")
dataset = load_dataset('../../biomedical/bigbio/biodatasets/cantemist/cantemist.py', 'cantemist_bigbio_kb', 'trust_remote_code=True')

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 501
    })
    test: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 300
    })
    validation: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 500
    })
})

In [11]:
from xmen import load_kb
kb = load_kb(base_path / f"{variant}.jsonl")
#kb = load_kb(base_path / "cantemist.jsonl")

In [12]:
missing_codes = list()
for d in dataset['train']:
    for e in d['entities']:
        for code in e['normalized']:
            id = code['db_id']
            if not id in kb.cui_to_entity:
                missing_codes.append(id)
len(missing_codes), len(set(missing_codes))

(0, 0)

In [13]:
missing_codes

[]

# Candidate Generation

We use the pre-computed indices to retrieve the 64 most similar concepts for each mention

In [14]:
# Generate candidates with Ensemble of TF-IDF + SapBERT
from xmen.linkers import default_ensemble
linker = default_ensemble(base_path / "index")

In [17]:
candidates = linker.predict_batch(dataset['test'], top_k=64, batch_size=128)

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [26]:
from xmen.evaluation import evaluate
evaluate(dataset['test'], candidates)

{'strict': {'precision': 0.34104046242774566,
  'recall': 0.34085281980742776,
  'fscore': 0.34094661529994497,
  'ptp': 1239,
  'fp': 2394,
  'rtp': 1239,
  'fn': 2396,
  'n_docs_system': 300,
  'n_annos_system': 3635,
  'n_docs_gold': 300,
  'n_annos_gold': 3635}}

In [27]:
from xmen.evaluation import evaluate_at_k
_ = evaluate_at_k(dataset['test'], candidates)

Recall@1 0.34085281980742776
Recall@2 0.4687757909215956
Recall@4 0.5881705639614856
Recall@8 0.6134800550206327
Recall@16 0.6451169188445667
Recall@32 0.6640990371389272
Recall@64 0.6940852819807428


In [28]:
#When the score is below the CUTOFF, then associate the term to the generic code 8000/6 (Neoplasia metastásica)
CUTOFF = 0.8

def add_manual_concepts(document):
    entities = []
    for e in document["entities"]:
        norm = e["normalized"]
        if norm[0]["score"] < CUTOFF:
            norm.insert(0,
                {'db_id': '8000/6',
                 'db_name': 'UMLS',
                 'score': 1.0,
                 'predicted_by': ['manual']})
        entities.append(e)
    return {"entities" : entities}

In [29]:
candidates_manual = candidates.map(lambda d: add_manual_concepts(d))

In [30]:
evaluate(dataset['test'], candidates_manual, top_k_predictions=1)

{'strict': {'precision': 0.5150013762730525,
  'recall': 0.5147180192572215,
  'fscore': 0.5148596587782059,
  'ptp': 1871,
  'fp': 1762,
  'rtp': 1871,
  'fn': 1764,
  'n_docs_system': 300,
  'n_annos_system': 3635,
  'n_docs_gold': 300,
  'n_annos_gold': 3635}}

In [31]:
_ = evaluate_at_k(dataset['test'], candidates_manual)

Recall@1 0.5147180192572215
Recall@2 0.5928473177441541
Recall@4 0.7400275103163686
Recall@8 0.8236588720770289
Recall@16 0.8563961485557083
Recall@32 0.8745529573590096
Recall@64 0.902888583218707


In [32]:
from xmen.evaluation import error_analysis

# Before re-ranking
ea_df = error_analysis(dataset['test'], candidates_manual, tasks=['nen'])

In [33]:
ea_df
#convert into csv file:
#ea_df.to_csv('all_errors', sep=',', index=False, encoding='utf-8')

Unnamed: 0,_word_len,_abbrev,gt_start,gt_end,gt_text,gold_type,gold_concept,pred_index,pred_index_score,pred_top,pred_top_score,document_id
0,1,False,129,139,[tumoración],MORFOLOGIA_NEOPLASIA,"{'db_name': 'eCIE-O-3.1', 'db_id': '8000/1'}",4,0.630540,8000/6,1.000000,cc_onco1197
1,2,False,354,370,[carcinoma ductal],MORFOLOGIA_NEOPLASIA,"{'db_name': 'eCIE-O-3.1', 'db_id': '8500/3'}",2,0.752837,8000/6,1.000000,cc_onco1197
2,1,False,439,450,[metastásica],MORFOLOGIA_NEOPLASIA,"{'db_name': 'eCIE-O-3.1', 'db_id': '8000/6'}",0,0.825001,8000/6,0.825001,cc_onco1197
3,1,False,1587,1599,[metastásicos],MORFOLOGIA_NEOPLASIA,"{'db_name': 'eCIE-O-3.1', 'db_id': '8000/6'}",0,1.000000,8000/6,1.000000,cc_onco1197
4,1,False,1699,1706,[tumoral],MORFOLOGIA_NEOPLASIA,"{'db_name': 'eCIE-O-3.1', 'db_id': '8000/1'}",2,0.697819,8000/6,1.000000,cc_onco1197
...,...,...,...,...,...,...,...,...,...,...,...,...
3630,1,False,288,297,[neoplasia],MORFOLOGIA_NEOPLASIA,"{'db_name': 'eCIE-O-3.1', 'db_id': '8000/1'}",2,0.775603,8000/3,0.816536,cc_onco1263
3631,6,False,434,468,[adenocarcinoma de recto de grado 1],MORFOLOGIA_NEOPLASIA,"{'db_name': 'eCIE-O-3.1', 'db_id': '8140/31'}",-1,,8000/6,1.000000,cc_onco1263
3632,1,False,495,502,[cT3N0M0],MORFOLOGIA_NEOPLASIA,"{'db_name': 'eCIE-O-3.1', 'db_id': '8000/6'}",0,1.000000,8000/6,1.000000,cc_onco1263
3633,2,False,847,864,[adenocarcinoma G3],MORFOLOGIA_NEOPLASIA,"{'db_name': 'eCIE-O-3.1', 'db_id': '8140/33'}",8,0.768803,8140/333,0.835836,cc_onco1263


In [34]:
ea_df.query("pred_index == -1")

Unnamed: 0,_word_len,_abbrev,gt_start,gt_end,gt_text,gold_type,gold_concept,pred_index,pred_index_score,pred_top,pred_top_score,document_id
7,1,False,2718,2729,[satelitosis],MORFOLOGIA_NEOPLASIA,"{'db_name': 'eCIE-O-3.1', 'db_id': '8001/1'}",-1,,8000/6,1.000000,cc_onco1197
25,7,False,2217,2275,[infiltración ósea por tumoración compatible c...,MORFOLOGIA_NEOPLASIA,"{'db_name': 'eCIE-O-3.1', 'db_id': '9530/6'}",-1,,8000/6,1.000000,cc_onco448
26,4,False,2300,2328,[invasión ósea por meningioma],MORFOLOGIA_NEOPLASIA,"{'db_name': 'eCIE-O-3.1', 'db_id': '9530/6'}",-1,,8000/6,1.000000,cc_onco448
27,11,False,2898,2970,[meningioma transicional del seno cavernoso gr...,MORFOLOGIA_NEOPLASIA,"{'db_name': 'eCIE-O-3.1', 'db_id': '9537/61'}",-1,,8000/6,1.000000,cc_onco448
53,3,False,31,48,[síndrome de Lynch],MORFOLOGIA_NEOPLASIA,"{'db_name': 'eCIE-O-3.1', 'db_id': '8000/3'}",-1,,8000/6,1.000000,cc_onco1367
...,...,...,...,...,...,...,...,...,...,...,...,...
3612,3,False,4025,4042,[quiste de Tailgut],MORFOLOGIA_NEOPLASIA,"{'db_name': 'eCIE-O-3.1', 'db_id': '8000/0'}",-1,,8000/6,1.000000,cc_onco24
3615,3,False,4471,4502,[hamartoma quístico retrorrectal],MORFOLOGIA_NEOPLASIA,"{'db_name': 'eCIE-O-3.1', 'db_id': '8000/0'}",-1,,8000/6,1.000000,cc_onco24
3618,4,False,5027,5078,[carcinoma sólido-papilar moderadamente difere...,MORFOLOGIA_NEOPLASIA,"{'db_name': 'eCIE-O-3.1', 'db_id': '8050/32'}",-1,,8452/32,0.857614,cc_onco24
3626,6,False,7210,7266,[hamartoma quístico retrorrectal con degenerac...,MORFOLOGIA_NEOPLASIA,"{'db_name': 'eCIE-O-3.1', 'db_id': '8000/3'}",-1,,8000/6,1.000000,cc_onco24


In [35]:
#Check when it was assigned the generic code 8000/6, but the match was better with another ICD-O code
ea_df[ea_df.gold_concept.map(lambda g: g["db_id"] == "8000/6") & (ea_df.pred_index == -1)]#.pred_index.value_counts()

Unnamed: 0,_word_len,_abbrev,gt_start,gt_end,gt_text,gold_type,gold_concept,pred_index,pred_index_score,pred_top,pred_top_score,document_id
205,2,True,3813,3826,[LOE hepáticas],MORFOLOGIA_NEOPLASIA,"{'db_name': 'eCIE-O-3.1', 'db_id': '8000/6'}",-1,,8170/0,0.805464,cc_onco465
215,2,True,4677,4690,[LOE hepáticas],MORFOLOGIA_NEOPLASIA,"{'db_name': 'eCIE-O-3.1', 'db_id': '8000/6'}",-1,,8170/0,0.805464,cc_onco465
218,2,True,4942,4955,[LOE hepáticas],MORFOLOGIA_NEOPLASIA,"{'db_name': 'eCIE-O-3.1', 'db_id': '8000/6'}",-1,,8170/0,0.805464,cc_onco465
839,2,True,3086,3099,[LOE hepáticas],MORFOLOGIA_NEOPLASIA,"{'db_name': 'eCIE-O-3.1', 'db_id': '8000/6'}",-1,,8170/0,0.805464,cc_onco1174
1453,2,True,3616,3627,[RC hepática],MORFOLOGIA_NEOPLASIA,"{'db_name': 'eCIE-O-3.1', 'db_id': '8000/6'}",-1,,8170/0,0.806711,cc_onco641
2998,2,True,3909,3922,[LOE hepáticas],MORFOLOGIA_NEOPLASIA,"{'db_name': 'eCIE-O-3.1', 'db_id': '8000/6'}",-1,,8170/0,0.805464,cc_onco770


# Reranking

## Using a Pre-trained Model for Reranking

When no are little annotated training data is available, a pre-trained model can provide good re-ranking performance.
Here, we use a cross-encoder that was trained on an automatically translated version of MedMentions (see the notebook [01_Translation.ipynb](01_Translation.ipynb))

In [80]:
#from xmen.reranking.cross_encoder import *
#ce_dataset = CrossEncoderReranker.prepare_data(candidates_manual, dataset["test"], kb)

In [81]:
# CE re-ranker, pre-trained on Spanish translation of MedMentions
#rr = CrossEncoderReranker.load("phlobo/xmen-es-ce-medmentions", device=0)

In [82]:
# Predict on test set
#prediction = rr.rerank_batch(candidates_manual, ce_dataset, allow_nil=False)

In [83]:
#evaluate(dataset['test'], prediction)

## Training a Fully-supervised Model

Finally, with enough training data, we can also train a fully supervsied cross-encoder to learn a better task-specific ranking.

The cross-encoder is trained with batches of 64 candidates. 
We keep the checkpoint that maximizes accuracy@1 on the validation set.

In [36]:
candidates_all = linker.predict_batch(dataset, top_k=64, batch_size=128)

Map:   0%|          | 0/501 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [37]:
candidates_manual_all = candidates_all.map(lambda d: add_manual_concepts(d))

Map:   0%|          | 0/501 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [38]:
from xmen.reranking.cross_encoder import *
ce_dataset_all = CrossEncoderReranker.prepare_data(candidates_manual_all, dataset, kb)

Context length: 128
Use NIL values: True


  0%|          | 0/6396 [00:00<?, ?it/s]

  0%|          | 0/6396 [00:00<?, ?it/s]

  0%|          | 0/6396 [00:00<?, ?it/s]

  0%|          | 0/3635 [00:00<?, ?it/s]

  0%|          | 0/3635 [00:00<?, ?it/s]

  0%|          | 0/3635 [00:00<?, ?it/s]

  0%|          | 0/6001 [00:00<?, ?it/s]

  0%|          | 0/6001 [00:00<?, ?it/s]

  0%|          | 0/6001 [00:00<?, ?it/s]

In [None]:
#rr_fs = CrossEncoderReranker()
#args = CrossEncoderTrainingArgs(num_train_epochs=10, model_name="PlanTL-GOB-ES/roberta-base-biomedical-clinical-es")
#rr_fs.fit(args, ce_dataset_all["train"].dataset, ce_dataset_all["validation"].dataset, show_progress_bar=True)

model_name := PlanTL-GOB-ES/roberta-base-biomedical-clinical-es
num_train_epochs := 10
fp16 := True
label_smoothing := False
rank_regularization := 1.0
train_layers := None
softmax_loss := True
random_seed := 42
learning_rate := 2e-05


Some weights of the model checkpoint at PlanTL-GOB-ES/roberta-base-biomedical-clinical-es were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-biomedical-clinical-es and are newly initialized: 

2024-07-02 14:20:11 - Use pytorch device: cuda


Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6396 [00:00<?, ?it/s]

2024-07-02 14:34:19 - EntityLinkingEvaluator: Evaluating the model on eval dataset after epoch 0:
2024-07-02 14:42:49 - Accuracy: 0.803032827862023
2024-07-02 14:42:49 - Accuracy @ 5: 0.9166805532411265
2024-07-02 14:42:49 - Accuracy @ 64: 0.9775037493751042
2024-07-02 14:42:49 - Baseline Accuracy: 0.5114147642059657
2024-07-02 14:42:49 - Save model to ./output/cross_encoder


Iteration:   0%|          | 0/6396 [00:00<?, ?it/s]

In [None]:
# Train the cross-encoder on the training set and use the validation set for model selection
rr_fs = CrossEncoderReranker.load('../cross_encoder_model_2024_07_02', device=0)

In [47]:
# Predict on test set
prediction_fs = rr_fs.rerank_batch(candidates_manual_all["test"], ce_dataset_all["test"])

Batches:   0%|          | 0/3635 [00:00<?, ?it/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [48]:
evaluate(dataset['test'], prediction_fs)

{'strict': {'precision': 0.8728691129731292,
  'recall': 0.8310866574965612,
  'fscore': 0.8514656144306652,
  'ptp': 3021,
  'fp': 440,
  'rtp': 3021,
  'fn': 614,
  'n_docs_system': 300,
  'n_annos_system': 3461,
  'n_docs_gold': 300,
  'n_annos_gold': 3635}}

In [49]:
_ = evaluate_at_k(dataset['test'], prediction_fs)

Recall@1 0.8310866574965612
Recall@2 0.8530949105914718
Recall@4 0.8693259972489684
Recall@8 0.874828060522696
Recall@16 0.8797799174690509
Recall@32 0.8847317744154057
Recall@64 0.8891334250343879


In [50]:
# After re-ranking
ea_df_reranked = error_analysis(dataset['test'], prediction_fs, tasks=['nen'])

In [51]:
ea_df_reranked

Unnamed: 0,_word_len,_abbrev,gt_start,gt_end,gt_text,gold_type,gold_concept,pred_index,pred_index_score,pred_top,pred_top_score,document_id
0,1,False,129,139,[tumoración],MORFOLOGIA_NEOPLASIA,"{'db_name': 'eCIE-O-3.1', 'db_id': '8000/1'}",0,0.056426,8000/1,0.056426,cc_onco1197
1,2,False,354,370,[carcinoma ductal],MORFOLOGIA_NEOPLASIA,"{'db_name': 'eCIE-O-3.1', 'db_id': '8500/3'}",0,0.055484,8500/3,0.055484,cc_onco1197
2,1,False,439,450,[metastásica],MORFOLOGIA_NEOPLASIA,"{'db_name': 'eCIE-O-3.1', 'db_id': '8000/6'}",0,0.055176,8000/6,0.055176,cc_onco1197
3,1,False,1587,1599,[metastásicos],MORFOLOGIA_NEOPLASIA,"{'db_name': 'eCIE-O-3.1', 'db_id': '8000/6'}",0,0.066785,8000/6,0.066785,cc_onco1197
4,1,False,1699,1706,[tumoral],MORFOLOGIA_NEOPLASIA,"{'db_name': 'eCIE-O-3.1', 'db_id': '8000/1'}",0,0.043706,8000/1,0.043706,cc_onco1197
...,...,...,...,...,...,...,...,...,...,...,...,...
3630,1,False,288,297,[neoplasia],MORFOLOGIA_NEOPLASIA,"{'db_name': 'eCIE-O-3.1', 'db_id': '8000/1'}",0,0.046987,8000/1,0.046987,cc_onco1263
3631,6,False,434,468,[adenocarcinoma de recto de grado 1],MORFOLOGIA_NEOPLASIA,"{'db_name': 'eCIE-O-3.1', 'db_id': '8140/31'}",-1,,8000/6,0.020674,cc_onco1263
3632,1,False,495,502,[cT3N0M0],MORFOLOGIA_NEOPLASIA,"{'db_name': 'eCIE-O-3.1', 'db_id': '8000/6'}",0,0.076871,8000/6,0.076871,cc_onco1263
3633,2,False,847,864,[adenocarcinoma G3],MORFOLOGIA_NEOPLASIA,"{'db_name': 'eCIE-O-3.1', 'db_id': '8140/33'}",0,0.018867,8140/33,0.018867,cc_onco1263
