In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

UMLS_API_KEY = os.getenv("UMLS_API_KEY")
PATH = "./standard/golds_v4.json"
PATH_BASELINES = "results/baselines/"

In [None]:
from utils import dump_results, load_results, load_cache, dump_cache
from tqdm import tqdm 

# Load TUIS

In [None]:
import pandas as pd

In [None]:
PATH_TUIS = "promts/tuis_desc.csv"

In [None]:
tuis_df = pd.read_csv(PATH_TUIS)

In [None]:
TUIS = set(tuis_df["TUI"].to_list())
TUIS_LABELS = tuis_df["Name"].to_list()

## Load Cache

In [None]:
PATH_CACHE = "../my-wiki-annot/cache_cuis_tuis.pkl"

In [None]:
CACHE_CUIS = load_cache(PATH_CACHE)
print(len(CACHE_CUIS))

# Load Golds

In [None]:
from utils import load_golds

In [None]:
golds = load_golds(PATH)

In [None]:
import requests

def parse_tui(semanticResult):
    #print(semanticResult)
    return semanticResult['uri'].split("/")[-1]
    
def cui2tui(cui):
    result = None
    apikey = UMLS_API_KEY
    version = 'current'
    uri = "https://uts-ws.nlm.nih.gov"        
    path = '/content/'+version+'/CUI/'+cui
 
    try:
        query = {'apiKey':apikey}
        r = requests.get(uri+path, params=query)
        r.raise_for_status()
        r.encoding = 'utf-8'
        outputs  = r.json()
        items = outputs['result']
        #print(items)
        result = items['semanticTypes'][0]['name']
        result = parse_tui(items['semanticTypes'][0])
 
    except Exception as except_error:
        print(except_error)
    return result

# Scispacy

### Models 

```
pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz
pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_md-0.5.4.tar.gz
pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_lg-0.5.4.tar.gz

pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_scibert-0.5.4.tar.gz

```

In [None]:
import spacy
import scispacy
from scispacy.linking import EntityLinker

In [None]:
nlp_small = spacy.load("en_core_sci_sm")
nlp_small.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})
nlp_small.meta["pipeline"]

In [None]:
nlp_md = spacy.load("en_core_sci_md")
nlp_md.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})
nlp_md.meta["pipeline"]

In [None]:
nlp_lg = spacy.load("en_core_sci_lg")
nlp_lg.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})
nlp_lg.meta["pipeline"]

In [None]:
nlp_bert = spacy.load("en_core_sci_scibert")
nlp_bert.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})
nlp_bert.meta["pipeline"]

### Sample

In [None]:
text = """
Subconjunctival bleeding initially appears bright red underneath the transparent bulbar conjunctiva. Later, the bleeding may spread 
and become green or yellow as the hemoglobin is metabolized. It usually disappears within two weeks. The affected eye may feel dry, 
rough, or scratchy, but the condition is not usually painful."""

In [None]:
text = "Most cases of renal artery stenosis are asymptomatic, and the main problem is high blood pressure that cannot be controlled with medication"

In [None]:
text = "Madarosis is not a critical or severe condition. The main symptom and sign of madarosis is the loss of hair from the eyelids, eyebrows, or eyelashes.n"

In [None]:
print(len(text)) 

In [None]:
doc = nlp_small(text)

In [None]:
def get_ents(doc):
    ents = []
    for ent in doc.ents: 
        #print(ent.text, ent.start_char, ent.end_char, ent.label_, "\n\n")

        if len(ent._.kb_ents) > 0:
            cui = str(ent._.kb_ents[0][0])
            #print(cui)

            if cui not in CACHE_CUIS:
                CACHE_CUIS[cui] = cui2tui(cui)

            tui = CACHE_CUIS[cui]

            if tui in TUIS:
                ents.append(ent.text.lower())
    return ents            

In [None]:
cui2tui('C0042776')

In [None]:
get_ents(doc)

## Scispacy small

In [None]:
results_scipacy_sm = []
for ds, val in tqdm(golds.items()):
    pred_doc = nlp_small(val["text"])
    pred_ents = get_ents(pred_doc)
    results_scipacy_sm.append({"text": val["text"], "y_pred":pred_ents, "y_true":val["sings"]})

In [None]:
dump_results(results_scipacy_sm, PATH_BASELINES + "sci-small.json")

## Scispacy medium

In [None]:
results_scipacy_md = []
for ds, val in tqdm(golds.items()):
    pred_doc = nlp_md(val["text"])
    pred_ents = get_ents(pred_doc)
    results_scipacy_md.append({"text": val["text"], "y_pred":pred_ents, "y_true":val["sings"]})

In [None]:
dump_results(results_scipacy_md, PATH_BASELINES + "sci-md.json")

## Scispacy long

In [None]:
results_scipacy_lg = []
for ds, val in tqdm(golds.items()):
    pred_doc = nlp_lg(val["text"])
    pred_ents = get_ents(pred_doc)
    results_scipacy_lg.append({"text": val["text"], "y_pred":pred_ents, "y_true":val["sings"]})

In [None]:
dump_results(results_scipacy_lg, PATH_BASELINES + "sci-lg.json")

## Scibert

In [None]:
results_scipacy_bert = []
for ds, val in tqdm(golds.items()):
    pred_doc = nlp_bert(val["text"])
    pred_ents = get_ents(pred_doc)
    results_scipacy_bert.append({"text": val["text"], "y_pred":pred_ents, "y_true":val["sings"]})

In [None]:
dump_results(results_scipacy_bert, PATH_BASELINES + "sci-bert.json")

# BioFalcon

In [None]:
import requests

headers = {'content-type': 'application/json', 'Accept-Charset': 'UTF-8'}

def biofalcon(text):
    url = 'https://labs.tib.eu/sdm/biofalcon/api?mode=long&k=1'
    payload = '{"text":"'+text+'"}'
    r = requests.post(url, data=payload.encode('utf-8'), headers=headers)

    if r.status_code == 200:
        response=r.json()

        #print(response)

        if len(response['entities_UMLS']) > 0:
            return response['entities_UMLS']

        return []
    else:
        print(str(r))       
        return []

In [None]:
results = biofalcon(text.replace("\n", " "))
results

In [None]:
def get_ents_biofalcon(results):
    ents = []
    for ent, cuis in results: 
        #print( ent, cuis)

        if len(cuis) > 0:
            cui = str(cuis[0])

            if cui not in CACHE_CUIS:
                CACHE_CUIS[cui] = cui2tui(cui)

            tui = CACHE_CUIS[cui]

            if tui in TUIS:
                ents.append(ent.lower())
    return ents    

In [None]:
get_ents_biofalcon(results)

In [None]:
results_biofalcon = []
for ds, val in tqdm(golds.items()):
    pred_doc = biofalcon(val["text"].replace("\n", " "))
    pred_ents = get_ents_biofalcon(pred_doc)
    results_biofalcon.append({"text": val["text"], "y_pred":pred_ents, "y_true":val["sings"]})

In [None]:
results_biofalcon[-1]

In [None]:
dump_results(results_biofalcon, PATH_BASELINES + "biofalcon.json")

### Dump cache tuis

In [None]:
dump_cache(CACHE_CUIS, PATH_CACHE)

# GliNER


### Available Models on Hugging Face
- [x] [GLiNER-Base](https://huggingface.co/urchade/gliner_base) (CC BY NC 4.0)
- [x] [GLiNER-Multi](https://huggingface.co/urchade/gliner_multi) (CC BY NC 4.0)
- [x] [GLiNER-small](https://huggingface.co/urchade/gliner_small) (CC BY NC 4.0)
- [x] [GLiNER-small-v2](https://huggingface.co/urchade/gliner_smallv2) (Apache)
- [x] [GLiNER-medium](https://huggingface.co/urchade/gliner_medium) (CC BY NC 4.0)
- [x] [GLiNER-medium-v2](https://huggingface.co/urchade/gliner_mediumv2) (Apache)
- [x] [GLiNER-large](https://huggingface.co/urchade/gliner_large) (CC BY NC 4.0)
- [x] [GLiNER-large-v2](https://huggingface.co/urchade/gliner_largev2) (Apache)

In [None]:
!pip install gliner

In [None]:
from gliner import GLiNER

In [None]:
GLiNER_base = GLiNER.from_pretrained("urchade/gliner_base")

In [None]:
def get_ents_gliNER(model, labels,  text:str):
   
    entities = model.predict_entities(text, labels, threshold=0.5)
    ents = [ entity["text"].lower().strip() for entity in entities]

    return ents

def eval_gliNER(model, labels, golds):
    results = []
    for ds, val in tqdm(golds.items()):
        pred = get_ents_gliNER(model, labels, val["text"])
        results.append({"y_pred":pred, "y_true":val["sings"]})

    return results

In [None]:
TUIS_LABELS

In [None]:
text = """
The symptoms of cytomegalovirus retinitis have it usually starting in one eye (and also have the possibility of retinal detachment), presenting as:
Blurred vision
Blind spots
Specks in your vision """

#labels = ["Sign and Symptom"]

entities = GLiNER_base.predict_entities(text=text, labels=TUIS_LABELS, threshold=0.5)

for entity in entities:
    print(entity["text"], "=>", entity["label"])

# Gli NER Base

In [None]:
results_base = eval_gliNER(GLiNER_base, TUIS_LABELS, golds)

In [None]:
dump_results(results_base, PATH_BASELINES + "GLiNER_base.json")

# Gli NER Medium

In [None]:
GLiNER_medium = GLiNER.from_pretrained("urchade/gliner_medium")

In [None]:
results_med = eval_gliNER(GLiNER_medium, TUIS_LABELS, golds)

In [None]:
dump_results(results_med, PATH_BASELINES + "GLiNER_medium.json")

In [None]:
GLiNER_medium_v2 = GLiNER.from_pretrained("urchade/gliner_mediumv2")

In [None]:
results_md2 = eval_gliNER(GLiNER_medium_v2, TUIS_LABELS, golds)

In [None]:
dump_results(results_md2, PATH_BASELINES + "GLiNER_medium_v2.json")

# Gli NER large

In [None]:
GLiNER_large = GLiNER.from_pretrained("urchade/gliner_large")

In [None]:
results_large = eval_gliNER(GLiNER_medium_v2, TUIS_LABELS, golds)

In [None]:
dump_results(results_large, PATH_BASELINES + "GLiNER_large.json")

# Eval

In [None]:
from metrics import eval_results

In [None]:
files_baselines = ["biofalcon", "metamap", "sci-small", "sci-md", "sci-lg", "sci-bert", "GLiNER_base", "GLiNER_medium", "GLiNER_medium_v2", "GLiNER_large"]

In [None]:
for f in files_baselines:
    print(f)
    data = load_results(PATH_BASELINES + f + ".json")

    results = [ ( set(e["y_true"]), set(e["y_pred"])) for e in data]

    bad = eval_results(results)