1. Compare the following spacy models in extracting medical entities including drug trade names:

* **en_core_web_trf**: English transformer pipeline. Components: transformer, tagger, parser, ner, attribute_ruler, lemmatizer.
* **en_ner_bc5cdr_md**: A spaCy NER model trained on the BC5CDR corpus.
* **en_core_sci_lg**: A full spaCy pipeline for biomedical data with a larger vocabulary and 600k word vectors.
* **en_core_sci_md**: A full spaCy pipeline for biomedical data with a larger vocabulary and 50k word vectors.
* **en_core_sci_sm**: A full spaCy pipeline for biomedical data.



In [None]:
!python -m spacy download en_core_web_trf

In [None]:
!pip install scispacy

In [None]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bc5cdr_md-0.5.4.tar.gz

In [None]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_lg-0.5.4.tar.gz

In [None]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_md-0.5.4.tar.gz

In [None]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz

In [None]:
import re
import pandas as pd
import spacy
import scispacy

In [None]:
en_core_web_trf = spacy.load('en_core_web_trf')
en_ner_bc5cdr_md = spacy.load('en_ner_bc5cdr_md')
en_core_sci_lg = spacy.load('en_core_sci_lg')
en_core_sci_md = spacy.load('en_core_sci_md')
en_core_sci_sm = spacy.load('en_core_sci_sm')

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [None]:
source_lang_code = 'en'
models_with_names = {
    'en_core_web_trf': en_core_web_trf,
    'en_ner_bc5cdr_md': en_ner_bc5cdr_md,
    'en_core_sci_lg': en_core_sci_lg,
    'en_core_sci_md': en_core_sci_md,
    'en_core_sci_sm': en_core_sci_sm
    }

In [None]:
test_dict = {'test_list': [
    'Adakveo',
    'crizanlizumab',
    'Afinitor Disperz®/Votubia®',
    '(everolimus)',
    'Afinitor®/Votubia®',
    'everolimus',
    'Oncology',
    'Aimovig',
    'erenumab',
    'Arzerra',
    '(ofatumumab)',
    'Oncology',
    'Azorga',
    '(brinzolamide',
    'timolol)',
    'Beovu®',
    'assumption',
    'pneumonia',
    'coronary heart disease',
    '(brolucizumab)',
    'Cibacen®',
    '(benazepril hydrochloride)',
    'Ciprodex®',
    '(ciprofloxacin, dexamethasone)',
    'Comtan',
    '(entacapone)',
    'Cosentyx®',
    '(secukinumab)',
    'Diovan HCT/Co-Diovan',
    '(valsartan, hydrochlorothiazide)',
    'Diovan®',
    '(valsartan)',
    'Duotrav',
    '(travoprost, timolol)',
    'Durezol®',
    '(difluprednate)',
    'Egaten®',
    '(triclabendazole)',
    'Entresto',
    '(sacubitril, valsartan)',
    'Eucreas®',
    '(vildagliptin, metformin)',
    'Exelon®',
    '(rivastigmine)',
    'Exforge HCT',
    '(valsartan',
    'amlodipine besylate',
    'hydrochlorothiazide',
    'Exforge®',
    '(valsartan',
    'amlodipine besylate)',
    'Exjade',
    '(deferasirox)',
    'Extavia®',
    '(interferon beta-1b)',
    'Fabhalta®',
    '(iptacopan)',
    'Farydak',
    '(panobinostat)',
    'Femara®',
    '(letrozole)',
    'Focalin®',
    '(dexmethylphenidate HCl',
    'dexmethylphenidate extended release)',
    'Focalin XR',
    '(dexmethylphenidate HCl',
    'dexmethylphenidate extended release)',
    'Galvus®',
    '(vildagliptin)',
    'Gilenya®',
    '(fingolimod)',
    'cat',
    'bread',
    'Toshiba',
    'Da Vinci',
    'train',
    'proton synchrotron'
    ]}

In [None]:
def find_med_ents(test_list, source_lang_code, model):
    list_of_med_ents = []
    for i in test_list:
        list_of_med_ents.append(model(i).ents)
    return list_of_med_ents

In [None]:
# Save medical entities identified by each model in the test_dict for comparison.
for model in models_with_names:
    output = find_med_ents(test_dict['test_list'], source_lang_code, models_with_names[model])
    test_dict[model] = output

In [None]:
df = pd.DataFrame(test_dict)
df.to_excel('med_ents_spacy.xlsx', index = False)

2. Extract medical entities from the detailed translation memory using the best model.

In [None]:
best_model = en_core_sci_md

In [None]:
# Each row in the detailed_translation_memory DataFrame contains a source segment, a target segment, a client name, and an order number.
df = pd.read_excel('detailed_translation_memory.xlsx')

In [None]:
# Extract medical entities from the source segments, storing them in a list, and indicate extracted entities for each segment in the DataFrame for future statistical analysis.
med_ents = []
i = 0
df['Medical entities'] = ''

for source in df['sourceText']:
  med_ents_in_segment = list(best_model(source).ents)
  med_ents.extend(med_ents_in_segment)
  df.at[i, 'Medical entities'] = med_ents_in_segment
  i += 1

len(med_ents)

1040

In [None]:
# Remove duplicate entries and entities that are likely to be irrelevant or erroneous.
clean_med_ents = []

for ent in med_ents:
  ent = str(ent)
  if (re.search(r"^[а-яa-zА-ЯA-Z \(\)\/—–-]+$", ent) and
      len(ent) > 3 and ent not in clean_med_ents):
   clean_med_ents.append(str(ent))

len(clean_med_ents)

435

In [None]:
# Save the augmented detailed translation memory, which includes medical entities for each row, as well as a clean list of medical entities for use with GPT.
df.to_excel('detailed_TM_with_ents.xlsx', index = False)

clean_list_of_ents = pd.DataFrame(clean_med_ents, columns = ['med_ents'])
clean_list_of_ents.to_excel('med_ents_detailed_TM.xlsx', index = False)