# BioLaySum Part III: Extra Definition Retrieval

Modify the dataset with definition retrieval and replacement from Webster medical library.


In [None]:
!pip install requests beautifulsoup4
!pip install spacy transformers
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


#### save a file with the modified text

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
keyword_list={'structural biology and molecular biophysics': 480}

In [None]:
import spacy
import requests
from bs4 import BeautifulSoup
import json
import os
import tqdm
import time
nlp = spacy.load('en_core_web_lg')

def get_definition(term):
    url = f"https://www.merriam-webster.com/dictionary/{term.lower()}"
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        meta_tag = soup.find('meta', attrs={'name': 'description'})
        if meta_tag and 'content' in meta_tag.attrs:
            content = meta_tag['content']
            definition_start = content.find(" of ") + 4
            return content[definition_start:]
    return term


def modify_text(text):
    doc = nlp(text)
    modified_text = text
    ent_words = set()
    for ent in doc.ents:
      if ent.label_ == "ORG":
          ent_words.add(ent.text)
    for w in ent_words:
        definition = get_definition(w)
        definition = truncate_text(definition, max_words=5)
        modified_text = modified_text.replace(w, definition)
    return modified_text
def truncate_text(text, max_words=512):

    return ' '.join(text.split()[:max_words])

def keyword_articles(file_path, save_path,keyword_list):
    keyword_data = {}
    with open(file_path, 'r') as f:
        print('Converting...')
        for line in tqdm.tqdm(f):
            entry = json.loads(line.strip())
            keywords = entry.get('keywords', []) # extract keywords
            article = entry.get('article', '') # extract articles
            lay_summary = entry.get('lay_summary', '') # extract lay_summary


            for keyword in keywords:
              if keyword in keyword_list:
                truncated_article = truncate_text(article)
                modified_article = modify_text(truncated_article)
                if modified_article != truncated_article:
                  print('modified')
                keyword_data.setdefault(keyword, {'article': [], 'lay_summary': []})
                keyword_data[keyword]['article'].append(modified_article)
                keyword_data[keyword]['lay_summary'].append(lay_summary)

    with open(save_path, 'w') as f:
        json.dump(keyword_data, f, indent=4)
    print(f'Finished, please check file {save_path}')

folder_dir = '/content/drive/My Drive/NLP Final Project/biolaysumm2024_data'
file_path = os.path.join(folder_dir, 'eLife_val.jsonl')
save_path = os.path.join(folder_dir, 'eLife_train_modified_structural_test.json')

keyword_articles(file_path, save_path,keyword_list)



Converting...


6it [00:02,  2.78it/s]

modified


19it [00:06,  3.23it/s]

modified


57it [00:09,  7.21it/s]

modified


65it [00:13,  4.60it/s]

modified


67it [00:17,  3.10it/s]

modified


70it [00:19,  2.91it/s]

modified


78it [00:19,  3.78it/s]

modified


90it [00:21,  4.47it/s]

modified


99it [00:24,  4.29it/s]

modified


110it [00:27,  3.89it/s]

modified


112it [00:31,  2.47it/s]

modified


115it [00:34,  2.02it/s]

modified


130it [00:35,  3.52it/s]

modified


133it [00:39,  2.54it/s]

modified


134it [00:40,  2.28it/s]

modified


141it [00:45,  1.89it/s]

modified


142it [00:48,  1.21it/s]

modified


152it [00:52,  2.02it/s]

modified


170it [00:54,  3.72it/s]

modified


205it [00:58,  5.91it/s]

modified


210it [00:59,  5.79it/s]

modified


221it [01:03,  4.34it/s]

modified


227it [01:07,  3.16it/s]

modified


241it [01:09,  3.48it/s]

modified





Finished, please check file /content/drive/My Drive/NLP Final Project/biolaysumm2024_data/eLife_train_modified_structural_test.json
