### NER

In [4]:
import spacy
from spacy import displacy

def extract_named_entities(abstract_text):
    """
    Args:
        abstract_text (str): The text of the abstract.

    Returns:
        list: A list of dictionaries, where each dictionary represents a named entity
              and contains the 'text' of the entity and its 'label'.
    """
    # Load the English language model from spaCy
    # You might need to download it first if you haven't already:
    # python -m spacy download en_core_web_sm
    #nlp = spacy.load("en_core_web_sm")
    nlp = spacy.load("en_core_web_lg")

    doc = nlp(abstract_text)
    displacy.render(doc, style="ent", jupyter=True)

    # Extract named entities
    named_entities = []
    for ent in doc.ents:
        named_entities.append({"text": ent.text, "label": ent.label_})

    # named_entities = []
    # for ent in doc.ents:
    #     if ent.label_ in ["ORG", "PERSON"]:  # Extract only organizations and people
    #         named_entities.append({"text": ent.text, "label": ent.label_})

    return named_entities

abstract = """
The study investigates the efficacy of a novel drug, Aliprex, in treating patients with Alzheimer's disease. 
The research was conducted at the University of California, San Francisco (UCSF) and involved 100 participants. 
Preliminary results indicate a significant improvement in cognitive function among patients receiving Aliprex compared to the placebo group. 
The research team, led by Dr. Emily Carter, plans to publish the full findings in the New England Journal of Medicine.
"""

entities = extract_named_entities(abstract)

Entity: Aliprex, Label: ORG
Entity: the University of California, Label: ORG
Entity: San Francisco, Label: GPE
Entity: UCSF, Label: ORG
Entity: 100, Label: CARDINAL
Entity: Aliprex, Label: ORG
Entity: Emily Carter, Label: PERSON
Entity: the New England Journal of Medicine, Label: ORG


In [14]:
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_path = "dslim/bert-base-NER"  # Replace with the actual path
tokenizer_path = "dslim/bert-base-NER" # Replace with the actual path

tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, device=device)

text = "Apple Inc. plans to open a new store in San Francisco by January 2024. Tim Cook, the CEO, announced the news yesterday."
ner_results = ner_pipeline(text)

for entity in ner_results:
    print(entity)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'entity': 'B-ORG', 'score': 0.9996086, 'index': 1, 'word': 'Apple', 'start': 0, 'end': 5}
{'entity': 'I-ORG', 'score': 0.99942136, 'index': 2, 'word': 'Inc', 'start': 6, 'end': 9}
{'entity': 'B-LOC', 'score': 0.99934715, 'index': 11, 'word': 'San', 'start': 40, 'end': 43}
{'entity': 'I-LOC', 'score': 0.99942625, 'index': 12, 'word': 'Francisco', 'start': 44, 'end': 53}
{'entity': 'B-PER', 'score': 0.9997869, 'index': 18, 'word': 'Tim', 'start': 71, 'end': 74}
{'entity': 'I-PER', 'score': 0.99977297, 'index': 19, 'word': 'Cook', 'start': 75, 'end': 79}


In [15]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline as hf_pipeline  # To avoid naming conflict

# Specify the pre-trained model
model_name = "dslim/bert-base-NER"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

text = "Angela Merkel visited the White House in Washington, D.C. on Tuesday."

inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)
logits = outputs.logits


import torch
predictions = torch.argmax(logits, dim=2)

# Map token ids to label names
id2label = model.config.id2label
predicted_labels = [id2label[prediction.item()] for prediction in predictions[0]]

# Align tokens and labels
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

# Combine sub-word tokens and their labels to form entities
entities = []
current_entity = None
for token, label in zip(tokens, predicted_labels):
    if token.startswith("##"):  # Handle sub-word tokens
        if current_entity:
            current_entity["word"] += token[2:]
    elif label.startswith("B-"):
        if current_entity:
            entities.append(current_entity)
        current_entity = {"entity": label[2:], "word": token}
    elif label.startswith("I-"):
        if current_entity and label[2:] == current_entity["entity"]:
            current_entity["word"] += " " + token
        else:
            # Handle cases where I- tag follows a different or no B- tag
            if current_entity:
                entities.append(current_entity)
            current_entity = {"entity": label[2:], "word": token}
    else:  # "O" label (outside of any entity)
        if current_entity:
            entities.append(current_entity)
            current_entity = None

if current_entity:
    entities.append(current_entity)

for entity in entities:
    print(entity)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'entity': 'PER', 'word': 'Angela Merkel'}
{'entity': 'LOC', 'word': 'White House'}
{'entity': 'LOC', 'word': 'Washington , D . C .'}
