In [4]:
# pip install transformers

In [None]:
import torch
from transformers import BertTokenizer, BertForTokenClassification

def load_ner_model():
    """Load pre-trained BERT model for Named Entity Recognition."""
    model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForTokenClassification.from_pretrained(model_name)
    return tokenizer, model

def predict_ner(text, tokenizer, model):
    """Predict named entities in the given text."""
    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text)))
    inputs = tokenizer.encode(text, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(inputs).logits
    
    predictions = torch.argmax(outputs, dim=2).numpy()[0]
    id2label = model.config.id2label
    
    entities = [(token, id2label[predictions[i]]) for i, token in enumerate(tokens)]
    return entities

def display_results(entities):
    """Display the predicted entities in a readable format."""
    print("\nNamed Entity Recognition Results:\n")
    for token, label in entities:
        print(f"{token:15} -> {label}")

if __name__ == "__main__":
    text = "Elon Musk is the CEO of Tesla and SpaceX, and he lives in California."
    tokenizer, model = load_ner_model()
    entities = predict_ner(text, tokenizer, model)
    display_results(entities)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Named Entity Recognition Results:

[CLS]           -> O
El              -> I-PER
##on            -> I-PER
Mu              -> I-PER
##sk            -> I-PER
is              -> O
the             -> O
CEO             -> O
of              -> O
Te              -> I-ORG
##sla           -> I-ORG
and             -> O
Space           -> I-ORG
##X             -> I-ORG
,               -> O
and             -> O
he              -> O
lives           -> O
in              -> O
California      -> I-LOC
.               -> O
[SEP]           -> O


In [2]:
# pip install spacy

In [3]:
from spacy import displacy

def visualize_ner(text, entities):
    """Visualize named entity recognition using spaCy's displaCy."""
    ents = []
    start = 0
    for token, label in entities:
        ents.append({"start": start, "end": start + len(token), "label": label})
        start += len(token) + 1  # Adjust for space

    doc = {"text": text, "ents": ents, "title": "Named Entity Recognition"}
    displacy.render(doc, style="ent", manual=True, jupyter=True)

# Example usage in Jupyter Notebook
visualize_ner(text, entities)

In [10]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
ner = pipeline("ner", model=model, tokenizer=tokenizer) # Named Entity Recognition

data = "Elon Musk is the CEO of Tesla and SpaceX, and he lives in California."
results = ner(data)
results

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'entity': 'B-PER',
  'score': 0.99209243,
  'index': 1,
  'word': 'El',
  'start': 0,
  'end': 2},
 {'entity': 'B-PER',
  'score': 0.8453797,
  'index': 2,
  'word': '##on',
  'start': 2,
  'end': 4},
 {'entity': 'I-PER',
  'score': 0.99850273,
  'index': 3,
  'word': 'Mu',
  'start': 5,
  'end': 7},
 {'entity': 'I-PER',
  'score': 0.9700687,
  'index': 4,
  'word': '##sk',
  'start': 7,
  'end': 9},
 {'entity': 'B-ORG',
  'score': 0.99847037,
  'index': 9,
  'word': 'Te',
  'start': 24,
  'end': 26},
 {'entity': 'I-ORG',
  'score': 0.9957432,
  'index': 10,
  'word': '##sla',
  'start': 26,
  'end': 29},
 {'entity': 'I-ORG',
  'score': 0.7222855,
  'index': 11,
  'word': 'and',
  'start': 30,
  'end': 33},
 {'entity': 'B-ORG',
  'score': 0.97088474,
  'index': 12,
  'word': 'Space',
  'start': 34,
  'end': 39},
 {'entity': 'I-ORG',
  'score': 0.9989818,
  'index': 13,
  'word': '##X',
  'start': 39,
  'end': 40},
 {'entity': 'B-LOC',
  'score': 0.9996055,
  'index': 19,
  'word': 'C

In [11]:
entities = predict_ner(data, tokenizer, model)
entities

[('[CLS]', 'O'),
 ('El', 'B-PER'),
 ('##on', 'B-PER'),
 ('Mu', 'I-PER'),
 ('##sk', 'I-PER'),
 ('is', 'O'),
 ('the', 'O'),
 ('CEO', 'O'),
 ('of', 'O'),
 ('Te', 'B-ORG'),
 ('##sla', 'I-ORG'),
 ('and', 'I-ORG'),
 ('Space', 'B-ORG'),
 ('##X', 'I-ORG'),
 (',', 'O'),
 ('and', 'O'),
 ('he', 'O'),
 ('lives', 'O'),
 ('in', 'O'),
 ('California', 'B-LOC'),
 ('.', 'O'),
 ('[SEP]', 'O')]

In [6]:
visualize_ner(data, [(entity["word"], entity["entity"]) for entity in results])

In [19]:
# Import necessary libraries
from transformers import BertTokenizer, BertForTokenClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader, random_split 
from tqdm import tqdm 
import torch
# Assuming a predefined set of entity types
entity_types = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
# Set num_labels
num_labels = len(entity_types)
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
# Define batch_size
batch_size = 32  # Adjust as needed
# Define learning rate
learning_rate = 5e-5  # Adjust as needed
# Sample data from CoNLL-2003 (Replace this with your dataset) 
train_dataset_sample = [
    {"text": "John works at Google in New York.", "labels": {"entities": [(0, 4, "PERSON"), (17, 22, "ORG"), (26, 34, "GPE")]}},
    {"text": "Apple Inc. is a technology company.", "labels": {"entities": [(0, 10, "ORG")]}},     # Add more samples as needed
]
def tokenize_and_format_data(dataset, tokenizer):
    tokenized_data = []
    for sample in dataset:
        text = sample["text"]
        entities = sample["labels"]["entities"]
        # Tokenize the input text using the BERT tokenizer

        tokens =  tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text)))
        # Initialize labels for each token as 'O' (Outside)
        labels = ['O'] * len(tokens)
        # Update labels for entity spans
        for start, end, entity_type in entities:
            # Tokenize the prefix to get the correct offset
            prefix_tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text[:start])))
            start_token = len(prefix_tokens)
            # Tokenize the entity to get its length
            entity_tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text[start:end])))
            end_token = start_token + len(entity_tokens) - 1
            labels[start_token] = f"B-{entity_type}"

    for i in range(start_token + 1, end_token +1):
        labels[i] = f"I-{entity_type}"

        # Convert tokens and labels to input IDs and label IDs
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        label_ids = [entity_types.index(label) for label in labels]
        # Pad input_ids and label_ids to the maximum sequence length
        padding_length = tokenizer.model_max_length - len(input_ids)
        input_ids += [tokenizer.pad_token_id] * padding_length
        label_ids += [entity_types.index('O')] * padding_length
    tokenized_data.append({'input_ids': input_ids, 'labels': label_ids
        })
    # Convert tokenized data to PyTorch dataset
    dataset = TensorDataset(
        torch.tensor([item['input_ids'] for item in tokenized_data]),               
        torch.tensor([item['labels'] for item in tokenized_data])
    )
    return dataset
# Prepare data for fine-tuning
train_data = tokenize_and_format_data(train_dataset_sample, tokenizer) 
train_dataloader = DataLoader(train_data, batch_size=batch_size)
# Fine-tune the model
optimizer = AdamW(model.parameters(), lr=learning_rate) 
num_epochs = 15  # Adjust as needed
for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_dataloader, desc="Training"):
        inputs, labels = batch
        # Unpack the tuple
        outputs = model(inputs, labels=labels)
        loss =  outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
# Save the fine-tuned model for later use model.save_pretrained('fine_tuned_ner_model')
model.save_pretrained('fine_tuned_ner_model')

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training:   0%|          | 0/1 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
Training: 100%|██████████| 1/1 [00:04<00:00,  4.73s/it]
Training: 100%|██████████| 1/1 [00:02<00:00,  2.22s/it]
Training: 100%|██████████| 1/1 [00:01<00:00,  1.52s/it]
Training: 100%|██████████| 1/1 [00:01<00:00,  1.75s/it]
Training: 100%|██████████| 1/1 [00:01<00:00,  1.57s/it]
Training: 100%|██████████| 1/1 [00:01<00:00,  1.35s/it]
Training: 100%|██████████| 1/1 [00:01<00:00,  1.67s/it]
Training: 100%|██████████| 1/1 [00:01<00:00,  1.56s/it]
Training: 100%

In [20]:
data = "Elon Musk is the CEO of Tesla and SpaceX, and he lives in California."

entities = predict_ner(data, tokenizer, model)
entities

[('[CLS]', 'LABEL_0'),
 ('el', 'LABEL_0'),
 ('##on', 'LABEL_0'),
 ('mu', 'LABEL_0'),
 ('##sk', 'LABEL_0'),
 ('is', 'LABEL_0'),
 ('the', 'LABEL_0'),
 ('ceo', 'LABEL_0'),
 ('of', 'LABEL_0'),
 ('tesla', 'LABEL_0'),
 ('and', 'LABEL_0'),
 ('space', 'LABEL_0'),
 ('##x', 'LABEL_0'),
 (',', 'LABEL_2'),
 ('and', 'LABEL_0'),
 ('he', 'LABEL_0'),
 ('lives', 'LABEL_0'),
 ('in', 'LABEL_0'),
 ('california', 'LABEL_0'),
 ('.', 'LABEL_6'),
 ('[SEP]', 'LABEL_4')]