<a href="https://colab.research.google.com/github/BEKKAMSHIVA/NER-LLM_FINETUNNING/blob/main/nlp_finetunning_ner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Installing the Hugging Face datasets library for loading and processing NLP datasets
!pip install datasets




In [None]:
# Load CoNLL-2003 NER dataset from Hugging Face
from datasets import load_dataset
data = load_dataset("eriktks/conll2003")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

conll2003.py:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

The repository for eriktks/conll2003 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/eriktks/conll2003.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] Y


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [None]:
# Displaying the structure of the loaded dataset (DatasetDict with train, validation, and test splits)

data

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [None]:
# Importing NumPy for numerical operations
import numpy as np

# Importing BERT tokenizer for fast tokenization of input text
from transformers import BertTokenizerFast

# Importing data collator to dynamically pad batches during token classification
from transformers import DataCollatorForTokenClassification

# Importing a pre-trained model architecture suitable for token classification tasks (e.g., NER)
from transformers import AutoModelForTokenClassification


In [None]:
# Show the first example from the training data
data["train"][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [None]:
# Show the list of NER tag names used in the dataset
data["train"].features["ner_tags"]

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [None]:
#The ner_tags column in the dataset uses the following labels:

# O - Outside any named entity (just a regular word)

#B-PER – Beginning of a person’s name

#I-PER – Inside a person’s name

#B-ORG – Beginning of an organization name

#I-ORG – Inside an organization name

#B-LOC – Beginning of a location name (e.g., country, city)

#I-LOC – Inside a location name

#B-MISC – Beginning of a miscellaneous entity (e.g., events, nationalities)

#I-MISC – Inside a miscellaneous entity

In [None]:
# Show the general description of the dataset
data["train"].description

'The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\n\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses 

In [None]:
# Load the BERT tokenizer (bert-base-uncased) to convert text into tokens
# This tokenizer will also be used later for processing new/unseen data
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Show the first training example with words and their tag values (POS, chunk, NER)
data['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [None]:
#tokens: These are the words in the sentence.

#pos_tags: Part-of-speech tags (as numbers).

#chunk_tags: Chunking labels (as numbers).

#ner_tags: Named Entity Recognition labels (as numbers).

In [None]:
example_text = data['train'][0]  # Get the first example from training data

tokenized_input = tokenizer(example_text["tokens"], is_split_into_words=True)
# Tokenize the list of words from the sentence

tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
# Convert token IDs back into readable tokens

word_ids = tokenized_input.word_ids()
# Get the word index each token came from

print(word_ids)  # Showing which word each token belongs to


[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, None]


In [None]:
# Turn token IDs into actual words (or subwords)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

# Show the list of tokens (including special start and end tokens)
tokens


['[CLS]',
 'eu',
 'rejects',
 'german',
 'call',
 'to',
 'boycott',
 'british',
 'lamb',
 '.',
 '[SEP]']

In [None]:
#[CLS] is a special token marking the start.

#[SEP] marks the end.

#The rest are the tokens produced from your sentence.

In [None]:
len(example_text['ner_tags']), len(tokenized_input["input_ids"])

(9, 11)

In [None]:
def tokenize_and_align_labels(examples, label_all_tokens=True):
    # Tokenize the input words (split into words already)
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True
    )

    labels = []  # To store the new labels that match the tokenized inputs

    for i, label in enumerate(examples["ner_tags"]):  # Loop over each example
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Get word IDs for this example

        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                # Special tokens like [CLS] and [SEP], we ignore them in loss calculation
                label_ids.append(-100)

            elif word_idx != previous_word_idx:
                # First token of a word, use its label
                label_ids.append(label[word_idx])

            else:
                # For subword tokens, use the same label if label_all_tokens is True
                label_ids.append(label[word_idx] if label_all_tokens else -100)

            previous_word_idx = word_idx  # Update previous word index

        labels.append(label_ids)  # Add this example's aligned labels

    tokenized_inputs["labels"] = labels  # Add aligned labels to the tokenized input
    return tokenized_inputs  # Return the final tokenized input with aligned labels


In [None]:
# Get the 5th record from the training set as a small dataset (not just a single example)
data['train'][4:5]


{'id': ['4'],
 'tokens': [['Germany',
   "'s",
   'representative',
   'to',
   'the',
   'European',
   'Union',
   "'s",
   'veterinary',
   'committee',
   'Werner',
   'Zwingmann',
   'said',
   'on',
   'Wednesday',
   'consumers',
   'should',
   'buy',
   'sheepmeat',
   'from',
   'countries',
   'other',
   'than',
   'Britain',
   'until',
   'the',
   'scientific',
   'advice',
   'was',
   'clearer',
   '.']],
 'pos_tags': [[22,
   27,
   21,
   35,
   12,
   22,
   22,
   27,
   16,
   21,
   22,
   22,
   38,
   15,
   22,
   24,
   20,
   37,
   21,
   15,
   24,
   16,
   15,
   22,
   15,
   12,
   16,
   21,
   38,
   17,
   7]],
 'chunk_tags': [[11,
   11,
   12,
   13,
   11,
   12,
   12,
   11,
   12,
   12,
   12,
   12,
   21,
   13,
   11,
   12,
   21,
   22,
   11,
   13,
   11,
   1,
   13,
   11,
   17,
   11,
   12,
   12,
   21,
   1,
   0]],
 'ner_tags': [[5,
   0,
   0,
   0,
   0,
   3,
   4,
   0,
   0,
   0,
   1,
   2,
   0,
   0,
   0,
   0,
   0,


In [None]:
# Tokenize the 5th training example and align its labels with the tokens
q = tokenize_and_align_labels(data['train'][4:5])
q  # Print the tokenized input along with aligned labels


{'input_ids': [[101, 2762, 1005, 1055, 4387, 2000, 1996, 2647, 2586, 1005, 1055, 15651, 2837, 14121, 1062, 9328, 5804, 2056, 2006, 9317, 10390, 2323, 4965, 8351, 4168, 4017, 2013, 3032, 2060, 2084, 3725, 2127, 1996, 4045, 6040, 2001, 24509, 1012, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 5, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, -100]]}

In [None]:
# Print each token with its matched label (aligned nicely using underscores)
for token, label in zip(tokenizer.convert_ids_to_tokens(q["input_ids"][0]), q["labels"][0]):
    print(f"{token:_<40} {label}")


[CLS]___________________________________ -100
germany_________________________________ 5
'_______________________________________ 0
s_______________________________________ 0
representative__________________________ 0
to______________________________________ 0
the_____________________________________ 0
european________________________________ 3
union___________________________________ 4
'_______________________________________ 0
s_______________________________________ 0
veterinary______________________________ 0
committee_______________________________ 0
werner__________________________________ 1
z_______________________________________ 2
##wing__________________________________ 2
##mann__________________________________ 2
said____________________________________ 0
on______________________________________ 0
wednesday_______________________________ 0
consumers_______________________________ 0
should__________________________________ 0
buy_____________________________________ 0
sheep___

In [None]:
## Applying on entire data
tokenized_datasets = data.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [None]:
# View the first tokenized and label-aligned training example
tokenized_datasets['train'][0]


{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0],
 'input_ids': [101,
  7327,
  19164,
  2446,
  2655,
  2000,
  17757,
  2329,
  12559,
  1012,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, -100]}

In [None]:
# Load the pre-trained BERT model for token classification (NER task)
# 'num_labels=9' because our dataset has 9 unique named entity tags
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=9)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#Defining Training Arguments

In [None]:
from transformers import TrainingArguments, Trainer


args = TrainingArguments(
"test-ner",
# Replacing 'evaluation_strategy' with 'evaluation_strategy'
eval_steps=1000,  # Evaluation every 1000 steps
# Replacing 'learning_rate' with 'learning_rate'
learning_rate=2e-5,
# Replacing 'per_device_train_batch_size' with 'per_device_train_batch_size' or 'train_batch_size' if using TPUs
per_device_train_batch_size=16,
# Replacing 'per_device_eval_batch_size' with 'per_device_eval_batch_size' or 'eval_batch_size' if using TPUs
per_device_eval_batch_size=16,
# Replacing 'num_train_epochs' with 'num_train_epochs'
num_train_epochs=3,
# Replacing 'weight_decay' with 'weight_decay'
weight_decay=0.01,
)

In [None]:
# Data Collator for Token Classification
# This collator dynamically pads inputs and labels to the longest sequence in a batch.
# It's necessary for token classification tasks like NER, where input and label lengths must align.
data_collator = DataCollatorForTokenClassification(tokenizer)



In [None]:
# Installing SeqEval: A specialized library for evaluating sequence labeling tasks (like NER).
# It calculates metrics such as precision, recall, and F1-score based on the full sequence of entity labels.
!pip install seqeval




In [None]:
!pip install evaluate
# Importing the evaluation library
# 'seqeval' is used to compute precision, recall, and F1-score for NER tasks
from evaluate import load
metric = load("seqeval")

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [None]:

example = data['train'][0]

In [None]:
label_list = data["train"].features["ner_tags"].feature.names
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [None]:
# Looping through the list of NER tag sequences in the example and printing each sequence of entity labels.
for i in example["ner_tags"]:
    print(i)



3
0
7
0
0
0
7
0
0


In [None]:
# Converting numeric NER tag IDs to their corresponding string labels using the label list.

labels = [label_list[i] for i in example["ner_tags"]]
labels


['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

In [None]:
# Computing evaluation metrics (precision, recall, F1-score) by comparing predicted labels with reference (true) labels.
# Here, we are using the same labels for both to test the metric setup (ideal scores expected).
metric.compute(predictions=[labels], references=[labels])

{'MISC': {'precision': np.float64(1.0),
  'recall': np.float64(1.0),
  'f1': np.float64(1.0),
  'number': np.int64(2)},
 'ORG': {'precision': np.float64(1.0),
  'recall': np.float64(1.0),
  'f1': np.float64(1.0),
  'number': np.int64(1)},
 'overall_precision': np.float64(1.0),
 'overall_recall': np.float64(1.0),
 'overall_f1': np.float64(1.0),
 'overall_accuracy': 1.0}

In [None]:
def compute_metrics(eval_preds):
    pred_logits, labels = eval_preds

    # Get the index of the highest prediction score for each token
    pred_logits = np.argmax(pred_logits, axis=2)
    # Note: No need for softmax since the highest score index represents the predicted label

    # Convert predicted indices to label names, ignoring padding (-100)
    predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
    ]

    # Convert true label indices to label names, ignoring padding (-100)
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
    ]

    # Compute evaluation metrics using seqeval
    results = metric.compute(predictions=predictions, references=true_labels)

    # Return standard evaluation metrics
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


In [None]:
#  Initializing the Trainer
# This sets up the Hugging Face Trainer class, which handles the full training and evaluation loop.
trainer = Trainer(
    model,                      # The BERT model for token classification
    args,                       # Training arguments (like batch size, learning rate, epochs, etc.)
    train_dataset=tokenized_datasets["train"],     # Preprocessed training data
    eval_dataset=tokenized_datasets["validation"], # Preprocessed validation data for evaluation
    data_collator=data_collator,                   # Dynamically pads inputs and labels to the same length
    tokenizer=tokenizer,                           # Tokenizer used for processing inputs
    compute_metrics=compute_metrics                # Function to compute evaluation metrics (F1, Precision, Recall)
)


  trainer = Trainer(


In [None]:
trainer = Trainer(
   model,
   args,
   train_dataset=tokenized_datasets["train"],
   eval_dataset=tokenized_datasets["validation"],
   data_collator=data_collator,
   tokenizer=tokenizer,
   compute_metrics=compute_metrics
)

  trainer = Trainer(


In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mshivabekkam7[0m ([33mshivabekkam7-innomatics-research-labs[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,0.2164
1000,0.0701


Step,Training Loss
500,0.2164
1000,0.0701
1500,0.0462
2000,0.0348
2500,0.0257


TrainOutput(global_step=2634, training_loss=0.0757505012988682, metrics={'train_runtime': 20427.9597, 'train_samples_per_second': 2.062, 'train_steps_per_second': 0.129, 'total_flos': 1020143109346326.0, 'train_loss': 0.0757505012988682, 'epoch': 3.0})

In [None]:
import pandas as pd

# Create a dictionary of your TrainOutput data
train_output_data = {
    "Metric": [
        "Global Step",
        "Training Loss",
        "Train Runtime (s)",
        "Samples/sec",
        "Steps/sec",
        "Total FLOPs",
        "Epochs Completed"
    ],
    "Value": [
        2634,
        0.0757505012988682,
        20427.9597,
        2.062,
        0.129,
        1020143109346326.0,
        3.0
    ]
}

# Convert to DataFrame
df_train_output = pd.DataFrame(train_output_data)




In [None]:
df_train_output

Unnamed: 0,Metric,Value
0,Global Step,2634.0
1,Training Loss,0.0757505
2,Train Runtime (s),20427.96
3,Samples/sec,2.062
4,Steps/sec,0.129
5,Total FLOPs,1020143000000000.0
6,Epochs Completed,3.0


In [None]:
trainer.evaluate()

{'eval_loss': 0.05878867208957672,
 'eval_precision': 0.9330682571239232,
 'eval_recall': 0.9450721557221166,
 'eval_f1': 0.9390318457177792,
 'eval_accuracy': 0.9858134621189254,
 'eval_runtime': 447.7184,
 'eval_samples_per_second': 7.259,
 'eval_steps_per_second': 0.456,
 'epoch': 3.0}

In [None]:
## Save model
model.save_pretrained("ner_model")

In [None]:
## Save tokenizer
tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [None]:
id2label = {
    str(i): label for i,label in enumerate(label_list)
}
label2id = {
    label: str(i) for i,label in enumerate(label_list)
}

In [None]:
id2label

{'0': 'O',
 '1': 'B-PER',
 '2': 'I-PER',
 '3': 'B-ORG',
 '4': 'I-ORG',
 '5': 'B-LOC',
 '6': 'I-LOC',
 '7': 'B-MISC',
 '8': 'I-MISC'}

In [None]:
label2id

{'O': '0',
 'B-PER': '1',
 'I-PER': '2',
 'B-ORG': '3',
 'I-ORG': '4',
 'B-LOC': '5',
 'I-LOC': '6',
 'B-MISC': '7',
 'I-MISC': '8'}

In [None]:
import json

In [None]:
config = json.load(open("ner_model/config.json"))
config["id2label"] = id2label
config["label2id"] = label2id
json.dump(config, open("ner_model/config.json","w"))

In [None]:
model_fine_tuned = AutoModelForTokenClassification.from_pretrained("ner_model")

In [None]:
from transformers import pipeline

In [None]:
nlp = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer)


example = "raju is eating an apple at apple office which is in india"

ner_results = nlp(example)



Device set to use cpu


In [None]:
ner_results

[{'entity': 'B-PER',
  'score': np.float32(0.99409735),
  'index': 1,
  'word': 'raju',
  'start': 0,
  'end': 4},
 {'entity': 'B-MISC',
  'score': np.float32(0.5776646),
  'index': 5,
  'word': 'apple',
  'start': 18,
  'end': 23},
 {'entity': 'B-ORG',
  'score': np.float32(0.9748317),
  'index': 7,
  'word': 'apple',
  'start': 27,
  'end': 32},
 {'entity': 'B-LOC',
  'score': np.float32(0.99857783),
  'index': 12,
  'word': 'india',
  'start': 52,
  'end': 57}]

In [None]:
from transformers import pipeline

# Assuming your model and tokenizer are already defined and fine-tuned
nlp = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer)

# New example text
example = "Elon Musk founded SpaceX and lives in the United States."

# Run NER on the text
ner_results_2 = nlp(example)

# Print the raw NER results
print(ner_results_2)

Device set to use cpu


[{'entity': 'B-PER', 'score': np.float32(0.9978083), 'index': 1, 'word': 'el', 'start': 0, 'end': 2}, {'entity': 'B-PER', 'score': np.float32(0.9984432), 'index': 2, 'word': '##on', 'start': 2, 'end': 4}, {'entity': 'I-PER', 'score': np.float32(0.9982162), 'index': 3, 'word': 'mu', 'start': 5, 'end': 7}, {'entity': 'I-PER', 'score': np.float32(0.9970349), 'index': 4, 'word': '##sk', 'start': 7, 'end': 9}, {'entity': 'B-ORG', 'score': np.float32(0.9885953), 'index': 6, 'word': 'space', 'start': 18, 'end': 23}, {'entity': 'B-ORG', 'score': np.float32(0.9950062), 'index': 7, 'word': '##x', 'start': 23, 'end': 24}, {'entity': 'B-LOC', 'score': np.float32(0.99479586), 'index': 12, 'word': 'united', 'start': 42, 'end': 48}, {'entity': 'I-LOC', 'score': np.float32(0.99002105), 'index': 13, 'word': 'states', 'start': 49, 'end': 55}]


In [None]:
ner_results_2

[{'entity': 'B-PER',
  'score': np.float32(0.9978083),
  'index': 1,
  'word': 'el',
  'start': 0,
  'end': 2},
 {'entity': 'B-PER',
  'score': np.float32(0.9984432),
  'index': 2,
  'word': '##on',
  'start': 2,
  'end': 4},
 {'entity': 'I-PER',
  'score': np.float32(0.9982162),
  'index': 3,
  'word': 'mu',
  'start': 5,
  'end': 7},
 {'entity': 'I-PER',
  'score': np.float32(0.9970349),
  'index': 4,
  'word': '##sk',
  'start': 7,
  'end': 9},
 {'entity': 'B-ORG',
  'score': np.float32(0.9885953),
  'index': 6,
  'word': 'space',
  'start': 18,
  'end': 23},
 {'entity': 'B-ORG',
  'score': np.float32(0.9950062),
  'index': 7,
  'word': '##x',
  'start': 23,
  'end': 24},
 {'entity': 'B-LOC',
  'score': np.float32(0.99479586),
  'index': 12,
  'word': 'united',
  'start': 42,
  'end': 48},
 {'entity': 'I-LOC',
  'score': np.float32(0.99002105),
  'index': 13,
  'word': 'states',
  'start': 49,
  'end': 55}]