### Load the Dataset and processing it

In [2]:
from datasets import load_dataset
dataset =load_dataset("unimelb-nlp/wikiann", "en")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
dataset

DatasetDict({
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 20000
    })
})

In [4]:
# Re-splitting the dataset into 30,000 train, 5,000 validation, and 5,000 test samples
from datasets import concatenate_datasets, DatasetDict

# Combine train/validation/test if needed
full_dataset = concatenate_datasets([dataset['train'], dataset['validation'], dataset['test']])

# First split: 75% train, 25% temp (val + test)
train_temp = full_dataset.train_test_split(test_size=0.25, seed=42)
train_dataset = train_temp['train']  # 30,000
temp_dataset = train_temp['test']    # 10,000

# Second split: half val, half test
val_test_split = temp_dataset.train_test_split(test_size=0.5, seed=42)
validation_dataset = val_test_split['train']  # 5,000
test_dataset = val_test_split['test']         # 5,000

new_dataset = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset,
    'test': test_dataset
})


In [5]:
for split in new_dataset:
    print(split, len(new_dataset[split]))


train 30000
validation 5000
test 5000


In [6]:
new_dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 30000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 5000
    })
})

In [7]:
new_dataset["train"][0]["tokens"]

['It',
 'partnered',
 'with',
 'George',
 'Hacker’s',
 'Alcohol',
 'Policies',
 'Project',
 'at',
 'the',
 'Center',
 'for',
 'Science',
 'in',
 'the',
 'Public',
 'Interest',
 '.']

In [8]:
new_dataset["train"][0]["ner_tags"]

[0, 0, 0, 1, 2, 3, 4, 4, 0, 0, 3, 4, 4, 4, 4, 4, 4, 0]

In [9]:
ner_feature = dataset["train"].features["ner_tags"]
ner_feature

List(ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']))

In [10]:
label_names = ner_feature.feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

In [11]:
words = new_dataset["train"][0]["tokens"]
labels = new_dataset["train"][0]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

It partnered with George Hacker’s Alcohol Policies Project at the Center for   Science in    the   Public Interest . 
O  O         O    B-PER  I-PER    B-ORG   I-ORG    I-ORG   O  O   B-ORG  I-ORG I-ORG   I-ORG I-ORG I-ORG  I-ORG    O 




```
# This is formatted as code
```

### Load the Model and tokenizer

In [12]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [13]:
tokenizer.is_fast

True

In [14]:
inputs = tokenizer(new_dataset["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'It',
 'partnered',
 'with',
 'George',
 'Ha',
 '##cker',
 '’',
 's',
 'Al',
 '##co',
 '##hol',
 'Pol',
 '##ici',
 '##es',
 'Project',
 'at',
 'the',
 'Center',
 'for',
 'Science',
 'in',
 'the',
 'Public',
 'Interest',
 '.',
 '[SEP]']

In [15]:
inputs.word_ids()

[None,
 0,
 1,
 2,
 3,
 4,
 4,
 4,
 4,
 5,
 5,
 5,
 6,
 6,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 None]

In [16]:
# Function to align labels with tokenized inputs
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [17]:
labels = new_dataset["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 0, 0, 1, 2, 3, 4, 4, 0, 0, 3, 4, 4, 4, 4, 4, 4, 0]
[-100, 0, 0, 0, 1, 2, 2, 2, 2, 3, 4, 4, 4, 4, 4, 4, 0, 0, 3, 4, 4, 4, 4, 4, 4, 0, -100]


In [18]:
# Function to tokenize and align labels for the entire dataset
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [19]:
tokenized_datasets = new_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=new_dataset["train"].column_names,
)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [20]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [21]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

tensor([[-100,    0,    0,    0,    1,    2,    2,    2,    2,    3,    4,    4,
            4,    4,    4,    4,    0,    0,    3,    4,    4,    4,    4,    4,
            4,    0, -100],
        [-100,    1,    2,    2,    2,    2,    2,    2, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100]])

In [22]:
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

[-100, 0, 0, 0, 1, 2, 2, 2, 2, 3, 4, 4, 4, 4, 4, 4, 0, 0, 3, 4, 4, 4, 4, 4, 4, 0, -100]
[-100, 1, 2, 2, 2, 2, 2, 2, -100]


In [23]:
# Install the Hugging Face evaluation package (and seqeval if not already)
!pip install -q evaluate seqeval

In [24]:
import evaluate

metric = evaluate.load("seqeval")

Downloading builder script: 0.00B [00:00, ?B/s]

In [25]:
labels = new_dataset["train"][0]["ner_tags"]
labels = [label_names[i] for i in labels]
labels

['O',
 'O',
 'O',
 'B-PER',
 'I-PER',
 'B-ORG',
 'I-ORG',
 'I-ORG',
 'O',
 'O',
 'B-ORG',
 'I-ORG',
 'I-ORG',
 'I-ORG',
 'I-ORG',
 'I-ORG',
 'I-ORG',
 'O']

In [26]:
predictions = labels.copy()
predictions[2] = "O"
metric.compute(predictions=[predictions], references=[labels])

{'ORG': {'precision': np.float64(1.0),
  'recall': np.float64(1.0),
  'f1': np.float64(1.0),
  'number': np.int64(2)},
 'PER': {'precision': np.float64(1.0),
  'recall': np.float64(1.0),
  'f1': np.float64(1.0),
  'number': np.int64(1)},
 'overall_precision': np.float64(1.0),
 'overall_recall': np.float64(1.0),
 'overall_f1': np.float64(1.0),
 'overall_accuracy': 1.0}

In [27]:
# Define the compute_metrics function
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [28]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [29]:
label2id

{'O': 0,
 'B-PER': 1,
 'I-PER': 2,
 'B-ORG': 3,
 'I-ORG': 4,
 'B-LOC': 5,
 'I-LOC': 6}

In [30]:
id2label

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC'}

In [31]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
model.config.num_labels

7

In [33]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [34]:
# Define training arguments
from transformers import TrainingArguments

args = TrainingArguments(
    "bert-finetuned-ner",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

In [35]:
# Create Trainer instance
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [None]:
!pip install wandb weave

In [36]:
wandb login

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into https://api.wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: Find your API key here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbeshoyarnest01[0m ([33mbeshoyarnest01-minia-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [37]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2821,0.242135,0.791352,0.838687,0.814332,0.925921
2,0.1919,0.252427,0.816335,0.843255,0.829577,0.928884
3,0.1307,0.290411,0.824858,0.849822,0.837154,0.931114


TrainOutput(global_step=11250, training_loss=0.21853640492757162, metrics={'train_runtime': 887.0159, 'train_samples_per_second': 101.464, 'train_steps_per_second': 12.683, 'total_flos': 1178207672138112.0, 'train_loss': 0.21853640492757162, 'epoch': 3.0})

In [44]:
import wandb
api = wandb.Api()
run = api.run("/beshoyarnest01-minia-university/huggingface/runs/c260tqdy")
print(run.history())


    _step    _runtime  train/loss  train/grad_norm  train/learning_rate  \
0       0   44.303056      0.5564        10.528369         1.911289e-05   
1       1   77.600776      0.3457         7.125881         1.822400e-05   
2       2  114.743025      0.3075        20.965637         1.733511e-05   
3       3  150.682874      0.2971        10.373394         1.644622e-05   
4       4  185.927770      0.2809         2.490146         1.555733e-05   
5       5  223.853115      0.2856         5.471761         1.466844e-05   
6       6  258.512129      0.2821         6.857677         1.377956e-05   
7       7  287.816948         NaN              NaN                  NaN   
8       8  322.460120      0.2355         2.629109         1.289067e-05   
9       9  358.348351      0.1929         2.452043         1.200178e-05   
10     10  393.806532      0.1856         2.135631         1.111289e-05   
11     11  429.395157      0.1975         3.968515         1.022400e-05   
12     12  464.486464    

In [45]:
import wandb
api = wandb.Api()

run = api.run("beshoyarnest01-minia-university/huggingface/c260tqdy")
if run.state == "finished":
    for i, row in run.history().iterrows():
      print(row["_timestamp"], row["accuracy"])

In [41]:
trainer.push_to_hub(commit_message="Training complete")

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...ned-ner/training_args.bin: 100%|##########| 5.84kB / 5.84kB            

  ...03143.041f5617ed87.2189.0: 100%|##########| 11.7kB / 11.7kB            

  ...ned-ner/model.safetensors:   8%|7         | 33.5MB /  431MB            

CommitInfo(commit_url='https://huggingface.co/7beshoyarnest/bert-finetuned-ner/commit/05ef31215d7bf89338cd83352461710e4723472a', commit_message='Training complete', commit_description='', oid='05ef31215d7bf89338cd83352461710e4723472a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/7beshoyarnest/bert-finetuned-ner', endpoint='https://huggingface.co', repo_type='model', repo_id='7beshoyarnest/bert-finetuned-ner'), pr_revision=None, pr_num=None)

#### Save the model and Tokenizer Locally

In [46]:
# Save trained model and tokenizer locally
# This saves the model weights and tokenizer files into `save_dir` so you can load them later.
import os
save_dir = "/content/Named_Entity_Recognition_model/"

# Prefer Trainer.save_model as it also saves the best model when using Trainer
try:
    trainer.save_model(save_dir)
except Exception:
    # fallback to explicit save
    model.save_pretrained(save_dir)

# Save tokenizer explicitly
tokenizer.save_pretrained(save_dir)

print(f"Model and tokenizer saved to: {save_dir}")
print(os.listdir("/content"))

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...ned-ner/training_args.bin: 100%|##########| 5.84kB / 5.84kB            

  ...03143.041f5617ed87.2189.0: 100%|##########| 11.7kB / 11.7kB            

  ...ned-ner/model.safetensors:   8%|7         | 33.5MB /  431MB            

No files have been modified since last commit. Skipping to prevent empty commit.


Model and tokenizer saved to: /content/Named_Entity_Recognition_model/
['.config', 'wandb', 'Named_Entity_Recognition_model', 'bert-finetuned-ner', 'sample_data']


In [47]:
!ls -lh /content

total 16K
drwxr-xr-x 6 root root 4.0K Dec 23 15:38 bert-finetuned-ner
drwxr-xr-x 2 root root 4.0K Dec 23 15:42 Named_Entity_Recognition_model
drwxr-xr-x 1 root root 4.0K Dec  9 14:42 sample_data
drwxr-xr-x 3 root root 4.0K Dec 23 15:19 wandb


In [None]:
import shutil
from google.colab import files

shutil.make_archive("/content/Named_Entity_Recognition_model/", 'zip', save_dir)
files.download("/content/Named_Entity_Recognition_model/.zip")


In [2]:
# loading the saved model and tokenizer from local directory
from transformers import AutoModelForTokenClassification, AutoTokenizer

save_dir = r"F:\end_to_end_AI_Projects\Named_Entity_Recognition(NER)\Named_Entity_Recognition_model"  # where you unzipped it

local_tokenizer = AutoTokenizer.from_pretrained(save_dir, local_files_only=True)
local_model = AutoModelForTokenClassification.from_pretrained(save_dir, local_files_only=True)
local_model.eval()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [3]:
from transformers import pipeline

ner_pipe = pipeline(
    "token-classification",
    model=local_model,
    tokenizer=local_tokenizer,
    aggregation_strategy="simple"
)

ner_pipe("I live in New York")


Device set to use cuda:0


[{'entity_group': 'LOC',
  'score': 0.9902638,
  'word': 'New York',
  'start': 10,
  'end': 18}]

In [51]:
metrics = trainer.evaluate(eval_dataset=tokenized_datasets["test"])

for key, value in metrics.items():
    print(f"{key}: {value}")


eval_loss: 0.3167576789855957
eval_precision: 0.8304585612116113
eval_recall: 0.8565229968180503
eval_f1: 0.8432894268422927
eval_accuracy: 0.9296537562840756
eval_runtime: 23.699
eval_samples_per_second: 210.98
eval_steps_per_second: 26.372
epoch: 3.0


In [53]:
# Get predictions on test set
import numpy as np

pred_output = trainer.predict(tokenized_datasets["test"])
logits, label_ids = pred_output.predictions, pred_output.label_ids
preds = np.argmax(logits, axis=-1)

# Convert to label names, ignoring -100
true_labels = [[label_names[l] for l in label if l != -100] for label in label_ids]
pred_labels = [
    [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(preds, label_ids)
]

results = metric.compute(predictions=pred_labels, references=true_labels)
for key, value in results.items():
    print(f"{key}: {value}")

LOC: {'precision': np.float64(0.8455518945634267), 'recall': np.float64(0.8856773080241588), 'f1': np.float64(0.8651495996628741), 'number': np.int64(2318)}
ORG: {'precision': np.float64(0.7645299145299145), 'recall': np.float64(0.771786022433132), 'f1': np.float64(0.768140832975526), 'number': np.int64(2318)}
PER: {'precision': np.float64(0.8802369868810833), 'recall': np.float64(0.913081650570676), 'f1': np.float64(0.8963585434173669), 'number': np.int64(2278)}
overall_precision: 0.8304585612116113
overall_recall: 0.8565229968180503
overall_f1: 0.8432894268422927
overall_accuracy: 0.9296537562840756


In [54]:
import json

records = []
for tok_ids, aligned_labels, pred in zip(
    tokenized_datasets["test"]["input_ids"],
    tokenized_datasets["test"]["labels"],
    preds
):
    tokens = tokenizer.convert_ids_to_tokens(tok_ids)
    # Keep only tokens with aligned labels (skip -100)
    true = [label_names[l] for l in aligned_labels if l != -100]
    predicted = [label_names[p] for (p, l) in zip(pred, aligned_labels) if l != -100]
    records.append({"tokens": tokens, "true": true, "pred": predicted})
with open("ner_test_predictions.json", "w", encoding="utf-8") as f:
    json.dump(records, f, ensure_ascii=False, indent=2)

In [56]:
from transformers import pipeline
token_classifier = pipeline(
    "token-classification",
    model=trainer.model,      # or from_pretrained(save_dir)
    tokenizer=tokenizer,
    aggregation_strategy="simple",
    device=0  # set if using GPU; omit/adjust on CPU
)
results = token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")
for entity in results:
    print(entity)

Device set to use cuda:0


{'entity_group': 'PER', 'score': np.float32(0.98799956), 'word': 'Sylvain', 'start': 11, 'end': 18}
{'entity_group': 'ORG', 'score': np.float32(0.9896188), 'word': 'Hugging Face', 'start': 33, 'end': 45}
{'entity_group': 'LOC', 'score': np.float32(0.9631331), 'word': 'Brooklyn', 'start': 49, 'end': 57}


In [57]:
from sklearn.metrics import classification_report, confusion_matrix
# flatten token-level arrays, removing -100 labels
y_true = []
y_pred = []
for lab_arr, pred_arr in zip(label_ids, preds):
    for l,p in zip(lab_arr, pred_arr):
        if l != -100:
            y_true.append(label_names[l])
            y_pred.append(label_names[p])
print(classification_report(y_true, y_pred, digits=4))
# Confusion matrix (careful: many labels; visualize with heatmap)
cm = confusion_matrix(y_true, y_pred, labels=list(label_names))

              precision    recall  f1-score   support

       B-LOC     0.8888    0.9103    0.8994      2318
       B-ORG     0.8442    0.8158    0.8297      2318
       B-PER     0.9143    0.9324    0.9233      2278
       I-LOC     0.8896    0.9143    0.9018      7271
       I-ORG     0.8623    0.8366    0.8493      8913
       I-PER     0.9100    0.9281    0.9189      8020
           O     0.9838    0.9792    0.9815     25374

    accuracy                         0.9297     56492
   macro avg     0.8990    0.9024    0.9006     56492
weighted avg     0.9296    0.9297    0.9295     56492



In [62]:
# using the saved model and tokenizer from huggingface hub
from transformers import AutoTokenizer, AutoModelForTokenClassification
tokenizer = AutoTokenizer.from_pretrained("7beshoyarnest/bert-finetuned-ner")
model = AutoModelForTokenClassification.from_pretrained("7beshoyarnest/bert-finetuned-ner")

In [64]:
from transformers import pipeline

token_classifier = pipeline(
    "token-classification",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple",
    device=0  # set to -1 for CPU or change if no GPU available
)

# quick sanity check
results = token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")
for entity in results:
    print(entity)

Device set to use cuda:0


{'entity_group': 'PER', 'score': np.float32(0.98799956), 'word': 'Sylvain', 'start': 11, 'end': 18}
{'entity_group': 'ORG', 'score': np.float32(0.9896188), 'word': 'Hugging Face', 'start': 33, 'end': 45}
{'entity_group': 'LOC', 'score': np.float32(0.9631331), 'word': 'Brooklyn', 'start': 49, 'end': 57}
