# Named Entity Recognition (NER) Setup

In [1]:

import os
import json
import pandas as pd
from datasets import load_dataset, ClassLabel
import transformers
from transformers import AutoModelForTokenClassification, AutoTokenizer, Trainer, TrainingArguments


## Load Data

In [3]:

# Load data from a JSON file
with open('./NER_v1.0.json', 'r') as file:
    data = [json.loads(line) for line in file]

# Define a function to extract labels
def extract_labels(data):
    labels_set = set()
    for item in data:
        for start, end, label in item['labels']:
            labels_set.add(label)
    return ['O'] + sorted(labels_set)

# Extract and sort labels, and prepare mappings
labels = extract_labels(data)
label_to_index = {label: idx for idx, label in enumerate(labels)}
index_to_label = {idx: label for idx, label in enumerate(labels)}


## Display Labels and Mappings

In [4]:

print("Labels:", labels)
print("Label to Index Mapping:", label_to_index)
print("Index to Label Mapping:", index_to_label)


Labels: ['O', 'LOC', 'MISC', 'ORG', 'PER']
Label to Index Mapping: {'O': 0, 'LOC': 1, 'MISC': 2, 'ORG': 3, 'PER': 4}
Index to Label Mapping: {0: 'O', 1: 'LOC', 2: 'MISC', 3: 'ORG', 4: 'PER'}


## Prepare Dataset

In [5]:

# Function to convert data into model-ready format
def convert_to_model_format(data, label_to_index):
    token_list = []
    ner_tag_list = []
    
    for item in data:
        text = item['text']
        annotations = item['labels']
        annotations.sort(key=lambda x: x[0])  # Sort by start position
        tokens = []
        labels = []
        last_end = 0
        
        for start, end, label in annotations:
            # Tokens and 'O' labels between entities
            tokens.extend(text[last_end:start].strip().split())
            labels.extend([label_to_index['O']] * (len(text[last_end:start].strip().split())))
            
            # Tokens and their labels for the entity
            tokens.extend(text[start:end].strip().split())
            labels.extend([label_to_index[label]] * (len(text[start:end].strip().split())))
            
            last_end = end
        
        # Append remaining tokens after the last entity
        tokens.extend(text[last_end:].strip().split())
        labels.extend([label_to_index['O']] * (len(text[last_end:].strip().split())))
        
        token_list.append(tokens)
        ner_tag_list.append(labels)
    
    return token_list, ner_tag_list

# Convert the data
tokens, tags = convert_to_model_format(data, label_to_index)


## Model Setup and Training

In [6]:

# Load a pre-trained model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(labels))

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    logging_dir='./logs',
    logging_steps=10,
)

# Define a simple Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=load_dataset(tokens, tags),
    eval_dataset=load_dataset(tokens, tags),
)

# Run training
trainer.train()


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

In [None]:
# Define a function to predict the NER tags for new text
def predict_ner(text, model, tokenizer, label_to_index):
    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    
    # Get model predictions
    outputs = model(**inputs)
    predictions = outputs.logits.argmax(dim=-1)
    
    # Convert predictions to labels
    predicted_label_indices = predictions[0].tolist()  # We use [0] to get the first sentence
    predicted_labels = [index_to_label[idx] for idx in predicted_label_indices]
    
    # Tokenize the input text to match tokens with their labels
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    
    return list(zip(tokens, predicted_labels))

# Example text to test
test_text = "Example sentence to be processed by the NER model."

# Predict NER tags
predicted_ner_tags = predict_ner(test_text, model, tokenizer, label_to_index)

# Display the predictions
print(predicted_ner_tags)
