In [None]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch

# Define the model path (where you saved it using trainer.save_model)
model_path = "ner_model"

# Load the model and tokenizer
model = AutoModelForTokenClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Force the model to run on CPU
device = torch.device("cpu")
model.to(device)
model.eval()  # Set to evaluation mode

# Read CoNLL-formatted data
def read_conll_file(file_path):
    with open(file_path, "r") as f:
        content = f.read().strip()
        sentences = content.split("\n\n")
        data = []
        for sentence in sentences:
            tokens = sentence.split("\n")
            token_data = []
            for token in tokens:
                token_data.append(token.split())
            data.append(token_data)
    return data

train_data = read_conll_file('resource/eng.train')

label_list = sorted(list(set([token_data[3] for sentence in train_data for token_data in sentence])))
label_map = {label: i for i, label in enumerate(label_list)}

# Named Entity Recognition function
def extract_named_entities(sentence):
    # Tokenize input sentence
    tokenized_input = tokenizer(sentence, return_tensors="pt").to(model.device)
    
    # Get model outputs
    outputs = model(**tokenized_input)
    # Get predicted labels
    predicted_labels = outputs.logits.argmax(-1)[0]
    
    # Decode tokens and filter out non-entity tokens
    named_entities = [
        tokenizer.decode([token]) 
        for token, label in zip(tokenized_input["input_ids"][0], predicted_labels)
        if label != 0 and label != label_map['O']  # Filter out "O" (Outside) labels
    ]
    
    return named_entities

# Example 
sentence = "John Smith graduated from MIT in 2010."

named_entities = extract_named_entities(sentence)
print("Named Entities - Example 1:", named_entities)

Named Entities - Example 1: ['John', 'Smith', 'MIT']


In [1]:
!conda list > installed_modules.txt
