In [1]:
import numpy as np
import pandas as pd
import torch
import re
from transformers import RobertaTokenizerFast, RobertaForTokenClassification, AdamW
from gensim.utils import tokenize

In [47]:
class PipelineNER:

    def __init__(self):
        self.tokenizer = None
        self.model = None
        self._setup()

    def _setup(self, model_name='roberta-base'):
        self.tokenizer = RobertaTokenizerFast.from_pretrained(model_name, add_prefix_space=True)
        self.model = RobertaForTokenClassification.from_pretrained(
            model_name, num_labels = 2,
            output_attentions = False, output_hidden_states = False)
        
    def _load_model(self, model_path: str):
        checkpoint = torch.load(model_path)
        # self.model.load_state_dict(checkpoint['model_state_dict'])
        self.model.load_state_dict(checkpoint)
        self.model.eval()
        print(f"Model loaded from {model_path}")

    def _preprocess_sentence(self, sentence: str):
        tokenized_inputs = self.tokenizer(
            sentence, padding="max_length", truncation=True,
            return_tensors="pt"
        )
        
        input_ids = tokenized_inputs["input_ids"]
        attention_masks = tokenized_inputs["attention_mask"]
        
        return input_ids, attention_masks
    
    def __call__(self, text: str):
        '''acquire prediction for text'''
        input_ids, attention_masks = self._preprocess_sentence(text)

        input_ids = input_ids.to(self.model.device)
        attention_masks = attention_masks.to(self.model.device)
        
        with torch.no_grad():
            outputs = self.model(input_ids=input_ids, attention_mask=attention_masks)
        
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=2).squeeze().cpu().numpy()
    
        tokens = self.tokenizer.convert_ids_to_tokens(input_ids.squeeze().cpu().numpy())
        labels = [self.model.config.id2label[label_id] for label_id in predictions]
        
        results = [{"token": token, "label": label} for token, label in zip(tokens, labels)]
        
        return results

In [48]:
model_path = '/kaggle/input/roberta-ner-mountain/pytorch/default/1/model_weights.pth'
pipeline = PipelineNER()
pipeline._load_model(model_path)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  checkpoint = torch.load(model_path)


Model loaded from /kaggle/input/roberta-ner-mountain/pytorch/default/1/model_weights.pth


In [49]:
sample_sentence = "We are going to the Mount Aconcagua."
pipeline(sample_sentence)

[{'token': '<s>', 'label': 'LABEL_0'},
 {'token': 'ĠWe', 'label': 'LABEL_0'},
 {'token': 'Ġare', 'label': 'LABEL_0'},
 {'token': 'Ġgoing', 'label': 'LABEL_0'},
 {'token': 'Ġto', 'label': 'LABEL_0'},
 {'token': 'Ġthe', 'label': 'LABEL_1'},
 {'token': 'ĠMount', 'label': 'LABEL_1'},
 {'token': 'ĠA', 'label': 'LABEL_0'},
 {'token': 'con', 'label': 'LABEL_0'},
 {'token': 'c', 'label': 'LABEL_0'},
 {'token': 'ag', 'label': 'LABEL_0'},
 {'token': 'ua', 'label': 'LABEL_0'},
 {'token': '.', 'label': 'LABEL_0'},
 {'token': '</s>', 'label': 'LABEL_0'},
 {'token': '<pad>', 'label': 'LABEL_0'},
 {'token': '<pad>', 'label': 'LABEL_0'},
 {'token': '<pad>', 'label': 'LABEL_0'},
 {'token': '<pad>', 'label': 'LABEL_0'},
 {'token': '<pad>', 'label': 'LABEL_0'},
 {'token': '<pad>', 'label': 'LABEL_0'},
 {'token': '<pad>', 'label': 'LABEL_0'},
 {'token': '<pad>', 'label': 'LABEL_0'},
 {'token': '<pad>', 'label': 'LABEL_0'},
 {'token': '<pad>', 'label': 'LABEL_0'},
 {'token': '<pad>', 'label': 'LABEL_0'},
 