In [17]:
from transformers import BertTokenizerFast, BertForTokenClassification
import torch

In [18]:
# Load the pre-trained model and tokenizer from Hugging Face
model = BertForTokenClassification.from_pretrained("Darebal/mountain-names-ner")
tokenizer = BertTokenizerFast.from_pretrained("Darebal/mountain-names-ner")

# Define the custom id2tag mapping to map label indices to human-readable tags
id2tag = {0: 'O', 1: 'B-MOUNTAIN', 2: 'I-MOUNTAIN'}


Classifies mountain words into 3 classes: <br>
O - Non mountain name <br>
B-MOUNTAIN - Begining of mountain or sinle mountain name <br>
I-MOUNTAIN - Subsequent tokens (words) related to mountain name <br>
Some words are splited e.g. Denali into 'den' and '##ali# and labeled as <br>
Token - den, label - B-MOUNTAIN <br>
Token - ##ali, label - I-MOUNTAIN <br>
So it classifies correctly Denali as mountain but splits into 2 subwords  and tagging as beggining and subsequent <br>
This is because tokenizer do not have these words in its vocabulary so it is ok that it is splited <br>
I could have added the names of the mountains to the dictionary, but I decided to do it this way, because adding mountains could have caused problems, that the model needed to be trained better, because the final pre-train embeddings would have been changed

In [19]:
sentences = ["The highest peak in the world is Everest.",
"Kilimanjaro is a dormant volcano in Tanzania.",
"Fuji is a famous symbol of Japan.",
"The Appalachian stretch across the eastern United States.",
"Denali, also known as Mount McKinley, is the tallest mountain in North America.",
"Hikers often visit the Andes in South America.",
"I saw a beautiful sunrise over the Rockies last summer.",
"The Himalayas are home to many of the world's tallest peaks.",
"Elbrus is considered the highest point in Europe.",
"The Matterhorn is one of the most famous mountains in the Alps.",
"The Everest is the tallest mountain in the world.",
'The Sierra de los Cuchumatanes is a mountain range in Guatemala, known for its stunning landscapes and rich cultural heritage.']


def make_inference(model, tokenizer, sentences, id2tag):
    """
    Make token classification predictions using a pre-trained BERT model for token classification.
    
    Args:
        model: The pre-trained BERT model for token classification.
        tokenizer: The tokenizer corresponding to the model.
        sentences: A list of sentences to perform inference on.
        id2tag: A dictionary mapping model output label IDs to human-readable tags.
    
    Returns:
        None. The function prints out the token and its predicted label for each sentence.
    """
    model.eval()
    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors='pt', truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
        tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze().tolist())
        predictions = outputs.logits.argmax(dim=-1).squeeze().tolist()
        predicted_labels = [id2tag[pred] for pred in predictions]
        tokens_labels_paired = list(zip(tokens, predicted_labels))
        for i in range(1, len(tokens_labels_paired) - 1):
            print(f'Token - {tokens_labels_paired[i][0]}, label - {tokens_labels_paired[i][1]}')
        print('\n')

make_inference(model, tokenizer, sentences, id2tag)

Token - the, label - O
Token - highest, label - O
Token - peak, label - O
Token - in, label - O
Token - the, label - O
Token - world, label - O
Token - is, label - O
Token - everest, label - B-MOUNTAIN
Token - ., label - O


Token - ki, label - B-MOUNTAIN
Token - ##lim, label - I-MOUNTAIN
Token - ##an, label - I-MOUNTAIN
Token - ##jar, label - I-MOUNTAIN
Token - ##o, label - I-MOUNTAIN
Token - is, label - O
Token - a, label - O
Token - dormant, label - O
Token - volcano, label - O
Token - in, label - O
Token - tanzania, label - O
Token - ., label - O


Token - fuji, label - B-MOUNTAIN
Token - is, label - O
Token - a, label - O
Token - famous, label - O
Token - symbol, label - O
Token - of, label - O
Token - japan, label - O
Token - ., label - O


Token - the, label - O
Token - appalachian, label - B-MOUNTAIN
Token - stretch, label - O
Token - across, label - O
Token - the, label - O
Token - eastern, label - O
Token - united, label - O
Token - states, label - O
Token - ., label - O


To

Inference was made on validation sentences and randomly generated sentences <br>
It can be seen that it can distinguish well between a specific name of a mountain, not some other name, successive mountains that are made up of several words <br>
yet it did not distinguish one mountain, Mount Mckinley, maybe possibly due to imperfect training <br>