# ⬅️ Importing libraries

In [None]:
import tensorflow as tf
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from transformers import BertTokenizer
from torch.utils.data import Dataset
from transformers import BertForSequenceClassification
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F

# 📥 Loading dataset

In [None]:
data_path = '/content/bert_ner_moountain_dataset.csv'
df = pd.read_csv('/content/bert_ner_moountain_dataset.csv', delimiter=";",  error_bad_lines=False)



  df = pd.read_csv('/content/bert_ner_moountain_dataset.csv', delimiter=";",  error_bad_lines=False)
Skipping line 1726: expected 2 fields, saw 3



In [None]:
df.sample(5)

Unnamed: 0,label,sentence
2699,O,The Museum of Folk Architecture and Life in Lv...
1343,Mountain,"Gyala Peri, the highest peak in Southeast Asia..."
2966,O,"South Africa is a desire for an expedition, wi..."
2762,O,"Thailand is beckoning me for an expedition, wi..."
1619,O,The rooftop bar provided a stunning panorama o...


# ⬇️ Load model weights

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenized_inputs = tokenizer(df["sentence"].tolist(), padding=True, truncation=True, return_tensors="pt")

In [None]:
dataset = CustomDataset(tokenized_inputs, labels=df["label"].map({"Mountain": 1, "O": 0}).tolist())

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.load_state_dict(torch.load('/content/model.pth'))

<All keys matched successfully>

# 🎯 Accuracy

In [None]:
def accuracy(df, model, tokenizer):
    """
    [df] - DataFrame with 'sentence' and 'label' columns
    [model] - BERT model that will predict on sentences
    [tokenizer] - tokenizer used for that model
    """
    total = 0
    correct = 0

    for index, row in df.iterrows():
        sentence = row['sentence']
        label = row['label']

        tokenized_inputs = tokenizer(sentence, padding=True, truncation=True, return_tensors="pt")

        with torch.no_grad():
            outputs = model(**tokenized_inputs)
            predictions = torch.argmax(outputs.logits, dim=1)

        predicted_label = "Mountain" if predictions.item() == 1 else "O"
        if predicted_label == label:
            correct += 1
        total += 1

    accuracy = correct / total * 100
    print(f"Accuracy: {accuracy:.2f}%")

accuracy(df, model, tokenizer)

Accuracy: 85.12%


Before saying anything about accuracy, it is worth looking at the structure of the dataset and how the model was trained.

The dataset consists of label and sentence columns. This does not allow the model to clearly understand which word in the sentence corresponds to a particular tag.  That's why the model is trained to determine the label for the entire sentence at once.

That is why we measure accuracy in the same way - by determining the label of the entire sentence at once.

This approach has a number of disadvantages, including low accuracy and the need for additional dataset lines to indicate the label "O".

This approach was chosen by mistake, but it is only an alternative version of the working code, and one of the first ones. Therefore, it has the right to exist

# ⚙️ Predictions

But unlike training, we can define labels for each word in a sentence, which is demonstrated in the predict_sentence function

In [None]:
def predict_sentence(sentence_x, model, tokenizer):
    """
      [sentence_x] - String sentence to predict
      [model] - BERT model that will predict on sentence
      [tokenizer] - tokenizer used for that model
    """
    tokenized_inputs = tokenizer(sentence_x.split(" "), padding=True, truncation=True, return_tensors ="pt")
    with torch.no_grad():
      outputs = model(**tokenized_inputs)
      predictions = torch.argmax(outputs.logits, dim=1)

      predicted_labels = ["Mountain" if pred >= 0.5 else "." for pred in predictions.tolist()]
      return predicted_labels


sentences_to_predict = [
    "Some sentence about cool mountain called Hoverla!",
    "Quantum has a cool task to make NER model that recognizes mountains!",
    "Especially Everest"
]

for sentence_x in sentences_to_predict:
  predicted_labels =  predict_sentence(sentence_x, model, tokenizer)
  for i in zip(sentence_x.split(" "), predicted_labels):
    print(f"{i[0]}({i[1]}) ", end="")
  print("")

Some(.) sentence(.) about(.) cool(.) mountain(Mountain) called(.) Hoverla!(Mountain) 
Quantum(.) has(.) a(Mountain) cool(.) task(.) to(Mountain) make(.) NER(Mountain) model(.) that(.) recognizes(Mountain) mountains!(Mountain) (Mountain) 


# ⚔️ Try by yourself!

In [None]:
your_sentence = ""

predicted_labels =  predict_sentence(your_sentence, model, tokenizer)
  for i in zip(your_sentence.split(" "), predicted_labels):
    print(f"{i[0]}({i[1]}) ", end="")