In [10]:
!pip install transformers -q

## Imports

In [25]:
import tensorflow as tf
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from transformers import BertTokenizer
from torch.utils.data import Dataset
from transformers import BertForSequenceClassification
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F

In [13]:
data_path = '/content/bert_ner_moountain_dataset.csv'
df = pd.read_csv(data_path)

In [14]:
df.sample(5)

Unnamed: 0,sentence,label
1,I went hiking in the beautiful mountain range.,Mountain
77,Rock climbing is an adventurous sport.,Mountain
20,The train passed through a tunnel in the mount...,Mountain
61,Mountain villages often have a close-knit comm...,Mountain
7,I prefer beaches over mountains.,


In [15]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenized_inputs = tokenizer(df["sentence"].tolist(), padding=True, truncation=True, return_tensors="pt")

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [17]:
class CustomDataset(Dataset):
    def __init__(self, tokenized_inputs, labels):
        self.tokenized_inputs = tokenized_inputs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.tokenized_inputs["input_ids"][idx],
            "attention_mask": self.tokenized_inputs["attention_mask"][idx],
            "labels": torch.tensor(self.labels[idx])
        }

In [18]:
dataset = CustomDataset(tokenized_inputs, labels=df["label"].map({"Mountain": 1, "None": 0}).tolist())

In [22]:
dataset[0]

{'input_ids': tensor([ 101, 1996, 4164, 2001, 2218, 1999, 1037, 2312, 3346, 1012,  102,    0,
            0,    0,    0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]),
 'labels': tensor(0)}

In [24]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
def train(num_epochs=3):
  """
  [num_epochs] 2-4 is recomended for fine-tuning bert
  """
  dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

  optimizer = optim.AdamW(model.parameters(), lr=2e-5)
  criterion = torch.nn.CrossEntropyLoss()

  for epoch in range(num_epochs):
      model.train()
      total_loss = 0

      for batch in dataloader:
          optimizer.zero_grad()

          inputs = batch["input_ids"]
          attention_mask = batch["attention_mask"]
          labels = batch["labels"]

          outputs = model(inputs, attention_mask=attention_mask)
          loss = criterion(outputs.logits, labels)
          total_loss += loss.item()

          loss.backward()
          optimizer.step()

      average_loss = total_loss / len(dataloader)
      print(f"Epoch {epoch + 1}/{num_epochs} - Average Loss: {average_loss}")

In [31]:
train()

Epoch 1/3 - Average Loss: 0.6794589161872864
Epoch 2/3 - Average Loss: 0.6117525547742844
Epoch 3/3 - Average Loss: 0.5206429734826088


In [32]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [33]:
def validation_run():
  correct_predictions = 0
  total_samples = 0
  # WARNING, RUNNING ON TRAINING DATASET FOR TEST PURPOSES
  dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

  with torch.no_grad():
      for batch in dataloader:
          inputs = batch["input_ids"]
          attention_mask = batch["attention_mask"]
          labels = batch["labels"]

          outputs = model(inputs, attention_mask=attention_mask)
          predictions = torch.argmax(outputs.logits, dim=1)

          correct_predictions += (predictions == labels).sum().item()
          total_samples += labels.size(0)

  accuracy = correct_predictions / total_samples
  print(f"Validation Accuracy: {accuracy * 100:.2f}%")

Validation Accuracy: 91.67%


In [79]:

def predict_sentences(sentence_x, model, tokenizer):
    sentence= []
    for word in sentence_x.split(" "):
        tokenized_inputs = tokenizer(word, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**tokenized_inputs)
            predictions = torch.argmax(outputs.logits, dim=1)
            predicted_labels = ["Mountain" if pred >= 0.5 else "None" for pred in predictions.tolist()]
        sentence += predicted_labels

    return sentence[1:][:-1]


sentences_to_predict = [
    "Next month im going on expedition to Ukraine",
    "Ukraine, Germany, England, London, will be at work at 5, Pacific OCean",
    "I love everest and alps, the're beautiful!"
]

for sentence_x in sentences_to_predict:
  predicted_labels = predict_sentences(sentence_x, model, tokenizer)
  print(f"sentence_x: {sentence_x}\n labels: {predicted_labels}\n")


sentence_x: Next month im going on expedition to Ukraine
 labels: ['None', 'None', 'None', 'None', 'Mountain', 'None']

sentence_x: Ukraine, Germany, England, London, will be at work at 5, Pacific OCean
 labels: ['None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None']

sentence_x: I love everest and alps, the're beautiful!
 labels: ['None', 'Mountain', 'None', 'Mountain', 'None']

