In [1]:
import pandas as pd
import torch

from scripts.inference_utils import load_model, predict

In [2]:
# Load the test dataset
test_data = pd.read_csv("data/test_dataset.csv")

# Extract sentences and true labels
test_sentences = test_data["sentence"].tolist()
test_labels = [labels.split() for labels in test_data["labels"]] # Split the labels into list format

In [3]:
# Specify the paths to the model and tokenizer
model_path = './model'  # Replace with your model's path
tokenizer_path = './tokenizer'  # Replace with your tokenizer's path

# Load the model and tokenizer using the load_model function from inference_utils.py
model, tokenizer = load_model(model_path, tokenizer_path)

In [4]:
# Set up device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [5]:
# Run predictions on the test sentences
predictions = []
for sentence in test_sentences:
    predicted_mountains = predict(sentence, model, tokenizer, device)
    predictions.append(predicted_mountains)

# Add predictions to the dataframe
test_data['predicted_mountains'] = predictions

# Clean up the predictions column by joining the tokens into a single string with spaces
test_data['predicted_mountains'] = test_data['predicted_mountains'].apply(lambda x: ' '.join(x) if x else "No mountain detected")

# Display the results
test_data[['sentence','predicted_mountains']].head()

Unnamed: 0,sentence,predicted_mountains
0,Mount Everest is the tallest mountain.,Everest
1,I hiked in the Alps and the Pyrenees last summer.,Alps Pyrenees
2,Kilimanjaro is in Tanzania.,Kilimanjaro
3,This sentence does not mention a mountain.,No mountain detected
4,"Denali, also known as Mount McKinley, is in Al...",Denali McKinley
5,The Andes are a stunning mountain range in Sou...,Andes SouthAmerica
6,I visited Table Mountain in South Africa last ...,Table SouthAfrica
7,Mount Fuji is an active volcano in Japan.,Fuji
8,Rocky Mountains extend across the United State...,Rocky States Canada
9,Mount Elbrus is the highest peak in Europe.,Elbrus
