In [1]:
!pip install -q transformers torch pandas tqdm scikit-learn

In [2]:
import torch
import warnings
import pandas as pd
from tqdm import tqdm
from transformers import logging
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import RobertaTokenizer, RobertaForTokenClassification

warnings.filterwarnings("ignore")
logging.set_verbosity_error()


# Dataset class to load and process the mountain named entity recognition data
class MountainNERDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.label_map = {
            'O': 0,
            'B-LOCATION': 1,
            'I-LOCATION': 2,
            '<s>': 3,
            '</s>': 4
        }

    def __len__(self):
        """
        Return the number of samples in the dataset.
        """
        return len(self.data)

    def __getitem__(self, idx):
        """
        Retrieve and process a specific sample from the dataset by index.

        Args:
            idx (int): Index of the sample to retrieve.

        Returns:
            dict: Processed input IDs, attention mask, and label tensor for the sample.
        """
        sentence = self.data.iloc[idx]['Sentence']
        labels = self.data.iloc[idx]['Labels'].split()
        label_ids = [self.label_map[label] for label in labels]
        encoding = self.tokenizer(
            sentence,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        padded_labels = torch.tensor(label_ids + [0] * (self.max_length - len(label_ids)))
        padded_labels = padded_labels[:self.max_length]
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': padded_labels
        }


def train_epoch(model, dataloader, optimizer, device):
    """
    Train the model for one epoch on the given dataloader.

    Args:
        model (RobertaForTokenClassification): The token classification model.
        dataloader (DataLoader): DataLoader for training data.
        optimizer (torch.optim.Optimizer): Optimizer for model parameters.
        device (torch.device): Device (CPU or GPU) for computation.

    Returns:
        float: Average training loss for the epoch.
    """
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc='Training'):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    return total_loss / len(dataloader)


# Function for evaluating the model's performance
def evaluate(model, dataloader, device):
    """
    Evaluate the model's performance on the given dataloader.

    Args:
        model (RobertaForTokenClassification): The token classification model.
        dataloader (DataLoader): DataLoader for validation data.
        device (torch.device): Device (CPU or GPU) for computation.

    Returns:
        tuple[float, str]: Validation accuracy and detailed classification report.
    """
    model.eval()
    all_predictions = []
    all_labels = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Evaluation'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=2)
            for pred, label, mask in zip(predictions, labels, attention_mask):
                valid_mask = mask.bool()
                all_predictions.extend(pred[valid_mask].cpu().numpy())
                all_labels.extend(label[valid_mask].cpu().numpy())
    accuracy = accuracy_score(all_labels, all_predictions)
    report = classification_report(all_labels, all_predictions)
    return accuracy, report


def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    dataset = 'mountain_sentences_dataset.csv'
    num_epochs = 35

    df = pd.read_csv(dataset)
    model_name = 'roberta-base'
    tokenizer = RobertaTokenizer.from_pretrained(model_name)
    model = RobertaForTokenClassification.from_pretrained(model_name, num_labels=5).to(device)
    dataset = MountainNERDataset(df, tokenizer)

    train_size = int(0.7 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

    train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=16)
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

    best_accuracy = 0
    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch + 1}/{num_epochs}")
        train_loss = train_epoch(model, train_dataloader, optimizer, device)
        print(f"Training loss: {train_loss:.4f}")
        accuracy, report = evaluate(model, val_dataloader, device)
        print(f"Validation accuracy: {accuracy:.4f}")
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            torch.save(model.state_dict(), 'best_model_weights.bin')
            print("Best model saved!")


if __name__ == "__main__":
    main()


Using device: cuda


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]


Epoch 1/35


Training: 100%|██████████| 14/14 [00:05<00:00,  2.48it/s]


Training loss: 0.4154


Evaluation: 100%|██████████| 6/6 [00:00<00:00,  9.51it/s]


Validation accuracy: 0.7995
Best model saved!

Epoch 2/35


Training: 100%|██████████| 14/14 [00:04<00:00,  3.32it/s]


Training loss: 0.0868


Evaluation: 100%|██████████| 6/6 [00:00<00:00, 10.52it/s]


Validation accuracy: 0.9033
Best model saved!

Epoch 3/35


Training: 100%|██████████| 14/14 [00:04<00:00,  3.30it/s]


Training loss: 0.0496


Evaluation: 100%|██████████| 6/6 [00:00<00:00, 10.20it/s]


Validation accuracy: 0.9300
Best model saved!

Epoch 4/35


Training: 100%|██████████| 14/14 [00:04<00:00,  3.28it/s]


Training loss: 0.0285


Evaluation: 100%|██████████| 6/6 [00:00<00:00, 10.24it/s]


Validation accuracy: 0.9683
Best model saved!

Epoch 5/35


Training: 100%|██████████| 14/14 [00:04<00:00,  3.27it/s]


Training loss: 0.0139


Evaluation: 100%|██████████| 6/6 [00:00<00:00,  9.85it/s]


Validation accuracy: 0.9746
Best model saved!

Epoch 6/35


Training: 100%|██████████| 14/14 [00:04<00:00,  3.19it/s]


Training loss: 0.0049


Evaluation: 100%|██████████| 6/6 [00:00<00:00, 10.20it/s]


Validation accuracy: 0.9850
Best model saved!

Epoch 7/35


Training: 100%|██████████| 14/14 [00:04<00:00,  3.18it/s]


Training loss: 0.0021


Evaluation: 100%|██████████| 6/6 [00:00<00:00,  9.98it/s]


Validation accuracy: 0.9821

Epoch 8/35


Training: 100%|██████████| 14/14 [00:04<00:00,  3.24it/s]


Training loss: 0.0029


Evaluation: 100%|██████████| 6/6 [00:00<00:00,  9.95it/s]


Validation accuracy: 0.9825

Epoch 9/35


Training: 100%|██████████| 14/14 [00:04<00:00,  3.22it/s]


Training loss: 0.0051


Evaluation: 100%|██████████| 6/6 [00:00<00:00,  9.77it/s]


Validation accuracy: 0.9808

Epoch 10/35


Training: 100%|██████████| 14/14 [00:04<00:00,  3.18it/s]


Training loss: 0.0024


Evaluation: 100%|██████████| 6/6 [00:00<00:00,  9.92it/s]


Validation accuracy: 0.9854
Best model saved!

Epoch 11/35


Training: 100%|██████████| 14/14 [00:04<00:00,  3.17it/s]


Training loss: 0.0013


Evaluation: 100%|██████████| 6/6 [00:00<00:00,  9.93it/s]


Validation accuracy: 0.9867
Best model saved!

Epoch 12/35


Training: 100%|██████████| 14/14 [00:04<00:00,  3.11it/s]


Training loss: 0.0009


Evaluation: 100%|██████████| 6/6 [00:00<00:00,  9.84it/s]


Validation accuracy: 0.9867

Epoch 13/35


Training: 100%|██████████| 14/14 [00:04<00:00,  3.11it/s]


Training loss: 0.0006


Evaluation: 100%|██████████| 6/6 [00:00<00:00,  9.90it/s]


Validation accuracy: 0.9846

Epoch 14/35


Training: 100%|██████████| 14/14 [00:04<00:00,  3.13it/s]


Training loss: 0.0006


Evaluation: 100%|██████████| 6/6 [00:00<00:00,  9.71it/s]


Validation accuracy: 0.9867

Epoch 15/35


Training: 100%|██████████| 14/14 [00:04<00:00,  3.13it/s]


Training loss: 0.0004


Evaluation: 100%|██████████| 6/6 [00:00<00:00,  9.56it/s]


Validation accuracy: 0.9858

Epoch 16/35


Training: 100%|██████████| 14/14 [00:04<00:00,  3.12it/s]


Training loss: 0.0003


Evaluation: 100%|██████████| 6/6 [00:00<00:00,  9.73it/s]


Validation accuracy: 0.9858

Epoch 17/35


Training: 100%|██████████| 14/14 [00:04<00:00,  3.09it/s]


Training loss: 0.0003


Evaluation: 100%|██████████| 6/6 [00:00<00:00,  9.64it/s]


Validation accuracy: 0.9867

Epoch 18/35


Training: 100%|██████████| 14/14 [00:04<00:00,  3.07it/s]


Training loss: 0.0002


Evaluation: 100%|██████████| 6/6 [00:00<00:00,  9.38it/s]


Validation accuracy: 0.9854

Epoch 19/35


Training: 100%|██████████| 14/14 [00:04<00:00,  3.06it/s]


Training loss: 0.0002


Evaluation: 100%|██████████| 6/6 [00:00<00:00,  9.55it/s]


Validation accuracy: 0.9854

Epoch 20/35


Training: 100%|██████████| 14/14 [00:04<00:00,  3.04it/s]


Training loss: 0.0002


Evaluation: 100%|██████████| 6/6 [00:00<00:00,  9.65it/s]


Validation accuracy: 0.9854

Epoch 21/35


Training: 100%|██████████| 14/14 [00:04<00:00,  3.01it/s]


Training loss: 0.0002


Evaluation: 100%|██████████| 6/6 [00:00<00:00,  9.38it/s]


Validation accuracy: 0.9850

Epoch 22/35


Training: 100%|██████████| 14/14 [00:04<00:00,  2.99it/s]


Training loss: 0.0002


Evaluation: 100%|██████████| 6/6 [00:00<00:00,  9.25it/s]


Validation accuracy: 0.9862

Epoch 23/35


Training: 100%|██████████| 14/14 [00:04<00:00,  2.97it/s]


Training loss: 0.0002


Evaluation: 100%|██████████| 6/6 [00:00<00:00,  9.11it/s]


Validation accuracy: 0.9837

Epoch 24/35


Training: 100%|██████████| 14/14 [00:04<00:00,  2.96it/s]


Training loss: 0.0013


Evaluation: 100%|██████████| 6/6 [00:00<00:00,  9.17it/s]


Validation accuracy: 0.9875
Best model saved!

Epoch 25/35


Training: 100%|██████████| 14/14 [00:04<00:00,  2.97it/s]


Training loss: 0.0112


Evaluation: 100%|██████████| 6/6 [00:00<00:00,  9.02it/s]


Validation accuracy: 0.9850

Epoch 26/35


Training: 100%|██████████| 14/14 [00:04<00:00,  2.94it/s]


Training loss: 0.0040


Evaluation: 100%|██████████| 6/6 [00:00<00:00,  9.16it/s]


Validation accuracy: 0.9837

Epoch 27/35


Training: 100%|██████████| 14/14 [00:04<00:00,  2.94it/s]


Training loss: 0.0008


Evaluation: 100%|██████████| 6/6 [00:00<00:00,  9.09it/s]


Validation accuracy: 0.9833

Epoch 28/35


Training: 100%|██████████| 14/14 [00:04<00:00,  2.90it/s]


Training loss: 0.0010


Evaluation: 100%|██████████| 6/6 [00:00<00:00,  8.80it/s]


Validation accuracy: 0.9837

Epoch 29/35


Training: 100%|██████████| 14/14 [00:04<00:00,  2.92it/s]


Training loss: 0.0008


Evaluation: 100%|██████████| 6/6 [00:00<00:00,  8.87it/s]


Validation accuracy: 0.9833

Epoch 30/35


Training: 100%|██████████| 14/14 [00:04<00:00,  2.89it/s]


Training loss: 0.0006


Evaluation: 100%|██████████| 6/6 [00:00<00:00,  8.92it/s]


Validation accuracy: 0.9850

Epoch 31/35


Training: 100%|██████████| 14/14 [00:04<00:00,  2.88it/s]


Training loss: 0.0004


Evaluation: 100%|██████████| 6/6 [00:00<00:00,  8.80it/s]


Validation accuracy: 0.9833

Epoch 32/35


Training: 100%|██████████| 14/14 [00:04<00:00,  2.87it/s]


Training loss: 0.0002


Evaluation: 100%|██████████| 6/6 [00:00<00:00,  8.84it/s]


Validation accuracy: 0.9858

Epoch 33/35


Training: 100%|██████████| 14/14 [00:04<00:00,  2.83it/s]


Training loss: 0.0002


Evaluation: 100%|██████████| 6/6 [00:00<00:00,  8.69it/s]


Validation accuracy: 0.9850

Epoch 34/35


Training: 100%|██████████| 14/14 [00:04<00:00,  2.84it/s]


Training loss: 0.0005


Evaluation: 100%|██████████| 6/6 [00:00<00:00,  8.85it/s]


Validation accuracy: 0.9871

Epoch 35/35


Training: 100%|██████████| 14/14 [00:04<00:00,  2.86it/s]


Training loss: 0.0001


Evaluation: 100%|██████████| 6/6 [00:00<00:00,  8.65it/s]


Validation accuracy: 0.9867


In [4]:
import torch
import warnings
from transformers import logging
from transformers import RobertaTokenizer, RobertaForTokenClassification

warnings.filterwarnings("ignore")
logging.set_verbosity_error()



def predict_labels(model, tokenizer, sentences, label_map_inv, device):
    """
    Predict labels for input sentences using a fine-tuned RoBERTa token classification model.
    It identifies mountain names  as entities and groups them based on 'B-LOCATION' and 'I-LOCATION' labels.

    Args:
        model (RobertaForTokenClassification): The fine-tuned token classification model.
        tokenizer (RobertaTokenizer): Tokenizer corresponding to the RoBERTa model.
        sentences (list[str]): List of input sentences for prediction.
        label_map_inv (dict): Mapping of label IDs to label names.
        device (torch.device): Device (CPU or GPU) for computation.

    Returns:
        list[tuple[str, list[str], list[str], list[str]]]: A list of tuples containing:
            - Sentence (str): The input sentence.
            - Tokens (list[str]): Tokens from the sentence.
            - Labels (list[str]): Predicted labels for the tokens.
            - Mountains (list[str]): Detected mountain names in the sentence.
    """

    model.eval()
    predictions = []

    with torch.no_grad():
        for sentence in sentences:

            inputs = tokenizer(
                sentence,
                padding=True,
                truncation=True,
                max_length=128,
                return_tensors='pt'
            ).to(device)

            outputs = model(**inputs)
            pred_labels = torch.argmax(outputs.logits, dim=2)[0]

            valid_tokens = inputs['input_ids'][0][inputs['attention_mask'][0] == 1]
            tokens = tokenizer.convert_ids_to_tokens(valid_tokens)
            labels = [label_map_inv[label.item()] for label in pred_labels[inputs['attention_mask'][0] == 1]]

            mountains = []
            current = []

            for token, label in zip(tokens, labels):
                if label == 'B-LOCATION':
                    if current:
                        mountains.append(''.join(current).replace('Ġ', ' ').strip())
                    current = [token]
                elif label == 'I-LOCATION' and current:
                    current.append(token)
                elif current:
                    mountains.append(''.join(current).replace('Ġ', ' ').strip())
                    current = []

            if current:
                mountains.append(''.join(current).replace('Ġ', ' ').strip())

            predictions.append((sentence, tokens, labels, mountains))

    return predictions


def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    test_data = 'sentences.txt'
    preloaded_model = 'best_model_weights.bin'

    label_map_inv = {0: 'O', 1: 'B-LOCATION', 2: 'I-LOCATION', 3: '<s>', 4: '</s>'}
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    model = RobertaForTokenClassification.from_pretrained('roberta-base', num_labels=5)
    if device == torch.device('cpu'):
        model.load_state_dict(torch.load(preloaded_model, weights_only=True, map_location=torch.device('cpu')))
    elif device == torch.device('cuda'):
        model.load_state_dict(torch.load(preloaded_model, weights_only=True))

    model.to(device)

    with open(test_data, 'r') as file:
        test_sentences = [line.strip() for line in file.readlines()]

    predictions = predict_labels(model, tokenizer, test_sentences, label_map_inv, device)

    for sentence, tokens, labels, mountains in predictions:
        print(f"\nSentence: {sentence}")
        print(f"Tokens: {tokens}")
        print(f"Labels: {labels}")
        print(f"Detected mountains: {mountains}")


if __name__ == "__main__":
    main()


Using device: cuda

Sentence: The weather conditions on Mount Washington are notoriously harsh.
Tokens: ['<s>', 'The', 'Ġweather', 'Ġconditions', 'Ġon', 'ĠMount', 'ĠWashington', 'Ġare', 'Ġnotoriously', 'Ġharsh', '.', '</s>']
Labels: ['<s>', 'O', 'O', 'O', 'O', 'O', 'B-LOCATION', 'O', 'O', 'O', 'O', '</s>']
Detected mountains: ['Washington']

Sentence: In Iraq, Halgurd stands as the highest peak, drawing adventurers from around the region.
Tokens: ['<s>', 'In', 'ĠIraq', ',', 'ĠHal', 'g', 'urd', 'Ġstands', 'Ġas', 'Ġthe', 'Ġhighest', 'Ġpeak', ',', 'Ġdrawing', 'Ġadventurers', 'Ġfrom', 'Ġaround', 'Ġthe', 'Ġregion', '.', '</s>']
Labels: ['<s>', 'O', 'O', 'O', 'B-LOCATION', 'I-LOCATION', 'I-LOCATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '</s>']
Detected mountains: ['Halgurd']

Sentence: Vietnam's Fansipan, known as the 'Roof of Indochina,' is a popular trekking destination.
Tokens: ['<s>', 'V', 'iet', 'nam', "'s", 'ĠFans', 'ip', 'an', ',', 'Ġknown', 'Ġas', 'Ġthe',