In [None]:
!pip install pytorch-crf datasets seqeval tqdm wandb GPUtil

Collecting pytorch-crf
  Downloading pytorch_crf-0.7.2-py3-none-any.whl.metadata (2.4 kB)
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting GPUtil
  Downloading GPUtil-1.4.0.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizerFast,XLMRobertaModel
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
from tqdm.auto import tqdm
import logging
import torch.nn as nn
from datetime import datetime
from huggingface_hub import HfFolder, HfApi
from huggingface_hub import hf_hub_download
from datasets import Dataset as HFDataset
from google.colab import drive
drive.mount('/content/drive')

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
tag_to_id = {
    'B-geo': 0, 'O': 1, 'B-gpe': 2, 'B-per': 3, 'I-per': 4, 'B-tim': 5,
    'B-org': 6, 'I-org': 7, 'B-art': 8, 'I-art': 9, 'I-tim': 10,
    'B-eve': 11, 'I-eve': 12, 'I-geo': 13, 'I-gpe': 14, 'B-nat': 15, 'I-nat': 16
}
id_to_tag = {v: k for k, v in tag_to_id.items()}


Mounted at /content/drive


In [None]:
class NERDataset(Dataset):
    def __init__(self, texts, tags, tokenizer, tag_to_id, max_len):
        self.texts = texts
        self.tags = tags
        self.tokenizer = tokenizer
        self.tag_to_id = tag_to_id
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        try:
            words = self.texts[idx]
            tags = self.tags[idx]

            encoding = self.tokenizer(
                words,
                is_split_into_words=True,
                padding='max_length',
                truncation=True,
                max_length=self.max_len,
                return_tensors='pt'
            )

            label_ids = []
            word_ids = encoding.word_ids()

            for word_idx in word_ids:
                if word_idx is None:
                    label_ids.append(-100)
                else:
                    label_ids.append(self.tag_to_id[tags[word_idx]])

            encoding = {key: val.squeeze() for key, val in encoding.items()}
            encoding['labels'] = torch.tensor(label_ids)

            return encoding
        except Exception as e:
            logger.error(f"Error processing item {idx}: {e}")
            raise e

In [None]:
class XLMRobertaBiLSTM(nn.Module):
    def __init__(self, num_labels, dropout=0.1, lstm_hidden_size=256):
        super().__init__()
        self.roberta = XLMRobertaModel.from_pretrained('xlm-roberta-large')
        hidden_size = self.roberta.config.hidden_size

        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(
            input_size=hidden_size,
            hidden_size=lstm_hidden_size,
            num_layers=2,
            bidirectional=True,
            batch_first=True,
            dropout=dropout if 2 > 1 else 0
        )
        self.classifier = nn.Linear(lstm_hidden_size * 2, num_labels)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)

        lstm_output, _ = self.lstm(sequence_output)
        lstm_output = self.dropout(lstm_output)

        logits = self.classifier(lstm_output)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            active_loss = labels.view(-1) != -100
            active_logits = logits.view(-1, logits.shape[-1])
            active_labels = labels.view(-1)
            loss = loss_fct(active_logits[active_loss], active_labels[active_loss])

        return {'loss': loss, 'logits': logits} if loss is not None else {'logits': logits}


In [None]:
class ModelTester:
    def __init__(self, model_id, token, tag_to_id, device, tokenizer):
        self.model_id = model_id
        self.token = token
        self.tag_to_id = tag_to_id
        self.id_to_tag = {v: k for k, v in tag_to_id.items()}
        self.device = device
        self.tokenizer = tokenizer
        self.model = self._load_model()

    def _load_model(self):
        try:
            model = XLMRobertaBiLSTM(num_labels=len(self.tag_to_id))

            from huggingface_hub import hf_hub_download
            model_path = hf_hub_download(
                repo_id=self.model_id,
                filename="pytorch_model.bin",
                token=self.token
            )

            state_dict = torch.load(model_path, map_location=self.device)
            model.load_state_dict(state_dict)

            model = model.to(self.device)
            model.eval()
            logger.info(f"Successfully loaded model from {self.model_id}")
            return model
        except Exception as e:
            logger.error(f"Error loading model from HuggingFace: {str(e)}")
            raise e

    def predict_sentence(self, words):
        """Predict tags for a single sentence"""
        try:
            # Tokenize the sentence
            encoding = self.tokenizer(
                words,
                is_split_into_words=True,
                return_tensors='pt',
                padding=True,
                truncation=True
            )

            # Move to device
            input_ids = encoding['input_ids'].to(self.device)
            attention_mask = encoding['attention_mask'].to(self.device)

            # Get predictions
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs['logits']

            # Get probabilities using softmax
            probs = torch.nn.functional.softmax(logits, dim=-1)[0]
            predictions = torch.argmax(logits, dim=2)[0]

            # Get word-level predictions and confidences
            word_predictions = []
            word_confidences = []

            # Map predictions back to words
            word_ids = encoding.word_ids(0)  # Get word_ids for first sequence
            previous_word_idx = None
            max_prob = 0
            current_pred = None

            for idx, word_idx in enumerate(word_ids):
                if word_idx is None:
                    continue

                if word_idx != previous_word_idx:
                    if previous_word_idx is not None:
                        word_predictions.append(self.id_to_tag[current_pred])
                        word_confidences.append(max_prob)
                    max_prob = float(torch.max(probs[idx]))
                    current_pred = int(predictions[idx])
                    previous_word_idx = word_idx
                else:
                    # If it's the same word, update if probability is higher
                    prob = float(torch.max(probs[idx]))
                    if prob > max_prob:
                        max_prob = prob
                        current_pred = int(predictions[idx])

            # Add the last word
            if current_pred is not None:
                word_predictions.append(self.id_to_tag[current_pred])
                word_confidences.append(max_prob)

            # Verify we have a prediction for each word
            if len(word_predictions) != len(words):
                logger.warning(f"Mismatch in sentence predictions: got {len(word_predictions)} predictions for {len(words)} words")
                return ['O'] * len(words), [0.0] * len(words)

            return word_predictions, word_confidences

        except Exception as e:
            logger.error(f"Error in predict_sentence: {str(e)}")
            return ['O'] * len(words), [0.0] * len(words)

    def predict_tags(self, test_loader, test_df):
        """
        Predict tags for test data
        Returns:
            word_predictions: list of predicted tags
            word_true_labels: list of true tags
            word_confidences: list of confidence scores
        """
        logger.info("Starting prediction...")
        all_predictions = []
        all_confidences = []

        # Process each sentence
        sentences = test_df.groupby('Sentence #')['Word'].apply(list).values

        for sentence_words in tqdm(sentences, desc="Processing sentences"):
            predictions, confidences = self.predict_sentence(sentence_words)
            all_predictions.extend(predictions)
            all_confidences.extend(confidences)

        # Get true labels
        true_labels = test_df['Tag'].tolist()

        # Verify predictions match the number of words
        assert len(all_predictions) == len(test_df), \
            f"Mismatch between predictions ({len(all_predictions)}) and words ({len(test_df)})"

        # Log prediction distribution
        tag_counts = {}
        for tag in all_predictions:
            tag_counts[tag] = tag_counts.get(tag, 0) + 1

        logger.info("\nPrediction distribution:")
        for tag, count in tag_counts.items():
            percentage = (count / len(all_predictions)) * 100
            logger.info(f"{tag}: {count} ({percentage:.2f}%)")

        return all_predictions, true_labels, all_confidences

In [None]:
##def main():
# Initialize device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
logger.info(f"Using device: {device}")

# Initialize tokenizer
tokenizer = XLMRobertaTokenizerFast.from_pretrained('xlm-roberta-large')
# Load test data
test_df = pd.read_csv('/content/drive/MyDrive/NER_Dataset/b-ner-test.csv')
test_sentences = test_df.groupby('Sentence #')['Word'].apply(list).values
test_tags = test_df.groupby('Sentence #')['Tag'].apply(list).values
num_sentences = len(test_sentences)

# Create test dataset
test_dataset = NERDataset(test_sentences, test_tags, tokenizer, tag_to_id, max_len=400)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Initialize tester with tokenizer
tester = ModelTester(
    model_id="Debk/Ben_NER_xlm-roberta-large_BiLSTM",
    token="secret",
    tag_to_id=tag_to_id,
    device=device,
    tokenizer=tokenizer  # Pass the tokenizer
)
predicted_tags, true_tags, confidences = tester.predict_tags(test_loader, test_df)

# Add predictions and confidences to DataFrame
test_df['BiLSTM_Roberta_Pred'] = predicted_tags
test_df['confidence'] = confidences

# Save results with confidence
test_df.to_csv('/content/drive/MyDrive/NER_Dataset/BiLSTM_Roberta_result_test.csv', index=False)

# Calculate metrics
accuracy = accuracy_score(true_tags, predicted_tags)
precision, recall, f1, _ = precision_recall_fscore_support(
    true_tags, predicted_tags, average='macro'
)

# Create results text file with confidence information
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

# Calculate confidence statistics
avg_confidence = np.mean(confidences)
avg_confidence_correct = np.mean([conf for conf, pred, true in
                                zip(confidences, predicted_tags, true_tags)
                                if pred == true])
avg_confidence_incorrect = np.mean([conf for conf, pred, true in
                                  zip(confidences, predicted_tags, true_tags)
                                  if pred != true])

results_text = f"""
Model: XLM-RoBERTa-Large with BiLSTM
Model ID: Debk/Ben_NER_xlm-roberta-large_BiLSTM
Test Date: {current_time}
Number of Test Sentences: {num_sentences}

Metrics (Macro):
---------------
Accuracy: {accuracy:.4f}
Precision: {precision:.4f}
Recall: {recall:.4f}
F1 Score: {f1:.4f}

Confidence Analysis:
-------------------
Average Confidence: {avg_confidence:.4f}
Average Confidence (Correct Predictions): {avg_confidence_correct:.4f}
Average Confidence (Incorrect Predictions): {avg_confidence_incorrect:.4f}
"""

# Save metrics
with open('/content/drive/MyDrive/NER_Dataset/BiLSTM_Roberta_result_test.txt', 'w') as f:
    f.write(results_text)

logger.info("Testing completed. Results saved to files.")
print(results_text)

  state_dict = torch.load(model_path, map_location=self.device)


Processing sentences:   0%|          | 0/33 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Model: XLM-RoBERTa-Large with BiLSTM
Model ID: Debk/Ben_NER_xlm-roberta-large_BiLSTM
Test Date: 2025-01-14 08:40:43
Number of Test Sentences: 33

Metrics (Macro):
---------------
Accuracy: 0.0067
Precision: 0.0005
Recall: 0.0769
F1 Score: 0.0010

Confidence Analysis:
-------------------
Average Confidence: 0.0674
Average Confidence (Correct Predictions): 0.0674
Average Confidence (Incorrect Predictions): 0.0674

