In [2]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import classification_report
import numpy as np
from tqdm import tqdm
import logging
from datetime import datetime

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from huggingface_hub import login

# Login with the token
login(token="secret")
file_path= '/content/drive/MyDrive/NER_Dataset/b-ner-train.csv'
tag_to_id = {
    'B-geo': 0, 'O': 1, 'B-gpe': 2, 'B-per': 3, 'I-per': 4, 'B-tim': 5,
    'B-org': 6, 'I-org': 7, 'B-art': 8, 'I-art': 9, 'I-tim': 10,
    'B-eve': 11, 'I-eve': 12, 'I-geo': 13, 'I-gpe': 14, 'B-nat': 15, 'I-nat': 16
}

In [4]:
class NERTester:
    def __init__(self, model_path, tokenizer_path, test_file, output_csv, output_txt):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.test_file = test_file
        self.output_csv = output_csv
        self.output_txt = output_txt

        # Load tokenizer and model
        logger.info("Loading tokenizer and model...")
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
        self.model = AutoModelForTokenClassification.from_pretrained(model_path)
        self.model = self.model.to(self.device)
        self.model.eval()

        # Get label mappings
        self.id2label = self.model.config.id2label
        self.label2id = self.model.config.label2id

    def predict_tags(self, sentence):
        # Tokenize sentence
        tokenized = self.tokenizer(sentence,
                                 return_tensors="pt",
                                 padding=True,
                                 truncation=True,
                                 return_offsets_mapping=True,
                                 return_special_tokens_mask=True)

        # Get word_ids correctly
        word_ids = self.tokenizer(sentence, return_tensors="pt").word_ids()

        # Move necessary inputs to device
        inputs = {
            'input_ids': tokenized['input_ids'].to(self.device),
            'attention_mask': tokenized['attention_mask'].to(self.device)
        }

        # Get predictions
        with torch.no_grad():
            outputs = self.model(**inputs)
            predictions = outputs.logits.argmax(dim=-1)

        # Convert predictions to tags
        predicted_tags = []
        previous_word_idx = None

        for idx, (pred_idx, word_idx) in enumerate(zip(predictions[0], word_ids)):
            # Skip special tokens and duplicates
            if word_idx is None:
                continue
            if word_idx != previous_word_idx:
                predicted_tags.append(self.id2label[pred_idx.item()])
                previous_word_idx = word_idx

        return predicted_tags

    def test_model(self):
        logger.info("Reading test dataset...")
        df = pd.read_csv(self.test_file)

        # Group by sentence
        sentences = df.groupby('Sentence #').agg({
            'Word': list,
            'Tag': list
        }).reset_index()

        all_true_tags = []
        all_pred_tags = []
        predicted_tags_list = []

        logger.info("Making predictions...")
        for _, row in tqdm(sentences.iterrows(), total=len(sentences)):
            sentence = " ".join(row['Word'])
            true_tags = row['Tag']

            pred_tags = self.predict_tags(sentence)

            # Ensure predictions match the number of words
            pred_tags = pred_tags[:len(true_tags)]
            if len(pred_tags) < len(true_tags):
                pred_tags.extend(['O'] * (len(true_tags) - len(pred_tags)))

            predicted_tags_list.extend(pred_tags)
            all_true_tags.extend(true_tags)
            all_pred_tags.extend(pred_tags)

        # Add predictions to dataframe
        df['Indic_Pred_Tag'] = predicted_tags_list

        # Save predictions
        logger.info(f"Saving predictions to {self.output_csv}")
        df.to_csv(self.output_csv, index=False)

        # Calculate metrics
        accuracy = accuracy_score(all_true_tags, all_pred_tags)
        precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
            all_true_tags,
            all_pred_tags,
            average='macro',
            zero_division=0,  # Explicitly handle zero division
            labels=list(self.label2id.keys())  # Ensure all labels are considered
        )

        # Get detailed classification report
        class_report = classification_report(
            all_true_tags,
            all_pred_tags,
            zero_division=0,  # Explicitly handle zero division
            labels=list(self.label2id.keys())  # Use all possible labels
        )

        # Basic analysis
        tag_distribution = pd.Series(all_pred_tags).value_counts()
        errors = [(true, pred) for true, pred in zip(all_true_tags, all_pred_tags) if true != pred]
        common_errors = pd.DataFrame(errors, columns=['True', 'Predicted']).value_counts().head()

        # Write results to text file
        logger.info(f"Writing analysis to {self.output_txt}")
        with open(self.output_txt, 'w', encoding='utf-8') as f:
            f.write(f"NER Testing Results - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write("=" * 50 + "\n\n")

            f.write("Model Information:\n")
            f.write(f"Model: Debk/Ben_NER_indic-bert\n")
            f.write(f"Tokenizer: ai4bharat/indic-bert\n")
            f.write(f"Number of test instances: {len(df)}\n")
            f.write(f"Number of unique sentences: {len(sentences)}\n\n")

            f.write("Performance Metrics:\n")
            f.write(f"Accuracy: {accuracy:.4f}\n")
            f.write(f"Macro Precision: {precision_macro:.4f}\n")
            f.write(f"Macro Recall: {recall_macro:.4f}\n")
            f.write(f"Macro F1-Score: {f1_macro:.4f}\n\n")

            f.write("Detailed Classification Report:\n")
            f.write(class_report + "\n\n")

            f.write("Tag Distribution in Predictions:\n")
            f.write(str(tag_distribution) + "\n\n")

            f.write("Most Common Prediction Errors (True -> Predicted):\n")
            f.write(str(common_errors) + "\n\n")

            f.write("Additional Analysis:\n")
            f.write(f"- Number of prediction errors: {len(errors)}\n")
            f.write(f"- Error rate: {len(errors)/len(all_true_tags):.4f}\n")

        logger.info("Testing completed!")
        return accuracy, precision_macro, recall_macro, f1_macro

In [5]:
def main():
    # Initialize tester
    tester = NERTester(
        model_path="Debk/Ben_NER_indic-bert",
        tokenizer_path="ai4bharat/indic-bert",
        test_file='/content/drive/MyDrive/NER_Dataset/b-ner-test.csv',
        output_csv='/content/drive/MyDrive/NER_Dataset/indic-bert_result-test.csv',
        output_txt='/content/drive/MyDrive/NER_Dataset/indic-bert_result-test.txt'
    )

    # Run testing
    accuracy, precision, recall, f1 = tester.test_model()

    # Print summary
    print("\nTesting Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Macro Precision: {precision:.4f}")
    print(f"Macro Recall: {recall:.4f}")
    print(f"Macro F1-Score: {f1:.4f}")

if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/5.65M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/131M [00:00<?, ?B/s]

  0%|          | 0/4429 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 4429/4429 [00:47<00:00, 93.21it/s] 



Testing Results:
Accuracy: 0.4628
Macro Precision: 0.0561
Macro Recall: 0.0448
Macro F1-Score: 0.0435
