In [1]:
import os
import json
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification
from IPython.display import Markdown, display

# Define paths to load the model
project_dir = os.path.dirname(os.path.dirname(os.path.abspath("__file__")))
model_dir = os.path.join(project_dir, "models", "roberta-finetuned-ner")

# Load model configuration
with open(os.path.join(model_dir, "model_config.json"), "r") as f:
    model_config = json.load(f)

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_dir, add_prefix_space=True)
model = AutoModelForTokenClassification.from_pretrained(model_dir)

# Get id2label mapping
id2label = {int(k): v for k, v in model_config["id2label"].items()}
def extract_names(text, debug=False):
    """
    Extract person names from text using the trained NER model
    """
    # Tokenize with the same parameters as during training
    # REMOVE add_prefix_space=True from here since it's already set in tokenizer initialization
    inputs = tokenizer(text, return_tensors="pt", add_special_tokens=True, 
                      return_offsets_mapping=True)
    offset_mapping = inputs.pop("offset_mapping")[0].numpy()
    
    # Get predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probabilities = torch.nn.functional.softmax(logits, dim=2)
        predictions = torch.argmax(logits, dim=2)[0].numpy()
    
    # Extract tokens and their predictions
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    token_predictions = [id2label[pred] for pred in predictions]
    
    if debug:
        print("\nDEBUG INFO:")
        for i, (token, pred) in enumerate(zip(tokens, token_predictions)):
            print(f"{token}: {pred}")
    
    # Extract full names by merging adjacent tokens
    names = []
    current_name = []
    current_start = None
    
    # Process each token
    for idx, (token, pred, offset) in enumerate(zip(tokens, token_predictions, offset_mapping)):
        # Skip special tokens
        if offset[0] == offset[1]:
            continue
            
        if pred == "B-PERSON":
            # If we were building a name, save it
            if current_name:
                names.append({
                    "name": text[current_start:offset_mapping[idx-1][1]].strip(),
                    "start": current_start,
                    "end": offset_mapping[idx-1][1]
                })
                current_name = []
            
            # Start new name
            current_name.append(token)
            current_start = offset[0]
            
        elif pred == "I-PERSON" and current_name:
            # Continue building name
            current_name.append(token)
            
        else:
            # End of a name
            if current_name:
                names.append({
                    "name": text[current_start:offset_mapping[idx-1][1]].strip(),
                    "start": current_start,
                    "end": offset_mapping[idx-1][1]
                })
                current_name = []
                current_start = None
    
    # Handle case where name ends at the end of the text
    if current_name and len(offset_mapping) > 0:
        names.append({
            "name": text[current_start:offset_mapping[-1][1]].strip(),
            "start": current_start,
            "end": offset_mapping[-1][1]
        })
    
    return names


def highlight_names_in_text(text, names):
    """
    Highlight detected names in the original text for display
    
    Args:
        text (str): Original text
        names (list): List of name objects with start/end positions
        
    Returns:
        str: Markdown formatted text with highlighted names
    """
    # Sort names by start position (descending)
    sorted_names = sorted(names, key=lambda x: x["start"], reverse=True)
    
    # Insert markdown formatting
    highlighted_text = text
    for name in sorted_names:
        prefix = highlighted_text[:name["start"]]
        name_text = highlighted_text[name["start"]:name["end"]]
        suffix = highlighted_text[name["end"]:]
        highlighted_text = f"{prefix}**{name_text}**{suffix}"
    
    return highlighted_text

# Demo function
def analyze_text(text):
    """
    Analyze text and display the results with highlighted names
    
    Args:
        text (str): Text to analyze
    """
    print("Input text:")
    print(text)
    print("\n" + "-"*50 + "\n")
    
    # Extract names
    names = extract_names(text)
    
    # Display results
    if names:
        print(f"Found {len(names)} person names:")
        for i, name in enumerate(names, 1):
            print(f"{i}. {name['name']}")
        
        # Display highlighted text
        print("\n" + "-"*50 + "\n")
        print("Text with highlighted names:")
        highlighted = highlight_names_in_text(text, names)
        display(Markdown(highlighted))
    else:
        print("No person names detected in the text.")

# Check if the model was trained correctly
def check_model():
    # Load evaluation results
    try:
        eval_path = os.path.join(model_dir, "eval_results.json")
        with open(eval_path, "r") as f:
            results = json.load(f)
        print("Model evaluation results:")
        for k, v in results.items():
            print(f"  {k}: {v}")
    except FileNotFoundError:
        print("No evaluation results found. Model may not be properly trained.")

check_model()
# Example usage
sample_text = "Barack Obama was the 44th president of the United States, and he worked with Joe Biden as his vice president. Later, Kamala Harris became vice president under President Biden."
analyze_text(sample_text)

def test_simple():
    # Try with a very simple example
    simple_text = "John Smith is here."
    print("\nTesting with simple text:", simple_text)
    names = extract_names(simple_text, debug=True)
    print(f"Found names: {[name['name'] for name in names]}")

test_simple()

# Add to your notebook
def check_training_data():
    try:
        # Path to your dataset
        dataset_path = os.path.join(project_dir, "data", "tokenized_train")
        from datasets import load_from_disk
        dataset = load_from_disk(dataset_path)
        
        # Print some statistics
        print(f"Training set size: {len(dataset['train'])}")
        print(f"Test set size: {len(dataset['test'])}")
        
        # Check label distribution in training data
        label_count = {"O": 0, "B-PERSON": 0, "I-PERSON": 0}
        for example in dataset['train']:
            for label in example['labels']:
                if label == 0:
                    label_count["O"] += 1
                elif label == 1:
                    label_count["B-PERSON"] += 1
                elif label == 2:
                    label_count["I-PERSON"] += 1
                    
        print("Label distribution in training data:")
        print(label_count)
    except Exception as e:
        print(f"Error checking training data: {str(e)}")

check_training_data()

Model evaluation results:
  eval_loss: 0.008307608775794506
  eval_accuracy: 0.9984569643596403
  eval_precision: 0.9984569643596403
  eval_recall: 0.9984569643596403
  eval_f1: 0.9984569643596403
  eval_person_precision: 0.9901873327386262
  eval_person_recall: 0.9910714285714286
  eval_person_f1: 0.9906291834002677
  eval_runtime: 4.4469
  eval_samples_per_second: 674.17
  eval_steps_per_second: 42.276
  epoch: 5.0
Input text:
Barack Obama was the 44th president of the United States, and he worked with Joe Biden as his vice president. Later, Kamala Harris became vice president under President Biden.

--------------------------------------------------

Found 4 person names:
1. Barack Obama
2. Joe Biden
3. Kamala Harris
4. Biden

--------------------------------------------------

Text with highlighted names:


**Barack Obama** was the 44th president of the United States, and he worked with **Joe Biden** as his vice president. Later, **Kamala Harris** became vice president under President **Biden**.


Testing with simple text: John Smith is here.

DEBUG INFO:
<s>: O
ĠJohn: B-PERSON
ĠSmith: I-PERSON
Ġis: O
Ġhere: O
.: O
</s>: O
Found names: ['John Smith']
Training set size: 11989
Test set size: 2998
Label distribution in training data:
{'O': 194182, 'B-PERSON': 5253, 'I-PERSON': 12036}


In [None]:
from transformers import pipeline

def compare_models(text):
    """
    Compare your model with other pretrained NER models
    
    Args:
        text (str): Text to analyze
    """
    # Your model
    print("Results from your custom model:")
    names = extract_names(text)
    for name in names:
        print(f"- {name['name']}")
    
    print("\n" + "-"*50 + "\n")
    
    # Hugging Face NER pipeline
    print("Results from Hugging Face NER pipeline:")
    ner_pipeline = pipeline("ner", model="../models/roberta-finetuned-ner", tokenizer="../models/roberta-finetuned-ner")
    results = ner_pipeline(text)
    
    # Filter for person entities
    person_entities = [entity for entity in results if entity["entity"].endswith("PER")]
    for entity in person_entities:
        print(f"- {entity['word']} (score: {entity['score']:.4f})")

# Example comparison
sample_text = "When John Smith met Sarah Johnson at the conference, they were introduced by Professor Williams."
compare_models(sample_text)

Results from your custom model:

--------------------------------------------------

Results from Hugging Face NER pipeline:


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


- John (score: 0.9995)
- Smith (score: 0.9996)
- Sarah (score: 0.9993)
- Johnson (score: 0.9997)
- Williams (score: 0.9989)
