In [15]:
import os
import json
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    pipeline
)

def is_local_model(model_path):
    """Check if model path refers to a local model directory"""
    return os.path.exists(model_path)

def load_model_config(model_path, debug=False):
    """
    Load model configuration from a custom config file if it exists
    
    Args:
        model_path: Path to model directory
        debug: Whether to print debug information
    
    Returns:
        dict: Configuration with id2label and label2id mappings
    """
    config_path = os.path.join(model_path, "model_config.json")
    
    if os.path.exists(config_path):
        if debug:
            print(f"Found custom config at {config_path}")
        
        with open(config_path, "r") as f:
            model_config = json.load(f)
        
        # Extract label mappings
        id2label = model_config.get("id2label", {"0": "O", "1": "B-PERSON", "2": "I-PERSON"})
        label2id = model_config.get("label2id", {"O": 0, "B-PERSON": 1, "I-PERSON": 2})
        
        # Convert string keys to integers for id2label
        id2label = {int(k): v for k, v in id2label.items()}
        
        return {"id2label": id2label, "label2id": label2id}
    else:
        if debug:
            print("No custom config found, using default settings")
        
        # Default settings for person NER
        return {
            "id2label": {0: "O", 1: "B-PERSON", 2: "I-PERSON"},
            "label2id": {"O": 0, "B-PERSON": 1, "I-PERSON": 2}
        }

def load_tokenizer(model_path, debug=False):
    """Load tokenizer from path"""
    if debug:
        print(f"Loading tokenizer from {model_path}")
    return AutoTokenizer.from_pretrained(model_path)

def configure_base_model(model_path, debug=False):
    """
    Check if a model is a base model requiring NER configuration
    
    Args:
        model_path: Path or name of the model
        debug: Whether to print debug info
    
    Returns:
        bool: Whether this is a base model needing configuration
    """
    is_base = "base" in model_path.lower() and "ner" not in model_path.lower()
    if is_base and debug:
        print(f"Detected {model_path} as base model requiring NER configuration")
    return is_base

def load_model(model_path, config=None, debug=False):
    """
    Load model with appropriate configuration
    
    Args:
        model_path: Path to model
        config: Optional configuration dictionary with id2label and label2id
        debug: Whether to print debug info
    
    Returns:
        Model: The loaded model
    """
    if debug:
        print(f"Loading model from {model_path}")
    
    # If no config provided, use default for NER
    if config is None:
        config = {
            "id2label": {0: "O", 1: "B-PERSON", 2: "I-PERSON"},
            "label2id": {"O": 0, "B-PERSON": 1, "I-PERSON": 2}
        }
    
    # Load model with configuration
    model = AutoModelForTokenClassification.from_pretrained(
        model_path,
        id2label=config["id2label"],
        label2id=config["label2id"],
        num_labels=len(config["id2label"])
    )
    
    if debug:
        print(f"Model loaded with {len(config['id2label'])} labels")
    
    return model

def create_ner_pipeline(model, tokenizer, debug=False):
    """
    Create NER pipeline from model and tokenizer
    
    Args:
        model: The model to use
        tokenizer: The tokenizer to use
        debug: Whether to print debug info
    
    Returns:
        pipeline: NER pipeline
    """
    device = 0 if torch.cuda.is_available() else -1
    if debug:
        print(f"Creating NER pipeline on device: {device}")
    
    return pipeline(
        "ner", 
        model=model, 
        tokenizer=tokenizer,
        aggregation_strategy="simple",
        device=device
    )

def test_pipeline(pipeline, test_text="Barack Obama was president of the United States.", debug=False):
    """
    Test NER pipeline on a sample text
    
    Args:
        pipeline: NER pipeline to test
        test_text: Sample text for testing
        debug: Whether to print full results
    
    Returns:
        list: Detected entities
    """
    if debug:
        print(f"Testing pipeline on: '{test_text}'")
    
    entities = pipeline(test_text)
    
    if debug:
        print(f"Detected entities: {entities}")
    
    return entities

def print_model_info(model, tokenizer):
    """Print detailed information about the model"""
    print(f"Model architecture: {model.__class__.__name__}")
    print(f"Number of parameters: {sum(p.numel() for p in model.parameters())}")
    print(f"Labels: {model.config.id2label}")
    print(f"Tokenizer: {tokenizer.__class__.__name__}")
    print(f"Vocabulary size: {tokenizer.vocab_size}")

def load_and_setup_ner_model(model_path, test=True, debug=False):
    """
    Main function to load and set up an NER model
    
    Args:
        model_path: Path or name of model
        test: Whether to run a quick test
        debug: Whether to print debug info
    
    Returns:
        tuple: (tokenizer, model, ner_pipeline)
    """
    # Check if local model
    local = is_local_model(model_path)
    if debug:
        print(f"Model path {model_path} is {'local' if local else 'from HuggingFace'}")
    
    # Load tokenizer
    tokenizer = load_tokenizer(model_path, debug)
    
    # Determine if base model
    is_base = configure_base_model(model_path, debug)
    
    # Load configuration
    if local:
        config = load_model_config(model_path, debug)
    elif is_base:
        config = {
            "id2label": {0: "O", 1: "B-PERSON", 2: "I-PERSON"},
            "label2id": {"O": 0, "B-PERSON": 1, "I-PERSON": 2}
        }
    else:
        config = None  # Use model's existing config
    
    # Load model
    model = load_model(model_path, config, debug)
    
    # Create pipeline
    ner_pipe = create_ner_pipeline(model, tokenizer, debug)
    
    # Test pipeline
    if test:
        entities = test_pipeline(ner_pipe, debug=debug)
        if debug:
            print(f"Found {len(entities)} entities in test text")
    
    # Print detailed model info if requested
    if debug:
        print_model_info(model, tokenizer)
    
    return tokenizer, model, ner_pipe

# Example usage:
# tokenizer, model, pipeline = load_and_setup_ner_model("../models/roberta-finetuned-ner", debug=True)

def extract_and_display_names(text, model_path, debug=False):
    """
    Extract person names from text using a specified NER model and display results.
    
    Args:
        text (str): Text to analyze
        model_path (str): Path or name of NER model
        debug (bool): Whether to print debug information
    
    Returns:
        list: Detected entities
    """
    # Load model
    a, b, ner_pipeline = load_and_setup_ner_model(model_path, test=False, debug=debug)
    
    print(f"\n--- Processing with model: {model_path} ---")
    print(f"Input text: {text}")
    
    # Get entities
    entities = ner_pipeline(text)
    
    # Filter for person entities
    person_entities = [e for e in entities if e["entity_group"] == "PERSON"]
    
    # Display results
    if not person_entities:
        print("No person names detected.")
        return entities
    
    # Create highlighted text display with markdown formatting
    highlighted_text = text
    # Sort entities by start position in reverse to avoid offset issues
    for entity in sorted(person_entities, key=lambda x: x["start"], reverse=True):
        start, end = entity["start"], entity["end"]
        entity_text = text[start:end]
        highlighted_text = highlighted_text[:start] + f"**{entity_text}**" + highlighted_text[end:]
    
    print("\nDetected person names:")
    print("-" * 40)
    from IPython.display import Markdown, display
    display(Markdown(highlighted_text))
    
    # Print entity details in a table
    print("\nEntity details:")
    print(f"{'Entity':<20} {'Type':<10} {'Confidence':<10}")
    print("-" * 50)
    for entity in person_entities:
        name = entity["word"]
        confidence = f"{entity['score']:.4f}"
        print(f"{name:<20} {'PERSON':<10} {confidence:<10}")
    
    print(f"\nTotal: {len(person_entities)} person entities detected")
    return entities

entities = extract_and_display_names(
"Barack Obama was president of the United States. He worked with Joe Biden, who later became president too. joe was a bad President",
    "../models/roberta-finetuned-ner", debug=True
)

Device set to use cuda:0


Model path ../models/roberta-finetuned-ner is local
Loading tokenizer from ../models/roberta-finetuned-ner
Found custom config at ../models/roberta-finetuned-ner\model_config.json
Loading model from ../models/roberta-finetuned-ner
Model loaded with 3 labels
Creating NER pipeline on device: 0
Model architecture: RobertaForTokenClassification
Number of parameters: 124057347
Labels: {0: 'O', 1: 'B-PERSON', 2: 'I-PERSON'}
Tokenizer: RobertaTokenizerFast
Vocabulary size: 50265

--- Processing with model: ../models/roberta-finetuned-ner ---
Input text: Barack Obama was president of the United States. He worked with Joe Biden, who later became president too. joe was a bad President

Detected person names:
----------------------------------------


**Barack Obama** was president of the United States. He worked with **Joe Biden**, who later became president too. **joe** was a bad President


Entity details:
Entity               Type       Confidence
--------------------------------------------------
Barack Obama         PERSON     0.9995    
 Joe Biden           PERSON     0.9998    
 joe                 PERSON     0.9974    

Total: 3 person entities detected
