# NER Model Evaluation Pipeline

This notebook provides a comprehensive evaluation pipeline for testing version 2 NER models on TITLE and NO-TITLE datasets.

## Pipeline Overview:
1. **Models**: roberta-finetuned-ner-TITLE-v2 and roberta-finetuned-ner-NO-TITLE-v2
2. **Datasets**: All transformer datasets in `/data/ds/TITLE/` and `/data/ds/NO-TITLE/`
3. **Metrics**: F1 score, Precision, Recall, Token Accuracy, Inference Time, Model Size
4. **Output**: Aggregated results saved to CSV and JSON for charting

## Results:
- TITLE model performance on TITLE datasets
- NO-TITLE model performance on NO-TITLE datasets
- Aggregated metrics for comparison

In [None]:
# Import Required Libraries
import os
import sys
import json
import time
import pandas as pd
import numpy as np
import torch
from pathlib import Path
from datetime import datetime
from datasets import load_from_disk
from transformers import AutoTokenizer, DataCollatorForTokenClassification
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, classification_report
from seqeval.metrics import classification_report as seq_classification_report
from seqeval.scheme import IOB2

# Add project root to path for importing custom modules
sys.path.insert(0, os.path.dirname(os.path.abspath('.')))

# Import custom model and functions
from scripts.train import (
    RobertaCRFForTokenClassification, 
    confidence_based_postprocessing,
    LABEL2ID, ID2LABEL, NUM_LABELS
)

print(" All libraries imported successfully!")
print(f" PyTorch version: {torch.__version__}")
print(f" Device available: {'CUDA' if torch.cuda.is_available() else 'CPU'}")

In [None]:
# Configuration and Paths
from utils.config import PROJECT_DIR
import os

PROJECT_ROOT = PROJECT_DIR
DATA_DS_PATH = os.path.join(PROJECT_ROOT, "data", "ds")
MODELS_PATH = os.path.join(PROJECT_ROOT, "models")
RESULTS_PATH = os.path.join(PROJECT_ROOT, "evaluation_results")

# Create results directory if it doesn't exist
os.makedirs(RESULTS_PATH, exist_ok=True)

# Model paths
TITLE_MODEL_PATH = os.path.join(MODELS_PATH, "roberta-finetuned-ner-TITLE-v2")
NO_TITLE_MODEL_PATH = os.path.join(MODELS_PATH, "roberta-finetuned-ner-NO-TITLE-v2")

# Dataset paths
TITLE_DATASETS_PATH = os.path.join(DATA_DS_PATH, "TITLE")
NO_TITLE_DATASETS_PATH = os.path.join(DATA_DS_PATH, "NO-TITLE")

print(" Configuration:")
print(f"  Project Root: {PROJECT_ROOT}")
print(f"  TITLE Model: {TITLE_MODEL_PATH}")
print(f"  NO-TITLE Model: {NO_TITLE_MODEL_PATH}")
print(f"  TITLE Datasets: {TITLE_DATASETS_PATH}")
print(f"  NO-TITLE Datasets: {NO_TITLE_DATASETS_PATH}")
print(f"  Results Output: {RESULTS_PATH}")

# Verify paths exist
for path, name in [(TITLE_MODEL_PATH, "TITLE Model"), (NO_TITLE_MODEL_PATH, "NO-TITLE Model"), 
                   (TITLE_DATASETS_PATH, "TITLE Datasets"), (NO_TITLE_DATASETS_PATH, "NO-TITLE Datasets")]:
    if os.path.exists(path):
        print(f" {name} found")
    else:
        print(f" {name} NOT found at {path}")

In [None]:
# Dataset Discovery Function
def discover_datasets(datasets_path):
    """
    Discover all transformer datasets in the given directory.
    Returns a list of dataset paths.
    """
    datasets = []
    datasets_path = Path(datasets_path)
    
    if not datasets_path.exists():
        print(f" Path {datasets_path} does not exist!")
        return datasets
    
    # Look for directories that contain dataset files
    for item in datasets_path.iterdir():
        if item.is_dir():
            # Check if it's a valid transformers dataset
            if (item / "dataset_dict.json").exists() or (item / "dataset_info.json").exists():
                datasets.append(item)
                print(f" Found dataset: {item.name}")
    
    return datasets

# Discover all datasets
print(" Discovering TITLE datasets...")
title_datasets = discover_datasets(TITLE_DATASETS_PATH)

print(f"\n Discovering NO-TITLE datasets...")
no_title_datasets = discover_datasets(NO_TITLE_DATASETS_PATH)

print(f"\n Summary:")
print(f"  TITLE datasets: {len(title_datasets)}")
print(f"  NO-TITLE datasets: {len(no_title_datasets)}")
print(f"  Total datasets: {len(title_datasets) + len(no_title_datasets)}")

In [None]:
from pathlib import Path
import json
# Model Loading Functions
def load_model_and_tokenizer(model_path, model_type="NO-TITLE"):
    """
    Load a model and tokenizer from the given path.
    Returns model, tokenizer, label_config
    """
    print(f" Loading {model_type} model from {model_path}")
    
    # Convert to Path object if it's a string
    model_path = Path(model_path)
    
    # Load label configuration
    label_config_path = model_path / "label_config.json"
    with open(label_config_path, 'r') as f:
        label_config = json.load(f)
    
    # Get model configuration from config.json
    config_path = model_path / "config.json"
    with open(config_path, 'r') as f:
        config = json.load(f)
    
    # Create model with exact same architecture as training
    model = RobertaCRFForTokenClassification(
        model_name="roberta-base",  # Base model
        num_labels=label_config['num_labels'],
        alpha=config.get('alpha', 0.25),
        gamma=config.get('gamma', 2.0),
        person_weight=config.get('person_weight', 5.0),
        crf_weight=config.get('crf_weight', 0.5),
        focal_weight=config.get('focal_weight', 0.2),
        dice_weight=config.get('dice_weight', 0.3),
        classifier_params=config.get('classifier_params', {}),
        dice_loss_params=config.get('dice_loss_params', {})
    )
    
    # Load trained weights
    from safetensors.torch import load_file
    model_file = model_path / "model.safetensors"
    if model_file.exists():
        state_dict = load_file(str(model_file))
        
        # Filter compatible weights
        model_state_dict = model.state_dict()
        compatible_state_dict = {}
        
        for key, value in state_dict.items():
            if key in model_state_dict and model_state_dict[key].shape == value.shape:
                compatible_state_dict[key] = value
        
        model.load_state_dict(compatible_state_dict, strict=False)
        print(f"   Loaded {len(compatible_state_dict)} weight tensors")
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(str(model_path), add_prefix_space=True)
    
    # Calculate model size
    model_size_mb = sum(p.numel() * p.element_size() for p in model.parameters()) / (1024 * 1024)
    print(f"   Model size: {model_size_mb:.1f} MB")
    
    return model, tokenizer, label_config, model_size_mb

# Load both models
print("Loading models...")
title_model, title_tokenizer, title_label_config, title_model_size = load_model_and_tokenizer(
    TITLE_MODEL_PATH, "TITLE"
)

no_title_model, no_title_tokenizer, no_title_label_config, no_title_model_size = load_model_and_tokenizer(
    NO_TITLE_MODEL_PATH, "NO-TITLE"
)

print(f"\n Both models loaded successfully!")
print(f"  TITLE model: {title_label_config['num_labels']} labels, {title_model_size:.1f} MB")
print(f"  NO-TITLE model: {no_title_label_config['num_labels']} labels, {no_title_model_size:.1f} MB")

In [None]:
# Evaluation Function
def evaluate_model_on_dataset(model, tokenizer, label_config, dataset_path, model_type):
    """
    Evaluate a model on a single dataset and return metrics.
    """
    dataset_name = dataset_path.name
    print(f"   Evaluating on {dataset_name}...")
    
    # Load dataset
    try:
        dataset = load_from_disk(str(dataset_path))
        
        # Check if dataset has splits and use appropriate data
        if hasattr(dataset, 'keys'):
            # Dataset has splits, use test if available, otherwise use the first available split
            if 'test' in dataset:
                eval_data = dataset['test']
            elif 'validation' in dataset:
                eval_data = dataset['validation']
            else:
                eval_data = dataset[list(dataset.keys())[0]]
        else:
            # Single dataset without splits
            eval_data = dataset
            
    except Exception as e:
        print(f"     Failed to load dataset {dataset_name}: {e}")
        return None
    
    # Filter labels to match model
    def filter_labels(example):
        filtered_labels = []
        for label in example['labels']:
            if label == -100:
                filtered_labels.append(label)
            elif label >= label_config['num_labels']:
                filtered_labels.append(0)  # Map invalid labels to O
            else:
                filtered_labels.append(label)
        example['labels'] = filtered_labels
        return example
    
    eval_data = eval_data.map(filter_labels)
    
    # Set up evaluation
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
    
    # Track time
    start_time = time.time()
    
    all_predictions = []
    all_labels = []
    
    # Process in batches
    from torch.utils.data import DataLoader
    dataloader = DataLoader(eval_data, batch_size=8, collate_fn=data_collator)
    
    with torch.no_grad():
        for batch in dataloader:
            # Move to device
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            
            # Get model outputs
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=None)
            logits = outputs["logits"]
            
            # Use CRF decoding
            try:
                crf_mask = attention_mask.bool()
                if hasattr(model, 'crf') and hasattr(model.crf, 'decode'):
                    crf_predictions = model.crf.decode(logits, mask=crf_mask)
                    predictions = torch.zeros_like(input_ids)
                    for b_idx, pred_seq in enumerate(crf_predictions):
                        seq_len = min(len(pred_seq), predictions.shape[1])
                        predictions[b_idx, :seq_len] = torch.tensor(pred_seq[:seq_len])
                else:
                    predictions = torch.argmax(logits, dim=-1)
            except:
                predictions = torch.argmax(logits, dim=-1)
            
            # Apply post-processing
            try:
                predictions = confidence_based_postprocessing(logits, predictions, attention_mask)
            except:
                pass  # Continue with unprocessed predictions
            
            # Extract sequences for evaluation
            for b in range(predictions.shape[0]):
                pred_seq = []
                label_seq = []
                
                for t in range(predictions.shape[1]):
                    if attention_mask[b, t] == 1 and labels[b, t] != -100:
                        pred_id = predictions[b, t].item()
                        label_id = labels[b, t].item()
                        
                        pred_label = label_config['id2label'].get(str(pred_id), "O")
                        true_label = label_config['id2label'].get(str(label_id), "O")
                        
                        pred_seq.append(pred_label)
                        label_seq.append(true_label)
                
                if pred_seq and label_seq:
                    all_predictions.append(pred_seq)
                    all_labels.append(label_seq)
    
    inference_time = time.time() - start_time
    
    # Calculate metrics
    if not all_predictions or not all_labels:
        print(f"     No valid predictions for {dataset_name}")
        return None
    
    # Entity-level metrics using seqeval
    entity_results = seq_classification_report(all_labels, all_predictions, scheme=IOB2, output_dict=True)
    
    # Token-level metrics
    all_true_labels = [l for seq in all_labels for l in seq]
    all_pred_labels = [p for seq in all_predictions for p in seq]
    
    token_accuracy = accuracy_score(all_true_labels, all_pred_labels)
    
    # Get precision, recall, F1 for PERSON entity specifically
    person_metrics = entity_results.get("PERSON", {"precision": 0.0, "recall": 0.0, "f1-score": 0.0})
    
    results = {
        'dataset': dataset_name,
        'model_type': model_type,
        'num_samples': len(all_predictions),
        'inference_time_seconds': inference_time,
        'inference_time_per_sample': inference_time / len(all_predictions),
        'person_precision': person_metrics["precision"],
        'person_recall': person_metrics["recall"],
        'person_f1': person_metrics["f1-score"],
        'entity_f1_macro': entity_results["macro avg"]["f1-score"],
        'token_accuracy': token_accuracy,
        'timestamp': datetime.now().isoformat()
    }
    
    print(f"     {dataset_name}: Person F1={results['person_f1']:.3f}, Token Acc={results['token_accuracy']:.3f}, Time={inference_time:.1f}s")
    
    return results

In [None]:
# Main Evaluation Pipeline
print(" Starting comprehensive evaluation pipeline...")
print("=" * 60)

# Store all results
all_results = []

# Evaluate TITLE model on TITLE datasets
print("\n TITLE Model Evaluation")
print("-" * 30)
for dataset_path in title_datasets:
    result = evaluate_model_on_dataset(
        title_model, title_tokenizer, title_label_config, 
        dataset_path, "TITLE"
    )
    if result:
        result['model_size_mb'] = title_model_size
        all_results.append(result)

# Evaluate NO-TITLE model on NO-TITLE datasets  
print("\n NO-TITLE Model Evaluation")
print("-" * 30)
for dataset_path in no_title_datasets:
    result = evaluate_model_on_dataset(
        no_title_model, no_title_tokenizer, no_title_label_config,
        dataset_path, "NO-TITLE"
    )
    if result:
        result['model_size_mb'] = no_title_model_size
        all_results.append(result)

print(f"\n Evaluation complete! Processed {len(all_results)} dataset-model combinations.")

# Create DataFrame for analysis
df_results = pd.DataFrame(all_results)
print(f"\n Results Summary:")
print(f"  Total evaluations: {len(df_results)}")
print(f"  TITLE evaluations: {len(df_results[df_results['model_type'] == 'TITLE'])}")
print(f"  NO-TITLE evaluations: {len(df_results[df_results['model_type'] == 'NO-TITLE'])}")

# Display first few results
if len(df_results) > 0:
    print(f"\n Sample Results:")
    display_cols = ['dataset', 'model_type', 'person_f1', 'person_precision', 'person_recall', 'token_accuracy']
    print(df_results[display_cols].head(10).to_string(index=False))

In [None]:
# Aggregate Results by Model Type
print("\n Aggregating Results by Model Type")
print("=" * 50)

if len(df_results) > 0:
    # Aggregate metrics by model type
    aggregated_results = []
    
    for model_type in ['TITLE', 'NO-TITLE']:
        model_data = df_results[df_results['model_type'] == model_type]
        
        if len(model_data) > 0:
            # Calculate aggregate statistics
            agg_result = {
                'model_type': model_type,
                'num_datasets': len(model_data),
                'total_samples': model_data['num_samples'].sum(),
                'total_inference_time': model_data['inference_time_seconds'].sum(),
                'avg_inference_time_per_sample': model_data['inference_time_per_sample'].mean(),
                'model_size_mb': model_data['model_size_mb'].iloc[0],  # Same for all entries
                
                # Person entity metrics
                'person_f1_mean': model_data['person_f1'].mean(),
                'person_f1_std': model_data['person_f1'].std(),
                'person_f1_min': model_data['person_f1'].min(),
                'person_f1_max': model_data['person_f1'].max(),
                
                'person_precision_mean': model_data['person_precision'].mean(),
                'person_precision_std': model_data['person_precision'].std(),
                
                'person_recall_mean': model_data['person_recall'].mean(),
                'person_recall_std': model_data['person_recall'].std(),
                
                # Overall metrics
                'entity_f1_macro_mean': model_data['entity_f1_macro'].mean(),
                'entity_f1_macro_std': model_data['entity_f1_macro'].std(),
                
                'token_accuracy_mean': model_data['token_accuracy'].mean(),
                'token_accuracy_std': model_data['token_accuracy'].std(),
                
                'timestamp': datetime.now().isoformat()
            }
            
            aggregated_results.append(agg_result)
            
            # Print summary for this model type
            print(f"\n {model_type} Model Summary:")
            print(f"  Datasets evaluated: {agg_result['num_datasets']}")
            print(f"  Total samples: {agg_result['total_samples']:,}")
            print(f"  Model size: {agg_result['model_size_mb']:.1f} MB")
            print(f"  Total inference time: {agg_result['total_inference_time']:.1f}s")
            print(f"  Avg time per sample: {agg_result['avg_inference_time_per_sample']*1000:.1f}ms")
            print(f"  Person F1: {agg_result['person_f1_mean']:.3f} ± {agg_result['person_f1_std']:.3f} (range: {agg_result['person_f1_min']:.3f} - {agg_result['person_f1_max']:.3f})")
            print(f"  Person Precision: {agg_result['person_precision_mean']:.3f} ± {agg_result['person_precision_std']:.3f}")
            print(f"  Person Recall: {agg_result['person_recall_mean']:.3f} ± {agg_result['person_recall_std']:.3f}")
            print(f"  Token Accuracy: {agg_result['token_accuracy_mean']:.3f} ± {agg_result['token_accuracy_std']:.3f}")
    
    # Create aggregated DataFrame
    df_aggregated = pd.DataFrame(aggregated_results)
    
    # Comparison summary
    if len(df_aggregated) == 2:
        print(f"\n Model Comparison:")
        title_row = df_aggregated[df_aggregated['model_type'] == 'TITLE'].iloc[0]
        no_title_row = df_aggregated[df_aggregated['model_type'] == 'NO-TITLE'].iloc[0]
        
        print(f"  F1 Score: TITLE({title_row['person_f1_mean']:.3f}) vs NO-TITLE({no_title_row['person_f1_mean']:.3f})")
        print(f"  Precision: TITLE({title_row['person_precision_mean']:.3f}) vs NO-TITLE({no_title_row['person_precision_mean']:.3f})")
        print(f"  Recall: TITLE({title_row['person_recall_mean']:.3f}) vs NO-TITLE({no_title_row['person_recall_mean']:.3f})")
        print(f"  Speed: TITLE({title_row['avg_inference_time_per_sample']*1000:.1f}ms) vs NO-TITLE({no_title_row['avg_inference_time_per_sample']*1000:.1f}ms)")
        print(f"  Size: TITLE({title_row['model_size_mb']:.1f}MB) vs NO-TITLE({no_title_row['model_size_mb']:.1f}MB)")
        
else:
    print(" No results to aggregate!")
    df_aggregated = pd.DataFrame()

In [None]:
# Save Results to Files
print("\n Saving Results to Files")
print("=" * 30)

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

if len(df_results) > 0:
    # Save detailed results only (aggregated can be calculated when loading)
    detailed_csv_path = RESULTS_PATH / f"detailed_evaluation_results_{timestamp}.csv"
    detailed_json_path = RESULTS_PATH / f"detailed_evaluation_results_{timestamp}.json"
    
    # Save to CSV
    df_results.to_csv(detailed_csv_path, index=False)
    print(f" Detailed results saved to: {detailed_csv_path}")
    
    # Save to JSON (with better formatting)
    df_results.to_json(detailed_json_path, orient='records', indent=2)
    print(f" Detailed results saved to: {detailed_json_path}")
    
    # Save a summary report
    summary_path = RESULTS_PATH / f"evaluation_summary_{timestamp}.txt"
    with open(summary_path, 'w') as f:
        f.write("NER Model Evaluation Summary\n")
        f.write("=" * 50 + "\n\n")
        f.write(f"Evaluation Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"Total Evaluations: {len(df_results)}\n")
        f.write(f"TITLE Evaluations: {len(df_results[df_results['model_type'] == 'TITLE'])}\n")
        f.write(f"NO-TITLE Evaluations: {len(df_results[df_results['model_type'] == 'NO-TITLE'])}\n\n")
        
        # Write individual dataset results summary
        f.write("Individual Dataset Results:\n")
        f.write("-" * 30 + "\n")
        for _, row in df_results.iterrows():
            f.write(f"{row['dataset']} ({row['model_type']}):\n")
            f.write(f"  Person F1: {row['person_f1']:.3f}\n")
            f.write(f"  Person Precision: {row['person_precision']:.3f}\n")
            f.write(f"  Person Recall: {row['person_recall']:.3f}\n")
            f.write(f"  Token Accuracy: {row['token_accuracy']:.3f}\n")
            f.write(f"  Inference Time: {row['inference_time_per_sample']*1000:.1f}ms per sample\n\n")
        
        # Write aggregated statistics (calculated on-the-fly)
        if len(df_aggregated) > 0:
            f.write("\nAggregated Statistics (calculated from detailed results):\n")
            f.write("-" * 50 + "\n")
            for _, row in df_aggregated.iterrows():
                f.write(f"{row['model_type']} Model Summary:\n")
                f.write(f"  Datasets: {row['num_datasets']}\n")
                f.write(f"  Samples: {row['total_samples']:,}\n")
                f.write(f"  Person F1: {row['person_f1_mean']:.3f} ± {row['person_f1_std']:.3f}\n")
                f.write(f"  Person Precision: {row['person_precision_mean']:.3f} ± {row['person_precision_std']:.3f}\n")
                f.write(f"  Person Recall: {row['person_recall_mean']:.3f} ± {row['person_recall_std']:.3f}\n")
                f.write(f"  Token Accuracy: {row['token_accuracy_mean']:.3f} ± {row['token_accuracy_std']:.3f}\n")
                f.write(f"  Model Size: {row['model_size_mb']:.1f} MB\n")
                f.write(f"  Avg Inference Time: {row['avg_inference_time_per_sample']*1000:.1f}ms per sample\n\n")
    
    print(f" Summary report saved to: {summary_path}")
    
    # Display file list
    print(f"\n Generated files:")
    for file_path in RESULTS_PATH.glob(f"*{timestamp}*"):
        print(f"  {file_path.name}")
        
    print(f"\n Note: Only detailed results are saved to files.")
    print(f"   Aggregated statistics can be recalculated when loading the detailed CSV/JSON.")
        
else:
    print(" No results to save!")

print(f"\n Evaluation pipeline completed successfully!")
print(f" Results available in: {RESULTS_PATH}")

In [None]:
# Quick Data Preview for Charting
print("\\n Data Preview for Charting")
print("=" * 35)

if len(df_results) > 0:
    print("\\n Detailed Results Preview:")
    print("Columns:", list(df_results.columns))
    print("\\nSample data:")
    preview_cols = ['dataset', 'model_type', 'person_f1', 'person_precision', 'person_recall', 
                   'token_accuracy', 'inference_time_per_sample', 'model_size_mb']
    print(df_results[preview_cols].head())
    
    if len(df_aggregated) > 0:
        print("\\n\\n Aggregated Results Preview:")
        print("Columns:", list(df_aggregated.columns))
        print("\\nData:")
        agg_preview_cols = ['model_type', 'num_datasets', 'person_f1_mean', 'person_f1_std',
                           'person_precision_mean', 'person_recall_mean', 'token_accuracy_mean',
                           'model_size_mb', 'avg_inference_time_per_sample']
        print(df_aggregated[agg_preview_cols])
        
    print(f"\\n Ready for Charting!")
    print(f"   Use 'df_results' for detailed dataset-level analysis")
    print(f"   Use 'df_aggregated' for model-level comparisons")
    print(f"   Data also saved as CSV files for external tools")
    
else:
    print(" No data available for charting!")

print(f"\\n🏁 Pipeline Complete! Ready for visualization and analysis.")