In [4]:
!pip install transformers==4.35.2 torch datasets huggingface-hub -q
!pip install gliner

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gliner 0.2.21 requires transformers>=4.38.2, but you have transformers 4.35.2 which is incompatible.
sentence-transformers 4.1.0 requires transformers<5.0.0,>=4.41.0, but you have transformers 4.35.2 which is incompatible.[0m[31m
Collecting transformers>=4.38.2 (from gliner)
  Using cached transformers-4.53.0-py3-none-any.whl.metadata (39 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers>=4.38.2->gliner)
  Using cached tokenizers-0.21.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Using cached transformers-4.53.0-py3-none-any.whl (10.8 MB)
Using cached tokenizers-0.21.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.1

In [1]:
import json
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
from typing import Dict, List, Any, Set, Optional, Tuple
import logging
from datetime import datetime
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
import torch
import warnings
warnings.filterwarnings('ignore')


from gliner import GLiNER
GLINER_AVAILABLE = True


logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NumpyEncoder, self).default(obj)

class TransformerNERAnalyzer:

    def __init__(self,
                 min_entity_length: int = 2,
                 device: int = -1,
                 aggregation_strategy: str = "simple",
                 chunk_size: int = 256,
                 gliner_labels: Optional[List[str]] = None):
        """
        Initialize the analyzer

        Args:
            min_entity_length: Minimum character length for entities
            device: Device to use
            aggregation_strategy: Strategy for aggregating subword tokens
            chunk_size: Size of text chunks for processing
            gliner_labels: Custom labels for GLiNER model
        """
        self.min_entity_length = min_entity_length
        self.device = device
        self.aggregation_strategy = aggregation_strategy
        self.chunk_size = chunk_size

        # Default medical entity labels for GLiNER
        self.gliner_labels = gliner_labels or [
            "disease", "disorder", "symptom", "sign", "syndrome",
            "medication", "drug", "treatment", "therapy",
            "anatomy", "body part", "organ", "tissue",
            "procedure", "test", "examination",
            "condition", "diagnosis", "prognosis",
            "gene", "protein", "cell", "molecule",
            "vaccine", "virus", "bacteria", "pathogen"
        ]

        self.medical_relevant_labels = {
            # Disease/Condition labels
            'DISEASE', 'DISORDER', 'CONDITION', 'PATHOLOGICAL_FORMATION',
            'SYMPTOM', 'SIGN', 'SYNDROME', 'CANCER', 'DISEASE_DISORDER',
            'Disease_disorder', 'SIGN_SYMPTOM', 'condition', 'disease',
            'disorder', 'symptom', 'syndrome',

            # Chemical/Drug labels
            'CHEMICAL', 'DRUG', 'MEDICATION', 'Drug', 'CHEMICAL_ENTITY',
            'medication', 'drug', 'treatment', 'vaccine', 'THERAPEUTIC_PROCEDURE',

            # Genetic/Molecular labels
            'GENE', 'PROTEIN', 'GENETIC', 'GENE_OR_GENE_PRODUCT', 'protein',
            'gene', 'molecule', 'biomarker',

            # Anatomical labels
            'ANATOMY', 'BODY_PART', 'ORGAN', 'TISSUE', 'BIOLOGICAL_STRUCTURE',
            'MULTI_TISSUE_STRUCTURE', 'anatomy', 'body part', 'organ',

            # Medical procedure labels
            'PROCEDURE', 'TEST', 'EXAMINATION', 'THERAPEUTIC_PROCEDURE',
            'DIAGNOSTIC_PROCEDURE', 'procedure', 'test', 'medical procedure',

            # Other relevant labels
            'PATHOGEN', 'VIRUS', 'BACTERIA', 'pathogen', 'virus', 'bacteria',
            'side effect', 'adverse event', 'complication'
        }

        # Labels to explicitly exclude
        self.noise_labels = {
            'LABEL_0', 'LABEL_1', 'LABEL_2', 'O', 'PAD', '[CLS]', '[SEP]',
            'ENTITY', '0', '1', 'PERSON', 'LOCATION', 'ORGANIZATION',
            'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY'
        }

        self.available_models = self._discover_models()
        self.results = {}
        self.evaluation_results = {}

    def _discover_models(self) -> Dict[str, Dict[str, Any]]:
        """Dynamically discover available transformer NER models"""

        # Comprehensive list of biomedical NER models
        model_configs = {
            # BioClinicalBERT variants
            "BioClinicalBERT-Pablo": {
                "model": "pabRomero/BioClinicalBERT-full-finetuned-ner-pablo",
                "description": "BioClinicalBERT fine-tuned on clinical NER",
                "domain": "clinical"
            },
            "Bio_ClinicalBERT": {
                "model": "emilyalsentzer/Bio_ClinicalBERT",
                "description": "Clinical BERT model",
                "domain": "clinical"
            },

            # BioBERT variants
            "BioBERT-Diseases": {
                "model": "alvaroalon2/biobert_diseases_ner",
                "description": "BioBERT for disease entity recognition",
                "domain": "diseases"
            },
            "BioBERT-Chemical": {
                "model": "alvaroalon2/biobert_chemical_ner",
                "description": "BioBERT for chemical entity recognition",
                "domain": "chemicals"
            },
            "BioBERT-Genetic": {
                "model": "alvaroalon2/biobert_genetic_ner",
                "description": "BioBERT for genetic entity recognition",
                "domain": "genetics"
            },
            "BioBERT-Base": {
                "model": "dmis-lab/biobert-base-cased-v1.1",
                "description": "Base BioBERT model",
                "domain": "biomedical"
            },
            "BioBERT-Tumre": {
                "model": "siddharthtumre/biobert-finetuned-ner",
                "description": "BioBERT fine-tuned for NER",
                "domain": "biomedical"
            },

            # Other biomedical models
            "BioMedical-NER-All": {
                "model": "d4data/biomedical-ner-all",
                "description": "Comprehensive biomedical NER",
                "domain": "biomedical"
            },
            "Apollo-Medical-NER": {
                "model": "Clinical-AI-Apollo/Medical-NER",
                "description": "Apollo medical NER model",
                "domain": "medical"
            },
            "SciBERT-Medical": {
                "model": "allenai/scibert_scivocab_uncased",
                "description": "SciBERT for scientific text",
                "domain": "scientific"
            },
            "BlueBERT-PubMed": {
                "model": "bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12",
                "description": "BlueBERT trained on PubMed",
                "domain": "pubmed"
            }
        }

        if GLINER_AVAILABLE:
            gliner_models = {
                "GLiNER-base": {
                    "model": "urchade/gliner_base",
                    "description": "GLiNER base model for zero-shot NER",
                    "domain": "general",
                    "type": "gliner"
                },
                "GLiNER-small": {
                    "model": "urchade/gliner_small",
                    "description": "GLiNER small model for zero-shot NER",
                    "domain": "general",
                    "type": "gliner"
                },
                "GLiNER-medium": {
                    "model": "urchade/gliner_medium",
                    "description": "GLiNER medium model for zero-shot NER",
                    "domain": "general",
                    "type": "gliner"
                },
                "GLiNER-large": {
                    "model": "urchade/gliner_large",
                    "description": "GLiNER large model for zero-shot NER",
                    "domain": "general",
                    "type": "gliner"
                },
                "GLiNER-multi": {
                    "model": "urchade/gliner_multi",
                    "description": "GLiNER multilingual model",
                    "domain": "multilingual",
                    "type": "gliner"
                }
            }
            model_configs.update(gliner_models)

        # Test which models are available
        available = {}
        for name, config in model_configs.items():
            try:
                if config.get("type") == "gliner":
                    # Test GLiNER model loading
                    _ = GLiNER.from_pretrained(config["model"])
                else:
                    # Test transformer model loading
                    _ = pipeline("ner", model=config["model"], device=self.device)
                available[name] = config
                logger.info(f"✓ Found model: {name}")
            except Exception as e:
                logger.warning(f"✗ Model not available: {name} - {str(e)[:50]}...")

        return available

    def _process_raw_entities(self, raw_results: List[Dict], model_name: str) -> List[Dict]:
        """
        Process raw NER results and clean them up

        Args:
            raw_results: Raw results from transformer pipeline
            model_name: Name of the model for model-specific processing

        Returns:
            Cleaned and processed entities
        """
        processed_entities = []
        current_entity = None

        for result in raw_results:
            # Extract and clean word
            word = result.get('word', '').replace('##', '').strip()
            entity_type = result.get('entity_group', result.get('entity', 'unknown'))
            score = float(result.get('score', 1.0))

            # Skip non-medical entities and noise labels
            if entity_type in self.noise_labels or len(word) < self.min_entity_length:
                if current_entity:
                    processed_entities.append(current_entity)
                    current_entity = None
                continue

            # For generic labels, check if it's medically relevant
            if entity_type not in self.medical_relevant_labels:
                # Skip if it's clearly not medical
                if score < 0.7:  # Lower confidence threshold for non-medical labels
                    if current_entity:
                        processed_entities.append(current_entity)
                        current_entity = None
                    continue

            # Group consecutive tokens of the same type
            if current_entity and entity_type == current_entity['label']:
                # Handle subword tokens
                if word.startswith('##'):
                    current_entity['text'] += word[2:]
                else:
                    current_entity['text'] += ' ' + word
                current_entity['score'] = (current_entity['score'] + score) / 2
            else:
                if current_entity:
                    processed_entities.append(current_entity)
                current_entity = {
                    'text': word,
                    'label': entity_type,
                    'score': score,
                    'start': result.get('start'),
                    'end': result.get('end')
                }
        if current_entity:
            processed_entities.append(current_entity)

        # Final cleanup and medical filtering
        cleaned_entities = []
        seen_entities = set()

        for entity in processed_entities:
            text = ' '.join(entity['text'].split())

            # Skip if too short or duplicate
            if len(text) < self.min_entity_length or (text, entity['label']) in seen_entities:
                continue

            # Additional filtering for medical relevance
            text_lower = text.lower()

            # Check if entity is medically relevant based on label and confidence
            is_medical = (
                entity['label'] in self.medical_relevant_labels or
                entity['score'] > 0.85  # High confidence entities regardless of label
            )

            # Skip common non-medical words
            non_medical_words = {
                'i', 'am', 'is', 'the', 'and', 'or', 'but', 'to', 'of', 'in',
                'for', 'a', 'an', 'my', 'your', 'his', 'her', 'their', 'it',
                'we', 'they', 'this', 'that', 'these', 'those', 'which', 'who',
                'what', 'when', 'where', 'why', 'how', 'all', 'some', 'any',
                'each', 'every', 'no', 'not', 'can', 'will', 'should', 'would',
                'could', 'may', 'might', 'must', 'shall', 'have', 'has', 'had',
                'do', 'does', 'did', 'be', 'been', 'being', 'was', 'were', 'are'
            }

            if text_lower in non_medical_words:
                continue

            if is_medical:
                seen_entities.add((text, entity['label']))
                cleaned_entities.append({
                    'text': text,
                    'label': entity['label'],
                    'score': entity['score'],
                    'start': entity.get('start'),
                    'end': entity.get('end')
                })

        return cleaned_entities

    def _process_with_gliner(self, text: str, model_path: str) -> List[Dict]:
        """
        Process text using GLiNER model

        Args:
            text: Input text
            model_path: Path to GLiNER model

        Returns:
            List of extracted entities
        """

        model = GLiNER.from_pretrained(model_path)

        # Move to appropriate device
        if self.device >= 0 and torch.cuda.is_available():
            model = model.to(f"cuda:{self.device}")

        # Predict entities
        entities = model.predict_entities(text, self.gliner_labels)

        # Process GLiNER output
        processed_entities = []
        for entity in entities:
            if len(entity.get('text', '')) >= self.min_entity_length:
                processed_entities.append({
                    'text': entity['text'],
                    'label': entity['label'],
                    'score': float(entity.get('score', 1.0)),
                    'start': entity.get('start'),
                    'end': entity.get('end')
                })

        return processed_entities

    def analyze_text(self, text: str, models: Optional[List[str]] = None) -> Dict[str, Any]:
        """
        Analyze text using specified models or all available models

        Args:
            text: Input text to analyze
            models: List of model names to use

        Returns:
            Dictionary containing analysis results
        """
        if models is None:
            models = list(self.available_models.keys())
        else:
            # Validate requested models
            models = [m for m in models if m in self.available_models]

        self.results = {
            "input_text": text,
            "text_length": len(text),
            "analysis_timestamp": datetime.now().isoformat(),
            "device": "CPU" if self.device == -1 else f"GPU:{self.device}",
            "models": {}
        }

        for model_name in models:
            logger.info(f"Processing with {model_name}...")
            model_config = self.available_models[model_name]

            try:
                # Check if it's a GLiNER model
                if model_config.get("type") == "gliner":
                    # Process with GLiNER
                    entities = self._process_with_gliner(text, model_config["model"])

                    # Calculate statistics
                    label_dist = Counter(e['label'] for e in entities)
                    avg_confidence = np.mean([e['score'] for e in entities]) if entities else 0

                    self.results["models"][model_name] = {
                        "entities": entities,
                        "entity_count": len(entities),
                        "unique_entities": len(set(e['text'] for e in entities)),
                        "label_distribution": dict(label_dist),
                        "average_confidence": float(avg_confidence),
                        "domain": model_config.get("domain", "general"),
                        "description": model_config.get("description", ""),
                        "model_type": "GLiNER",
                        "labels_used": self.gliner_labels
                    }
                else:
                    # Process with traditional transformer pipeline
                    ner_pipeline = pipeline(
                        "ner",
                        model=model_config["model"],
                        aggregation_strategy=self.aggregation_strategy,
                        device=self.device
                    )

                    all_results = []

                    if len(text) > self.chunk_size:
                        # Process in chunks with overlap
                        overlap = 50

                        for i in range(0, len(text), self.chunk_size - overlap):
                            chunk = text[i:i + self.chunk_size]
                            chunk_results = ner_pipeline(chunk)

                            for r in chunk_results:
                                if 'start' in r:
                                    r['start'] += i
                                if 'end' in r:
                                    r['end'] += i

                            all_results.extend(chunk_results)
                    else:
                        all_results = ner_pipeline(text)

                    # Process and clean results
                    entities = self._process_raw_entities(all_results, model_name)

                    # Calculate statistics
                    label_dist = Counter(e['label'] for e in entities)
                    avg_confidence = np.mean([e['score'] for e in entities]) if entities else 0

                    self.results["models"][model_name] = {
                        "entities": entities,
                        "entity_count": len(entities),
                        "unique_entities": len(set(e['text'] for e in entities)),
                        "label_distribution": dict(label_dist),
                        "average_confidence": float(avg_confidence),
                        "domain": model_config.get("domain", "general"),
                        "description": model_config.get("description", ""),
                        "model_type": "Transformer"
                    }

            except Exception as e:
                logger.error(f"Error processing {model_name}: {str(e)}")
                self.results["models"][model_name] = {"error": str(e)}

        return self.results

    def evaluate_i2b2(self, sample_texts: List[str], expected_entities: List[List[str]],
                      models: Optional[List[str]] = None) -> Dict[str, Any]:

        if models is None:
            models = list(self.available_models.keys())
        else:
            models = [m for m in models if m in self.available_models]

        self.evaluation_results = {
            "evaluation_timestamp": datetime.now().isoformat(),
            "num_samples": len(sample_texts),
            "models": {}
        }

        for model_name in models:
            logger.info(f"Evaluating {model_name}...")

            model_eval = {
                "samples": [],
                "overall_metrics": {
                    "true_positives": 0,
                    "false_positives": 0,
                    "false_negatives": 0,
                    "precision": 0.0,
                    "recall": 0.0,
                    "f1_score": 0.0,
                    "exact_matches": 0,
                    "partial_matches": 0
                }
            }

            for idx, (text, expected) in enumerate(zip(sample_texts, expected_entities)):
                results = self.analyze_text(text, models=[model_name])

                if "error" in results["models"][model_name]:
                    model_eval["samples"].append({
                        "sample_index": idx,
                        "error": results["models"][model_name]["error"]
                    })
                    continue

                predicted_entities = [e['text'].lower() for e in results["models"][model_name]["entities"]]
                expected_lower = [e.lower() for e in expected]

                true_positives = 0
                partial_matches = 0

                for exp_entity in expected_lower:
                    if exp_entity in predicted_entities:
                        true_positives += 1
                    else:
                        for pred_entity in predicted_entities:
                            if exp_entity in pred_entity or pred_entity in exp_entity:
                                partial_matches += 1
                                break

                false_positives = len(predicted_entities) - true_positives
                false_negatives = len(expected_lower) - true_positives

                precision = true_positives / len(predicted_entities) if predicted_entities else 0
                recall = true_positives / len(expected_lower) if expected_lower else 0
                f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

                sample_result = {
                    "sample_index": idx,
                    "text_preview": text[:100] + "..." if len(text) > 100 else text,
                    "expected_entities": expected,
                    "predicted_entities": [e['text'] for e in results["models"][model_name]["entities"]],
                    "metrics": {
                        "true_positives": true_positives,
                        "false_positives": false_positives,
                        "false_negatives": false_negatives,
                        "partial_matches": partial_matches,
                        "precision": precision,
                        "recall": recall,
                        "f1_score": f1
                    }
                }

                model_eval["samples"].append(sample_result)

                model_eval["overall_metrics"]["true_positives"] += true_positives
                model_eval["overall_metrics"]["false_positives"] += false_positives
                model_eval["overall_metrics"]["false_negatives"] += false_negatives
                model_eval["overall_metrics"]["partial_matches"] += partial_matches
                if true_positives == len(expected_lower) and false_positives == 0:
                    model_eval["overall_metrics"]["exact_matches"] += 1

            total_tp = model_eval["overall_metrics"]["true_positives"]
            total_fp = model_eval["overall_metrics"]["false_positives"]
            total_fn = model_eval["overall_metrics"]["false_negatives"]

            if total_tp + total_fp > 0:
                model_eval["overall_metrics"]["precision"] = total_tp / (total_tp + total_fp)

            if total_tp + total_fn > 0:
                model_eval["overall_metrics"]["recall"] = total_tp / (total_tp + total_fn)

            if model_eval["overall_metrics"]["precision"] + model_eval["overall_metrics"]["recall"] > 0:
                model_eval["overall_metrics"]["f1_score"] = 2 * (
                    model_eval["overall_metrics"]["precision"] * model_eval["overall_metrics"]["recall"]
                ) / (model_eval["overall_metrics"]["precision"] + model_eval["overall_metrics"]["recall"])

            self.evaluation_results["models"][model_name] = model_eval

        return self.evaluation_results

    def generate_evaluation_report(self) -> str:
        report = []
        report.append("=" * 100)
        report.append("i2b2 2010 RELATIONS CHALLENGE EVALUATION REPORT")
        report.append("=" * 100)

        report.append(f"\nEvaluation Date: {self.evaluation_results['evaluation_timestamp']}")
        report.append(f"Number of Samples: {self.evaluation_results['num_samples']}")
        report.append(f"Models Evaluated: {len(self.evaluation_results['models'])}")

        report.append("\n" + "=" * 100)
        report.append("OVERALL MODEL PERFORMANCE")
        report.append("=" * 100)

        model_performance = []
        for model_name, eval_data in self.evaluation_results['models'].items():
            metrics = eval_data['overall_metrics']
            model_performance.append({
                "Model": model_name,
                "Precision": f"{metrics['precision']:.3f}",
                "Recall": f"{metrics['recall']:.3f}",
                "F1-Score": f"{metrics['f1_score']:.3f}",
                "True Positives": metrics['true_positives'],
                "False Positives": metrics['false_positives'],
                "False Negatives": metrics['false_negatives'],
                "Exact Matches": metrics['exact_matches'],
                "Partial Matches": metrics['partial_matches']
            })

        df = pd.DataFrame(model_performance)
        df = df.sort_values(by="F1-Score", ascending=False)
        report.append("\n" + df.to_string(index=False))

        if model_performance:
            best_model = df.iloc[0]['Model']
            report.append(f"\n\nBest Performing Model: {best_model} (F1-Score: {df.iloc[0]['F1-Score']})")

        report.append("\n" + "=" * 100)
        report.append("DETAILED MODEL EVALUATION")
        report.append("=" * 100)

        for model_name, eval_data in sorted(self.evaluation_results['models'].items()):
            report.append(f"\n\n{'='*80}")
            report.append(f"Model: {model_name}")
            report.append(f"{'='*80}")

            metrics = eval_data['overall_metrics']
            report.append(f"\nOverall Performance:")
            report.append(f"  - Precision: {metrics['precision']:.3f}")
            report.append(f"  - Recall: {metrics['recall']:.3f}")
            report.append(f"  - F1-Score: {metrics['f1_score']:.3f}")
            report.append(f"  - Exact Sample Matches: {metrics['exact_matches']}/{self.evaluation_results['num_samples']}")

            report.append(f"\nSample-Level Performance:")

            sample_precisions = []
            sample_recalls = []
            sample_f1s = []

            for sample in eval_data['samples']:
                if 'metrics' in sample:
                    sample_precisions.append(sample['metrics']['precision'])
                    sample_recalls.append(sample['metrics']['recall'])
                    sample_f1s.append(sample['metrics']['f1_score'])

            if sample_precisions:
                report.append(f"  - Average Sample Precision: {np.mean(sample_precisions):.3f} (±{np.std(sample_precisions):.3f})")
                report.append(f"  - Average Sample Recall: {np.mean(sample_recalls):.3f} (±{np.std(sample_recalls):.3f})")
                report.append(f"  - Average Sample F1-Score: {np.mean(sample_f1s):.3f} (±{np.std(sample_f1s):.3f})")

            worst_samples = sorted(
                [s for s in eval_data['samples'] if 'metrics' in s],
                key=lambda x: x['metrics']['f1_score']
            )[:3]

            if worst_samples:
                report.append(f"\nWorst Performing Samples:")
                for i, sample in enumerate(worst_samples):
                    report.append(f"\n  Sample {sample['sample_index']} (F1: {sample['metrics']['f1_score']:.3f}):")
                    report.append(f"    Expected: {', '.join(sample['expected_entities'][:5])}")
                    if len(sample['expected_entities']) > 5:
                        report.append(f"              ... and {len(sample['expected_entities']) - 5} more")
                    report.append(f"    Predicted: {', '.join(sample['predicted_entities'][:5])}")
                    if len(sample['predicted_entities']) > 5:
                        report.append(f"               ... and {len(sample['predicted_entities']) - 5} more")

        report.append("\n" + "=" * 100)
        report.append("CROSS-MODEL ENTITY DETECTION ANALYSIS")
        report.append("=" * 100)

        entity_detection_stats = defaultdict(lambda: {"detected_by": [], "missed_by": []})

        for model_name, eval_data in self.evaluation_results['models'].items():
            for sample in eval_data['samples']:
                if 'expected_entities' in sample and 'predicted_entities' in sample:
                    predicted_lower = [e.lower() for e in sample['predicted_entities']]

                    for expected in sample['expected_entities']:
                        expected_lower = expected.lower()
                        if expected_lower in predicted_lower:
                            entity_detection_stats[expected]["detected_by"].append(model_name)
                        else:
                            entity_detection_stats[expected]["missed_by"].append(model_name)

        consistently_detected = []
        consistently_missed = []

        for entity, stats in entity_detection_stats.items():
            detection_rate = len(stats["detected_by"]) / (len(stats["detected_by"]) + len(stats["missed_by"]))

            if detection_rate >= 0.8:
                consistently_detected.append((entity, detection_rate))
            elif detection_rate <= 0.2:
                consistently_missed.append((entity, detection_rate))

        if consistently_detected:
            report.append("\nConsistently Detected Entities (≥80% models):")
            for entity, rate in sorted(consistently_detected, key=lambda x: x[1], reverse=True)[:10]:
                report.append(f"  - '{entity}': {rate*100:.0f}% detection rate")

        if consistently_missed:
            report.append("\nConsistently Missed Entities (≤20% models):")
            for entity, rate in sorted(consistently_missed, key=lambda x: x[1])[:10]:
                report.append(f"  - '{entity}': {rate*100:.0f}% detection rate")

        return "\n".join(report)

    def generate_report(self) -> str:
        report = []
        report.append("=" * 100)
        report.append("TRANSFORMER NER ANALYSIS REPORT")
        report.append("=" * 100)

        report.append(f"\nAnalysis Date: {self.results['analysis_timestamp']}")
        report.append(f"Text Length: {self.results['text_length']} characters")
        report.append(f"Device: {self.results['device']}")
        report.append(f"Models Analyzed: {len(self.results['models'])}")

        total_entities = sum(m.get('entity_count', 0) for m in self.results['models'].values())
        successful_models = sum(1 for m in self.results['models'].values() if 'entities' in m)

        report.append(f"\nTotal Entities Found: {total_entities}")
        report.append(f"Successful Models: {successful_models}/{len(self.results['models'])}")

        report.append("\n" + "-" * 100)
        report.append("MODEL PERFORMANCE SUMMARY")
        report.append("-" * 100)

        model_stats = []
        for model_name, data in self.results['models'].items():
            if "error" in data:
                model_stats.append({
                    "Model": model_name,
                    "Domain": "N/A",
                    "Status": "Error",
                    "Entities": 0,
                    "Unique": 0,
                    "Labels": 0,
                    "Avg Confidence": 0
                })
            else:
                model_stats.append({
                    "Model": model_name,
                    "Domain": data.get('domain', 'unknown'),
                    "Status": "Success",
                    "Entities": data['entity_count'],
                    "Unique": data['unique_entities'],
                    "Labels": len(data['label_distribution']),
                    "Avg Confidence": f"{data['average_confidence']:.3f}"
                })

        df = pd.DataFrame(model_stats)
        report.append("\n" + df.to_string(index=False))

        report.append("\n" + "=" * 100)
        report.append("DOMAIN-BASED ANALYSIS")
        report.append("=" * 100)

        domain_stats = defaultdict(lambda: {"models": 0, "entities": 0})
        for model_name, data in self.results['models'].items():
            if 'entities' in data:
                domain = data.get('domain', 'unknown')
                domain_stats[domain]["models"] += 1
                domain_stats[domain]["entities"] += data['entity_count']

        for domain, stats in sorted(domain_stats.items()):
            report.append(f"\n{domain.upper()} Domain:")
            report.append(f"  - Models: {stats['models']}")
            report.append(f"  - Total Entities: {stats['entities']}")
            report.append(f"  - Avg Entities/Model: {stats['entities']/stats['models']:.1f}")

        report.append("\n" + "=" * 100)
        report.append("DETAILED MODEL ANALYSIS")
        report.append("=" * 100)

        for model_name, data in sorted(self.results['models'].items()):
            report.append(f"\n\n{'='*80}")
            report.append(f"Model: {model_name}")
            if 'description' in data:
                report.append(f"Description: {data['description']}")
            report.append(f"{'='*80}")

            if "error" in data:
                report.append(f"Error: {data['error']}")
                continue

            report.append(f"Domain: {data.get('domain', 'unknown')}")
            report.append(f"Model Type: {data.get('model_type', 'Transformer')}")
            report.append(f"Total Entities: {data['entity_count']}")
            report.append(f"Unique Entities: {data['unique_entities']}")
            report.append(f"Average Confidence: {data['average_confidence']:.3f}")

            if data.get('model_type') == 'GLiNER' and 'labels_used' in data:
                report.append(f"\nGLiNER Labels Used ({len(data['labels_used'])}):")
                report.append(f"  {', '.join(data['labels_used'][:10])}")
                if len(data['labels_used']) > 10:
                    report.append(f"  ... and {len(data['labels_used']) - 10} more")

            if data['label_distribution']:
                report.append("\nLabel Distribution:")
                for label, count in sorted(data['label_distribution'].items(),
                                          key=lambda x: x[1], reverse=True):
                    percentage = (count / data['entity_count']) * 100
                    report.append(f"  - {label}: {count} ({percentage:.1f}%)")

            if data['entities']:
                sorted_entities = sorted(data['entities'],
                                       key=lambda x: x['score'], reverse=True)
                report.append("\nTop Entities by Confidence (max 10):")
                for i, ent in enumerate(sorted_entities[:10]):
                    report.append(f"  {i+1}. '{ent['text']}' ({ent['label']}) - {ent['score']:.3f}")

        report.append("\n" + "=" * 100)
        report.append("CROSS-MODEL ENTITY COMPARISON")
        report.append("=" * 100)

        entity_model_map = defaultdict(lambda: {"models": set(), "labels": set()})

        for model_name, data in self.results['models'].items():
            if 'entities' in data:
                for ent in data['entities']:
                    key = ent['text'].lower()
                    entity_model_map[key]["models"].add(model_name)
                    entity_model_map[key]["labels"].add(ent['label'])

        consensus_entities = {
            text: info for text, info in entity_model_map.items()
            if len(info["models"]) > 1
        }

        if consensus_entities:
            report.append(f"\nEntities found by multiple models ({len(consensus_entities)}):")

            sorted_consensus = sorted(
                consensus_entities.items(),
                key=lambda x: len(x[1]["models"]),
                reverse=True
            )

            for text, info in sorted_consensus[:15]:
                labels = ', '.join(sorted(info["labels"]))
                report.append(f"  - '{text}': {len(info['models'])} models")
                report.append(f"    Labels: {labels}")
                if len(info["models"]) <= 5:
                    report.append(f"    Models: {', '.join(sorted(info['models']))}")

        report.append("\n\nLabel Consistency Analysis:")

        label_consistency = defaultdict(lambda: {"consistent": 0, "inconsistent": 0})
        for text, info in consensus_entities.items():
            if len(info["labels"]) == 1:
                label_consistency["consistent"]["consistent"] += 1
            else:
                label_consistency["inconsistent"]["inconsistent"] += 1

        total_consensus = len(consensus_entities)
        if total_consensus > 0:
            consistent = label_consistency["consistent"]["consistent"]
            inconsistent = label_consistency["inconsistent"]["inconsistent"]
            report.append(f"  - Consistent labeling: {consistent}/{total_consensus} ({consistent/total_consensus*100:.1f}%)")
            report.append(f"  - Inconsistent labeling: {inconsistent}/{total_consensus} ({inconsistent/total_consensus*100:.1f}%)")

        return "\n".join(report)

    def get_model_agreement_matrix(self) -> pd.DataFrame:
        models = [m for m in self.results['models'] if 'entities' in self.results['models'][m]]

        if len(models) < 2:
            return pd.DataFrame()

        matrix = pd.DataFrame(index=models, columns=models, dtype=float)

        for i, model1 in enumerate(models):
            entities1 = {(e['text'].lower(), e['label']) for e in self.results['models'][model1]['entities']}

            for j, model2 in enumerate(models):
                if i == j:
                    matrix.loc[model1, model2] = 1.0
                else:
                    entities2 = {(e['text'].lower(), e['label']) for e in self.results['models'][model2]['entities']}

                    if len(entities1) > 0 or len(entities2) > 0:
                        intersection = len(entities1.intersection(entities2))
                        union = len(entities1.union(entities2))
                        matrix.loc[model1, model2] = intersection / union if union > 0 else 0
                    else:
                        matrix.loc[model1, model2] = 0

        return matrix

    def export_results(self, filename: str = "transformer_ner_results.json"):
        with open(filename, 'w') as f:
            json.dump(self.results, f, indent=2, cls=NumpyEncoder)
        logger.info(f"Results exported to {filename}")

    def export_evaluation_results(self, filename: str = "i2b2_evaluation_results.json"):
        if not self.evaluation_results:
            logger.warning("No evaluation results to export. Run evaluate_i2b2() first.")
            return

        with open(filename, 'w') as f:
            json.dump(self.evaluation_results, f, indent=2, cls=NumpyEncoder)
        logger.info(f"Evaluation results exported to {filename}")

    def generate_entity_comparison_csv(self, filename: str = "entity_comparison.csv"):
        rows = []

        all_entities = set()
        for model_data in self.results['models'].values():
            if 'entities' in model_data:
                all_entities.update(e['text'] for e in model_data['entities'])

        for entity in sorted(all_entities):
            row = {"Entity": entity}

            for model_name, model_data in self.results['models'].items():
                if 'entities' in model_data:
                    model_entities = [e for e in model_data['entities'] if e['text'] == entity]

                    if model_entities:
                        labels = ', '.join(set(e['label'] for e in model_entities))
                        avg_score = np.mean([e['score'] for e in model_entities])
                        row[f"{model_name}_label"] = labels
                        row[f"{model_name}_confidence"] = f"{avg_score:.3f}"
                    else:
                        row[f"{model_name}_label"] = ""
                        row[f"{model_name}_confidence"] = ""
                else:
                    row[f"{model_name}_label"] = "ERROR"
                    row[f"{model_name}_confidence"] = ""

            rows.append(row)

        df = pd.DataFrame(rows)
        df.to_csv(filename, index=False)
        logger.info(f"Entity comparison exported to {filename}")

        return df

    def generate_pubmed_query(self, max_terms: int = 5, min_confidence: float = 0.7) -> str:
        entity_scores = defaultdict(lambda: {
            'count': 0,
            'labels': set(),
            'max_score': 0,
            'models': set()
        })

        for model_name, model_data in self.results['models'].items():
            if 'entities' in model_data:
                for entity in model_data['entities']:
                    if (entity['score'] >= min_confidence and
                        entity['label'] in self.medical_relevant_labels):

                        key = entity['text'].lower()
                        entity_scores[key]['count'] += 1
                        entity_scores[key]['labels'].add(entity['label'])
                        entity_scores[key]['max_score'] = max(
                            entity_scores[key]['max_score'],
                            entity['score']
                        )
                        entity_scores[key]['models'].add(model_name)

        ranked_entities = []

        for text, data in entity_scores.items():
            relevance_score = data['count'] * data['max_score']

            if data['count'] >= 2:
                relevance_score *= 1.5

            disease_labels = {'DISEASE', 'DISORDER', 'CONDITION', 'SYNDROME',
                            'disease', 'disorder', 'condition', 'syndrome',
                            'DISEASE_DISORDER', 'Disease_disorder', 'PATHOLOGICAL_FORMATION'}
            if any(label in disease_labels for label in data['labels']):
                relevance_score *= 2.0

            treatment_labels = {'DRUG', 'MEDICATION', 'TREATMENT', 'THERAPY',
                              'drug', 'medication', 'treatment', 'therapy',
                              'THERAPEUTIC_PROCEDURE', 'vaccine'}
            if any(label in treatment_labels for label in data['labels']):
                relevance_score *= 1.3

            ranked_entities.append((text, relevance_score, data))

        ranked_entities.sort(key=lambda x: x[1], reverse=True)

        query_parts = []

        conditions = []
        for text, score, data in ranked_entities[:max_terms]:
            if len(text.split()) > 1:
                conditions.append(f'"{text}"')
            else:
                conditions.append(text)

        if conditions:
            if len(conditions) > 1:
                query_parts.append(f"({' OR '.join(conditions)})")
            else:
                query_parts.append(conditions[0])

        all_labels = set()
        for _, _, data in ranked_entities[:10]:
            all_labels.update(data['labels'])

        if any(label in all_labels for label in ['DISEASE', 'DISORDER', 'disease', 'disorder']):
            query_parts.append("(treatment OR therapy OR management OR guidelines)")

        if any('vaccine' in text for text, _, _ in ranked_entities[:10]):
            query_parts.append('(safety OR efficacy OR "adverse events" OR "side effects")')

        filters = []
        filters.append('(Clinical Trial[Publication Type] OR Systematic Review[Publication Type] OR Meta-Analysis[Publication Type])')

        if query_parts:
            main_query = ' AND '.join(query_parts)
            if filters:
                final_query = f"{main_query} AND {' AND '.join(filters)}"
            else:
                final_query = main_query
        else:
            final_query = "No medical entities found for PubMed query"

        return final_query

    def get_medical_entities_summary(self) -> Dict[str, Any]:
        summary = {
            'total_medical_entities': 0,
            'unique_medical_entities': set(),
            'diseases_conditions': [],
            'medications_treatments': [],
            'anatomical_terms': [],
            'procedures_tests': [],
            'other_medical': []
        }

        for model_data in self.results['models'].values():
            if 'entities' in model_data:
                for entity in model_data['entities']:
                    if entity['label'] in self.medical_relevant_labels:
                        summary['total_medical_entities'] += 1
                        summary['unique_medical_entities'].add(entity['text'].lower())

                        label_lower = entity['label'].lower()
                        text = entity['text']

                        if any(term in label_lower for term in ['disease', 'disorder', 'condition', 'syndrome']):
                            summary['diseases_conditions'].append(text)
                        elif any(term in label_lower for term in ['drug', 'medication', 'treatment', 'therapy']):
                            summary['medications_treatments'].append(text)
                        elif any(term in label_lower for term in ['anatomy', 'body', 'organ', 'tissue']):
                            summary['anatomical_terms'].append(text)
                        elif any(term in label_lower for term in ['procedure', 'test', 'examination']):
                            summary['procedures_tests'].append(text)
                        else:
                            summary['other_medical'].append(text)

        summary['unique_medical_entities'] = len(summary['unique_medical_entities'])
        for key in ['diseases_conditions', 'medications_treatments',
                    'anatomical_terms', 'procedures_tests', 'other_medical']:
            summary[key] = list(set(summary[key]))

        return summary

if __name__ == "__main__":
    i2b2_samples = [
        """The patient is a 72 year old white male who was transferred from Cay Memorial Hospital Of for cardiac catheterization and Electrophysiology Studies .
His cardiac risk factors include :
hypercholesterolemia , hypertension and insulin dependent diabetes mellitus .
He has a history of chest pain and in January 1993 underwent a cardiac catheterization at Ph University Of Medical Center which revealed an occluded right coronary artery and a 40-50% proximal stenosis .
He subsequently had an echocardiogram in December 1994 which showed normal left ventricular size and systolic function .""",

        """She had a liver function test and amylase and lipase postoperatively and she had a digoxin level of 1.0 on 06/04/05 .
The patient had a CBC on admission of 14.1 with a hematocrit of 33.8 .
Her CBC remained stable on 06/05/05 .
She had a white blood cell of 7.7 , hematocrit of 30.6 .
The patient had a MRSA nasal culture obtained on 06/03/05 , which revealed rare staphylococcus aureus .
The patient had a chest x-ray on admission , which was clear .
No pleural effusion or pneumothorax .""",

        """The patient is a 64-year-old male with a long standing history of peripheral vascular disease who has had multiple vascular procedures in the past including a fem-fem bypass , a left fem pop as well as bilateral TMAs and a right fem pop bypass who presents with a nonhealing wound of his left TMA stump as well as a pretibial ulcer that is down to the bone .
The patient was admitted to obtain adequate pain control and to have an MRI / MRA to evaluate any possible bypass procedures that could be performed ."""
    ]

    i2b2_expected_entities = [
        ["echocardiogram", "hypertension", "diabetes", "cardiac catheterization", "chest pain", "an occluded right coronary artery", "a 40-50% proximal stenosis",
         "coronary artery", "cardiac catheterization","electrophysiology studies", "hypercholesterolemia", "insulin dependent diabetes mellitus"],

        ["a digoxin level", "cbc", "chest x-ray", 'pleural effusion', 'liver function test', 'white blood cell', 'hematocrit', 'rare staphylococcus aureus',
         'mrsa nasal culture', 'amylase', 'lipase', 'pneumothorax'],

        ["bilateral tmas", "peripheral vascular disease", "fem-fem bypass", "right fem pop bypass",
         "left fem pop", "pain control", "vascular procedures", 'mri', 'mra', 'non healing wound', 'tma stump', 'bypass procedures']
    ]

    print("Initializing Transformer NER Analyzer...")
    analyzer = TransformerNERAnalyzer(
        min_entity_length=3,
        device=-1,
        aggregation_strategy="simple"
    )

    print(f"\nFound {len(analyzer.available_models)} available models")

    print("\nRunning i2b2 evaluation on all available models...")
    eval_results = analyzer.evaluate_i2b2(i2b2_samples, i2b2_expected_entities)

    eval_report = analyzer.generate_evaluation_report()
    print(eval_report)

    analyzer.export_evaluation_results()
    print("\nEvaluation results exported to i2b2_evaluation_results.json")

    print("\n\nRunning detailed analysis on first sample...")
    results = analyzer.analyze_text(i2b2_samples[0])

    report = analyzer.generate_report()
    print(report)

    pubmed_query = analyzer.generate_pubmed_query()
    print(f"\n\nGenerated PubMed Query:\n{pubmed_query}")

    medical_summary = analyzer.get_medical_entities_summary()
    print(f"\n\nMedical Entities Summary:")
    print(f"- Total medical entities: {medical_summary['total_medical_entities']}")
    print(f"- Unique medical entities: {medical_summary['unique_medical_entities']}")
    print(f"- Diseases/Conditions: {len(medical_summary['diseases_conditions'])}")
    print(f"- Medications/Treatments: {len(medical_summary['medications_treatments'])}")
    print(f"- Anatomical Terms: {len(medical_summary['anatomical_terms'])}")
    print(f"- Procedures/Tests: {len(medical_summary['procedures_tests'])}")

Initializing Transformer NER Analyzer...


Device set to use cpu
Some weights of BertForTokenClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Some weights of BertForTokenClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]


Found 16 available models

Running i2b2 evaluation on all available models...


Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Some weights of BertForTokenClassification w

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


i2b2 2010 RELATIONS CHALLENGE EVALUATION REPORT

Evaluation Date: 2025-07-04T07:35:49.256989
Number of Samples: 3
Models Evaluated: 16

OVERALL MODEL PERFORMANCE

                Model Precision Recall F1-Score  True Positives  False Positives  False Negatives  Exact Matches  Partial Matches
        GLiNER-medium     0.676  0.694    0.685              25               12               11              0                8
          GLiNER-base     0.667  0.667    0.667              24               12               12              0                9
         GLiNER-large     0.641  0.694    0.667              25               14               11              0                9
         GLiNER-multi     0.649  0.667    0.658              24               13               12              0               10
         GLiNER-small     0.710  0.611    0.657              22                9               14              0                8
     BioBERT-Diseases     0.500  0.167    0.250          

Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Device set to use cpu
Device set to use cpu
Device set to use cpu
Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


TRANSFORMER NER ANALYSIS REPORT

Analysis Date: 2025-07-04T07:37:51.317894
Text Length: 587 characters
Device: CPU
Models Analyzed: 16

Total Entities Found: 103
Successful Models: 16/16

----------------------------------------------------------------------------------------------------
MODEL PERFORMANCE SUMMARY
----------------------------------------------------------------------------------------------------

                Model       Domain  Status  Entities  Unique  Labels Avg Confidence
BioClinicalBERT-Pablo     clinical Success         2       2       2          0.917
     Bio_ClinicalBERT     clinical Success         0       0       0          0.000
     BioBERT-Diseases     diseases Success         6       6       1          0.980
     BioBERT-Chemical    chemicals Success         0       0       0          0.000
      BioBERT-Genetic     genetics Success         1       1       1          1.000
         BioBERT-Base   biomedical Success         0       0       0          0