In [7]:
import spacy
import random
from spacy.matcher import Matcher, PhraseMatcher
from spacy.training import Example
import requests
import json
import os
import numpy as np
from spacy.training.iob_utils import offsets_to_biluo_tags
from spacy.scorer import Scorer
from spacy import displacy
import pickle

In [9]:
#1
def load_data(filepath, pure_text=False):
    """
    Load the WNUT16 dataset, supporting both plain text and labeled modes.

    Parameters:
        filepath: Path to the data file.
        pure_text: If True, only load plain text; otherwise, load data with BIO annotations.
    Returns:
        sentences: List of sentences.
        labeled_sentences: List of labeled sentences in the format (text, {"entities": [(start, end, label)]}).
    """
    sentences = []
    labeled_sentences = []
    response = requests.get(filepath)
    blocks = response.text.strip().split('\n\n')
    
    for block in blocks:
        if not block.strip():
            continue
        lines = block.split('\n')
        tokens = []
        labels = []
        for line in lines:
            if line.strip() and '\t' in line:
                token, label = line.split('\t')
                tokens.append(token.strip())
                labels.append(label.strip())
            else:
                continue
        
        if not tokens or not labels:
            continue
        
        text = " ".join(tokens)
        if pure_text:
            sentences.append(text)
            labeled_sentences.append((text, {"entities": []}))
            continue
        
        char_positions = []
        current_pos = 0
        for token in tokens:
            char_positions.append(current_pos)
            current_pos += len(token) + 1
        
        entities = []
        seen_entities = set()
        start = None
        current_label = None
        for i in range(len(tokens)):
            label = labels[i]
            if label.startswith("B-"):
                if current_label is not None:
                    start_char = char_positions[start]
                    end_char = char_positions[i - 1] + len(tokens[i - 1])
                    entity_key = (start, i - 1, current_label)
                    if entity_key not in seen_entities:
                        seen_entities.add(entity_key)
                        entities.append((start_char, end_char, current_label))
                start = i
                current_label = label[2:]  
            elif label.startswith("I-") and current_label:
                expected_label = label[2:]
                if expected_label != current_label:
                    print(f"Warning: Mismatched I- label found for '{text}' at position {i}: Expected {current_label}, but got {expected_label}")
                continue
            else:
                if current_label is not None:
                    start_char = char_positions[start]
                    end_char = char_positions[i - 1] + len(tokens[i - 1])
                    entity_key = (start, i - 1, current_label)
                    if entity_key not in seen_entities:
                        seen_entities.add(entity_key)
                        entities.append((start_char, end_char, current_label))
                start = None
                current_label = None
        
        if current_label is not None:
            start_char = char_positions[start]
            end_char = char_positions[-1] + len(tokens[-1])
            entity_key = (start, len(tokens) - 1, current_label)
            if entity_key not in seen_entities:
                seen_entities.add(entity_key)
                entities.append((start_char, end_char, current_label))
        
        sentences.append(text)
        labeled_sentences.append((text, {"entities": entities}))
    
    return sentences, labeled_sentences
    
# 1.2 Sampling Function (Confidence-Based and Random Selection)
def sample_sentences(unlabeled_sentences, n_confidence, n_random, ner):
    """
    Mix confidence-based and random selection of sentences for active learning sampling.

    Parameters:
        unlabeled_sentences: List of unlabeled sentences.
        n_confidence: Number of sentences selected based on confidence.

        n_random: Number of sentences selected randomly.

        ner: NER model component.

    Returns:
        List of sampled sentences.
    """
    
    def compute_confidence_score(doc, ner):
        """
        Compute confidence score for uncertainty sampling """
        entity_scores = []
        try:
            doc_with_entities = ner(doc)
            spans = doc_with_entities.ents
            model_output = ner.model.predict([doc])
            logits = model_output.logits if hasattr(model_output, 'logits') else model_output
            if isinstance(logits, (list, tuple, np.ndarray)) and len(logits) > 0:
                logits = logits[0] if isinstance(logits[0], (list, tuple, np.ndarray)) else logits
            else:
                logits = np.ones((len(doc), len(ner.labels)))
            logits_np = np.asarray(logits)
            def numpy_softmax(x, axis=-1):
                exp_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
                return exp_x / np.sum(exp_x, axis=axis, keepdims=True)
            probs = numpy_softmax(logits_np, axis=-1)
            for span in spans:
                probs_span = probs[span.start:span.end]
                max_prob = np.max(probs_span) if len(probs_span) > 0 else 1.0
                weighted_prob = max_prob * (span.end - span.start)
                entity_scores.append(weighted_prob)
        except Exception as e:
            entity_scores.append(1.0)
        return sum(entity_scores) / len(entity_scores) if entity_scores else 1.0

    random_samples = random.sample(unlabeled_sentences, min(n_random, len(unlabeled_sentences)))
    remaining_sentences = [s for s in unlabeled_sentences if s not in random_samples]
    
    scores = []
    for sentence in remaining_sentences:
        doc = nlp.make_doc(sentence)
        avg_score = compute_confidence_score(doc, ner)
        scores.append((sentence, avg_score))
    
    scores.sort(key=lambda x: x[1])
    print("\nTop 5 sentences with the lowest confidence:")
    for sentence, score in scores[:5]:
        print(f"Sentence: {sentence},  Confidence Score: {score}")
    confidence_samples = [s[0] for s in scores[:n_confidence]]
    return confidence_samples + random_samples


# 1.3 Import and Process Annotated Data
def import_annotated_data(nlp, annotated_file, priority, low_freq_categories=None):
    """ 
    Import and process annotated data, converting it into SpaCy format while avoiding forced ordering.

    Parameters: 
        nlp: SpaCy model.
        annotated_file: Path to the annotated file (Label Studio format). 
        priority: Dictionary of entity priorities (for reference only).
 
        low_freq_categories: List of low-frequency categories (prioritized for retention).

    Returns:
        List of newly annotated data in the format [(text, {"entities": [(start, end, label)]})].
    """
    low_freq_categories = low_freq_categories or []
    with open(annotated_file, "r", encoding="utf-8") as f:
        annotated_data = json.load(f)
    
    new_labeled_data = []
    total_entities = 0
    ignored_entities = 0
    for item in annotated_data:
        if "data" in item and "text" in item["data"]:
            text = item["data"]["text"]
            annotations = item.get("annotations", [{}])[0].get("result", [])
        elif "text" in item:
            text = item["text"]
            annotations = item.get("annotations", [{}])[0].get("result", [])
        else:
            raise ValueError(f"Failed to extract 'text' from annotated data, incorrect item format: {item}")

        doc = nlp.make_doc(text)
        entities_list = []
        for r in annotations:
            char_start = r["value"]["start"]
            char_end = r["value"]["end"]
            label = r["value"]["labels"][0]
            total_entities += 1
            span = doc.char_span(char_start, char_end, label=label, alignment_mode="expand")
            if span is not None:
                token_start = span.start
                token_end = span.end
                aligned_char_start = doc[token_start].idx
                aligned_char_end = doc[token_end-1].idx + len(doc[token_end-1].text) if token_end > token_start else aligned_char_start + len(doc[token_start].text)
                entities_list.append((aligned_char_start, aligned_char_end, label))
            else:
                print(f"Failed to convert character span, Sentence: {text}, Entity: {char_start, char_end, label}")
        
        # Sort by start and end positions, but do not sort by priority.
        entities_list.sort(key=lambda x: (x[0], x[1]))
        
        # Conflict resolution: Prioritize retaining entities of low-frequency categories to avoid forced removal.

        entities = []
        used_tokens = set()
        for char_start, char_end, label in entities_list:
            span = doc.char_span(char_start, char_end, label=label, alignment_mode="expand")
            if span is None:
                continue
            start = span.start
            end = span.end
            overlap = False
            for i in range(start, end):
                if i in used_tokens:
                    overlap = True
                    break
            # If there is an overlap, check if it belongs to a low-frequency category.

            if overlap:
                # If the current entity is a low-frequency category, prioritize its retention.
                if label in low_freq_categories:
                    # Remove previously conflicting entities and reassign tokens.
                    new_entities = []
                    new_used_tokens = set()
                    for e in entities:
                        e_span = doc.char_span(e[0], e[1], label=e[2], alignment_mode="expand")
                        if e_span is None:
                            continue
                        e_start = e_span.start
                        e_end = e_span.end
                        e_overlap = False
                        for j in range(e_start, e_end):
                            if j in range(start, end):
                                e_overlap = True
                                break
                        if not e_overlap or (e_overlap and e[2] not in low_freq_categories):
                            new_entities.append(e)
                            for j in range(e_start, e_end):
                                new_used_tokens.add(j)
                    entities = new_entities
                    used_tokens = new_used_tokens
                    overlap = False
            if not overlap:
                entities.append((char_start, char_end, label))
                for i in range(start, end):
                    used_tokens.add(i)
        
        try:
            biluo_tags = offsets_to_biluo_tags(doc, entities)
            valid_entities = []
            for char_start, char_end, label in entities:
                span = doc.char_span(char_start, char_end, label=label, alignment_mode="expand")
                if span and all(biluo_tags[i] != "-" for i in range(span.start, span.end)):
                    valid_entities.append((char_start, char_end, label))
                else:
                    print(f"Skipping misaligned entity, Sentence: '{text}', Entity: {char_start, char_end, label}")
                    ignored_entities += 1
            new_labeled_data.append((text, {"entities": valid_entities}))
        except ValueError as e:
            print(f"Error aligning entities, Sentence: '{text}': {e}")
            new_labeled_data.append((text, {"entities": []}))
    
    print(f"Loaded {len(new_labeled_data)} new labeled samples.")
    print(f"Total entities: {total_entities}, Ignored entities: {ignored_entities}, Ignored ratio: {ignored_entities/total_entities:.2%}")
    if new_labeled_data:
        print("Sample:", new_labeled_data[0])
    return new_labeled_data

# 1.4 Train the Model
def train_model(nlp, train_data, other_pipes, optimizer, epochs=30, dropout=0.15):
    """
    Train the model.

    Parameters:
        nlp: SpaCy model.

        train_data: Training data in the format [(text, {"entities": [(start, end, label)]})].

        other_pipes: Other pipelines to disable.

        optimizer: Optimizer.

        epochs: Number of training epochs.

        dropout: Dropout rate.

    Returns:
        The trained model.
    """

    print("Starting model training") 
    with nlp.disable_pipes(*other_pipes):
        for i in range(epochs):
            random.shuffle(train_data)
            losses = {}
            for text, annotations in train_data:
                doc = nlp.make_doc(text)
                valid_entities = [(start, end, label) for start, end, label in annotations["entities"]
                                  if start >= 0 and end <= len(doc.text) and start < end]
                example = Example.from_dict(doc, {"entities": valid_entities})
                nlp.update([example], drop=dropout, sgd=optimizer, losses=losses)
            print(f" {i + 1} : {losses}")
    return nlp
    
# 1.5 Evaluate the Model
def evaluate_model(nlp, data, dataset_name="Dev"):
    """
    Evaluate model performance.

    Parameters:
        nlp: SpaCy model.

        data: Evaluation data in the format [(text, {"entities": [(start, end, label)]})].

        dataset_name: Name of the dataset (for printing purposes).

    Returns:

        Evaluation scores.
    """
    scorer = spacy.scorer.Scorer()
    examples = []
    for text, annotations in data:
        doc = nlp(text)
        example = Example.from_dict(doc, annotations)
        examples.append(example)
    scorer.score(examples) 
    print(f"\n{dataset_name} Metrics：")
    print(f"Overall：{scorer.scores}")
    print("\nPer Entity Type：")
    for entity_type, metrics in scorer.scores["ents_per_type"].items():
        print(f"{entity_type}: {metrics}")
    return scorer.scores

In [5]:
base_url = "https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/wnut16/data/"
priority = {
    "company": 1, "facility": 2, "geo-loc": 3, "movie": 4, "musicartist": 5,
    "person": 6, "product": 7, "sportsteam": 8, "tvshow": 9, "other": 10
}

print("Preprocessing data")
files = ["train", "dev", "test"]
data = {}
for file in files:
    file_url = f"{base_url}{file}"
    sentences, labeled_sentences = load_data(file_url, pure_text=True)
    data[file] = sentences
print(f"train: {len(data['train'])}, dev: {len(data['dev'])}, test: {len(data['test'])}")

Preprocessing data
train: 2394, dev: 1000, test: 3850


In [7]:
print("\nCreating model")
nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")
entity_types = ["company", "facility", "geo-loc", "movie", "musicartist", "other", "person", "product", "sportsteam", "tvshow"]
for label in entity_types:
    ner.add_label(label)
nlp.initialize()
nlp.to_disk("initial_ner_model")
print("Initial model has been saved as 'initial_ner_model'")


Creating model
Initial model has been saved as 'initial_ner_model'


In [9]:
print("\nGenerating pseudo labels based on rules")
matcher = Matcher(nlp.vocab)
phrase_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
matcher.add("person", [[{"IS_UPPER": True}, {"IS_UPPER": True}]])
matcher.add("company", [[{"TEXT": {"REGEX": r"(Inc\.|Ltd\.|Corp\.)$"}}]])
matcher.add("facility", [[{"IS_UPPER": True}, {"LOWER": {"IN": ["stadium", "airport", "museum"]}}]])


geo_loc_dict = ["new york", "london", "tokyo", "shanghai", "paris", "los angeles", "california", "texas", "florida", "chicago"]
movie_dict = ["the matrix", "titanic", "inception", "star wars", "the godfather", "pulp fiction", "the avengers"]


phrase_matcher.add("geo-loc", [nlp.make_doc(loc) for loc in geo_loc_dict])
phrase_matcher.add("movie", [nlp.make_doc(movie) for movie in movie_dict])


excluded_tokens = ["RT", ":D", "...", "ALL", "FML", "KK", "TIME", "LOW", "I", "WARNING", "do", "IN", "MY", "IS", "DONE"]

sample_size = 100
sampled_sentences = random.sample(data["train"], min(sample_size, len(data["train"])))
train_data = []

for sentence in sampled_sentences:
    doc = nlp(sentence)
    entities_with_priority = []
    for match_id, start, end in matcher(doc):
        label = nlp.vocab.strings[match_id]
        if any(token in excluded_tokens for token in doc[start:end].text.split()):
            continue
        entities_with_priority.append((start, end, label, priority[label]))
    for match_id, start, end in phrase_matcher(doc):
        label = nlp.vocab.strings[match_id]
        entities_with_priority.append((start, end, label, priority[label]))
    
    entities_with_priority.sort(key=lambda x: (x[0], x[1], x[3]))
    entities = []
    used_tokens = set()
    for start, end, label, _ in entities_with_priority:
        if start < 0 or end > len(doc) or start >= end:
            continue
        overlap = any(i in used_tokens for i in range(start, end))
        if not overlap:
            char_start = doc[start].idx
            char_end = doc[end-1].idx + len(doc[end-1].text) if end > start else char_start + len(doc[start].text)
            if char_end > len(doc.text):
                char_end = len(doc.text)
            span = doc.char_span(char_start, char_end, label=label, alignment_mode="strict")
            if span:
                entities.append((char_start, char_end, label))
                for i in range(start, end):
                    used_tokens.add(i)
    try:
        biluo_tags = offsets_to_biluo_tags(doc, entities)
        valid_entities = [(char_start, char_end, label) for char_start, char_end, label in entities
                          if char_start >= 0 and char_end <= len(doc.text) and char_start < char_end and
                          all(biluo_tags[i] != "-" for i in range(doc.char_span(char_start, char_end, label=label, alignment_mode="expand").start, 
                              doc.char_span(char_start, char_end, label=label, alignment_mode="expand").end))]
        train_data.append((sentence, {"entities": valid_entities}))
    except ValueError as e:
        print(f"Error aligning entities, Sentence: '{sentence}': {e}")
        train_data.append((sentence, {"entities": []}))

with open("initial_pseudo_labels.json", "w", encoding="utf-8") as f:
    json.dump(train_data, f, ensure_ascii=False)
print(f"Generated {len(train_data)} pseudo-labeled samples.")



Generating pseudo labels based on rules
Generated 100 pseudo-labeled samples.


In [11]:
print("\nInitial Training")
nlp = spacy.load("initial_ner_model")
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
optimizer = nlp.begin_training()
nlp = train_model(nlp, train_data, other_pipes, optimizer, epochs=10)
nlp.to_disk("initial_trained_model")
unlabeled_sentences = [s for s in data["train"] if s not in [t[0] for t in train_data]]
with open("train_data.json", "w", encoding="utf-8") as f:
    json.dump(train_data, f, ensure_ascii=False)
with open("unlabeled_sentences.json", "w", encoding="utf-8") as f:
    json.dump(unlabeled_sentences, f, ensure_ascii=False)
print("Initial training completed. Model saved as  'initial_trained_model'")


Initial Training
Starting model training
 1 : {'ner': 211.45419476459526}
 2 : {'ner': 49.13733175813346}
 3 : {'ner': 47.775095690906895}
 4 : {'ner': 8.012485581540707}
 5 : {'ner': 7.504210933557242}
 6 : {'ner': 6.668474730856127}
 7 : {'ner': 3.5153146960329082}
 8 : {'ner': 0.860901114254744}
 9 : {'ner': 1.9686555860688217}
 10 : {'ner': 0.005218327837362067}
Initial training completed. Model saved as  'initial_trained_model'


In [13]:
def run_first_iteration(iteration, model_name, export_file, n_confidence, n_random , epochs=15, dropout = 0.3):
    """
    First Iteration: Sampling and exporting samples (without manual annotation file)
    
    Parameters:
        iteration: Iteration number (used for printing steps)
        model_name: Current model name (e.g., "initial_trained_model")
        export_file: File name for exported samples (e.g., "manual_samples_first.json")
        n_confidence: Number of confidence-based samples
        n_random: Number of randomly selected samples
    
    Returns:
        None
    """
    print(f"\nStep {2*iteration+5}：Active Learning Iteration {iteration} - Exporting data")
    global nlp, train_data, unlabeled_sentences
    
    # Load model
    nlp = spacy.load(model_name)
    ner = nlp.get_pipe("ner")
    
    # Load train_data and unlabeled_sentences
    with open("train_data.json", "r", encoding="utf-8") as f:
        train_data = json.load(f)
    with open("unlabeled_sentences.json", "r", encoding="utf-8") as f:
        unlabeled_sentences = json.load(f)
    
    # Sample sentences
    samples = sample_sentences(unlabeled_sentences, n_confidence, n_random, ner)  
    with open(export_file, "w", encoding="utf-8") as f:
        json.dump([{"text": s} for s in samples], f, ensure_ascii=False)
    unlabeled_sentences = [s for s in unlabeled_sentences if s not in samples]
    with open("train_data.json", "w", encoding="utf-8") as f:
        json.dump(train_data, f, ensure_ascii=False)
    with open("unlabeled_sentences.json", "w", encoding="utf-8") as f:
        json.dump(unlabeled_sentences, f, ensure_ascii=False)
    print(f"Exported {n_confidence + n_random} samples to '{export_file}'. Please annotate manually.")

In [15]:

def run_subsequent_iteration(iteration, model_name, export_file, annotated_file, n_confidence, n_random, save_model_name, epochs=30, dropout=0.15, low_freq_categories=None, dev_url=None, target_f1=85.0, max_iterations=10, f1_history=None):
    """
    Subsequent Iterations: Sampling, importing annotated data, training (single iteration), and evaluating performance.

    Parameters:
    iteration: Iteration number (used for printing steps)
    model_name: Current model name (e.g., "initial_trained_model" or "active_learning_ner_model_iteration_1")
    export_file: File name for exported samples (e.g., "uncertain_samples_iteration_1.json")
    annotated_file: Manually annotated file name (e.g., "manual_annotated_first.json" or "annotated_samples_iteration_1.json")
    n_confidence: Number of confidence-based samples
    n_random: Number of randomly selected samples
    save_model_name: Name of the saved model (e.g., "active_learning_ner_model_iteration_1")
    epochs: Number of training epochs
    """
    print(f"\nStep {iteration}：Active Learning Iteration {iteration} - Sampling, Training")
    global nlp, train_data, unlabeled_sentences
    
    # Load model
    nlp = spacy.load(model_name)
    ner = nlp.get_pipe("ner")
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    
    # Load optimizer state
    optimizer_file = f"{model_name}_optimizer.pkl"
    if os.path.exists(optimizer_file):
        with open(optimizer_file, "rb") as f:
            optimizer = pickle.load(f)
        print(f"Loaded optimizer state from '{optimizer_file}'.")
    else:
        optimizer = nlp.begin_training()
        optimizer.learn_rate = 0.0001
        print("Initialized new optimizer with learn_rate=0.0001.")
    
    # Load train_data and unlabeled_sentences
    with open("train_data.json", "r", encoding="utf-8") as f:
        train_data = json.load(f)
    with open("unlabeled_sentences.json", "r", encoding="utf-8") as f:
        unlabeled_sentences = json.load(f)

    # Sample sentences
    samples = sample_sentences_subsequent(unlabeled_sentences, n_confidence, n_random, ner, low_freq_categories=low_freq_categories)
    with open(export_file, "w", encoding="utf-8") as f:
        json.dump([{"text": s} for s in samples], f, ensure_ascii=False)
    unlabeled_sentences = [s for s in unlabeled_sentences if s not in samples]
    with open("unlabeled_sentences.json", "w", encoding="utf-8") as f:
        json.dump(unlabeled_sentences, f, ensure_ascii=False)
    print(f"Exported {n_confidence + n_random} samples to '{export_file}'. Please annotate '{annotated_file}'，然后继续下一次迭代。")
    
    # Check if annotated_file exists
    if not os.path.exists(annotated_file):
        raise FileNotFoundError(f"Annotated file '{annotated_file}' not found. Please ensure it has been annotated.")
    
    # Load new annotated data
    new_labeled_data = import_annotated_data(nlp, annotated_file, priority, low_freq_categories=low_freq_categories if low_freq_categories is not None else [])
    train_data.extend(new_labeled_data)
    
    # Train model
    print(f"Training data size:  {len(train_data)}")
    nlp = train_model(nlp, train_data, other_pipes, optimizer, epochs=epochs, dropout=dropout)
    
    # Save model and optimizer state
    nlp.to_disk(save_model_name)
    with open(f"{save_model_name}_optimizer.pkl", "wb") as f:
        pickle.dump(optimizer, f)
    with open("train_data.json", "w", encoding="utf-8") as f:
        json.dump(train_data, f, ensure_ascii=False)
    with open("unlabeled_sentences.json", "w", encoding="utf-8") as f:
        json.dump(unlabeled_sentences, f, ensure_ascii=False)
    print(f"Iteration {iteration} completed: Model saved as '{save_model_name}'。")
    return save_model_name, True
    

In [39]:
def sample_sentences_subsequent(unlabeled_sentences, n_confidence, n_random, ner, low_freq_categories=None):
    """
    Mix confidence-based and random sentence sampling, with support for prioritizing low-frequency categories.

    Parameters:
        unlabeled_sentences: List of unlabeled sentences
        n_confidence: Number of confidence-based samples
        n_random: Number of randomly selected samples
        ner: NER model component

    Returns:
        List of selected samples
    """
    def compute_confidence_score(doc, ner):
        entity_scores = []
        entities_detected = []
        try:
            doc_with_entities = ner(doc)
            spans = doc_with_entities.ents
            model_output = ner.model.predict([doc])
            logits = model_output.logits if hasattr(model_output, 'logits') else model_output
            if isinstance(logits, (list, tuple, np.ndarray)) and len(logits) > 0:
                logits = logits[0] if isinstance(logits[0], (list, tuple, np.ndarray)) else logits
            else:
                logits = np.ones((len(doc), len(ner.labels)))
            logits_np = np.asarray(logits)
            def numpy_softmax(x, axis=-1):
                exp_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
                return exp_x / np.sum(exp_x, axis=axis, keepdims=True)
            probs = numpy_softmax(logits_np, axis=-1)
            for span in spans:
                probs_span = probs[span.start:span.end]
                max_prob = np.max(probs_span) if len(probs_span) > 0 else 1.0
                weighted_prob = max_prob * (span.end - span.start)
                entity_scores.append(weighted_prob)
                entities_detected.append(span.label_)
        except Exception as e:
            entity_scores.append(1.0)
        return sum(entity_scores) / len(entity_scores) if entity_scores else 1.0, entities_detected

    # Random sampling
    random_samples = random.sample(unlabeled_sentences, min(n_random, len(unlabeled_sentences)))
    remaining_sentences = [s for s in unlabeled_sentences if s not in random_samples]
    
    # Prioritize sentences containing low-frequency categories
    low_freq_samples = []
    other_samples = []
    
    for sentence in remaining_sentences:
        doc = nlp.make_doc(sentence)
        avg_score, entities_detected = compute_confidence_score(doc, ner)
        # Check if the sentence contains entities from low-frequency categories
        if low_freq_categories and any(entity in low_freq_categories for entity in entities_detected):
            low_freq_samples.append((sentence, avg_score))
        else:
            other_samples.append((sentence, avg_score))
    
    # Sort by confidence score
    low_freq_samples.sort(key=lambda x: x[1])  
    other_samples.sort(key=lambda x: x[1])
    
    # Prioritize sampling sentences from low-frequency categories
    n_low_freq = min(len(low_freq_samples), int(n_confidence * 0.8)) 
    n_other = n_confidence - n_low_freq
    
    scores = low_freq_samples + other_samples
    print("\nTop 5 sentences with the lowest confidence：")
    for sentence, score in scores[:5]:
        print(f"Sentence: {sentence}, Confidence Score: {score}")
    
    confidence_samples = [s[0] for s in low_freq_samples[:n_low_freq]] + [s[0] for s in other_samples[:n_other]]
    
    if low_freq_categories:
        target_min_count = 30  
        category_counts = {cat: 0 for cat in low_freq_categories}
        
        for sentence in confidence_samples:
            doc = nlp.make_doc(sentence)
            doc_with_entities = ner(doc)
            for ent in doc_with_entities.ents:
                if ent.label_ in low_freq_categories:
                    category_counts[ent.label_] += 1
        
        print("\nLow-Frequency Category Sampling Statistics:")
        for cat, count in category_counts.items():
            print(f"{cat}: {count} samples")
            if count < target_min_count:
                print(f"Warning: The number of samples for category {cat} ({count}) did not reach the target ({target_min_count})，. Resampling is recommended.")
    
    return confidence_samples + random_samples



In [19]:
run_first_iteration(
    iteration=1,
    model_name="initial_trained_model",
    export_file="manual_samples_1.json",
    n_confidence=10,
    n_random=90,
    epochs=20,
    dropout=0.3
)


Step 7：Active Learning Iteration 1 - Exporting data

Top 5 sentences with the lowest confidence:
Sentence: Chicago Blackhawks general manager Stan Bowman said Cristobal Huet will be gone in two weeks ... http://fan.ac/Ydv #NHL,  Confidence Score: 0.1
Sentence: HAVE YOU HEARD DJ STRATEGY IS NOW AT VISIONS LOUNGE IN HICKORY , NC ON WEDNESDAY ; S NIGHTS ... WOW !! DONT MISS IT !,  Confidence Score: 0.1
Sentence: @MakeDaPussyDrip ALL DAY,  Confidence Score: 0.1
Sentence: just whooped st . francis preps asssss . and i scored a goal :) and its friday . and i have no homeworkkk . SICK LIFEEEE,  Confidence Score: 0.1
Sentence: RT @TheOutlawz : EVERYBODY WHO WANT A FOLLOW FROM ME FOR FOLLOW FRIDAY LET ME KNOW !! WE AINT LIKE THESE HOLLYWOOD AZZ RAPPERS WE FOLLOW BACK !,  Confidence Score: 0.13999999999999999
Exported 100 samples to 'manual_samples_1.json'. Please annotate manually.


In [21]:
run_subsequent_iteration(
    iteration=2,
    model_name="initial_trained_model",
    export_file="manual_samples_2.json",
    annotated_file="manual_annotated_05_01.json",
    n_confidence=10,
    n_random=90,
    save_model_name="active_learning_ner_model_iteration_1",
    epochs=20,
    dropout=0.3
)
###1-2


Step 2：Active Learning Iteration 2 - Sampling, Training
Initialized new optimizer with learn_rate=0.0001.

Top 5 sentences with the lowest confidence：
Sentence: @SammieLynnsMom @tg10781 they will be all done by Sunday trust me *wink*, Confidence Score: 1.0
Sentence: Made it back home to GA . It sucks not to be at Disney world , but its good to be home . Time to start planning the next Disney World trip ., Confidence Score: 1.0
Sentence: ' Breaking Dawn ' Returns to Vancouver on January 11th http://bit.ly/dbDMs8, Confidence Score: 1.0
Sentence: @ls_n perhaps , but folks may find something in the gallery that is helpful in their day-to-day work as well . Even just to use it ., Confidence Score: 1.0
Sentence: @Carr0t aye been tonight - excellent, Confidence Score: 1.0
Exported 100 samples to 'manual_samples_2.json'. Please annotate 'manual_annotated_05_01.json'，然后继续下一次迭代。
Loaded 100 new labeled samples.
Total entities: 81, Ignored entities: 0, Ignored ratio: 0.00%
Sample: ('Chicago Black

('active_learning_ner_model_iteration_1', True)

In [23]:
run_subsequent_iteration(
    iteration=3,
    model_name="active_learning_ner_model_iteration_1",
    export_file="manual_samples_3.json",
    annotated_file="manual_annotated_05_02.json",
    n_confidence=30,
    n_random=70,
    save_model_name="active_learning_ner_model_iteration_2",
    epochs=20,
    dropout=0.3
)
###2-3


Step 3：Active Learning Iteration 3 - Sampling, Training
Loaded optimizer state from 'active_learning_ner_model_iteration_1_optimizer.pkl'.

Top 5 sentences with the lowest confidence：
Sentence: CLUB BLU tonite ...... 90 's music .. oldskool night wiith dj finese, Confidence Score: 0.1
Sentence: Jan Brewer : Beheadings ? http://bit.ly/a2LBMP, Confidence Score: 0.1
Sentence: A Twisted two nighter in London NOVEMBER 18 ... Younger Brother ( Live) , Shpongle ( Simon Posford DJ set ) and other ... http://fb.me/Hqwb7kzG, Confidence Score: 0.1
Sentence: I just remembered this week is The Heatwave 7th birthday ... meant to make tonight 's Brixton bashment our birthday party but I forgot LOL, Confidence Score: 0.1
Sentence: Costa Rican group CocoFunka power this week 's Indiesent Exposure http://ht.ly/2G4nS by @fuseboxradio on @planetill, Confidence Score: 0.1
Exported 100 samples to 'manual_samples_3.json'. Please annotate 'manual_annotated_05_02.json'，然后继续下一次迭代。
Loaded 100 new labeled sample

('active_learning_ner_model_iteration_2', True)

In [25]:
run_subsequent_iteration(
    iteration=4,
    model_name="active_learning_ner_model_iteration_2",
    export_file="manual_samples_4.json",
    annotated_file="manual_annotated_05_03.json",
    n_confidence=40,
    n_random=60,
    save_model_name="active_learning_ner_model_iteration_3",
    epochs=20,
    dropout=0.3
)
###3-4


Step 4：Active Learning Iteration 4 - Sampling, Training
Loaded optimizer state from 'active_learning_ner_model_iteration_2_optimizer.pkl'.

Top 5 sentences with the lowest confidence：
Sentence: right . Five Star Day came out in may this year . why haven't ANY trailers come out here in Scotland yet ? i've waited about a year for this !, Confidence Score: 0.1
Sentence: Justice Breyer 's About Face : Koran-Burning Is Constitutionally Protected After All via Atlas Shrugs http://tinyurl.com/39wg73o, Confidence Score: 0.1
Sentence: http://bit.ly/aTTQYq When Pepsi to ring usually confirm to , winning a Nokia 5800 ?, Confidence Score: 0.1
Sentence: Your paycheck may shrink next year , at least temporarily , if a vote on extending the Bush tax cuts slips past November . http://bit.ly/9y8fdS, Confidence Score: 0.1
Sentence: we need some time alone we need to let it BREATH, Confidence Score: 0.1
Exported 100 samples to 'manual_samples_4.json'. Please annotate 'manual_annotated_05_03.json'，然后继续下一

('active_learning_ner_model_iteration_3', True)

In [27]:
run_subsequent_iteration(
    iteration=5,
    model_name="active_learning_ner_model_iteration_3",
    export_file="manual_samples_5.json",
    annotated_file="manual_annotated_05_04.json",
    n_confidence=60,
    n_random=60,
    save_model_name="active_learning_ner_model_iteration_4",
    epochs=20,
    dropout=0.25
)
###4-5


Step 5：Active Learning Iteration 5 - Sampling, Training
Loaded optimizer state from 'active_learning_ner_model_iteration_3_optimizer.pkl'.

Top 5 sentences with the lowest confidence：
Sentence: 'On set filming Chris " Rainy Days " video . This is the final scene then we can finally edit it .', Confidence Score: 0.1
Sentence: Just changed my twitter background , check it out ! Found it at http://twitrounds.com .. September 17 , 2010 , 2:52 pm, Confidence Score: 0.1
Sentence: RT @eljmayes : Ladbrokes Labour Leadership Market- http://bit.ly/cD3Rn8 Ed Miliband 's odds have shortened significantly in the last week ., Confidence Score: 0.1
Sentence: #news Dems to voters : You may hate us , but GOP is worse ( AP ) ( Yahoo ! ) : Share With Friends : | Latest Top Ne ... http://adpro.co/aQxQtY, Confidence Score: 0.1
Sentence: Kristen is MY LIFE . She made me who i am today , I was a fucker before her , she fixed me . And so did Robert ., Confidence Score: 0.1
Exported 120 samples to 'manual_sam

('active_learning_ner_model_iteration_4', True)

In [29]:
run_subsequent_iteration(
    iteration=6,
    model_name="active_learning_ner_model_iteration_4",
    export_file="manual_samples_6.json",
    annotated_file="manual_annotated_05_05.json",
    n_confidence=90,
    n_random=30,
    save_model_name="active_learning_ner_model_iteration_5",
    epochs=20,
    dropout=0.25
)
###5-6


Step 6：Active Learning Iteration 6 - Sampling, Training
Loaded optimizer state from 'active_learning_ner_model_iteration_4_optimizer.pkl'.

Top 5 sentences with the lowest confidence：
Sentence: George N . Parks , UMass band director , dies after performance in Ohio : George N . Parks , for 33 years the dire ... http://tinyurl.com/2femvgq, Confidence Score: 0.1
Sentence: MD Jobs | Executive Pharmacist-Full Time Float-Ellicott City , MD Job ( MD ) http://bit.ly/daNH6v #Job #Hiring #MDJobs, Confidence Score: 0.1
Sentence: Football game tonight with mariaelena , sarah , and brittany ! Mood : excited !! GOO WB WILDCATS !, Confidence Score: 0.1
Sentence: RT @Quotealicious : Today , I saw a guy driving a Pepsi truck , drinking a Coke . MLIA #Quotealicious, Confidence Score: 0.1
Sentence: Aggressive Kids With ADHD May Not Need Antipsychotic Meds http://t.co/JfGm0uH, Confidence Score: 0.1
Exported 120 samples to 'manual_samples_6.json'. Please annotate 'manual_annotated_05_05.json'，然后继续下一次迭代。


('active_learning_ner_model_iteration_5', True)

In [31]:
run_subsequent_iteration(
    iteration=7,
    model_name="active_learning_ner_model_iteration_5",
    export_file="manual_samples_7.json",
    annotated_file="manual_annotated_05_06.json",
    n_confidence=100,
    n_random=30,
    save_model_name="active_learning_ner_model_iteration_6",
    epochs=20,
    dropout=0.2
)
###6-7


Step 7：Active Learning Iteration 7 - Sampling, Training
Loaded optimizer state from 'active_learning_ner_model_iteration_5_optimizer.pkl'.

Top 5 sentences with the lowest confidence：
Sentence: @LinnySmit Linny Linny Linny . U are something lady ... Well that story has been on hold for weeks now hasn't it ? I want to get on it this &gt; &gt;, Confidence Score: 0.1
Sentence: The last time I chatted with Kyle was June . The last time we emailed was August ., Confidence Score: 0.1
Sentence: :( RT @themaine Who is coming to the show tomorrow in Hawaii ?, Confidence Score: 0.1
Sentence: Dems to voters : You may hate us , but GOP is worse ( AP ) AP - With just six weeks to avoid a possible election catastrophe , http://tiny.ly/wc5, Confidence Score: 0.1
Sentence: Don Mattingly will replace Joe Torre as LA Dodgers manager after this season, Confidence Score: 0.1
Exported 130 samples to 'manual_samples_7.json'. Please annotate 'manual_annotated_05_06.json'，然后继续下一次迭代。
Loaded 120 new labeled sa

('active_learning_ner_model_iteration_6', True)

In [33]:
run_subsequent_iteration(
    iteration=8,
    model_name="active_learning_ner_model_iteration_6",
    export_file="manual_samples_8.json",
    annotated_file="manual_annotated_05_07.json",
    n_confidence=120,
    n_random=30,
    save_model_name="active_learning_ner_model_iteration_7",
    epochs=20,
    dropout=0.2,
    low_freq_categories =  ["company", "facility", "geo-loc", "movie", "musicartist",
    "person", "product", "sportsteam", "tvshow", "other"]
)
###7-8


Step 8：Active Learning Iteration 8 - Sampling, Training
Loaded optimizer state from 'active_learning_ner_model_iteration_6_optimizer.pkl'.

Top 5 sentences with the lowest confidence：
Sentence: 'Come to " MRB @ 61 Roadhouse BBQ " Sunday , October 10 from 12:00 pm to 4:00 pm . We are playing for the 61 Roadhouse ... http://fb.me/G1FbJGw1', Confidence Score: 0.1
Sentence: Come to " 6th Biannual 24 Hour Prayer Focus " Saturday , November 13 from 10:00 am to 1:00 pm . Mark the Date !!! http://fb.me/JyYXPmql, Confidence Score: 0.1
Sentence: Hurry up ! Santy will be Leaving in 2 days ! - #Eskorte and #Massasje i #Norge, Confidence Score: 0.1
Sentence: Power nap . I need it . Its been a stressful week . I'm excited for xmas , haha . Goodnight . :), Confidence Score: 0.1
Sentence: A Few Clouds and 69 F at Islip , Long Island Mac Arthur Airport , NY Winds are North at 16.1 MPH ( 14 KT) . The pressure is http://s1z.us/vf.htm, Confidence Score: 0.1

Low-Frequency Category Sampling Statistics:
co

('active_learning_ner_model_iteration_7', True)

In [35]:
run_subsequent_iteration(
    iteration=9,
    model_name="active_learning_ner_model_iteration_7",
    export_file="manual_samples_9.json",
    annotated_file="manual_annotated_05_08.json",
    n_confidence=120,
    n_random=30,
    save_model_name="active_learning_ner_model_iteration_8",
    epochs=20,
    dropout=0.2,
    low_freq_categories =  ["company", "facility", "geo-loc", "movie", "musicartist",
    "person", "product", "sportsteam", "tvshow", "other"]
)
###8-9


Step 9：Active Learning Iteration 9 - Sampling, Training
Loaded optimizer state from 'active_learning_ner_model_iteration_7_optimizer.pkl'.

Top 5 sentences with the lowest confidence：
Sentence: First Day of Autumn Networking Mixer at Dick 's Carpet next T ... http://conta.cc/ap95gL via #constantcontact, Confidence Score: 0.1
Sentence: Check this video out -- Three Days Grace - Break ( Official Music Video ) [ HQ ] http://t.co/GOwCLQJ via @youtube, Confidence Score: 0.1
Sentence: Game tonight with Tessa . I get to see Seanzie if he goes :) . I get to see my boyfriend too &lt; 3 c :, Confidence Score: 0.1
Sentence: @Loserface_Laura when mike lets me know , I will let you know . I mean everyone might just switch out a lot ., Confidence Score: 0.1
Sentence: CAFE NINE TONIGHT AT 11 !!, Confidence Score: 0.1

Low-Frequency Category Sampling Statistics:
company: 4 samples
facility: 5 samples
geo-loc: 5 samples
movie: 0 samples
musicartist: 1 samples
person: 22 samples
product: 3 samples
spor

('active_learning_ner_model_iteration_8', True)

In [37]:
run_subsequent_iteration(
    iteration=10,
    model_name="active_learning_ner_model_iteration_8",
    export_file="manual_samples_10.json",
    annotated_file="manual_annotated_05_09.json",
    n_confidence=120,
    n_random=30,
    save_model_name="active_learning_ner_model_iteration_9",
    epochs=15,
    dropout=0.2,
    low_freq_categories =  ["company", "facility", "geo-loc", "movie", "musicartist",
    "person", "product", "sportsteam", "tvshow", "other"]
)
###9-10


Step 10：Active Learning Iteration 10 - Sampling, Training
Loaded optimizer state from 'active_learning_ner_model_iteration_8_optimizer.pkl'.

Top 5 sentences with the lowest confidence：
Sentence: i want a bath but do n't have a bath , shut up , sam 's coming tomorrow and steve and tanya will be round at 10am so go away you mean people, Confidence Score: 0.1
Sentence: RT @TeamShaneDawson : RT If you think @AntiShaneDawson is a 10 year old without a penis to play with so they hate on Shane instead :( :' ), Confidence Score: 0.1
Sentence: @KSSchro When I run over an animal I think of Disney movies &amp; that I just ran over someones brother , mom or dad . The full life montage helps, Confidence Score: 0.1
Sentence: @DebVRuns hi pal !! How 's Hawaii ?? when are you heading home !, Confidence Score: 0.1
Sentence: O happy day . My local grocery carries Hoegaarden ., Confidence Score: 0.1

Low-Frequency Category Sampling Statistics:
company: 1 samples
facility: 0 samples
geo-loc: 3 samples
m

('active_learning_ner_model_iteration_9', True)

In [41]:
run_subsequent_iteration(
    iteration=11,
    model_name="active_learning_ner_model_iteration_9",
    export_file="manual_samples_11.json",
    annotated_file="manual_annotated_05_10.json",
    n_confidence=100,
    n_random=50,
    save_model_name="active_learning_ner_model_iteration_10",
    epochs=15,
    dropout=0.2,
    low_freq_categories =  ["company", "facility", "geo-loc", "movie", "musicartist",
    "person", "product", "sportsteam", "tvshow", "other"]
)
###10-11


Step 11：Active Learning Iteration 11 - Sampling, Training
Loaded optimizer state from 'active_learning_ner_model_iteration_9_optimizer.pkl'.

Top 5 sentences with the lowest confidence：
Sentence: @vogueglamGIRL Ah I know ! She is simply the best in The Sept Issue . My boyfriend 's aunt worked for Anna Wintor in NY ., Confidence Score: 0.1
Sentence: Fuckk man ! I fuckinn missed @yelyahwilliams concert :( #shizz ! I wonder if they're coming to Arizona next year ??, Confidence Score: 0.1
Sentence: Blegh , that fell through . Staffing agency had a miscommunication w\the client . So two week temp assignment fell through unfortunately ., Confidence Score: 0.1
Sentence: Anja Rubik Model in Lingerie of the Day http://f.ast.ly/DCaEM, Confidence Score: 0.1
Sentence: @StarryEyedJoeJ like after I download a PSD from @ CherryPSDs what do I do ?!, Confidence Score: 0.1

Low-Frequency Category Sampling Statistics:
company: 3 samples
facility: 3 samples
geo-loc: 4 samples
movie: 1 samples
musicartist

('active_learning_ner_model_iteration_10', True)

In [43]:
run_subsequent_iteration(
    iteration=12,
    model_name="active_learning_ner_model_iteration_10",
    export_file="manual_samples_12.json",
    annotated_file="manual_annotated_05_11.json",
    n_confidence=90,
    n_random=10,
    save_model_name="active_learning_ner_model_iteration_11",
    epochs=15,
    dropout=0.2,
    low_freq_categories =  ["company", "facility", "geo-loc", "movie", "musicartist",
    "person", "product", "sportsteam", "tvshow", "other"]
)
###11-12


Step 12：Active Learning Iteration 12 - Sampling, Training
Loaded optimizer state from 'active_learning_ner_model_iteration_10_optimizer.pkl'.

Top 5 sentences with the lowest confidence：
Sentence: RT @dropolo Headed to da gump today alabama here I come &lt; &lt; come to shut it down broski .. fuck wit me..Parlae 's in house producer, Confidence Score: 0.1
Sentence: @cprieboy At the end of the year ? Sometime in May . They are just starting at the beginning of August now instead of the middle ., Confidence Score: 0.1
Sentence: Amazing . And encouraging . RT @SusannahFox : 36,000 #bluebutton downloads for VA health records in 1st 10 days !, Confidence Score: 0.1
Sentence: The pope isn't really making much of an effort . He 's wearing the same clothes as yesterday ., Confidence Score: 0.1
Sentence: ' Free ' ' Day 26 ' '' #nowplaying http://cpwr.me/c9GNpt, Confidence Score: 0.1

Low-Frequency Category Sampling Statistics:
company: 0 samples
facility: 0 samples
geo-loc: 3 samples
movie: 0 

('active_learning_ner_model_iteration_11', True)

In [45]:
run_subsequent_iteration(
    iteration=13,
    model_name="active_learning_ner_model_iteration_11",
    export_file="manual_samples_13.json",
    annotated_file="manual_annotated_05_12.json",
    n_confidence=100,
    n_random=20,
    save_model_name="active_learning_ner_model_iteration_12",
    epochs=10,
    dropout=0.2,
    low_freq_categories =  ["company", "facility", "geo-loc", "movie", "musicartist",
    "person", "product", "sportsteam", "tvshow", "other"]
)


Step 13：Active Learning Iteration 13 - Sampling, Training
Loaded optimizer state from 'active_learning_ner_model_iteration_11_optimizer.pkl'.

Top 5 sentences with the lowest confidence：
Sentence: to all my girls in london or travelling up tomorrow i love and miss you wish i could be there ! NEXT YEAR I WILL !, Confidence Score: 0.1
Sentence: RT @IlliniCampusRec : Orange &amp; Blue Skate tonight , 7:30 - 9:30 pm ! Free for UI faculty/staff &amp; their immediate families . http://bit.ly/c1M8dr, Confidence Score: 0.1
Sentence: I heart Park(ing ) Day ! Photo of meter shark : http://bit.ly/aQun1b, Confidence Score: 0.1
Sentence: RT @Sexstrology : Pisces tend to escape into fantasy and day dreams . There they are free . &lt; yesss #wavesindexfingerintheair #AGREED ! ^10thpower, Confidence Score: 0.1
Sentence: Gotta call mom 2 let her know it 's almost 3 . Yli gets out in 15 . Mom took the antenna home today &lt; connected it at home , finally ., Confidence Score: 0.1

Low-Frequency Categor

('active_learning_ner_model_iteration_12', True)

In [47]:
run_subsequent_iteration(
    iteration=14,
    model_name="active_learning_ner_model_iteration_12",
    export_file="manual_samples_14.json",
    annotated_file="manual_annotated_05_13.json",
    n_confidence=120,
    n_random=30,
    save_model_name="active_learning_ner_model_iteration_13",
    epochs=10,
    dropout=0.2,
)
###13-14


Step 14：Active Learning Iteration 14 - Sampling, Training
Loaded optimizer state from 'active_learning_ner_model_iteration_12_optimizer.pkl'.

Top 5 sentences with the lowest confidence：
Sentence: dear youtube why does it take so long to upload a video gggrrrrr, Confidence Score: 0.1
Sentence: It 's the fescue turf grass aeration &amp; overseeding season for Atlanta &amp; N . Georgia . Do some evaluation &amp; planning beforehand , there 's time, Confidence Score: 0.1
Sentence: Any 1 know when season 3 True Blood is back on the telly ?, Confidence Score: 0.1
Sentence: last day of sorting pope visit to birmingham stuff out ..... hope it goes ok on sunday !!, Confidence Score: 0.1
Sentence: @joejonas @nickjonas @kevinjonas @papajonas @greggarbo @johnlloydtaylor Rock to SECTION 204 tonight !!!!, Confidence Score: 0.1
Exported 150 samples to 'manual_samples_14.json'. Please annotate 'manual_annotated_05_13.json'，然后继续下一次迭代。
Loaded 120 new labeled samples.
Total entities: 34, Ignored entiti

('active_learning_ner_model_iteration_13', True)

In [3]:
def evaluate_model_on_datasets(model, datasets, dataset_names):
    results = {}
    for dataset, name in zip(datasets, dataset_names):
        scorer = spacy.scorer.Scorer()
        examples = []
        for text, annotations in dataset:
            doc = model(text)
            example = Example.from_dict(doc, annotations)
            examples.append(example)
        scores = scorer.score(examples)
        
        # Print performance metrics
        print(f"\n{name} Dataset Performance Metrics:")
        print(f"Overall Metrics: Precision={scores['ents_p']:.2f}, Recall={scores['ents_r']:.2f},F1={scores['ents_f']:.2f}")
        print("\nPer Entity Type:")
        for entity_type, metrics in scores["ents_per_type"].items():
            print(f"{entity_type}: Precision={metrics['p']:.2f}, Recall={metrics['r']:.2f}, F1={metrics['f']:.2f}")
        
        # Save results
        results[name] = scores
    
    return results

In [21]:
base_url = "https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/wnut16/data/"
train_url = f"{base_url}train"
dev_url = f"{base_url}dev"
test_url = f"{base_url}test"

_, train_data = load_data(train_url)
_, dev_data = load_data(dev_url)
_, test_data = load_data(test_url)

model = spacy.load("active_learning_ner_model_iteration_10")

datasets = [train_data, dev_data, test_data]
dataset_names = ["Train", "Dev", "Test"]
results = evaluate_model_on_datasets(model, datasets, dataset_names)

with open("evaluation_results.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False)
print("Test results have been saved to 'evaluation_results.json'。")



Train Dataset Performance Metrics:
Overall Metrics: Precision=0.96, Recall=0.76,F1=0.85

Per Entity Type:
geo-loc: Precision=0.97, Recall=0.76, F1=0.85
facility: Precision=0.97, Recall=0.71, F1=0.82
movie: Precision=0.96, Recall=0.76, F1=0.85
company: Precision=0.97, Recall=0.81, F1=0.88
product: Precision=0.96, Recall=0.84, F1=0.90
person: Precision=0.98, Recall=0.78, F1=0.87
other: Precision=0.92, Recall=0.76, F1=0.83
sportsteam: Precision=0.95, Recall=0.75, F1=0.84
tvshow: Precision=0.91, Recall=0.62, F1=0.74
musicartist: Precision=0.93, Recall=0.69, F1=0.79

Dev Dataset Performance Metrics:
Overall Metrics: Precision=0.30, Recall=0.12,F1=0.18

Per Entity Type:
other: Precision=0.18, Recall=0.10, F1=0.13
company: Precision=0.27, Recall=0.21, F1=0.23
geo-loc: Precision=0.44, Recall=0.24, F1=0.31
product: Precision=0.12, Recall=0.05, F1=0.07
facility: Precision=0.14, Recall=0.05, F1=0.08
person: Precision=0.37, Recall=0.15, F1=0.21
sportsteam: Precision=0.60, Recall=0.04, F1=0.08
mus