In [None]:
import spacy
from spacy.training.example import Example
from spacy.scorer import Scorer
from spacy.util import filter_spans
import random
import re
import pandas as pd
import warnings
from ner_service import run_comparison

In [20]:
df_gemini_2_flash = pd.read_csv('ner_train_gemini.csv')
df_golden = pd.read_csv('human_train.csv')

df_gemini_2_flash = df_gemini_2_flash.drop(columns=['entities'])

display(df_gemini_2_flash.head(2))
display(df_golden.head(2))

Unnamed: 0.1,Unnamed: 0,id,sentence_text,ner_tags_str,entities_pred,llm_time,llm_prompt_tokens,llm_completion_tokens
0,0,7567,Egyptian government newspapers have criticised...,B-MISC O O O O B-LOC O O B-MISC O O O O O O O ...,"[{'entity': 'Egyptian government', 'label': 'O...",0.84497,203,62
1,1,624,Coritiba 1 Atletico Mineiro 0,B-ORG O B-ORG I-ORG O,"[{'entity': 'Coritiba', 'label': 'ORG'}, {'ent...",0.696006,171,66


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,sentence_text,ner_tags_str,entities,llm_time,llm_prompt_tokens,llm_completion_tokens
0,0,0,7567,Egyptian government newspapers have criticised...,B-MISC O O O O B-LOC O O B-MISC O O O O O O O ...,"[{'entity': 'Egyptian', 'label': 'MISC'}, {'en...",0.84497,203,62
1,1,1,624,Coritiba 1 Atletico Mineiro 0,B-ORG O B-ORG I-ORG O,"[{'entity': 'Coritiba', 'label': 'ORG'}, {'ent...",0.696006,171,66


In [21]:
price_dict = {
    "gemini": {
        "prompt": 0.1 / 1000000,
        "completion": 0.4 / 1000000,
    }
}

dfs = {
    "gemini": df_gemini_2_flash,
}

results_df = run_comparison(dfs, df_golden, price_dict)
display(results_df) 

Unnamed: 0,model,f1,precision,recall,avg_time_sec,avg_cost_usd,count,final_score
0,gemini,0.884162,0.892988,0.887122,0.778604,4.4e-05,1300,0.884162


# prepare data

In [None]:
def convert_to_spacy_format(raw_data, nlp_blank):
    training_data = []
    
    for text, annotations in raw_data:
        doc = nlp_blank.make_doc(text)
        spans = []
        
        for item in annotations:
            entity_text = item["entity"]
            label = item["label"]
            
            pattern_str = re.escape(entity_text)
            pattern = re.compile(fr'(?<!\w){pattern_str}(?!\w)')
            
            for match in re.finditer(pattern, text):
                start, end = match.span()
                span = doc.char_span(start, end, label=label, alignment_mode="strict")
                
                if span is None:
                    print(f"no match: {entity_text} in '{text}'")
                else:
                    spans.append(span)
        
        doc.ents = filter_spans(spans)
        
        example = Example.from_dict(doc, {"entities": [(e.start_char, e.end_char, e.label_) for e in doc.ents]})
        training_data.append(example)
        
    return training_data

# train pipeline

In [None]:
def fine_tune_english_model(training_examples, base_model="en_core_web_lg", iterations=15):
    print(f"Loading base model: {base_model}...")
    try:
        nlp = spacy.load(base_model)
    except OSError:
        print(f"Model not found. Please run: python -m spacy download {base_model}")
        return None

    if "ner" not in nlp.pipe_names:
        ner = nlp.add_pipe("ner", last=True)
    else:
        ner = nlp.get_pipe("ner")

    for example in training_examples:
        for ent in example.reference.ents:
            ner.add_label(ent.label_)

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

    print(f"Starting fine-tuning on {len(training_examples)} examples")
    
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.resume_training()
        
        for itn in range(iterations):
            random.shuffle(training_examples)
            losses = {}
            
            for example in training_examples:
                nlp.update(
                    [example],
                    drop=0.3, 
                    sgd=optimizer,
                    losses=losses,
                )
            
            if (itn + 1) % 5 == 0:
                print(f"Epoch {itn + 1}, Loss: {losses['ner']:.4f}")
                
    return nlp


# tests

In [None]:
def evaluate_model_to_df(nlp, examples):
    scorer = Scorer()
    examples_for_scoring = []
    
    for example in examples:
        pred_doc = nlp(example.reference.text)
        examples_for_scoring.append(Example(pred_doc, example.reference))
    
    scores = scorer.score(examples_for_scoring)
    
    metrics_data = []
    metrics_data.append({
        "Label": "GLOBAL",
        "Precision": scores["ents_p"], 
        "Recall": scores["ents_r"], 
        "F1-Score": scores["ents_f"]
    })
    
    for label, metrics in scores["ents_per_type"].items():
        metrics_data.append({
            "Label": label,
            "Precision": metrics["p"], 
            "Recall": metrics["r"], 
            "F1-Score": metrics["f"]
        })
        
    return pd.DataFrame(metrics_data).round(3)

In [None]:
def train_test(model: str, raw_data_train, raw_data_test):
    try:
        print(f'start train in {model}')
        nlp_base = spacy.load("en_core_web_lg")
        
        train_data = convert_to_spacy_format(raw_data_train, nlp_base)
        test_data = convert_to_spacy_format(raw_data_test, nlp_base)
        
        nlp_finetuned = fine_tune_english_model(train_data, base_model="en_core_web_lg", iterations=15)
        
        if nlp_finetuned:
            print("\nMetrics")
            df_metrics = evaluate_model_to_df(nlp_finetuned, test_data)
            print(df_metrics)
            
            nlp_finetuned.to_disk(f"./my_eng_ner_{model}")

    except OSError:
        print("Error: 'en_core_web_lg' not found")

In [None]:
train = pd.read_csv()
test = pd.read_csv()

train_spacy = list(zip(train['sentence_text'].values, train['entities'].values))
test_spacy = list(zip(train['sentence_text'].values, train['entities'].values))

# RES

## human
- 15 it

|index|Label|Precision|Recall|F1-Score|
|---|---|---|---|---|
|0|GLOBAL|0\.853|0\.861|0\.857|
|1|ORG|0\.821|0\.797|0\.809|
|2|MISC|0\.756|0\.798|0\.776|
|3|PER|0\.899|0\.895|0\.897|
|4|LOC|0\.888|0\.916|0\.902|
|5|DATE|0\.0|0\.0|0\.0|

## gemini 
- 15 it

|index|Label|Precision|Recall|F1-Score|
|---|---|---|---|---|
|0|GLOBAL|0\.750|0\.744|0\.747|
|1|ORG|0\.674|0\.480|0\.561|
|2|MISC|0\.665|0\.577|0\.618|
|3|PER|0\.843|0\.913|0\.876|
|4|LOC|0\.742|0\.903|0\.815|

- 20 it

|index|Label|Precision|Recall|F1-Score|
|---|---|---|---|---|
|0|GLOBAL|0\.767|0\.732|0\.749|
|1|ORG|0\.676|0\.51|0\.581|
|2|MISC|0\.823|0\.488|0\.613|
|3|PER|0\.895|0\.873|0\.884|
|4|LOC|0\.712|0\.915|0\.801|