In [None]:
import spacy
from spacy.tokens import DocBin
from spacy.cli import train
from spacy.util import minibatch, compounding
import random
from tqdm import tqdm

# Asegurarse de que tenemos las dependencias necesarias
# !pip install -U spacy
# !python -m spacy download es_core_news_md


In [None]:
def convert_bio_to_spacy(bio_data):
    """
    Convierte datos en formato BIO a formato de entrenamiento de spaCy
    """
    training_data = []
    
    for sentence in bio_data:
        text = ' '.join([word for word, tag in sentence])
        entities = []
        
        current_entity = None
        start_char = 0
        
        for word, tag in sentence:
            end_char = start_char + len(word)
            
            if tag.startswith('B-'):  # Inicio de una nueva entidad
                if current_entity:
                    entities.append(current_entity)
                current_entity = {'start': start_char, 'end': end_char, 'label': tag[2:]}
            
            elif tag.startswith('I-'):  # Continuación de una entidad
                if current_entity and current_entity['label'] == tag[2:]:
                    current_entity['end'] = end_char
                
            else:  # Tag O - No es parte de una entidad
                if current_entity:
                    entities.append(current_entity)
                    current_entity = None
            
            start_char = end_char + 1  # +1 para el espacio
        
        if current_entity:  # Añadir la última entidad si existe
            entities.append(current_entity)
        
        training_data.append((text, {'entities': entities}))
    
    return training_data


In [None]:
# Convertir los datos de entrenamiento y prueba al formato de spaCy
spacy_train_data = convert_bio_to_spacy(train_data)
spacy_test_data = convert_bio_to_spacy(test_data)

# Crear un nuevo modelo vacío de spaCy
nlp = spacy.blank("en")

# Añadir el pipeline de NER
ner = nlp.add_pipe("ner")

# Añadir las etiquetas al reconocedor de entidades
for _, annotations in spacy_train_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent["label"])

# Configurar el entrenamiento
n_iter = 30
batch_size = 50


In [None]:
# Entrenar el modelo
with nlp.disable_pipes(*[pipe for pipe in nlp.pipe_names if pipe != "ner"]):
    optimizer = nlp.begin_training()
    
    # Mostrar progreso con tqdm
    for itn in tqdm(range(n_iter), desc="Training epochs"):
        random.shuffle(spacy_train_data)
        losses = {}
        
        # Batch the examples
        batches = minibatch(spacy_train_data, size=batch_size)
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
            
        print(f"Losses at iteration {itn}: {losses}")

# Guardar el modelo
nlp.to_disk("restaurant_ner_model")


In [None]:
# Evaluar el modelo en el conjunto de prueba
def evaluate_ner(nlp, test_data):
    correct = 0
    total = 0
    
    for text, annotations in test_data:
        doc = nlp(text)
        gold_entities = annotations['entities']
        pred_entities = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
        
        # Comparar entidades predichas con las reales
        for gold_ent in gold_entities:
            total += 1
            for pred_start, pred_end, pred_label in pred_entities:
                if (gold_ent['start'] == pred_start and 
                    gold_ent['end'] == pred_end and 
                    gold_ent['label'] == pred_label):
                    correct += 1
                    break
    
    precision = correct / total if total > 0 else 0
    return precision

# Cargar el modelo entrenado y evaluarlo
trained_nlp = spacy.load("restaurant_ner_model")
precision = evaluate_ner(trained_nlp, spacy_test_data)
print(f"Precisión del modelo en el conjunto de prueba: {precision:.2%}")


In [None]:
# Ejemplo de uso del modelo entrenado
def test_model_on_text(nlp, text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Probar con algunas oraciones de ejemplo
test_sentences = [
    "The Thai restaurant on Main Street has great pad thai",
    "I had dinner at Le Bernardin last night and the service was excellent",
    "The prices at Sushi Express are very reasonable"
]

print("Ejemplos de predicciones del modelo:")
for sentence in test_sentences:
    entities = test_model_on_text(trained_nlp, sentence)
    print(f"\nTexto: {sentence}")
    print("Entidades encontradas:")
    for text, label in entities:
        print(f"- {text}: {label}")


In [None]:
# Convertir los datos de entrenamiento y prueba al formato de spaCy
spacy_train_data = convert_bio_to_spacy(train_data)
spacy_test_data = convert_bio_to_spacy(test_data)

# Crear un nuevo modelo vacío de spaCy
nlp = spacy.blank("en")

# Añadir el pipeline de NER
ner = nlp.add_pipe("ner")

# Añadir las etiquetas al reconocedor de entidades
for _, annotations in spacy_train_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent["label"])

# Deshabilitar otros componentes durante el entrenamiento
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
