In [9]:
# %% [markdown]
# # Extracción de Entidades Nombradas con CRF
# ## 1. Configuración Inicial

# %%
import re
import nltk
import pandas as pd
from nltk.corpus import conll2002
from nltk.tag import CRFTagger
from sklearn.metrics import classification_report
from typing import List, Dict, Tuple
from sklearn.metrics import balanced_accuracy_score

nltk.download('conll2002')
nltk.download('averaged_perceptron_tagger')

with open("locations.txt", encoding="utf-8") as f:
    locations_set = set(line.strip().lower() for line in f if line.strip())

with open("person_names.txt", encoding="utf-8") as f:
    person_names_set = set(line.strip().lower() for line in f if line.strip())
# %% [markdown]
# ## 2. Clase para Manejo de Datos

# %%
class NERDataProcessor:
    def __init__(self, language: str = "spanish"):
        self.language = language
        
    def load_data(self):
        if self.language == "spanish":
            return (
                conll2002.iob_sents('esp.train'),
                conll2002.iob_sents('esp.testa'),
                conll2002.iob_sents('esp.testb')
            )
        return (
            conll2002.iob_sents('ned.train'),
            conll2002.iob_sents('ned.testa'),
            conll2002.iob_sents('ned.testb')
        )
    
    def convert_to_features(self, data):
        return [[(word, pos) for word, pos, _ in sent] for sent in data]

    def get_labels(self, data):
        return [[tag for _, _, tag in sent] for sent in data]

# %% [markdown]
# ## 3. Generador de Características

# %%
class CRFFeatureGenerator:
    def __init__(self, feature_config: Dict):
        self.config = feature_config
        self.lemmatizer = nltk.WordNetLemmatizer()  # Usar un lematizador adecuado
        
    def get_features(self, tokens: List[Tuple[str, str]], index: int) -> List[str]:
        word, pos = tokens[index]
        features = ["bias"]
        # --- Nuevas features avanzadas ---
        # Características básicas de la palabra
        if self.config.get("word_form", True):
            features.append(f"word={word}")
            features.append(f"word.lower={word.lower()}")

        if self.config.get("pos", True):
            features.append(f"pos={pos}")
            features.append(f"lemma={self.lemmatizer.lemmatize(word.lower())}")  # Usar un lematizador adecuado

        # Características morfológicas
        if self.config.get("morphology", True):
            features.append(f"is_title={word.istitle()}")
            features.append(f"is_upper={word.isupper()}")
            features.append(f"is_digit={word.isdigit()}")
            features.append(f"has_digit={any(c.isdigit() for c in word)}")
            features.append(f"has_symbol={not word.isalnum()}")

        # Prefijos y sufijos
        if self.config.get("prefix_suffix", True):
            features.append(f"prefix3={word[:3]}")
            features.append(f"suffix3={word[-3:]}")
            features.append(f"prefix2={word[:2]}")
            features.append(f"suffix2={word[-2:]}")

        # Longitud de la palabra
        if self.config.get("length", True):
            features.append(f"length={len(word)}")

        # Posición en la oración
        if self.config.get("position", True):
            features.append(f"position={index}")
            features.append(f"is_first={index == 0}")
            features.append(f"is_last={index == len(tokens)-1}")

        # Contexto circundante
        if self.config.get("context", True):
            if index > 0:
                prev_word, prev_pos = tokens[index-1]
                features.append(f"prev_word.lower={prev_word.lower()}")
                features.append(f"prev_word.istitle={prev_word.istitle()}")
                features.append(f"prev_word.isdigit={prev_word.isdigit()}")
                features.append(f"prev_pos={prev_pos}")
            if index < len(tokens)-1:
                next_word, next_pos = tokens[index+1]
                features.append(f"next_word.lower={next_word.lower()}")
                features.append(f"next_word.istitle={next_word.istitle()}")
                features.append(f"next_word.isdigit={next_word.isdigit()}")
                features.append(f"next_pos={next_pos}")

        # Gazetteer features usando locations_set y person_names_set
        if self.config.get("gazetteers", True):
            features.append(f"in_location_gazetteer={word.lower() in locations_set}")
            features.append(f"in_person_gazetteer={word.lower() in person_names_set}")

        return features

# %% [markdown]
# ## 4. Modelo CRF y Entrenamiento

# %%
class CRFModel:
    def __init__(self, feature_generator: CRFFeatureGenerator):
        self.ct = CRFTagger(feature_func=feature_generator.get_features)
        
    def train(self, train_sents, train_labels):
        formatted_data = self._format_data(train_sents, train_labels)
        self.ct.train(formatted_data, 'model.crf')
        
    def predict(self, test_sents):
        tagged_sents = self.ct.tag_sents(test_sents)
        # Extract only the tags from the (word, tag) tuples
        return [[tag for _, tag in sent] for sent in tagged_sents]
    
    def _format_data(self, sents, labels):
        return [list(zip(sent, label)) for sent, label in zip(sents, labels)]

# %% [markdown]
# ## 5. Evaluación y Experimentación

# %%
# %% [markdown]
# ## 5. Funciones de Evaluación

# %%
def sent_tags_to_IO(sent_tags):
    return [[tag.replace("B-", "I-") for tag in sent] for sent in sent_tags]

def entity_finder(sent_tags):
    entities = []
    for sent in sent_tags:
        sent_entities = []  # Initialize a list for the current sentence
        entities.append(sent_entities)  # Add it to the main entities list
        
        current_entity = None
        start_idx = None
        entity_type = None
        
        for i, tag in enumerate(sent):
            if tag.startswith("I-"):
                if current_entity is None:  # Nueva entidad
                    current_entity = tag[2:]
                    start_idx = i
                    entity_type = tag[2:]
                elif tag[2:] != entity_type:  # Cambio de tipo
                    if current_entity:
                        sent_entities.append((entity_type, (start_idx, i-1)))
                    current_entity = tag[2:]
                    start_idx = i
                    entity_type = tag[2:]
            else:
                if current_entity is not None:  # Finalizar entidad
                    sent_entities.append((entity_type, (start_idx, i-1)))
                    current_entity = None
                    start_idx = None
                    entity_type = None
        
        if current_entity is not None:  # Entidad al final de la frase
            sent_entities.append((entity_type, (start_idx, len(sent)-1)))
    
    return entities

def evaluate_model(y_true, y_pred, errors=False):
    info = {}
    # Convertir a formato IO
    y_true_io = sent_tags_to_IO(y_true)
    y_pred_io = sent_tags_to_IO(y_pred)

    # Balanced accuracy usando solo el primer carácter de la etiqueta
    def join_sent_tags(sent_tags):
        return [tag[0] for sent in sent_tags for tag in sent]

    info['Balanced accuracy'] = balanced_accuracy_score(join_sent_tags(y_true_io), join_sent_tags(y_pred_io))

    # Encontrar entidades
    true_entities = entity_finder(y_true_io)
    pred_entities = entity_finder(y_pred_io)

    # Contar entidades reales y correctas
    counts = {'LOC': 0, 'MISC': 0, 'ORG': 0, 'PER': 0}
    correct_counts = {'LOC': 0, 'MISC': 0, 'ORG': 0, 'PER': 0}
    invented = 0

    for i, sent in enumerate(true_entities):
        sent_true = set(sent)
        sent_pred = set(pred_entities[i])
        for ent in sent:
            counts[ent[0]] += 1
        for ent in sent_pred & sent_true:
            correct_counts[ent[0]] += 1
        invented += len(sent_pred - sent_true)

    # Métricas por tipo
    for ent_type in counts:
        total = counts[ent_type]
        correct = correct_counts[ent_type]
        info[f'{ent_type} correct'] = correct / total if total > 0 else 0.0

    total_entities = sum(counts.values())
    true_positives = sum(correct_counts.values())
    false_positives = invented
    false_negatives = total_entities - true_positives
    
    # Precision, Recall y F1
    info['Precision'] = true_positives / (true_positives + false_positives) if (true_positives + false_positives) else 0
    info['Recall'] = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) else 0
    if info['Precision'] + info['Recall'] > 0:
        info['F1 Score'] = 2 * (info['Precision'] * info['Recall']) / (info['Precision'] + info['Recall'])
    else:
        info['F1 Score'] = 0

    if errors:
        error_list = []
        for i in range(len(true_entities)):
            error_list.extend([
                (i, ent) for ent in pred_entities[i] if ent not in true_entities[i]
            ])
        return info, error_list

    return info

# %% [markdown]
# ## 6. Modificación del Pipeline de Experimentación

# %%
def run_experiment(config: Dict, language: str = "spanish"):
    processor = NERDataProcessor(language)
    train, dev, test = processor.load_data()
    
    # Convertir datos
    X_train = processor.convert_to_features(train)
    y_train = processor.get_labels(train)
    X_dev = processor.convert_to_features(dev)
    y_dev = processor.get_labels(dev)
    
    # Configurar modelo
    feature_gen = CRFFeatureGenerator(config)
    model = CRFModel(feature_gen)
    
    # Entrenar y predecir
    model.train(X_train, y_train)
    y_pred = model.predict(X_dev)
    
    return evaluate_model(y_dev, y_pred)

# %% [markdown]
# ## 7. Uso del Sistema

# %%
# Configuración de ejemplo
config = {
    "word_form": True,
    "pos": True,
    "morphology": True,
    "prefix_suffix": True,
    "length": True,
    "position": True,
    "context": True,
    "gazetteers": True,
}

config = {
    "word_form": True,
    "pos": False,
    "morphology": False,
    "prefix_suffix": False,
    "length": False,
    "position": False,
    "context": False,
    "gazetteers": False,
}

# Ejecutar experimento
results = run_experiment(config)
print("Resultados de la evaluación:")
print(pd.DataFrame([results]).T)

# %% [markdown]
# ## 7. Mejoras Adicionales
# - Añadir más características contextuales
# - Optimizar hiperparámetros del CRF
# - Incorporar más gazetteers
# - Implementar validación cruzada

[nltk_data] Downloading package conll2002 to
[nltk_data]     C:\Users\11ser\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\11ser\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Resultados de la evaluación:
                          0
Balanced accuracy  0.762550
LOC correct        0.587873
MISC correct       0.250000
ORG correct        0.392540
PER correct        0.277504
Precision          0.749110
Recall             0.389454
F1 Score           0.512477


In [30]:
import pandas as pd
from itertools import product

# Define a small grid of parameters to search
param_grid = {
    "lenght": [True, False],
    "context": [True, False],
    "pos": [True, False],
    "prefix_suffix": [True, False],
}

# Keep these features always on for baseline performance
fixed_params = {
    "morphology": True,
    "word_form": True,
    "gazetteers": True,
    "position": True
}

# Generate all combinations of parameters in the grid
configs = []
results_list = []

# Create parameter combinations
keys = list(param_grid.keys())
for values in product(*[param_grid[key] for key in keys]):
    config = fixed_params.copy()
    config.update(dict(zip(keys, values)))
    print(f"Running configuration: {config}")
    # Ensure a fresh CRFTagger instance for each run to avoid pickling issues
    results = run_experiment(config)
    
    # Store configuration and results
    config_results = {**config}
    config_results.update(results)
    results_list.append(config_results)

# Create DataFrame with results
results_df = pd.DataFrame(results_list)

# Sort by balanced accuracy
results_df = results_df.sort_values("Balanced accuracy", ascending=False)

# Display results
print("\nGrid Search Results:")
display(results_df)

# Optionally save results to CSV
# results_df.to_csv("grid_search_results.csv", index=False)

Running configuration: {'morphology': True, 'word_form': True, 'gazetteers': True, 'position': True, 'lenght': True, 'context': True, 'pos': True, 'prefix_suffix': True}
Running configuration: {'morphology': True, 'word_form': True, 'gazetteers': True, 'position': True, 'lenght': True, 'context': True, 'pos': True, 'prefix_suffix': False}
Running configuration: {'morphology': True, 'word_form': True, 'gazetteers': True, 'position': True, 'lenght': True, 'context': True, 'pos': False, 'prefix_suffix': True}
Running configuration: {'morphology': True, 'word_form': True, 'gazetteers': True, 'position': True, 'lenght': True, 'context': True, 'pos': False, 'prefix_suffix': False}
Running configuration: {'morphology': True, 'word_form': True, 'gazetteers': True, 'position': True, 'lenght': True, 'context': False, 'pos': True, 'prefix_suffix': True}
Running configuration: {'morphology': True, 'word_form': True, 'gazetteers': True, 'position': True, 'lenght': True, 'context': False, 'pos': Tru

Unnamed: 0,morphology,word_form,gazetteers,position,lenght,context,pos,prefix_suffix,Balanced accuracy,Total entities,Entities correct,LOC correct,MISC correct,ORG correct,PER correct,Entities invented
2,True,True,True,True,True,True,False,True,0.787264,4324,0.724098,0.8037,0.418919,0.705743,0.797209,1031
10,True,True,True,True,False,True,False,True,0.787264,4324,0.724098,0.8037,0.418919,0.705743,0.797209,1031
0,True,True,True,True,True,True,True,True,0.787187,4324,0.725717,0.793422,0.416667,0.709888,0.80624,1020
8,True,True,True,True,False,True,True,True,0.787187,4324,0.725717,0.793422,0.416667,0.709888,0.80624,1020
1,True,True,True,True,True,True,True,False,0.775607,4324,0.712072,0.793422,0.382883,0.701007,0.78243,1080
9,True,True,True,True,False,True,True,False,0.775607,4324,0.712072,0.793422,0.382883,0.701007,0.78243,1080
3,True,True,True,True,True,True,False,False,0.769754,4324,0.702128,0.788284,0.376126,0.693902,0.763547,1108
11,True,True,True,True,False,True,False,False,0.769754,4324,0.702128,0.788284,0.376126,0.693902,0.763547,1108
4,True,True,True,True,True,False,True,True,0.768182,4324,0.708372,0.791367,0.434685,0.685613,0.773399,1078
12,True,True,True,True,False,False,True,True,0.768182,4324,0.708372,0.791367,0.434685,0.685613,0.773399,1078


In [31]:
results_df.to_csv("resultados_grid_search_tot30min.csv", index=False)