In [1]:
import nltk
nltk.download('conll2002')
from nltk.corpus import conll2002
train_es = conll2002.iob_sents('esp.train') # Train
dev_es = conll2002.iob_sents('esp.testa') # Dev
test_es =conll2002.iob_sents('esp.testb') # Test

train_ned = conll2002.iob_sents('ned.train') # Train
dev_ned = conll2002.iob_sents('ned.testa') # Dev
test_ned =conll2002.iob_sents('ned.testb') # Test

data = {'spanish': (train_es, dev_es, test_es),
        'dutch': (train_ned, dev_ned, test_ned)}

[nltk_data] Downloading package conll2002 to
[nltk_data]     C:\Users\11ser\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


In [2]:
train = conll2002.tagged_sents('esp.train')
test = conll2002.tagged_sents('esp.testb')
print(len(train))

8323


In [10]:
def gazetters():
    """
    Define a set of gazetteers for named entity recognition.
    These gazetteers include common locations, person names, and organizations.
    """

    # List of locations, person names, and organizations
    locations_list = {
        # Ciudades principales (España y Países Bajos)
        "madrid", "barcelona", "valencia", "sevilla", "zaragoza", "bilbao", "málaga", "murcia", "palma",
        "amsterdam", "rotterdam", "utrecht", "eindhoven", "groningen", "nijmegen",

        # Regiones y provincias
        "andalucía", "cataluña", "galicia", "castilla", "canarias", "baleares",
        "noord-holland", "zuid-holland", "gelderland", "limburg",

        # Países (hispanohablantes + relevantes cercanos)
        "españa", "méxico", "argentina", "colombia", "perú", "chile", "ecuador",
        "nederland", "belgië", "duitsland", "francia", "italia", "portugal", "reino unido", "estados unidos",

        # Geografía física
        "mediterráneo", "atlántico", "pirineos", "cantábrico", "guadalquivir", "ebro",
        "noordzee", "rijn", "maas", "veluwe"
    }

    person_names_list = {
        # Nombres masculinos comunes
        "juan", "josé", "david", "javier", "miguel", "pedro", "sergio", "pablo", "antonio",
        "jan", "peter", "willem", "tim", "thomas", "jeroen", "lucas", "kees",

        # Nombres femeninos comunes
        "maría", "ana", "isabel", "laura", "marta", "lucía", "paula", "cristina",
        "maria", "johanna", "emma", "sophie", "lisa", "lotte", "iris", "eva",

        # Apellidos comunes
        "garcía", "gonzález", "rodríguez", "fernández", "lópez", "sánchez", "martínez",
        "de jong", "jansen", "de vries", "van den berg", "bakker", "visser", "meijer"
    }

    organizations_list = {
        # Empresas destacadas
        "telefónica", "bbva", "iberdrola", "mercadona", "inditex", "seat",
        "philips", "shell", "unilever", "heineken", "ing", "asml",

        # Instituciones gubernamentales
        "gobierno", "ministerio", "ayuntamiento", "generalitat", "policía nacional", "guardia civil",
        "regering", "ministerie", "gemeente", "belastingdienst", "politie", "koninklijke marechaussee",

        # Educación y cultura
        "universidad", "museo del prado", "csic", "colegio", "hospital", "asociación", "fundación",
        "universiteit", "hogeschool", "tno", "knaw", "rijksmuseum", "omroep", "kvk"
    }

    return locations_list, person_names_list, organizations_list

In [11]:
import re
import nltk
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemma(word):
    return lemmatizer.lemmatize(word)

def pos_tag(word):
    tagged = nltk.pos_tag([word])
    return tagged[0][1]

def generate_features(feature_selection):
    def feature_function(sentence, index):
        features = {"bias": 1.0}  # Bias term for the feature vector
        word = sentence[index][0]  # Always get the first element (word)
        
        # Características básicas de la palabra
        if feature_selection.get("word_form", True):
            features.update({
                "word": word,
                "word.lower": word.lower(),
            })

        # Características morfológicas
        if feature_selection.get("morphology", True):
            features.update({
                "word.istitle": word.istitle(),
                "word.isupper": word.isupper(),
                "word.islower": word.islower(),
                "word.isdigit": word.isdigit(),
                "has_digit": any(c.isdigit() for c in word),
                "capitals_inside": any(c.isupper() for c in word[1:]),
                "has_symbol": not word.isalnum()
            })
        
        # POS tagging y lemas
        if feature_selection.get("lemma_pos_tags", True):
            features["pos_tag"] = pos_tag(word)
            features["lemma"] = lemma(word)
        
        # Prefijos y sufijos
        if feature_selection.get("prefix_suffix", True):
            features.update({
                "prefix3": word[:3],
                "suffix3": word[-3:],
                "prefix2": word[:2],
                "suffix2": word[-2:],
            })
        
        # Longitud de la palabra
        if feature_selection.get("length", True):
            features["length"] = len(word)
        
        # Posición en la oración
        if feature_selection.get("position", True):
            features.update({
                "position": index,
                "is_first": index == 0,
                "is_last": index == len(sentence)-1
            })
        
        # Contexto circundante
        if feature_selection.get("context", True):
            if index > 0:
                prev_word = sentence[index-1][0]
                features.update({
                    "prev_word.lower": prev_word.lower(),
                    "prev_word.istitle": prev_word.istitle(),
                    "prev_word.isupper": prev_word.isupper(),
                    "prev_word.isdigit": prev_word.isdigit(),
                    "prev_word.istitle": prev_word.istitle()})
                
                if feature_selection.get("lemma_pos_tags", True):
                    features["prev_lemma"] = lemma(prev_word)
                    features["prev_pos_tag"] = pos_tag(prev_word)

            if index < len(sentence)-1:
                next_word = sentence[index+1][0]
                features.update({
                    "next_word.lower": next_word.lower(),
                    "next_word.istitle": next_word.istitle(),
                    "next_word.isupper": next_word.isupper(),
                    "next_word.isdigit": next_word.isdigit(),
                    "next_word.istitle": next_word.istitle()})
                
                if feature_selection.get("lemma_pos_tags", True):
                    features["next_lemma"] = lemma(next_word)
                    features["next_pos_tag"] = pos_tag(next_word)
        
        # Patrones de fecha y hora
        if feature_selection.get("date_time_patterns", True):
            date_pattern = r"\d{1,2}[/-]\d{1,2}([/-]\d{2,4})?"
            time_pattern = r"\d{1,2}:\d{2}(:\d{2})?"
            features["has_date_pattern"] = bool(re.search(date_pattern, word))
            features["has_time_pattern"] = bool(re.search(time_pattern, word))

        # Caracteres especiales y símbolos
        if feature_selection.get("symbol_patterns", True):
            features.update({
            "has_hyphen": "-" in word,
            "has_dot": "." in word,
            "has_comma": "," in word,
            "has_slash": "/" in word,
            "has_percent": "%" in word,
            "has_currency": any(c in word for c in "$€£¥"),
            "has_at": "@" in word
            })

                # En generate_features, añadir:
        if feature_selection.get("gazetteers", True):
            locations_list, person_names_list, organizations_list = gazetters()
            features.update({
                "in_locations": word.lower() in locations_list,
                "in_person_names": word.lower() in person_names_list,
                "in_organizations": word.lower() in organizations_list
            })
        
        return features
    
    return feature_function

In [12]:
feature_selection = {
    "word_form": True,           # Basic word characteristics
    "prefix_suffix": True,       # Prefixes and suffixes
    "morphology": True,          # Morphological features
    "context": True,             # Surrounding words
    "lemma_pos_tags": True,      # POS tagging and lemmas
    "date_time_patterns": True,  # Date and time patterns
    "symbol_patterns": True,     # Special characters
    "length": True,              # Word length features
    "position": True,            # Position features
    "gazetteers": True           # Gazetteer features
}

ct = nltk.tag.CRFTagger(feature_func=generate_features(feature_selection))
ct.train(train[:100], "nooooooolapolitziiaa.mdl")
ct.accuracy(test)

0.6670482991481187

In [None]:
import nltk
from nltk.corpus import conll2002
from nltk.tag import CRFTagger
from typing import List, Tuple, Optional, Callable
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

nltk.download('conll2002')

class CRFModel:
    # Clase proporcionada por el usuario (como en el enunciado)
    def __init__(self, train, test, model_file: Optional[str] = None, features: Optional[Callable] = None, encoding: str = 'bio') -> None:
        self.model = CRFTagger(feature_func=features)
        self.model_file = model_file
        self.encoding = encoding
        self.train_data = self.transformar_dades(train)
        self.test_data = self.transformar_dades(test)

    def transformar_dades(self, data):
        if self.encoding == 'io':
            #data_encoded = [self.to_io(sent) for sent in data] 
            data_encoded = self.to_io(data)
        elif self.encoding == 'bioes':
            #data_encoded = [self.to_bioes(sent) for sent in data]
            data_encoded = self.to_bioes(data)
        elif self.encoding == 'biow':
            #data_encoded = [self.to_biow(sent) for sent in data]
            data_encoded = self.to_biow(data)
        else:
            data_encoded = data
        return data_encoded

    def train(self, train_data: List[List[Tuple[str, str]]]) -> None:
        self.model.train(train_data, self.model_file)

    def predict(self, sents: List[List[Tuple[str, str]]]) -> List[List[Tuple[str, str]]]:
        if self.model_file:
            self.model.set_model_file(self.model_file)
        words_test = [[word for (word, _) in sent] for sent in sents]
        return self.model.tag_sents(words_test)
    def to_biow(self, tagged_sents):
   
        biow_tagged_sents = []
        
        for sent in tagged_sents:
            biow_sent = []
            i = 0
            while i < len(sent):
                word = sent[i][0]  # Get the word
                tag = sent[i][-1]  # Get the BIO tag
                
                # If it's a B- tag, check if it's a single token entity
                if tag.startswith('B-'):
                    entity_type = tag[2:]  # Get entity type (PER, LOC, etc.)
                    
                    # Check if next token continues this entity
                    if i + 1 < len(sent) and sent[i+1][-1] == f'I-{entity_type}':
                        # Multi-token entity, keep as B-
                        biow_sent.append((word, tag))
                    else:
                        # Single token entity, convert to W-
                        biow_sent.append((word, f'W-{entity_type}'))
                else:
                    # Keep I- and O tags as they are
                    biow_sent.append((word, tag))
                    
                i += 1
                
            biow_tagged_sents.append(biow_sent)
        
        return biow_tagged_sents
    
    def to_io(self,tagged_sents):
        io_tagged_sents = []
        
        for sent in tagged_sents:
            io_sent = []
            for token in sent:
                word = token[0]  # Get the word
                tag = token[-1]  # Get the BIO tag
                
                # Convert B- to I- (Beginning to Inside)
                if tag.startswith('B-'):
                    tag = 'I-' + tag[2:]
                    
                io_sent.append((word, tag))
            io_tagged_sents.append(io_sent)
        
        return io_tagged_sents
    def to_bioes(self,tagged_sents):
        bioes_tagged_sents = []
        
        for sent in tagged_sents:
            bioes_sent = []
            i = 0
            while i < len(sent):
                word = sent[i][0]  # Get the word
                tag = sent[i][-1]  # Get the BIO tag
                
                # Handle outside tags (O)
                if tag == 'O':
                    bioes_sent.append((word, tag))
                    i += 1
                    continue
                
                # Handle entity tags (B-X or I-X)
                if tag.startswith('B-'):
                    entity_type = tag[2:]  # Get entity type (PER, LOC, etc.)
                    
                    # Find the end of this entity
                    end_idx = i + 1
                    while (end_idx < len(sent) and 
                        sent[end_idx][-1] == f'I-{entity_type}'):
                        end_idx += 1
                    
                    # Single-token entity (B-X not followed by I-X)
                    if end_idx == i + 1:
                        bioes_sent.append((word, f'S-{entity_type}'))
                    else:
                        # Add B- tag for beginning
                        bioes_sent.append((word, tag))
                        
                        # Add I- tags for middle tokens (if any)
                        for j in range(i+1, end_idx-1):
                            word_j = sent[j][0]
                            bioes_sent.append((word_j, f'I-{entity_type}'))
                        
                        # Add E- tag for the end token
                        word_end = sent[end_idx-1][0]
                        bioes_sent.append((word_end, f'E-{entity_type}'))
                    
                    i = end_idx
                    
                elif tag.startswith('I-'):
                    # This is an error case - I- tag without preceding B-
                    # We'll treat it as B- for robustness
                    entity_type = tag[2:]
                    
                    # Find the end of this entity
                    end_idx = i + 1
                    while (end_idx < len(sent) and 
                        sent[end_idx][-1] == f'I-{entity_type}'):
                        end_idx += 1
                    
                    # Single-token entity (just this I-X)
                    if end_idx == i + 1:
                        bioes_sent.append((word, f'S-{entity_type}'))
                    else:
                        # Add B- tag for beginning (correcting the I- error)
                        bioes_sent.append((word, f'B-{entity_type}'))
                        
                        # Add I- tags for middle tokens (if any)
                        for j in range(i+1, end_idx-1):
                            word_j = sent[j][0]
                            bioes_sent.append((word_j, f'I-{entity_type}'))
                        
                        # Add E- tag for the end token
                        word_end = sent[end_idx-1][0]
                        bioes_sent.append((word_end, f'E-{entity_type}'))
                    
                    i = end_idx
                else:
                    # Unknown tag type, pass through unchanged
                    bioes_sent.append((word, tag))
                    i += 1
                    
            bioes_tagged_sents.append(bioes_sent)
        
        return bioes_tagged_sents
    # Resto de métodos (to_bioes, to_io, extraccio_entitats, evaluacio_entitats, matriu_confusio) como en el enunciado

class NERCRF:
    def __init__(self, language: str = 'esp', encoding: str = 'bio', feature_func: Optional[Callable] = None, model_file: Optional[str] = None):
        self.language = language
        self.encoding = encoding
        self.feature_func = feature_func
        self.model_file = model_file

        # Cargar datos
        self.train_raw = self._load_conll_data(language +'.train')
        self.dev_raw = self._load_conll_data(language +'.testa')
        self.test_raw = self._load_conll_data(language +'.testb')

        # Inicializar modelo CRF
        self.crf_model = CRFModel(
            train=self.train_raw,
            test=self.test_raw,
            model_file=self.model_file,
            features=self.feature_func,
            encoding=self.encoding
        )

    def _load_conll_data(self, split: str) -> List[List[Tuple[str, str]]]:
        sents = conll2002.iob_sents(f'{self.language}.{split}')
        return [[(word, tag) for (word, _, tag) in sent] for sent in sents]
    
    

    def train_model(self):
        self.crf_model.train(self.crf_model.train_data)

    def evaluate(self, split: str = 'dev') -> Tuple[float, float, float]:
        if split == 'dev':
            data = self.dev_raw
        elif split == 'test':
            data = self.test_raw
        else:
            raise ValueError("split debe ser 'dev' o 'test'")

        # Transformar datos al encoding correcto
        data_encoded = self.crf_model.transformar_dades(data)

        # Predecir etiquetas
        predicted = self.crf_model.predict(data_encoded)

        # Extraer entidades
        y_true = []
        for sent in data_encoded:
            entities = self.crf_model.extraccio_entitats([sent])
            y_true.extend(entities)

        y_pred = []
        for sent in predicted:
            entities = self.crf_model.extraccio_entitats([sent])
            y_pred.extend(entities)

        # Evaluar
        recall, precision, f1 = self.crf_model.evaluacio_entitats(y_true, y_pred)
        return recall, precision, f1

    def predict_sentence(self, sentence: List[str]) -> List[Tuple[str, str]]:
        formatted_sent = [[(word, 'O') for word in sentence]]
        predicted = self.crf_model.predict(formatted_sent)
        return predicted[0]



In [None]:
# Ejemplo de uso
if __name__ == "__main__":
    # Función de características de ejemplo
    def custom_features(sentence, index):
        word = sentence[index][0]
        features = {
            'word': word,
            'word.lower()': word.lower(),
            'prefix3': word[:3],
            'suffix3': word[-3:],
            'prev_word': sentence[index-1][0] if index > 0 else '<START>',
            'next_word': sentence[index+1][0] if index < len(sentence)-1 else '<END>',
            'is_capitalized': word[0].isupper(),
        }
        return features

    # Entrenar modelo para español con BIOES
    spanish_model = NERCRF(
        language='esp',
        encoding='bioes',
        feature_func=custom_features,
        model_file='esp_model.crf'
    )
    spanish_model.train_model()
    recall, precision, f1 = spanish_model.evaluate('dev')
    print(f"Resultados para español (BIOES):\nRecall: {recall:.2f}, Precision: {precision:.2f}, F1: {f1:.2f}")

    # Entrenar modelo para neerlandés con IO
    dutch_model = NERCRF(
        language='ned',
        encoding='io',
        feature_func=custom_features,
        model_file='ned_model.crf'
    )
    dutch_model.train_model()
    recall, precision, f1 = dutch_model.evaluate('test')
    print(f"Resultados para neerlandés (IO):\nRecall: {recall:.2f}, Precision: {precision:.2f}, F1: {f1:.2f}")

In [2]:
%pip install pyconll

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.

Collecting pyconll
  Using cached pyconll-3.2.0-py3-none-any.whl.metadata (8.0 kB)
Using cached pyconll-3.2.0-py3-none-any.whl (27 kB)
Installing collected packages: pyconll
Successfully installed pyconll-3.2.0



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: C:\Users\sambr\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


# Extra: Cadec Corpus

In [None]:
import pyconll
import os

# Adjust working with CADEC-specific data
def load_cadec_data(filepath):
    """Load CADEC data and extract entity types (ADR, Di, Dr, S, F)"""
    sentences = []
    
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            raw_text = f.read()
        
        # Split into sentences (assuming one sentence per line or blank line separation)
        raw_sents = [s.strip() for s in raw_text.split('\n\n') if s.strip()]
        
        for sent in raw_sents:
            tokens = []
            for line in sent.split('\n'):
                if line.strip():
                    # Assuming CoNLL format: token pos chunk tag
                    parts = line.split()
                    if len(parts) >= 4:  # Ensure we have enough columns
                        word = parts[0]
                        tag = parts[3]  # Entity tag (O, B-ADR, I-ADR, etc.)
                        
                        # Strip the identifier, keep only entity type
                        if tag != 'O' and '-' in tag:
                            prefix, entity_type = tag.split('-', 1)
                            # Keep only the entity type (ADR, Di, Dr, S, F), remove identifiers
                            if '+' in entity_type:
                                entity_type = entity_type.split('+')[0]
                            tag = f"{prefix}-{entity_type}"
                        
                        tokens.append((word, tag))
            
            if tokens:
                sentences.append(tokens)
    
    except Exception as e:
        print(f"Error loading CADEC data: {e}")
        return []
    
    return sentences




Loaded 5719 training sentences
Loaded 1878 test sentences

Sample sentence:
pain --> O
in --> O
my --> O
left --> O
leg --> O
and --> O
most --> O
of --> O
my --> O
joints --> O
. --> O


In [1]:
train_data = load_cadec_data('./cadec/train.conll')
test_data = load_cadec_data('./cadec/test.conll')

NameError: name 'load_cadec_data' is not defined