# Pràctica 3

In [4]:
import nltk
nltk.download('conll2002')
from nltk.corpus import conll2002
train_es = conll2002.iob_sents('esp.train') # Train
dev_es = conll2002.iob_sents('esp.testa') # Dev
test_es =conll2002.iob_sents('esp.testb') # Test

train_ned = conll2002.iob_sents('ned.train') # Train
dev_ned = conll2002.iob_sents('ned.testa') # Dev
test_ned =conll2002.iob_sents('ned.testb') # Test

data = {'spanish': (train_es, dev_es, test_es),
        'dutch': (train_ned, dev_ned, test_ned)}

[nltk_data] Downloading package conll2002 to
[nltk_data]     C:\Users\11ser\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


In [5]:
train = conll2002.tagged_sents('esp.train')
test = conll2002.tagged_sents('esp.testb')
print(len(train))

8323


In [3]:
# Initialize the CRFTagger
model = nltk.tag.CRFTagger()


In [4]:
train_sample = train[0:500] #temps d'entrenament còmicament gran amb totes les dades: pillo un sample més petit
print(len(train_sample))

500


In [5]:
model.train(train, 'crfTaggerEs.mdl')


In [6]:
model.accuracy(test)

0.9549997089243786

In [None]:
def generar_features(tokens, idx, features_selection):
    """
    Genera una llista de característiques per a un token específic basat en les característiques seleccionades.

    features_selection és un diccionari que indica quines característiques incloure (True o False).

    Retorna una llista de característiques per al token donat.
    """
    caracteristiques = []
    caracteristiques.append('bias')

    token_actual = tokens[idx]

    # ------ FORMA DE LA PARAULA ------
    if features_selection.get('word_form', True):
        caracteristiques.extend([
            f'paraula={token_actual}',
            f'token_minuscula={token_actual.lower()}'
        ])

    # ------ PREFIXOS I SUFIXOS ------
    if features_selection.get('prefix_suffix', True):
        caracteristiques.extend([
            f'prefix-1={token_actual[0]}',
            f'prefix-2={token_actual[:2]}',
            f'prefix-3={token_actual[:3]}',
            f'sufix-1={token_actual[-1]}',
            f'sufix-2={token_actual[-2:]}',
            f'sufix-3={token_actual[-3:]}'
        ])

    # ------ MORFOLOGIA ------
    if features_selection.get('morphology', True):
        caracteristiques.extend([
            f'està_capitalitzat={token_actual[0].isupper()}',
            f'es_majuscules={token_actual.isupper()}',
            f'es_minuscules={token_actual.islower()}',
            f'té_guion={"-" in token_actual}',
            f'es_numeric={token_actual.isdigit()}',
            f'capitals_internes={token_actual[1:].lower() != token_actual[1:]}'
        ])

    # ------ LONGITUD ------
    if features_selection.get('length', True):
        caracteristiques.append(f'longitud={len(token_actual)}')

    # ------ POSICIÓ ------
    if features_selection.get('position', True):
        caracteristiques.extend([
            f'es_primer={idx == 0}',
            f'es_ultim={idx == len(tokens) - 1}'
        ])

    # ------ CONTEXT ANTERIOR ------
    if idx > 0 and features_selection.get('context', True):
        token_anterior = tokens[idx - 1]
        caracteristiques.extend([
            f'token_anterior={token_anterior.lower()}',
            f'anterior_capitalitzat={token_anterior[0].isupper()}',
            f'anterior_majuscules={token_anterior.isupper()}',
            f'anterior_minuscules={token_anterior.islower()}'
        ])

    # ------ CONTEXT POSTERIOR ------
    if idx < len(tokens) - 1 and features_selection.get('context', True):
        token_seguent = tokens[idx + 1]
        caracteristiques.extend([
            f'token_seguent={token_seguent.lower()}',
            f'seguent_capitalitzat={token_seguent[0].isupper()}',
            f'seguent_majuscules={token_seguent.isupper()}',
            f'seguent_minuscules={token_seguent.islower()}'
        ])

    return caracteristiques


In [None]:
feature_types = {
        'word_form': True,
        'lemma_pos_tags': False,
        'prefix_suffix': True,
        'morphology': True,
        'length': True,
        'position': True,
        'context': True
    }

In [17]:
def generate_features(feature_selection):
    def feature_function(sentence, index):
        features = {}
        word = sentence[index][0]  # Always get the first element (word)
        pos = sentence[index][1] if len(sentence[index]) > 1 else None  # Get POS if available
        
        # Características básicas de la palabra
        if feature_selection.get("word_form", True):
            features.update({
                "word": word,
                "word.lower": word.lower(),
                "word.istitle": word.istitle(),
                "word.isupper": word.isupper(),
                "word.isdigit": word.isdigit(),
            })
        
        # POS tagging y lemas
        if feature_selection.get("lemma_pos_tags", True) and pos:
            features["pos"] = pos
        
        # Prefijos y sufijos
        if feature_selection.get("prefix_suffix", True):
            features.update({
                "prefix3": word[:3],
                "suffix3": word[-3:],
                "prefix2": word[:2],
                "suffix2": word[-2:],
            })
        
        # Características morfológicas
        if feature_selection.get("morphology", True):
            features.update({
                "hyphen": "-" in word,
                "has_digit": any(c.isdigit() for c in word),
                "shape": "".join([
                    "X" if c.isupper() else 
                    "x" if c.islower() else 
                    "d" if c.isdigit() else c 
                    for c in word
                ])
            })
        
        # Longitud de la palabra
        if feature_selection.get("length", True):
            features["length"] = len(word)
        
        # Posición en la oración
        if feature_selection.get("position", True):
            features.update({
                "position": index,
                "is_first": index == 0,
                "is_last": index == len(sentence)-1
            })
        
        # Contexto circundante
        if feature_selection.get("context", True):
            if index > 0:
                prev_word = sentence[index-1][0]
                features.update({
                    "prev_word": prev_word,
                    "prev_word.lower": prev_word.lower(),
                    "prev_word.istitle": prev_word.istitle(),
                })
            if index < len(sentence)-1:
                next_word = sentence[index+1][0]
                features.update({
                    "next_word": next_word,
                    "next_word.lower": next_word.lower(),
                    "next_word.istitle": next_word.istitle(),
                })
        
        # Características adicionales
        features.update({
            "bias": 1.0,  # Término de sesgo
            "word.isalnum": word.isalnum(),
            "capital_inside": word[1:].lower() != word[1:] if len(word) > 1 else False
        })
        
        return features
    
    return feature_function

In [18]:
feature_selection = {
    "word_form": True,
    "prefix_suffix": True,
    "morphology": True,
    "context": True,
    # ... otros parámetros
}

ct = nltk.tag.CRFTagger(feature_func=generate_features(feature_selection))
ct.train(train[:100], "nooooooolapolitziiaa.mdl")

In [19]:
ct.accuracy(test)

0.7088467583878292

In [None]:
import re
def generate_features(feature_selection):
    def feature_function(sentence, index):
        features = {}
        word = sentence[index]
        symbols = {'@', '#', '$', '%', '&', '*', '-', '/', ':', '€', '£', '¥'}
        
        # ========================
        # 1. PREFIJOS Y SUFIJOS
        # ========================
        if feature_selection['prefix_suffix']:
            # Prefijos estándar
            for n in [2, 3]:
                if len(word) >= n:
                    features[f'prefix{n}'] = word[:n]
                    features[f'suffix{n}'] = word[-n:]
            
            # Suffijos largos (4+ caracteres)
            if feature_selection['suffix_long'] and len(word) >= 4:
                features.update({
                    'prefix4': word[:4],
                    'suffix4': word[-4:],
                    'suffix5': word[-5:] if len(word) >=5 else None
                })

        # ========================
        # 2. LONGITUD DE PALABRA
        # ========================
        if feature_selection['length']:
            word_len = len(word)
            features.update({
                'length': word_len,
                'length_bin': 'short' if word_len <4 else 'medium' if 4<=word_len<=7 else 'long',
                'is_single_char': word_len == 1
            })

        # ========================
        # 3. PATRONES DE SÍMBOLOS
        # ========================
        if feature_selection['symbol_patterns']:
            symbol_features = {
                'contains_symbol': any(c in symbols for c in word),
                'symbol_count': sum(1 for c in word if c in symbols),
                'is_hashtag': word.startswith('#') and len(word) > 1,
                'is_mention': word.startswith('@') and len(word) > 1,
                'is_url': re.match(r'^https?://', word) is not None
            }
            features.update(symbol_features)

            # Detección de combinaciones especiales
            if '-' in word:
                features['hyphen_combinations'] = word.count('-')
            if '/' in word:
                features['slash_pattern'] = True
        
        # 1. Características léxicas básicas
        if feature_selection['word_form']:
            features.update({
                'word': word,
                'word.lower': word.lower(),
                'word.istitle': word.istitle(),
                'word.isupper': word.isupper(),
                'word.isdigit': word.isdigit(),
                'word.isalnum': word.isalnum(),
            })
        
        # 2. Morfología avanzada
        if feature_selection['morphology']:
            features.update({
                'hyphen': '-' in word,
                'dot': '.' in word,
                'apostrophe': "'" in word,
                'shape': ''.join([
                    'A' if c.isalpha() else 
                    'N' if c.isdigit() else 
                    c for c in word
                ])
            })
        
        # 3. Patrones de mayúsculas complejos
        if feature_selection['case_patterns']:
            features.update({
                'all_caps': word == word.upper(),
                'mixed_case': word != word.lower() and word != word.upper(),
                'capital_inside': word[1:] != word[1:].lower() if len(word) > 1 else False
            })
        
        # 4. Prefijos y sufijos extendidos
        if feature_selection['prefix_suffix']:
            for n in [2,3,4]:
                if len(word) >= n:
                    features[f'prefix{n}'] = word[:n]
                    features[f'suffix{n}'] = word[-n:]
        
        # 5. Detección de patrones temporales/númericos
        if feature_selection['date_time_patterns']:
            features['is_date'] = any(c in word for c in ['/', '-', ':'])
            features['has_year'] = len(word) == 4 and word.isdigit()
        
        # 6. Contexto extendido (ventana ±2)
        if feature_selection['extended_context']:
            for i in range(-2, 3):
                if i != 0 and 0 <= index+i < len(sentence):
                    ctx_word = sentence[index+i][0]
                    features[f'ctx_{i}'] = ctx_word.lower()
        
        # 7. Características posicionales avanzadas
        if feature_selection['sentence_position']:
            total_words = len(sentence)
            features.update({
                'position_quartile': int(4 * index / total_words),
                'is_first_10%': index < total_words * 0.1
            })
        
        # 8. Características idiomáticas (ejemplo para español)
        if feature_selection['language_specific']:
            features.update({
                'has_accent': any(c in 'áéíóúñ' for c in word.lower()),
                'common_ending': word[-2:] in ['os', 'as', 'es']  # Típico en español
            })
        
        return features
    
    return feature_function

In [26]:
feature_selection = {
    "word_form": True,
    "prefix_suffix": True,
    "morphology": True,
    "context": True,
    "extended_context": True,
    "language_specific": True,
    "sentence_position": True,
    "case_patterns": True,
    "date_time_patterns": True,
    "common_ending": True,
}

ct = nltk.tag.CRFTagger(feature_func=generate_features(feature_selection))
ct.train(train[:100], "nooooooolapolitziiaa.mdl")
ct.accuracy(test)

0.8240156792734753

In [None]:
best_features = {
    'word_form': True,
    'prefix_suffix': True,
    'case_patterns': True,
    'context': True,
    'morphology': True,
    'date_time_patterns': True,
    'numeric_patterns': True,
    'orthographic_features': True,
    'symbol_patterns': True,
    'length': True,
    'language_specific': True,  # Esencial para español/neerlandés
}

In [29]:
import re

def generate_features(feature_selection):
    def feature_function(sentence, index):
        features = {}
        print(sentence[index])
        word,pos = sentence[index]
        symbols = {'@', '#', '$', '%', '&', '*', '-', '/', ':', '€', '£', '¥'}
        total_words = len(sentence)
        
        # ========================
        # 1. COMMON ENDINGS (IDIOMÁTICOS)
        # ========================
        if feature_selection['language_specific']:
            # Español
            spanish_endings = {'ción', 'mente', 'dad', 'ero', 'era', 'ado', 'ada', 'ía'}
            features['common_ending_es'] = any(word.lower().endswith(e) for e in spanish_endings)
            
            # Neerlandés
            dutch_endings = {'heid', 'ing', 'teit', 'schap', 'lijk', 'baar', 'nis', 'tie'}
            features['common_ending_nl'] = any(word.lower().endswith(e) for e in dutch_endings)
        
        # ========================
        # 2. LEMMA Y POS TAGS
        # ========================
        if feature_selection['lemma_pos_tags']:
            features.update({
                'pos': pos,
                'pos_prefix': pos.split('|')[0] if '|' in pos else pos,  # Para tags compuestos
                'pos_coarse': pos[0] if len(pos) > 0 else 'U'  # Primera letra del tag
            })
        
        # ========================
        # 3. POSICIÓN EN LA FRASE
        # ========================
        if feature_selection['sentence_position']:
            position_ratio = index / total_words if total_words > 0 else 0
            features.update({
                'position_ratio': round(position_ratio, 2),
                'position_segment': int(position_ratio * 5),  # 0-4 (quintiles)
                'is_first_5%': position_ratio < 0.05,
                'is_last_5%': position_ratio > 0.95,
                'middle_50%': 0.25 <= position_ratio <= 0.75
            })
        # ========================
        # 1. PREFIJOS Y SUFIJOS
        # ========================
        if feature_selection['prefix_suffix']:
            # Prefijos estándar
            for n in [2, 3]:
                if len(word) >= n:
                    features[f'prefix{n}'] = word[:n]
                    features[f'suffix{n}'] = word[-n:]
            
            # Suffijos largos (4+ caracteres)
            if feature_selection['suffix_long'] and len(word) >= 4:
                features.update({
                    'prefix4': word[:4],
                    'suffix4': word[-4:],
                    'suffix5': word[-5:] if len(word) >=5 else None
                })

        # ========================
        # 2. LONGITUD DE PALABRA
        # ========================
        if feature_selection['length']:
            word_len = len(word)
            features.update({
                'length': word_len,
                'length_bin': 'short' if word_len <4 else 'medium' if 4<=word_len<=7 else 'long',
                'is_single_char': word_len == 1
            })

        # ========================
        # 3. PATRONES DE SÍMBOLOS
        # ========================
        if feature_selection['symbol_patterns']:
            symbol_features = {
                'contains_symbol': any(c in symbols for c in word),
                'symbol_count': sum(1 for c in word if c in symbols),
                'is_hashtag': word.startswith('#') and len(word) > 1,
                'is_mention': word.startswith('@') and len(word) > 1,
                'is_url': re.match(r'^https?://', word) is not None
            }
            features.update(symbol_features)

            # Detección de combinaciones especiales
            if '-' in word:
                features['hyphen_combinations'] = word.count('-')
            if '/' in word:
                features['slash_pattern'] = True
        
        # ====================
        # 1. FORMA DE PALABRA
        # ====================
        if feature_selection['word_form']:
            features.update({
                'word.lower': word.lower(),
                'word.istitle': word.istitle(),
                'word.isupper': word.isupper(),
                'word.isdigit': word.isdigit(),
                'word.isalnum': word.isalnum(),
                'word[:4]': word[:4] if len(word) >=4 else word,
                'word[-4:]': word[-4:] if len(word) >=4 else word,
            })
        
        # ========================
        # 2. MORFOLOGÍA Y MAYÚSCULAS
        # ========================
        if feature_selection['morphology']:
            features.update({
                'hyphen': '-' in word,
                'dot': '.' in word,
                'has_digit': any(c.isdigit() for c in word),
                'shape': ''.join([
                    'A' if c.isalpha() else 
                    'N' if c.isdigit() else 
                    'S' if c in {'-', '/', ':'} else 'O' 
                    for c in word
                ])
            })
        
        if feature_selection['case_patterns']:
            features.update({
                'all_caps': word == word.upper(),
                'mixed_case': word != word.lower() and word != word.upper(),
                'capital_inside': word[1:] != word[1:].lower() if len(word) >1 else False
            })
        
        # ====================
        # 3. PATRONES TEMPORALES/NUMÉRICOS
        # ====================
        if feature_selection['date_time_patterns']:
            features.update({
                'is_date': any(c in word for c in {'/', '-', ':'}) and any(c.isdigit() for c in word),
                'has_year': (len(word) == 4 and word.isdigit() and 1900 <= int(word) <= 2100)
            })
        
        if feature_selection['numeric_patterns']:
            features.update({
                'has_currency': any(c in word for c in {'€', '$', '£'}),
                'is_percentage': '%' in word,
                'is_numeric': any(c.isdigit() for c in word) and any(c in {',', '.', ':'} for c in word)
            })


                # 4. Prefijos y sufijos extendidos
        if feature_selection['prefix_suffix']:
            for n in [2,3,4]:
                if len(word) >= n:
                    features[f'prefix{n}'] = word[:n]
                    features[f'suffix{n}'] = word[-n:]
        
        # ====================
        # 4. CONTEXTO INMEDIATO
        # ====================
        if feature_selection['context']:
            context_window = []
            for i in [-2, -1, 1, 2]:  # Ventana ampliada
                if 0 <= index+i < len(sentence):
                    ctx_word = sentence[index+i][0]
                    context_window.append(ctx_word.lower())
            
            features.update({
                'prev_word': sentence[index-1][0].lower() if index >0 else '<START>',
                'next_word': sentence[index+1][0].lower() if index < len(sentence)-1 else '<END>',
                'context_bag': ' '.join(context_window)
            })
        
        # ====================
        # 5. CARACTERÍSTICAS IDIOMÁTICAS
        # ====================
        if feature_selection['language_specific']:
            # Para español
            features.update({
                'has_accent': any(c in 'áéíóúñ' for c in word.lower()),
                'common_ending_es': word[-2:] in {'os', 'as', 'es', 'ía'}
            })
            
            # Para neerlandés (ejemplo)
            if any(c in {'ij', 'ee', 'uu'} for c in word.lower()):
                features['dutch_diacritics'] = True
        
        # ====================
        # 6. ORTOGRAFÍA COMPLEJA
        # ====================
        if feature_selection['orthographic_features']:
            ortho_patterns = {
                'ALPHANUM': re.match(r'^(?=.*[A-Za-z])(?=.*\d)', word),
                'UPPER_DIGIT': re.match(r'^[A-Z0-9-]+$', word),
                'CAMEL_CASE': re.match(r'^[A-Z][a-z]+([A-Z][a-z]+)+$', word)
            }
            for pattern, match in ortho_patterns.items():
                if match:
                    features[f'ortho_{pattern}'] = True
        
        # ====================
        # 7. METADATOS POSICIONALES
        # ====================
        if feature_selection['position']:
            features.update({
                'position': index,
                'is_first_3': index < 3,
                'is_last_3': index > len(sentence)-4
            })
        
        return features
    
    return feature_function

In [30]:
feature_config = {
    'word_form': True,
    'prefix_suffix': True,
    'lemma_pos_tags': True,  # Solo si el corpus incluye POS tags
    'sentence_position': True,
    'language_specific': True,
    'case_patterns': True,
    'date_time_patterns': True,
    'numeric_patterns': True,
    'symbol_patterns': True,
    'length': True,
    'morphology': True,
    'context': True,
    'extended_context': True,
    'orthographic_features': True,
    'common_ending': True,
    'position': True,
}
ct = nltk.tag.CRFTagger(feature_func=generate_features(feature_config))
ct.train(train[:100], "nooooooolapolitziiaa.mdl")
ct.accuracy(test)

Melbourne


ValueError: too many values to unpack (expected 2)