## Loading the data
Si hay multiwords, se debe saltar una linea y coger las dos siguientes. Ejemplo:

19-20	don't	_	_	_	_	_	_	_	_

19	do	do	AUX	VBP	Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin	21	aux	21:aux	_

20	n't	not	PART	RB	Polarity=Neg	21	advmod	21:advmod	_

In [2]:
def load_conllu_data(filepath):
    """
    Carga y procesa un archivo CoNLL-U, extrayendo las oraciones y sus etiquetas UPOS.
    """
    sentences = []
    tags = []
    current_sentence = []
    current_tags = []

    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()

            # 1. Ignorar comentarios y líneas vacías que no sean separadores de oración
            if line.startswith('#'):
                continue
            
            # 2. Línea en blanco: indica el final de una oración
            elif line == '':
                if current_sentence:
                    sentences.append(current_sentence)
                    tags.append(current_tags)
                    current_sentence = []
                    current_tags = []
            
            # 3. Procesar línea de palabra
            else:
                fields = line.split('\t')
                
                # Ignorar tokens multiword (ID con guion, e.g., '1-2') o nodos vacíos (ID con punto, e.g., '1.1') 
                if '-' in fields[0] or '.' in fields[0]:
                    continue

                # Extraer la palabra (FORM, índice 1) y la etiqueta PoS (UPOS, índice 3)
                word = fields[1]
                pos_tag = fields[3]
                
                current_sentence.append(word)
                current_tags.append(pos_tag)

    # Asegurarse de añadir la última oración si el archivo no termina en línea vacía
    if current_sentence:
        sentences.append(current_sentence)
        tags.append(current_tags)

    return sentences, tags

In [3]:
filepath = "./en_ewt-ud-train.conllu"

sentences = []
tags = []
current_sentence = []
current_tags = []

with open (filepath, 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip()

        # Ignorar comentarios y lineas vacias
        if line.startswith('#'):
            continue

        # Si hay una linea en blanco indica el final de una oracion
        elif line == '':
            if current_sentence:
                sentences.append(current_sentence)
                tags.append(current_tags)
                current_sentences = []
                current_tags = []
                
        # Procesar línea de palabra
        else:
            fields = line.split('\t')

            # Ignorar tokens multiword
            if '-' in fields[0] or '.' in fields[0]:
                    continue
                
            # Extraer la palabra (FORM, índice 1) y la etiqueta PoS (UPOS, índice 3)
            word = fields[1]
            pos_tag = fields[3]
                
            current_sentence.append(word)
            current_tags.append(pos_tag)


    # Asegurarse de añadir la última oración si el archivo no termina en línea vacía
    if current_sentence:
        sentences.append(current_sentence)
        tags.append(current_tags)

    
# Ejemplo de uso (asumiendo que los archivos están en la misma carpeta):
train_sents, train_tags = load_conllu_data('en_ewt-ud-train.conllu')
dev_sents, dev_tags = load_conllu_data('en_ewt-ud-dev.conllu')
test_sents, test_tags = load_conllu_data('en_ewt-ud-test.conllu')

print(test_sents[5], test_tags[5])

['Google', 'is', 'a', 'nice', 'search', 'engine', '.'] ['PROPN', 'AUX', 'DET', 'ADJ', 'NOUN', 'NOUN', 'PUNCT']


## 2. Text Vectorization: Creating the Dictionaries

The first step in preparing the data for the LSTM model is to convert our text-based sentences and tags into numerical sequences. Neural networks can only process numbers, so we need a consistent way to map each word and each tag to a unique integer ID.

For this task, we'll use Keras's modern `TextVectorization` layer. We will create two separate instances of this layer: one for the input words (`word_vectorizer`) and one for the output tags (`tag_vectorizer`).

The process involves two main stages:
1.  **Configuration**: We initialize the `TextVectorization` layer with `output_mode='int'` to ensure it produces sequences of integer IDs (e.g., "Google is nice" -> `[2, 3, 42]`). We also set `output_sequence_length=128` to enforce that all sequences are padded or truncated to a fixed length, which is a requirement for the model.
2.  **Adaptation**: We then call the `.adapt()` method on our training data. This step builds the internal vocabulary for each vectorizer. It analyzes all the words (or tags) in the training set and assigns a unique integer to each one. This ensures our "dictionaries" are based only on the data the model is allowed to learn from.


In [4]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

# Max lend of words for a sentence
MAX_LEN = 128 

# Create the TextVectorization layer.
word_vectorizer = TextVectorization(
    output_mode='int',
    output_sequence_length=MAX_LEN
)

#Flatten the training sentences.
train_sents_flat = [' '.join(sentence) for sentence in train_sents]

# Flatten the training tags for the adaptation step.
train_tags_flat = [' '.join(tag_list) for tag_list in train_tags]

# Adapt the vectorizer to the training data.
# This builds the internal vocabulary (the word-to-integer dictionary).
word_vectorizer.adapt(train_sents_flat) 




# --- Let's test it with an example ---
# Create an example sentence containing an unknown word ("jojoto").
example_sentence = ["Google", "is", "a", "jojoto", "engine"]
print("example:", example_sentence)

# running example
example_vec = word_vectorizer([" ".join(example_sentence)])
print("\nexample vec:")
print(example_vec.numpy())

example: ['Google', 'is', 'a', 'jojoto', 'engine']

example vec:
[[2475    9    5    1 1862    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]]


In [14]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

def create_and_adapt_vectorizer(sentences, max_len=128):

    vectorizer = TextVectorization(
        output_mode='int',
        output_sequence_length=max_len
    )

    sentences_flat = [' '.join(sentence) for sentence in sentences]
    
    vectorizer.adapt(sentences_flat)
    
    vocab_size = len(vectorizer.get_vocabulary())
    
    print(f"Adaptation complete. Vocabulary size: {vocab_size}")
    
    return vectorizer, vocab_size

# How to use the function ---


word_vectorizer, WORD_VOCAB_SIZE = create_and_adapt_vectorizer(train_sents)

tags_vectorizer, TAGS_VOCAB_SIZE = create_and_adapt_vectorizer(train_tags)

print(f"\nWe have successfully created a vectorizer with a vocabulary of {WORD_VOCAB_SIZE} words.")

print(f"\nWe have successfully created a vectorizer with a vocabulary of {TAGS_VOCAB_SIZE} tags.")

print("Vectorizing all data sets...")

# --- 1. Flatten the data from list of lists to list of strings ---
# The vectorizer layers expect a flat list of strings as input.
train_sents_flat = [' '.join(sentence) for sentence in train_sents]
train_tags_flat = [' '.join(tag_list) for tag_list in train_tags]

dev_sents_flat = [' '.join(sentence) for sentence in dev_sents]
dev_tags_flat = [' '.join(tag_list) for tag_list in dev_tags]

test_sents_flat = [' '.join(sentence) for sentence in test_sents]
test_tags_flat = [' '.join(tag_list) for tag_list in test_tags]


# --- 2. Use the vectorizers to transform the flattened data ---
# Now we call the vectorizers with the correct input format.
X_train = word_vectorizer(train_sents_flat)
y_train = tags_vectorizer(train_tags_flat)

X_dev = word_vectorizer(dev_sents_flat)
y_dev = tags_vectorizer(dev_tags_flat)

X_test = word_vectorizer(test_sents_flat)
y_test = tags_vectorizer(test_tags_flat)

print("Vectorization complete!")
print("\nShape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_dev:", X_dev.shape)
print("Shape of y_dev:", y_dev.shape)


print(X_train[0])
print(y_dev[0])

Adaptation complete. Vocabulary size: 16250
Adaptation complete. Vocabulary size: 19

We have successfully created a vectorizer with a vocabulary of 16250 words.

We have successfully created a vectorizer with a vocabulary of 19 tags.
Vectorizing all data sets...
Vectorization complete!

Shape of X_train: (12544, 128)
Shape of y_train: (12544, 128)
Shape of X_dev: (2001, 128)
Shape of y_dev: (2001, 128)
tf.Tensor(
[  169  4754   258   637  1119  4039 15673   169 15421     2  6883    30
     2  7141     8     2   436     6  6811   721     2  2132  1607     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     

## Esto es el modeo

In [15]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, TimeDistributed
from tensorflow.keras.models import Model

# --- Hiperparámetros (igual que antes) ---
# (Asegúrate de tener estas variables definidas de pasos anteriores)
# WORD_VOCAB_SIZE, TAG_VOCAB_SIZE, MAX_LEN
EMBEDDING_DIM = 128
LSTM_UNITS = 128

# --- Construcción del Modelo con la API Funcional ---

# 1. Definir la capa de Entrada 
# Le decimos al modelo que recibirá secuencias de números enteros de longitud MAX_LEN.
inputs = Input(shape=(MAX_LEN,), name='word_ids_input')

# 2. Conectar las capas en una "carrera de relevos" 🔗
# La capa Embedding recibe los 'inputs' y su salida se guarda en 'x'.
x = Embedding(
    input_dim=WORD_VOCAB_SIZE, 
    output_dim=EMBEDDING_DIM, 
    mask_zero=True, # Importante para que ignore el padding
    name='word_embedding'
)(inputs)

# La capa LSTM recibe la salida del Embedding ('x') y su propia salida se guarda de nuevo en 'x'.
x = LSTM(
    units=LSTM_UNITS, 
    return_sequences=True, # Necesitamos una salida para cada palabra
    name='lstm_layer'
)(x)

# La capa TimeDistributed(Dense) recibe la salida de la LSTM ('x') y su salida es la final.
# La llamamos 'outputs' para que quede claro que es el final del camino.
outputs = TimeDistributed(
    Dense(units=TAGS_VOCAB_SIZE, activation='softmax'), 
    name='pos_tag_output'
)(x)

# 3. Crear el Modelo final
# Le decimos a Keras dónde empieza el modelo (inputs) y dónde termina (outputs).
model = Model(inputs=inputs, outputs=outputs, name='pos_tagger_model')

# ¡Listo! Ahora puedes imprimir el resumen y ver la arquitectura.
model.summary()

In [16]:
import tensorflow as tf
from tensorflow.keras.metrics import sparse_categorical_accuracy

def custom_masked_accuracy(y_true, y_pred):
    # 1. Crea una máscara que es 0.0 para padding (ID 0) y 1.0 para todo lo demás
    # tf.not_equal(y_true, 0) crea un tensor de Boleanos (True/False)
    # tf.cast lo convierte a 1.0 y 0.0
    sample_weight = tf.cast(tf.not_equal(y_true, 0), dtype=tf.float32)

    # 2. Calcula la precisión para TODOS los tokens (incluyendo padding)
    accuracy_all_tokens = sparse_categorical_accuracy(y_true, y_pred)
    
    # 3. Aplica la máscara: multiplica la precisión de cada token por 1.0 o 0.0
    weighted_accuracy = accuracy_all_tokens * sample_weight
    
    # 4. Calcula la media solo sobre los tokens reales
    # Suma las precisiones (solo los 1.0) y divide por el número de tokens reales (la suma de la máscara)
    # Añadimos epsilon para evitar una división por cero si un batch estuviera vacío
    epsilon = 1e-7
    return tf.reduce_sum(weighted_accuracy) / (tf.reduce_sum(sample_weight) + epsilon)


# Compilar el modelo
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy', # Esto ya estaba bien
    metrics=[custom_masked_accuracy]       # <-- ¡USA LA MÉTRICA PERSONALIZADA!
)

In [17]:
# Parámetros de entrenamiento
EPOCHS = 5 # Empezamos con pocas para probar rápido
BATCH_SIZE = 128

# Entrenar el modelo
history = model.fit(
    X_train, 
    y_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_data=(X_dev, y_dev)
)

Epoch 1/5


2025-10-22 13:56:52.976709: E tensorflow/core/util/util.cc:131] oneDNN supports DT_BOOL only on platforms with AVX-512. Falling back to the default Eigen-based implementation if present.


[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 355ms/step - custom_masked_accuracy: 0.2968 - loss: 2.2786 - val_custom_masked_accuracy: 0.4304 - val_loss: 1.7450
Epoch 2/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 348ms/step - custom_masked_accuracy: 0.4566 - loss: 1.5883 - val_custom_masked_accuracy: 0.5349 - val_loss: 1.3559
Epoch 3/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 341ms/step - custom_masked_accuracy: 0.5242 - loss: 1.2993 - val_custom_masked_accuracy: 0.5569 - val_loss: 1.2289
Epoch 4/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 344ms/step - custom_masked_accuracy: 0.5658 - loss: 1.1286 - val_custom_masked_accuracy: 0.5580 - val_loss: 1.1983
Epoch 5/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 348ms/step - custom_masked_accuracy: 0.5966 - loss: 1.0035 - val_custom_masked_accuracy: 0.5543 - val_loss: 1.2071


In [19]:
import numpy as np

# Asumimos que tu 'model', 'word_vectorizer', y 'tags_vectorizer' 
# ya están definidos y entrenados en las celdas anteriores.

# 1. Obtenemos el vocabulario de etiquetas (la lista de strings)
# El ID 0 es '', el ID 1 es '[UNK]', así que el vocabulario real empieza en el índice 2
tags_vocab = tags_vectorizer.get_vocabulary()

def predict_tags(sentence_string):
    """
    Toma una frase (string), la procesa con el modelo 
    y muestra las predicciones palabra por palabra.
    """
    
    print(f"--- Prediciendo para: ---\n{sentence_string}\n")
    
    # 0. Separamos las palabras para saber cuántas son
    words = sentence_string.split(' ')
    
    # 1. Convertir la frase (string) en un tensor de IDs (shape [1, 128])
    # El vectorizador espera una lista de strings
    input_tensor = word_vectorizer([sentence_string])
    
    # 2. Obtener las predicciones del modelo
    # El modelo devuelve probabilidades (shape [1, 128, 20])
    predictions = model.predict(input_tensor)
    
    # 3. Encontrar el ID de la etiqueta con mayor probabilidad para cada token
    # Usamos np.argmax para obtener los IDs ganadores (shape [1, 128])
    predicted_ids = np.argmax(predictions, axis=-1)[0] # [0] para coger el primer (y único) batch
    
    # 4. Mostrar los resultados
    print("--- Resultados: ---")
    for i in range(len(words)):
        word = words[i]
        tag_id = predicted_ids[i]
        tag_name = tags_vocab[tag_id]
        
        print(f"{word:<15} -> {tag_name}")

In [20]:
# Prueba 1: La frase de ejemplo de la práctica
predict_tags("Google is a nice search engine .")

# Prueba 2: Una frase nueva
predict_tags("I am writing this new sentence for the test .")

# Prueba 3: Otra más
predict_tags("The university is in ntiago .")

--- Prediciendo para: ---
Google is a nice search engine .

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 945ms/step
--- Resultados: ---
Google          -> propn
is              -> aux
a               -> det
nice            -> adj
search          -> noun
engine          -> noun
.               -> noun
--- Prediciendo para: ---
I am writing this new sentence for the test .

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
--- Resultados: ---
I               -> pron
am              -> aux
writing         -> verb
this            -> det
new             -> adj
sentence        -> noun
for             -> adp
the             -> det
test            -> noun
.               -> noun
--- Prediciendo para: ---
The university is in ntiago .

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
--- Resultados: ---
The             -> det
university      -> propn
is              -> aux
in              -> adp
ntiago          -> propn
.            