In [1]:
# Dataset de ejemplo
corpus = [
    "hola mundo",
    "cómo estás",
    "el modelo autoregresivo predice tokens uno por uno",
    "los transformers usan atención",
    "la máscara causal evita ver el futuro"
]

In [2]:
# Crear vocabulario de caracteres únicos
caracteres = sorted(list(set("".join(corpus))))
vocab = {c: i + 1 for i, c in enumerate(caracteres)}  # 0 reservado para padding
vocab["<pad>"] = 0
inv_vocab = {i: c for c, i in vocab.items()}

In [None]:
# Funciones para codificar y decodificar texto
def texto_a_indices(texto):
    return [vocab[c] for c in texto]

def indices_a_texto(indices):
    return "".join([inv_vocab[i] for i in indices if i != 0])

In [None]:
# Convertir el corpus a secuencias de índices
secuencias = [texto_a_indices(frase) for frase in corpus]

for frase, indices in zip(corpus, secuencias):
    print(f"'{frase}' → {indices}")

print("\nVocabulario:")
print(vocab)

'hola mundo' → [8, 14, 11, 2, 1, 12, 19, 13, 4, 14]
'cómo estás' → [3, 22, 12, 14, 1, 5, 17, 18, 21, 17]
'el modelo autoregresivo predice tokens uno por uno' → [5, 11, 1, 12, 14, 4, 5, 11, 14, 1, 2, 19, 18, 14, 16, 5, 7, 16, 5, 17, 9, 20, 14, 1, 15, 16, 5, 4, 9, 3, 5, 1, 18, 14, 10, 5, 13, 17, 1, 19, 13, 14, 1, 15, 14, 16, 1, 19, 13, 14]
'los transformers usan atención' → [11, 14, 17, 1, 18, 16, 2, 13, 17, 6, 14, 16, 12, 5, 16, 17, 1, 19, 17, 2, 13, 1, 2, 18, 5, 13, 3, 9, 22, 13]
'la máscara causal evita ver el futuro' → [11, 2, 1, 12, 21, 17, 3, 2, 16, 2, 1, 3, 2, 19, 17, 2, 11, 1, 5, 20, 9, 18, 2, 1, 20, 5, 16, 1, 5, 11, 1, 6, 19, 18, 19, 16, 14]

Vocabulario:
{' ': 1, 'a': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'k': 10, 'l': 11, 'm': 12, 'n': 13, 'o': 14, 'p': 15, 'r': 16, 's': 17, 't': 18, 'u': 19, 'v': 20, 'á': 21, 'ó': 22, '<pad>': 0}


In [5]:
# Guardar vocabulario y secuencias
import json
with open("vocab.json", "w") as f:
    json.dump(vocab, f)

import pickle
with open("secuencias.pkl", "wb") as f:
    pickle.dump(secuencias, f)