In [None]:
import gc
import json
import math
import os
import pickle
import random
import re
import sys

import numpy as np
from charset_normalizer import from_path
from gensim.corpora import Dictionary
import nltk
from tqdm import tqdm

"""
Se importan las librerias que se necesiten, 
si se quiere ejecutar el notebook, se recomienda crear la carpeta de data, y poner ahi los files como se describe

"""
ACTUAL_PATH = os.getcwd()
# Donde esta el 20 News
PATH_20N = os.path.join(ACTUAL_PATH, "data/20news-18828")
# Donde se encuentra el BAC
PATH_BAC = os.path.join(ACTUAL_PATH, "data/BAC/blogs")
# Donde se van a guardar los files que se van obteniendo

# IMPORTANTE: Los files de https://uniandes-my.sharepoint.com/:f:/g/personal/eg_soto_uniandes_edu_co/Ep4A2ReC4jNGpSyFcqflY_YBVJdekMnu7W755IMhpI33dw?e=9A6Ese, tienen que ir aca en esta direccion.
PATH_FINAL_FILES = os.path.join(ACTUAL_PATH, "data/final_files")
# Numero de grupo (realmente como no hay pues simplemente se pusimos nuestros nombres)
GRUPO = "Erich_Carlos"

[nltk_data] Downloading package stopwords to /home/erich/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## I. Read the files and build two large consolidate files that are the union of all the documents in 20N and BAC.

### UPLOAD_FILES

#### UPLOADING 20N 

In [5]:
"""Para esto se define en que formato y donde se quiere el archivo completo de 20N"""

NEW_20N_FILE = os.path.join(PATH_FINAL_FILES, "20N.jsonl")

mayor_folders_20N = os.listdir(PATH_20N)
dictionary = {}
"""
Para cada archivo disponible en 20N,  
se generan registros con:  
- el ID del archivo  
- el tema del archivo  
- el texto del contenido  

Todo se guarda en formato JSONL por cuestiones de formato.
"""
with open(NEW_20N_FILE, "w", encoding="utf-8", errors="replace") as f_n:
    for folder in mayor_folders_20N:
        minor_files_path = os.path.join(PATH_20N, folder)
        minor_files = os.listdir(minor_files_path)
        for file in minor_files:
            file_path = os.path.join(minor_files_path, file)
            with open(file_path, "r", encoding="utf-8", errors="replace") as f:
                text = f.read().strip()

            record = {"id": file, "theme": folder, "text": text}
            unit = folder + file
            if file in dictionary.keys():
                dictionary[unit] += 1
            else:
                dictionary[unit] = 1
            f_n.write(json.dumps(record, ensure_ascii=False) + "\n")

#### UPLOADING BAC

In [6]:
"""
Para cada archivo disponible en BAC,
se generan registros con:
- el ID del archivo
- el tema del archivo
- el texto del contenido

Todo se guarda en formato JSONL por cuestiones de formato.
"""

NEW_BAC_FILE = os.path.join(PATH_FINAL_FILES, "BAC.jsonl")
mayor_folders_BAC = os.listdir(PATH_BAC)
with open(NEW_BAC_FILE, "w", encoding="utf-8", errors="replace") as f_n:
    for file in mayor_folders_BAC:
        post_num = 0
        file_path = os.path.join(PATH_BAC, file)
        with open(file_path, "r", encoding="utf-8", errors="replace") as f:
            text = f.read().strip()
            text = re.sub(r"</?Blog>", "", text)
        post_list = text.split("<post>")
        for post in post_list:
            post = post.strip().replace("</post>", "")
            record = {"id": file, "post_num": post_num, "text": post}
            f_n.write(json.dumps(record, ensure_ascii=False) + "\n")
            post_num += 1

KeyboardInterrupt: 

## II. & III. 
### Tokenize by sentence 
### & 
### Select 80% of the resulting sentences -random without replacement- to build the N-gram model and the remaining 20% for evaluation.)

### Funciones utiles

In [None]:
def preprocess_text(text: str) -> list[str]:
    """procesador de sentencias

    Args:
        text (str): sentencia a procesar

    Returns:
        list[str]: lista de palabras y simbolos a dejar.
    """
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s.,!?]", " ", text, flags=re.I | re.A | re.MULTILINE)
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"\d+", "NUM", text)
    text = "<s> " + text + " </s>"
    text = text.strip().split()
    return text


def divide_sentences(text: str) -> list[str]:
    """Divide los textos en sentencias

    Args:
        text (str): texto completo

    Returns:
        list[str]: Lista completa de sentencias
    """
    sentences = nltk.sent_tokenize(text)
    return sentences

#### Examples

In [None]:
text = """Welcome! As soon as we can, Jon and I are going to try to get the class list from the registrar to invite the whole class, but in the meantime, tell everyone you know in YLS04 to check it out.   We'd conceived of this place as a way for the class to keep in touch, share legal experiences (working, clerking, keeping ourselves out of jail, etc.) and express opinions on everything and anything.  Basically, we want to entertain ourselves and each other.  Obviously, it will become whatever we all want it to be, and I can't wait to see what bizarre form it takes.  If you'd like to post but haven't gotten a log-in, e-mail Jon or me, and we'll put it up for you or arrange a log-in.  There are no rules per se, but as a general rule, if you're posting something that has to do with your work, obviously be aware of whatever privileges/ethical guidelines apply (the secrecy of a judge's chambers, attorney-client privilege, general theories of slander: i.e. "Tony Kronman smokes crack" might cause us some problems).  Remember,  anyone  can see this blog (and I am sure the rest of the world is just dying to tune in to hear what YLS04 has to say), so be cautious.  But not if it means holding back something entertaining, of course.  First order of business -- what should we call it?  We were thinking of it as a virtual "Wall" -- with fewer (only slightly) personal attacks and more fun, but couldn't think of a really good name.  So let's hear some suggestions."""
sentences = divide_sentences(text)
sentences

['Welcome!',
 'As soon as we can, Jon and I are going to try to get the class list from the registrar to invite the whole class, but in the meantime, tell everyone you know in YLS04 to check it out.',
 "We'd conceived of this place as a way for the class to keep in touch, share legal experiences (working, clerking, keeping ourselves out of jail, etc.)",
 'and express opinions on everything and anything.',
 'Basically, we want to entertain ourselves and each other.',
 "Obviously, it will become whatever we all want it to be, and I can't wait to see what bizarre form it takes.",
 "If you'd like to post but haven't gotten a log-in, e-mail Jon or me, and we'll put it up for you or arrange a log-in.",
 "There are no rules per se, but as a general rule, if you're posting something that has to do with your work, obviously be aware of whatever privileges/ethical guidelines apply (the secrecy of a judge's chambers, attorney-client privilege, general theories of slander: i.e.",
 '"Tony Kronman s

In [None]:
processed = [preprocess_text(s) for s in sentences]

print(processed)

[['<s>', 'welcome', '!', '</s>'], ['<s>', 'as', 'soon', 'as', 'we', 'can', ',', 'jon', 'and', 'i', 'are', 'going', 'to', 'try', 'to', 'get', 'the', 'class', 'list', 'from', 'the', 'registrar', 'to', 'invite', 'the', 'whole', 'class', ',', 'but', 'in', 'the', 'meantime', ',', 'tell', 'everyone', 'you', 'know', 'in', 'ylsNUM', 'to', 'check', 'it', 'out', '.', '</s>'], ['<s>', 'we', 'd', 'conceived', 'of', 'this', 'place', 'as', 'a', 'way', 'for', 'the', 'class', 'to', 'keep', 'in', 'touch', ',', 'share', 'legal', 'experiences', 'working', ',', 'clerking', ',', 'keeping', 'ourselves', 'out', 'of', 'jail', ',', 'etc', '.', '</s>'], ['<s>', 'and', 'express', 'opinions', 'on', 'everything', 'and', 'anything', '.', '</s>'], ['<s>', 'basically', ',', 'we', 'want', 'to', 'entertain', 'ourselves', 'and', 'each', 'other', '.', '</s>'], ['<s>', 'obviously', ',', 'it', 'will', 'become', 'whatever', 'we', 'all', 'want', 'it', 'to', 'be', ',', 'and', 'i', 'can', 't', 'wait', 'to', 'see', 'what', 'biz

### Tokenizacion por sentencia y preprocesamiento de las mismas.

Para cada archivo y para cada texto contenido en él:  
1. Se divide el texto en oraciones y se normaliza (minúsculas, 
   manejo de números, limpieza de símbolos innecesarios, etc.).  
2. Se agregan las etiquetas `<s>` y `</s>` para marcar el inicio y fin de cada oración.  
3. Se construye un diccionario de tokens.  
4. Todas las palabras con frecuencia igual a 1 se reemplazan por el token <unk>, 
   con el fin de reducir ruido y manejar vocabulario raro o desconocido.  

In [None]:
def save_pickle(data, filename: str):
    """funcion para guardar una estructura en un .pickle

    Args:
        data (_type_): Estructura a guardar
        filename (str): Nombre del archivo a guardar
    """
    filepath = os.path.join(PATH_FINAL_FILES, filename)
    with open(filepath, "wb") as f:
        pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
"""Diccionario para contar palabras y asi reemplazar despues por <UNK>"""

word_count = {}
with open(os.path.join(PATH_FINAL_FILES, "20N.jsonl"), "r") as f:
    for line in f:
        line = json.loads(line)
        for word in preprocess_text(line["text"]):
            word_count[word] = word_count.get(word, 0) + 1
word_count = {word: count for word, count in word_count.items() if count == 1}

"""
Se itera sobre los textos de 20N, dividiendo en oraciones y aplicando preprocesamiento.  
Cada oración se añade a una lista para su posterior uso.  
Luego, se realiza una partición aleatoria en conjuntos de entrenamiento y prueba,  
los cuales finalmente se guardan en archivos.
"""
sentencias = []
with open(os.path.join(PATH_FINAL_FILES, "20N.jsonl"), "r") as f:
    for line in f:
        line = json.loads(line)
        sentences = divide_sentences(line["text"])
        for sentence in sentences:
            pre_process = preprocess_text(sentence)
            tokens = [
                word if word not in word_count else "<UNK>" for word in pre_process
            ]
            sentencias.append(tokens)
            # print(sentence, pre_process)
random.seed(42)
random.shuffle(sentencias)
index_to_split = int(0.8 * len(sentencias))
train_sentences = sentencias[:index_to_split]
test_sentences = sentencias[index_to_split:]
save_pickle(train_sentences, f"20N_{GRUPO}_training.pkl")
save_pickle(test_sentences, f"20N_{GRUPO}_testing.pkl")

""" Se eliminan los archivos que no se necesitan por performance, (mi pc se ponia lenta).
"""
del sentencias
del train_sentences
del test_sentences
gc.collect()

In [None]:
def contar_lineas(path):
    c = 0
    with open(path, "r", encoding="utf-8") as f:
        for _ in f:
            c += 1
    return c


file_path = os.path.join(PATH_FINAL_FILES, "BAC.jsonl")
total_lineas = contar_lineas(file_path)

""" Diccionario para contar palabras y asi reemplazar despues por <UNK>
"""
word_count = {}
with open(file_path, "r", encoding="utf-8") as f:
    for line in tqdm(f, total=total_lineas, desc="Contando palabras", unit="línea"):
        line = json.loads(line)
        for word in preprocess_text(line["text"]):
            word_count[word] = word_count.get(word, 0) + 1

word_count = {word: count for word, count in word_count.items() if count == 1}
"""
Se itera sobre los textos de BAC, dividiendo en oraciones y aplicando preprocesamiento.  
Cada oración se añade a una lista para su posterior uso.  
Luego, se realiza una partición aleatoria en conjuntos de entrenamiento y prueba,  
los cuales finalmente se guardan en archivos.
"""
sentencias = []
with open(file_path, "r", encoding="utf-8") as f:
    for line in tqdm(f, total=total_lineas, desc="Procesando oraciones", unit="línea"):
        line = json.loads(line)
        sentences = divide_sentences(line["text"])
        for sentence in sentences:
            pre_process = preprocess_text(sentence)
            tokens = [
                word if word not in word_count else "<UNK>" for word in pre_process
            ]
            sentencias.append(tokens)
            # print(sentence, pre_process)
random.seed(42)
random.shuffle(sentencias)
split_idx = int(0.8 * len(sentencias))
train_sentences = sentencias[:split_idx]
test_sentences = sentencias[split_idx:]

In [None]:
save_pickle(train_sentences, f"BAC_{GRUPO}_training.pkl")
save_pickle(test_sentences, f"BAC_{GRUPO}_testing.pkl")

""" Se eliminan los archivos que no se necesitan por performance, (mi pc se ponia lenta).
"""
del sentencias
del train_sentences
del test_sentences
gc.collect()

8

## IV. Calcular N Gramas

### Calculo de N-Gramas

#### Clases de los N-Gramas, (Unigramas, Bigramas y Trigramas)

In [None]:
class UnigramModel:
    """
    Modelo de unigramas,
    """

    def __init__(self, filename: str, file_is_training=True):
        """Makes the diccionary that the model needs to work,
        ge
        Args:
            filename (str): Nombre del archivo a procesar
            file_is_training (bool, optional):
                Indica cómo manejar el archivo de entrada.
                - Si es False, se carga el objeto ya procesado desde un archivo `.pickle`.
                - Si es True, el archivo se procesa desde cero.

        """
        print(filename)
        file = self.get_pickle(filename)
        if file_is_training:
            self.word_counter_20N = {}
            for sentence in file:
                for word in sentence:
                    self.word_counter_20N[word] = self.word_counter_20N.get(word, 0) + 1
            self.total_words = sum(self.word_counter_20N.values())
            self.V = len(self.word_counter_20N)
        else:
            self.word_counter_20N = file["word_counter_20N"]
            self.total_words = file["total_words"]
            self.V = file["V"]

        self.total_words = sum(self.word_counter_20N.values())
        self.V = len(self.word_counter_20N)

    def get_pickle(self, filename: str):
        """
        Abre un file en formato .pickle,
        dentro de PATH_FINAL_FILES y lo devuelve.

        Args:
            filename (str): Nombre del file a abrir

        """
        filepath = os.path.join(PATH_FINAL_FILES, filename)
        with open(filepath, "rb") as f:
            sentences = pickle.load(f)
        return sentences

    def generate_unigrams(self, filename: str):
        """Genera los unigramas en un archivo (jsonl) es lo que se
        espera

        Args:
            filename (str): Nombre del archivo
        """
        filepath = os.path.join(PATH_FINAL_FILES, filename)
        with open(filepath, "w", encoding="utf-8") as f:
            for word in self.word_counter_20N.keys():
                prob = self.get_prob(word)
                f.write(json.dumps({"word": word, "probability": prob}) + "\n")

    def get_prob(self, word: str) -> float:
        """
        Calcula la probabilidad de un unigrama.
        Si la palabra existe en el vocabulario V, se devuelve su probabilidad.
        En caso contrario, se asigna al token <UNK>.

        Args:
            word (str): Palabra a consultar.

        Returns:
            float: Probabilidad asociada a la palabra.
        """
        if word.lower() in self.word_counter_20N.keys():
            prob = self.word_counter_20N[word] / self.total_words
        else:
            prob = self.word_counter_20N["<UNK>"] / self.total_words
        return prob

    def get_next_token(self) -> str:
        """Genera un token según las probabilidades unigramales."""
        probabilities = [self.get_prob([self.token_of(k)]) for k in range(self.V)]
        probs = [math.exp(p) for p in probabilities]
        index = random.choices(range(self.V), weights=probs, k=1)[0]
        return self.token_of(index)

    def generate_sentences(self, limit: int = 50) -> list[str]:
        """Genera una sentencia basado en unigramas

        Args:
            limit (int, optional): limite de palabras a predecir. Defaults to 50.

        Returns:
            list[str]: sentencia en una lista de strings.
        """
        sentence = ["<s>"]
        for _ in range(limit):
            token = self.get_next_token()
            if token == "</s>":
                break
            sentence.append(token)
        sentence.append("</s>")
        return " ".join(sentence)

    def save_model(self, filename: str):
        """
        Guarda el modelo de unigramas entrenado en un archivo `.pickle`.

        El archivo contendrá:
        - word_counter_20N: Diccionario de conteos de palabras.
        - total_words: Número total de palabras en el corpus.
        - V: Tamaño del vocabulario.

        Args:
            filename (str): Nombre del archivo de salida.
        """
        payload = {
            "word_counter_20N": self.word_counter_20N,
            "total_words": self.total_words,
            "V": self.V,
        }
        filepath = os.path.join(PATH_FINAL_FILES, filename)
        with open(filepath, "wb") as f:
            pickle.dump(payload, f, protocol=pickle.HIGHEST_PROTOCOL)


class BigramModel:
    """Bigram Model"""

    def __init__(self, filename: str, file_is_training=True):
        """
        Inicializador del modelo de bigramas.

        En este paso se construyen varias estructuras necesarias:

        - dictionary: objeto que permite mapear palabras ↔ tokens.
        - V: número total de tokens en el corpus (palabras + caracteres especiales).
        - matrix: diccionario que guarda el conteo de ocurrencias de los bigramas
            observados. (La matriz completa sería inviable de almacenar).
        - row_sums: para agilizar el cálculo de probabilidades se guarda, para cada
            token, la suma total de sus ocurrencias como primer elemento en un bigrama.
            De esta forma, el denominador de la probabilidad condicional ya está
            precomputado y no es necesario recalcularlo en cada consulta.
        """

        data = self._load_pickle(filename)

        if file_is_training:
            self.dictionary = Dictionary(data)
            self.V = len(self.dictionary)

            self.matrix = {}
            self.row_sums = {}

            for sentence in data:
                for i in range(len(sentence)):
                    w_idx = self._word_index(sentence[i])
                    if i < len(sentence) - 1:
                        w_next_idx = self._word_index(sentence[i + 1])
                        key = (w_idx, w_next_idx)
                        self.matrix[key] = self.matrix.get(key, 0) + 1
                        self.row_sums[w_idx] = self.row_sums.get(w_idx, 0) + 1
        else:
            self.dictionary = data["dictionary"]
            self.matrix = dict(data["matrix"])
            self.row_sums = dict(data["row_sums"])

            if not hasattr(self.dictionary, "id2token") or not self.dictionary.id2token:
                self.dictionary.id2token = {
                    i: t for t, i in self.dictionary.token2id.items()
                }

            self.V = len(self.dictionary.token2id)

    def _word_index(self, word: str) -> int:
        """
        Devuelve el ID asociado a un token.
        Si el token no existe en el diccionario, se asigna el ID correspondiente de <UNK>.

        Args:
            word (str): Palabra o token cuyo ID se desea obtener.

        Returns:
            int: ID de la palabra o, en caso de no estar en el diccionario,
                el ID de <UNK>.
        """
        tid = self.dictionary.token2id.get(word)
        if tid is None:
            tid = self.dictionary.token2id["<UNK>"]
        return tid

    def _load_pickle(self, filename: str):
        """Carga de un pickle

        Args:
            filename (str): Nombre del file

        Returns:
            _type_: Estructura que posea el pickle
        """
        filepath = os.path.join(PATH_FINAL_FILES, filename)
        with open(filepath, "rb") as f:
            return pickle.load(f)

    def save_model(self, filename: str):
        """Guarda el modelo, para no tener que
        volver a recalcular.

        Args:
            filename (str): Nombre del file en el cual se va a guardar el modelo
        """
        payload = {
            "dictionary": self.dictionary,
            "V": self.V,
            "matrix": dict(self.matrix),
            "row_sums": dict(self.row_sums),
        }
        filepath = os.path.join(PATH_FINAL_FILES, filename)
        with open(filepath, "wb") as f:
            pickle.dump(payload, f, protocol=pickle.HIGHEST_PROTOCOL)

    def token_of(self, idx: int) -> str:
        """
        Devuelve el token asociado a un ID.

        Args:
            idx (int): ID del token.

        Returns:
            str: Token correspondiente o <UNK> si no existe.
        """
        try:
            return self.dictionary[idx]
        except KeyError:
            return "<UNK>"

    def get_prob(self, words: list[str]) -> float:
        """Se obtiene la probabilidad de una lista palabras
        [w1,w2]

        Args:
            words (list[str]): Lista de palabras sobre la que
            se obtiene

        Returns:
            float: probabilidad de w1, w2
        """
        m_i = self._word_index(words[0])
        m_j = self._word_index(words[1])
        c_bigram = self.matrix.get((m_i, m_j), 0)
        row_sum = self.row_sums.get(m_i, 0)
        return np.log((c_bigram + 1) / (row_sum + self.V))

    def generate_bigrams(self, filename: str):
        """
        Genera y guarda SOLO las probabilidades de los bigramas OBSERVADOS
        (claves en self.matrix), incluyendo aquellos que involucren <UNK>.

        Args:
            filename (str): Nombre del archivo de salida (.jsonl).
        """
        filepath = os.path.join(PATH_FINAL_FILES, filename)

        with open(filepath, "w", encoding="utf-8") as f:
            for (i, j), _count in self.matrix.items():
                w1 = self.token_of(i)
                w2 = self.token_of(j)
                prob = self.get_prob([w1, w2])
                record = {"w1": w1, "w2": w2, "probabilidad": prob}
                f.write(json.dumps(record, ensure_ascii=False) + "\n")

    def get_next_token(self, words: list[str], top_k: int = 10) -> str:
        """
        Predice el siguiente token a partir del modelo de bigramas.

        A partir del token actual `words[0]`, calcula las probabilidades
        de transición hacia todos los tokens del vocabulario y selecciona
        aleatoriamente el siguiente token entre los `top_k` más probables.

        Args:
            words (list[str]): Lista de un solo token que sirve como contexto.
            top_k (int): Número de candidatos más probables a considerar.

        Returns:
            str: El siguiente token predicho.
        """
        prev = words[0]

        probabilities = []
        for k in range(self.V):
            lp = self.get_prob([prev, self.token_of(k)])
            probabilities.append((k, lp))

        topk = sorted(probabilities, key=lambda x: x[1], reverse=True)[:top_k]

        indices, lps = zip(*topk)
        probs = [math.exp(lp) for lp in lps]

        chosen_idx = random.choices(indices, weights=probs, k=1)[0]
        return self.token_of(chosen_idx)

    def generate_sentences(
        self, words: list[str], limit: int = 50, top_k: int = 10
    ) -> str:
        """
        Genera una oración usando el modelo de bigramas.
        Se detiene al alcanzar </s> o el límite de tokens.
        """
        current = words[0]
        out = []
        out.append(words[0])

        for _ in range(limit):
            nxt = self.get_next_token([current], top_k=top_k)
            if nxt == "</s>":
                break
            out.append(nxt)
            current = nxt

        return " ".join(out)


class TrigramModel:
    """
    Inicializa el modelo de trigramas.

    Si `file_is_training` es True, procesa los datos para construir el diccionario
    y las estructuras necesarias para calcular probabilidades.
    Si es False, carga el modelo previamente entrenado desde un archivo `.pickle`.

    Args:
        filename (str): Nombre del archivo de entrada.
        file_is_training (bool, optional):
            - True: procesa los datos desde cero.
            - False: carga un modelo ya procesado.
    """

    def __init__(self, filename, file_is_training=True):
        data = self._load_pickle(filename)
        if file_is_training:
            self.dictionary = Dictionary(data)
            self.dictionary.add_documents([["<UNK>"]])
            self.V = len(self.dictionary)
            self.matrix_trigram = {}
            self.pair_sums = {}
            for sent in data:
                ids = [self._word_index(w) for w in sent]
                for t in range(len(ids) - 2):
                    i, j, k = ids[t], ids[t + 1], ids[t + 2]
                    key3 = (i, j, k)
                    key2 = (i, j)
                    self.matrix_trigram[key3] = self.matrix_trigram.get(key3, 0) + 1
                    self.pair_sums[key2] = self.pair_sums.get(key2, 0) + 1
        else:
            self.dictionary = data["dictionary"]
            self.V = data["V"]
            self.matrix_trigram = dict(data["matrix_trigram"])
            self.pair_sums = dict(data["pair_sums"])

            if not hasattr(self.dictionary, "id2token") or not self.dictionary.id2token:
                self.dictionary.id2token = {
                    i: t for t, i in self.dictionary.token2id.items()
                }

            self.V = len(self.dictionary.token2id)

    def _word_index(self, word: str) -> int:
        """
        Devuelve el ID asociado a una palabra.

        Si la palabra no existe en el diccionario, devuelve el ID de <UNK>.

        Args:
            word (str): Palabra a consultar.

        Returns:
            int: ID asociado a la palabra o al token <UNK>.
        """
        tid = self.dictionary.token2id.get(word)
        if tid is None:
            tid = self.dictionary.token2id["<UNK>"]
        return tid

    def _load_pickle(self, filename: str):
        """
        Carga un archivo `.pickle` desde PATH_FINAL_FILES.

        Args:
            filename (str): Nombre del archivo a cargar.

        Returns:
            object: Contenido del pickle (corpus o modelo guardado).
        """
        filepath = os.path.join(PATH_FINAL_FILES, filename)
        with open(filepath, "rb") as f:
            return pickle.load(f)

    def save_model(self, filename: str):
        """
        Guarda el modelo entrenado en un archivo `.pickle`.

        El archivo incluye:
        - Diccionario de tokens.
        - Tamaño del vocabulario.
        - Conteo de trigramas observados.
        - Conteo de pares de tokens.

        Args:
            filename (str): Nombre del archivo de salida.
        """
        payload = {
            "dictionary": self.dictionary,
            "V": self.V,
            "matrix_trigram": dict(self.matrix_trigram),
            "pair_sums": dict(self.pair_sums),
        }
        filepath = os.path.join(PATH_FINAL_FILES, filename)
        with open(filepath, "wb") as f:
            pickle.dump(payload, f, protocol=pickle.HIGHEST_PROTOCOL)

    def token_of(self, idx: int) -> str:
        """
        Devuelve el token asociado a un ID.

        Args:
            idx (int): ID del token.

        Returns:
            str: Token correspondiente o <UNK> si no existe.
        """
        try:
            return self.dictionary[idx]
        except KeyError:
            return "<UNK>"

    def get_prob(self, words: list[str]) -> float:
        """
                Calcula la probabilidad de un trigrama.

                Usa la fórmula:
                    P(w_k | w_i, w_j) = (conteo(i, j, k)) / (conteo(i, j) + V)

                con suavizado de Laplace.
        top_k
                Args:
                    words (list[str]): Lista con tres tokens [w_i, w_j, w_k].

                Returns:
                    float: Probabilidad logarítmica del trigrama.
        """
        i = self._word_index(words[0])
        j = self._word_index(words[1])
        k = self._word_index(words[2])
        V = self.V
        c_ijk = self.matrix_trigram.get((i, j, k), 0)
        denom = self.pair_sums.get((i, j), 0)
        return float(np.log((c_ijk + 1) / (denom + V)))

    def generate_trigrams(self, filename: str):
        """
        Guarda SOLO trigramas OBSERVADOS (claves de self.matrix_trigram),
        incluyendo aquellos que involucren <UNK>.

        Args:
            filename (str): Nombre del archivo de salida (.jsonl).
        """
        filepath = os.path.join(PATH_FINAL_FILES, filename)
        with open(filepath, "w", encoding="utf-8") as f:
            for (i, j, k), _count in self.matrix_trigram.items():
                w1, w2, w3 = self.token_of(i), self.token_of(j), self.token_of(k)
                prob = self.get_prob([w1, w2, w3])
                record = {"w1": w1, "w2": w2, "w3": w3, "probabilidad": prob}
                f.write(json.dumps(record, ensure_ascii=False) + "\n")

    def get_next_token(self, words: list[str]) -> str:
        """
        Predice el siguiente token a partir del modelo de trigramas.

        A partir del par actual `(words[0], words[1])`, calcula las probabilidades
        de transición hacia todos los tokens del vocabulario y selecciona
        aleatoriamente el siguiente token según las 10 probabilidades más altas.

        Args:
            words (list[str]): Lista de dos tokens que sirven como contexto.

        Returns:
            str: El siguiente token predicho.
        """
        if (words[0], words[1]) not in self.pair_sums:
            return self.token_of(random.randint(0, self.V - 1))

        probabilities = []
        for k in range(self.V):
            p = self.get_prob([words[0], words[1], self.token_of(k)])
            probabilities.append((k, p))

        top10 = sorted(probabilities, key=lambda x: x[1], reverse=True)[:10]

        indices, probs = zip(*top10)
        probs = [math.exp(p) for p in probs]

        chosen_idx = random.choices(indices, weights=probs, k=1)[0]
        return self.token_of(chosen_idx)

    def generate_sentences(self, words: list[str], limit=50) -> list[str]:
        """
        Genera una oración utilizando un modelo de trigramas.

        La generación comienza con dos palabras iniciales (`words[0]` y `words[1]`).
        En cada paso se predice el siguiente token con `get_next_token`, se añade
        a la oración y se actualiza el contexto.
        El proceso se detiene al alcanzar el token de fin de secuencia `</s>`
        o al llegar al número máximo de tokens (`limit`).

        Args:
            words (list[str]): Lista inicial con dos tokens de contexto.
            limit (int, optional): Número máximo de tokens generados.
                Por defecto 50.

        Returns:
            str: Oración generada.
        """
        i = 0
        sentence = " ".join(words)
        predicted_token = self.get_next_token(words)
        words[0] = words[1]
        words[1] = predicted_token
        while i != limit or predicted_token == "<s>":
            predicted_token = self.get_next_token(words)
            sentence += " " + predicted_token
            words[0] = words[1]
            words[1] = predicted_token
        return sentence

#### Entrenamiento de los N-Gramas, creacion de los archivos con las probabilidades y guardado de los archivos que contienen las estructuras de los N-Gramas

In [None]:
def get_pickle(filename: str) -> list[str]:
    """Carga un .pickle guardado

    Args:
        filename (str): nombre del archivo

    Returns:
        Any: devuelve lo que sea se guardara en el .pickle
    """
    filepath = os.path.join(PATH_FINAL_FILES, filename)
    with open(filepath, "rb") as f:
        sentences = pickle.load(f)
    return sentences

In [None]:
# Genera los archivos con las probabilidades
# Unigramas

print("Iniciando la construcción del modelo de unigramas para el corpus 20N...")
modelo_de_unigramas_20N = UnigramModel(f"20N_{GRUPO}_training.pkl")
print("   • Procesando y generando la distribución de unigramas (20N).")
modelo_de_unigramas_20N.generate_unigrams(f"20N_{GRUPO}_unigrams.jsonl")
print("   • Distribución de unigramas generada satisfactoriamente (20N).")
modelo_de_unigramas_20N.save_model(f"20N_{GRUPO}_unigram_model.pkl")
print("   • Modelo de unigramas almacenado correctamente (20N).")
del modelo_de_unigramas_20N
gc.collect()
print("✓ Proceso completado: unigramas del corpus 20N.\n")

print("Iniciando la construcción del modelo de unigramas para el corpus BAC...")
modelo_de_unigramas_BAC = UnigramModel(f"BAC_{GRUPO}_training.pkl")
print("   • Procesando y generando la distribución de unigramas (BAC).")
modelo_de_unigramas_BAC.generate_unigrams(f"BAC_{GRUPO}_unigrams.jsonl")
print("   • Distribución de unigramas generada satisfactoriamente (BAC).")
modelo_de_unigramas_BAC.save_model(f"BAC_{GRUPO}_unigram_model.pkl")
print("   • Modelo de unigramas almacenado correctamente (BAC).")
del modelo_de_unigramas_BAC
gc.collect()
print("✓ Proceso completado: unigramas del corpus BAC.\n")

Iniciando la construcción del modelo de unigramas para el corpus 20N...
20N_Erich_Carlos_training.pkl
   • Procesando y generando la distribución de unigramas (20N).
   • Distribución de unigramas generada satisfactoriamente (20N).
   • Modelo de unigramas almacenado correctamente (20N).
✓ Proceso completado: unigramas del corpus 20N.

Iniciando la construcción del modelo de unigramas para el corpus BAC...
BAC_Erich_Carlos_training.pkl
   • Procesando y generando la distribución de unigramas (BAC).
   • Distribución de unigramas generada satisfactoriamente (BAC).
   • Modelo de unigramas almacenado correctamente (BAC).
✓ Proceso completado: unigramas del corpus BAC.



In [None]:
# Bigramas
print("Iniciando la construcción del modelo de bigramas para el corpus 20N...")
modelo_de_bigramas_20N = BigramModel(f"20N_{GRUPO}_training.pkl")
print("   • Procesando y generando la distribución de bigramas (20N).")
modelo_de_bigramas_20N.generate_bigrams(f"20N_{GRUPO}_bigrams.jsonl")
print("   • Distribución de bigramas generada satisfactoriamente (20N).")
modelo_de_bigramas_20N.save_model(f"20N_{GRUPO}_bigram_model.pkl")
print("   • Modelo de bigramas almacenado correctamente (20N).")
del modelo_de_bigramas_20N
gc.collect()
print("✓ Proceso completado: bigramas del corpus 20N.\n")

print("Iniciando la construcción del modelo de bigramas para el corpus BAC...")
modelo_de_bigramas_BAC = BigramModel(f"BAC_{GRUPO}_training.pkl")
print("   • Procesando y generando la distribución de bigramas (BAC).")
modelo_de_bigramas_BAC.generate_bigrams(f"BAC_{GRUPO}_bigrams.jsonl")
print("   • Distribución de bigramas generada satisfactoriamente (BAC).")
modelo_de_bigramas_BAC.save_model(f"BAC_{GRUPO}_bigram_model.pkl")
print("   • Modelo de bigramas almacenado correctamente (BAC).")
del modelo_de_bigramas_BAC
gc.collect()
print("✓ Proceso completado: bigramas del corpus BAC.\n")

Iniciando la construcción del modelo de bigramas para el corpus 20N...
   • Procesando y generando la distribución de bigramas (20N).
   • Distribución de bigramas generada satisfactoriamente (20N).
   • Modelo de bigramas almacenado correctamente (20N).
✓ Proceso completado: bigramas del corpus 20N.

Iniciando la construcción del modelo de bigramas para el corpus BAC...
   • Procesando y generando la distribución de bigramas (BAC).
   • Distribución de bigramas generada satisfactoriamente (BAC).
   • Modelo de bigramas almacenado correctamente (BAC).
✓ Proceso completado: bigramas del corpus BAC.



In [None]:
# Trigramas
print("Iniciando la construcción del modelo de trigramas para el corpus 20N...")
modelo_de_trigramas_20N = TrigramModel(f"20N_{GRUPO}_training.pkl")
print("   • Procesando y generando la distribución de trigramas (20N).")
modelo_de_trigramas_20N.generate_trigrams(f"20N_{GRUPO}_trigrams.jsonl")
print("   • Distribución de trigramas generada satisfactoriamente (20N).")
modelo_de_trigramas_20N.save_model(f"20N_{GRUPO}_trigram_model.pkl")
print("   • Modelo de trigramas almacenado correctamente (20N).")
del modelo_de_trigramas_20N
gc.collect()
print("✓ Proceso completado: trigramas del corpus 20N.\n")

print("Iniciando la construcción del modelo de trigramas para el corpus BAC...")
modelo_de_trigramas_BAC = TrigramModel(f"BAC_{GRUPO}_training.pkl")
print("   • Procesando y generando la distribución de trigramas (BAC).")
modelo_de_trigramas_BAC.generate_trigrams(f"BAC_{GRUPO}_trigrams.jsonl")
print("   • Distribución de trigramas generada satisfactoriamente (BAC).")
modelo_de_trigramas_BAC.save_model(f"BAC_{GRUPO}_trigram_model.pkl")
print("   • Modelo de trigramas almacenado correctamente (BAC).")
del modelo_de_trigramas_BAC
gc.collect()
print("✓ Proceso completado: trigramas del corpus BAC.\n")

Iniciando la construcción del modelo de trigramas para el corpus 20N...
   • Procesando y generando la distribución de trigramas (20N).
   • Distribución de trigramas generada satisfactoriamente (20N).
   • Modelo de trigramas almacenado correctamente (20N).
✓ Proceso completado: trigramas del corpus 20N.

Iniciando la construcción del modelo de trigramas para el corpus BAC...
   • Procesando y generando la distribución de trigramas (BAC).
   • Distribución de trigramas generada satisfactoriamente (BAC).
   • Modelo de trigramas almacenado correctamente (BAC).
✓ Proceso completado: trigramas del corpus BAC.

