In [None]:
import os
import re
import json
import warnings 

import numpy as np
import pandas as pd
import nltk
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from tqdm import tqdm
from skopt import BayesSearchCV

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.metrics import (
    precision_recall_fscore_support,
    accuracy_score,
    classification_report,
)
from sklearn.preprocessing import MinMaxScaler
from sklearn.exceptions import ConvergenceWarning
from tabulate import tabulate

from utils_taller3 import SenticLexiconFeaturizer

warnings.filterwarnings("ignore", category=ConvergenceWarning)

ACTUAL_PATH = os.getcwd()
PATH_20N = os.path.join(ACTUAL_PATH, "data/20news-18828")
PATH_MD = os.path.join(ACTUAL_PATH, "data/Multi Domain Sentiment/processed_acl")
PATH_FINAL_FILES = os.path.join(ACTUAL_PATH, "data/final_files")

stemmer = SnowballStemmer("english")
nltk.download("stopwords")
stop_words = stopwords.words("english")

RANDOM_STATE = 42
val_ratio_within_train = 1.0 / 7.0


[nltk_data] Downloading package stopwords to /home/erich/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Upload 20N 

In [34]:
"""Para esto se define en que formato y donde se quiere el archivo completo de 20N"""

NEW_20N_FILE = os.path.join(PATH_FINAL_FILES, "20N.jsonl")

mayor_folders_20N = os.listdir(PATH_20N)
dictionary = {}
"""
Para cada archivo disponible en 20N,  
se generan registros con:  
- el ID del archivo  
- el tema del archivo  
- el texto del contenido  

Todo se guarda en formato JSONL por cuestiones de formato.
"""
with open(NEW_20N_FILE, "w", encoding="utf-8", errors="replace") as f_n:
    for folder in mayor_folders_20N:
        minor_files_path = os.path.join(PATH_20N, folder)
        minor_files = os.listdir(minor_files_path)
        for file in minor_files:
            file_path = os.path.join(minor_files_path, file)
            with open(file_path, "r", encoding="utf-8", errors="replace") as f:
                text = f.read().strip()

            record = {"id": file, "theme": folder, "text": text}
            unit = folder + file
            if file in dictionary.keys():
                dictionary[unit] += 1
            else:
                dictionary[unit] = 1
            f_n.write(json.dumps(record, ensure_ascii=False) + "\n")

### I. For the 20N dataset compare two classifiers NB and LR to identify the 20 different newsgroups.

In [35]:



def preprocess_text(text: str) -> list[str]:
    """Limpia y preprocesa texto: elimina correos, URLs,
    normaliza, tokeniza y aplica stemming (para inglés)."""

    text = text.lower()

    text = re.sub(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", " ", text)

    text = re.sub(r"http\S+|www\.\S+", " ", text)

    text = re.sub(r"\([^)]*\)", " ", text)

    text = re.sub(r"\d+", " NUM ", text)

    text = re.sub(r"[^a-z0-9'\-]", " ", text)

    text = re.sub(r"\s+", " ", text).strip()

    tokens = nltk.word_tokenize(text, language="english")

    tokens = [
        stemmer.stem(token)
        for token in tokens
        if token.isalpha() and token not in stop_words
    ]

    return tokens


def extract_body(text: str) -> str:
    """Extrae el cuerpo del texto dejando el Subject al inicio y eliminando From."""

    text = re.sub(r"^From:.*\n", "", text, flags=re.MULTILINE)

    subject_match = re.search(
        r"^Subject:\s*(.*)", text, flags=re.MULTILINE | re.IGNORECASE
    )
    subject = subject_match.group(1).strip() if subject_match else ""

    body = re.sub(
        r"^Subject:.*\n", "", text, flags=re.MULTILINE | re.IGNORECASE
    ).strip()

    if subject:
        body = subject + "\n\n" + body

    return body

In [36]:


texts = []
labels = []

with open(os.path.join(PATH_FINAL_FILES, "20N.jsonl")) as f:
    for line in f:
        data = json.loads(line)
        labels.append(data["theme"])
        texts.append(extract_body(data["text"]))
X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.3, random_state=42, stratify=labels
)

In [37]:
resultado_de_modelos = {}

In [38]:
def mostrar_resultados_tabulate(resultados_segunda_parte, ordenar_por="f1_macro"):
    """
    Muestra los resultados en formato de tabla usando tabulate.
    Ordena por la métrica especificada (default: f1_macro).
    """
    ejemplo = next(iter(resultados_segunda_parte.values()))
    columnas = ["Modelo"] + list(ejemplo.keys())

    filas = []
    for modelo, metricas in resultados_segunda_parte.items():
        fila = [modelo]
        for valor in metricas.values():
            fila.append(round(valor, 4) if isinstance(valor, (int, float)) else valor)
        filas.append(fila)

    if ordenar_por in ejemplo:
        idx = columnas.index(ordenar_por)
        filas.sort(key=lambda x: x[idx], reverse=True)

    print(tabulate(filas, headers=columnas, tablefmt="grid"))

In [39]:
def generar_modelo_val_train(X_train, y_train, model, text_repr):
    # Seria mas optimo tener esta seccion apartada, pero dado el numero de outputs que genera prefiero
    # por limieza tenerlo aca, ademas el dataset no es tan grande como para preocuparme por cuanto se demora esta operacion.
    # X_tr, X_val, y_tr, y_val = train_test_split(
    #     X_train,
    #     y_train,
    #     test_size=val_ratio_within_train,
    #     random_state=RANDOM_STATE,
    #     stratify=y_train,
    # )

    pipeline = Pipeline(
        [
            (
                "repr",
                text_repr,
            ),
            ("model", model),
        ]
    )

    # X_total = np.array(list(X_tr) + list(X_val), dtype=object)
    # y_total = np.array(list(y_tr) + list(y_val))

    # test_fold = np.array([-1] * len(X_tr) + [0] * len(X_val))
    # ps = PredefinedSplit(test_fold=test_fold)

    pipeline.fit(X_train, y_train)

    print("Mejores hiperparámetros:", pipeline.get_params())

    return pipeline


def evaluate(opt, X_test, y_test, print_flag=False):
    """
    Evalúa un modelo optimizado (BayesSearchCV) en el conjunto de test.
    Devuelve un diccionario con métricas.
    """
    y_pred = opt.predict(X_test)

    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
        y_test, y_pred, average="macro"
    )
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(
        y_test, y_pred, average="micro"
    )

    resultados = {
        "precision_macro": precision_macro,
        "recall_macro": recall_macro,
        "f1_macro": f1_macro,
        "precision_micro": precision_micro,
        "recall_micro": recall_micro,
        "f1_micro": f1_micro,
        "accuracy": accuracy_score(y_test, y_pred),
        "best_params": opt.get_params(),
    }
    if print_flag:
        print(classification_report(y_test, y_pred))

    return resultados


def entrenar_modelos(X_train, y_train, X_test, y_test):
    """
    Itera por diferentes combinaciones de representaciones y modelos.
    Devuelve un diccionario con los resultados de cada configuración.
    """

    configuraciones = {
        "BOW_LR_1_VAL": {
            "model": LogisticRegression(
                solver="saga", penalty="l2"  # , max_iter=iteraciones
            ),
            "text_repr": CountVectorizer(
                tokenizer=preprocess_text, preprocessor=None, token_pattern=None
            ),
            "espacio": {
                # "repr__max_df": (0.5, 1.0),
                # "model__C": (1e-3, 1e2, "log-uniform"),
            },
        },
        "TF-IDF_LR_1_VAL": {
            "model": LogisticRegression(solver="saga", penalty="l2"),  # , max_iter=2000
            "text_repr": TfidfVectorizer(
                tokenizer=preprocess_text, preprocessor=None, token_pattern=None
            ),
            "espacio": {
                # "repr__max_df": (0.5, 1.0),
                # "model__C": (1e-3, 1e2, "log-uniform"),
            },
        },
        "BOW_NB_1_VAL": {
            "model": MultinomialNB(),
            "text_repr": CountVectorizer(
                tokenizer=preprocess_text, preprocessor=None, token_pattern=None
            ),
            "espacio": {
                # "repr__max_df": (0.5, 1.0),
                # "model__alpha": (1e-3, 1.0, "log-uniform"),
            },
        },
        "TF-IDF_NB_1_VAL": {
            "model": MultinomialNB(),
            "text_repr": TfidfVectorizer(
                tokenizer=preprocess_text, preprocessor=None, token_pattern=None
            ),
            "espacio": {
                # "repr__max_df": (0.5, 1.0),
                # "model__alpha": (1e-3, 1.0, "log-uniform"),
            },
        },
    }

    resultados = {}
    for nombre, cfg in configuraciones.items():
        print(f"\nEntrenando modelo: {nombre}")
        opt = generar_modelo_val_train(
            X_train,
            y_train,
            cfg["model"],
            cfg["text_repr"],
        )
        resultados[nombre] = evaluate(opt, X_test, y_test, print_flag=True)
    return resultados

In [42]:
resultado_de_modelos = entrenar_modelos(X_train, y_train, X_test, y_test)


Entrenando modelo: BOW_LR_1_VAL
Mejores hiperparámetros: {'memory': None, 'steps': [('repr', CountVectorizer(token_pattern=None,
                tokenizer=<function preprocess_text at 0x7560c4f34e50>)), ('model', LogisticRegression(solver='saga'))], 'transform_input': None, 'verbose': False, 'repr': CountVectorizer(token_pattern=None,
                tokenizer=<function preprocess_text at 0x7560c4f34e50>), 'model': LogisticRegression(solver='saga'), 'repr__analyzer': 'word', 'repr__binary': False, 'repr__decode_error': 'strict', 'repr__dtype': <class 'numpy.int64'>, 'repr__encoding': 'utf-8', 'repr__input': 'content', 'repr__lowercase': True, 'repr__max_df': 1.0, 'repr__max_features': None, 'repr__min_df': 1, 'repr__ngram_range': (1, 1), 'repr__preprocessor': None, 'repr__stop_words': None, 'repr__strip_accents': None, 'repr__token_pattern': None, 'repr__tokenizer': <function preprocess_text at 0x7560c4f34e50>, 'repr__vocabulary': None, 'model__C': 1.0, 'model__class_weight': None, 'm

In [45]:
resultado_de_modelos

{'BOW_LR_1_VAL': {'precision_macro': 0.7032877316346806,
  'recall_macro': 0.6707547437254145,
  'f1_macro': 0.6568456131351404,
  'precision_micro': 0.6843689148521862,
  'recall_micro': 0.6843689148521862,
  'f1_micro': 0.6843689148521862,
  'accuracy': 0.6843689148521862,
  'best_params': {'memory': None,
   'steps': [('repr',
     CountVectorizer(token_pattern=None,
                     tokenizer=<function preprocess_text at 0x7560c4f34e50>)),
    ('model', LogisticRegression(solver='saga'))],
   'transform_input': None,
   'verbose': False,
   'repr': CountVectorizer(token_pattern=None,
                   tokenizer=<function preprocess_text at 0x7560c4f34e50>),
   'model': LogisticRegression(solver='saga'),
   'repr__analyzer': 'word',
   'repr__binary': False,
   'repr__decode_error': 'strict',
   'repr__dtype': numpy.int64,
   'repr__encoding': 'utf-8',
   'repr__input': 'content',
   'repr__lowercase': True,
   'repr__max_df': 1.0,
   'repr__max_features': None,
   'repr__min_d

In [48]:
mostrar_resultados_tabulate(resultado_de_modelos)

+-----------------+-------------------+----------------+------------+-------------------+----------------+------------+------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### II Investigate cross-validation technique

#### Modelo 10 fold cross validation

In [49]:
def evaluate(opt, X_test, y_test, print_flag=False):
    y_pred = opt.predict(X_test)

    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
        y_test, y_pred, average="macro"
    )
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(
        y_test, y_pred, average="micro"
    )

    resultados = {
        "precision_macro": precision_macro,
        "recall_macro": recall_macro,
        "f1_macro": f1_macro,
        "precision_micro": precision_micro,
        "recall_micro": recall_micro,
        "f1_micro": f1_micro,
        "accuracy": accuracy_score(y_test, y_pred),
        "best_params": opt.best_params_,
    }
    if print_flag:
        print(classification_report(y_test, y_pred))

    return resultados


def cv_summary_from_opt(opt, X_test=None, y_test=None, print_flag=False):
    """
    Si se suministra X_test,y_test, devuelve exactamente lo mismo que evaluate(opt,...).
    Si no, devuelve un resumen desde opt.cv_results_ (valores medios de CV).
    """

    if X_test is not None and y_test is not None:
        return evaluate(opt, X_test, y_test, print_flag=print_flag)
    bi = opt.best_index_
    cv = opt.cv_results_

    out = {
        "precision_macro": float(cv["mean_test_precision_macro"][bi]),
        "recall_macro": float(cv["mean_test_recall_macro"][bi]),
        "f1_macro": float(cv["mean_test_f1_macro"][bi]),
        "precision_micro": float(cv["mean_test_precision_micro"][bi]),
        "recall_micro": float(cv["mean_test_recall_micro"][bi]),
        "f1_micro": float(cv["mean_test_f1_micro"][bi]),
        "accuracy": float(cv["mean_test_accuracy"][bi]),
        "best_params": opt.best_params_,
    }
    return out


def generar_modelo_val_train(
    X_train, y_train, espacio, model, text_repr, iteraciones=30
):
    pipeline = Pipeline(
        [
            ("repr", text_repr),
            ("model", model),
        ]
    )

    dict_scoring = {
        "f1_macro": "f1_macro",
        "precision_macro": "precision_macro",
        "recall_macro": "recall_macro",
        "accuracy": "accuracy",
        "f1_micro": "f1_micro",
        "precision_micro": "precision_micro",
        "recall_micro": "recall_micro",
    }

    opt = BayesSearchCV(
        estimator=pipeline,
        search_spaces=espacio,
        n_iter=iteraciones,
        scoring=dict_scoring,
        refit="f1_macro",
        cv=10,
        random_state=RANDOM_STATE,
        n_jobs=-1,
        verbose=0,
    )

    opt.fit(X_train, y_train)
    print("Mejores hiperparámetros:", opt.best_params_)

    return opt


def entrenar_modelos(X_train, y_train, X_test, y_test, iteraciones=30):
    """
    Itera por combinaciones de representaciones y modelos (SGD y NB).
    Reporta métricas de CROSS (CV del BayesSearchCV) y TEST (hold-out).
    """
    configuraciones = {
        "BOW_SGD_10_VAL": {
            "model": SGDClassifier(loss="log_loss", random_state=RANDOM_STATE),
            "text_repr": CountVectorizer(
                tokenizer=preprocess_text, preprocessor=None, token_pattern=None
            ),
            "espacio": {
                "repr__max_df": (0.7, 1.0),
                "model__alpha": (1e-5, 1e-2, "log-uniform"),
                "model__penalty": ["l2", None, "elasticnet"],
                "model__learning_rate": ["optimal", "constant"],
                "model__eta0": (
                    1e-3,
                    1e-1,
                    "log-uniform",
                ),
                "model__early_stopping": [True],
                "model__max_iter": (50, 150),
                "model__tol": (1e-4, 1e-2, "log-uniform"),
            },
        },
        "TF-IDF_SGD_10_VAL": {
            "model": SGDClassifier(loss="log_loss", random_state=RANDOM_STATE),
            "text_repr": TfidfVectorizer(
                tokenizer=preprocess_text, preprocessor=None, token_pattern=None
            ),
            "espacio": {
                "repr__max_df": (0.7, 1.0),
                "model__alpha": (1e-5, 1e-2, "log-uniform"),
                "model__penalty": ["l2", None, "elasticnet"],
                "model__learning_rate": ["optimal", "constant"],
                "model__eta0": (1e-3, 1e-1, "log-uniform"),
                "model__early_stopping": [True],
                "model__max_iter": (50, 150),
                "model__tol": (1e-4, 1e-2, "log-uniform"),
            },
        },
        "BOW_NB_10_VAL": {
            "model": MultinomialNB(),
            "text_repr": CountVectorizer(
                tokenizer=preprocess_text, preprocessor=None, token_pattern=None
            ),
            "espacio": {
                "repr__max_df": (0.5, 1.0),
                "model__alpha": (1e-3, 1.0, "log-uniform"),
                "model__fit_prior": [True, False],
            },
        },
        "TF-IDF_NB_10_VAL": {
            "model": MultinomialNB(),
            "text_repr": TfidfVectorizer(
                tokenizer=preprocess_text, preprocessor=None, token_pattern=None
            ),
            "espacio": {
                "repr__max_df": (0.5, 1.0),
                "model__alpha": (1e-3, 1.0, "log-uniform"),
                "model__fit_prior": [True, False],
            },
        },
    }

    resultados = {}
    for nombre, cfg in configuraciones.items():
        print(f"\nEntrenando modelo: {nombre}")
        opt = generar_modelo_val_train(
            X_train,
            y_train,
            cfg["espacio"],
            cfg["model"],
            cfg["text_repr"],
            iteraciones=iteraciones,
        )

        resultados[f"{nombre}__TEST"] = evaluate(opt, X_test, y_test, print_flag=True)
        resultados[f"{nombre}__CROSS"] = cv_summary_from_opt(opt)

    return resultados

In [None]:
resultado_de_modelos = entrenar_modelos(X_train, y_train, X_test, y_test, iteraciones=5)


Entrenando modelo: BOW_SGD_10_VAL


In [None]:
mostrar_resultados_tabulate(resultado_de_modelos)

+--------------------------+-------------------+----------------+------------+-------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Modelo                   |   precision_macro |   recall_macro |   f1_macro |   precision_micro | recall_micro                                                                                                                                                                          