In [None]:
import gc
import json
import math
import os
import pickle
import random
import re
import sys

import numpy as np
from charset_normalizer import from_path
from gensim.corpora import Dictionary
import nltk
from tqdm import tqdm

"""
Se importan las librerias que se necesiten, 
si se quiere ejecutar el notebook, se recomienda crear la carpeta de data, y poner ahi los files como se describe

"""
ACTUAL_PATH = os.getcwd()
# Donde esta el 20 News
PATH_20N = os.path.join(ACTUAL_PATH, "data/20news-18828")
# Donde se encuentra el BAC
PATH_BAC = os.path.join(ACTUAL_PATH, "data/BAC/blogs")
# Donde se van a guardar los files que se van obteniendo
PATH_FINAL_FILES = os.path.join(ACTUAL_PATH, "data/final_files")

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import nltk
from nltk.corpus import stopwords
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, PredefinedSplit
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from skopt import BayesSearchCV
import numpy as np

stemmer = nltk.stem.SnowballStemmer("english")
nltk.download("stopwords")

RANDOM_STATE = 42

val_ratio_within_train = 1.0 / 7.0

### Upload 20N 

In [2]:
"""Para esto se define en que formato y donde se quiere el archivo completo de 20N"""

NEW_20N_FILE = os.path.join(PATH_FINAL_FILES, "20N.jsonl")

mayor_folders_20N = os.listdir(PATH_20N)
dictionary = {}
"""
Para cada archivo disponible en 20N,  
se generan registros con:  
- el ID del archivo  
- el tema del archivo  
- el texto del contenido  

Todo se guarda en formato JSONL por cuestiones de formato.
"""
with open(NEW_20N_FILE, "w", encoding="utf-8", errors="replace") as f_n:
    for folder in mayor_folders_20N:
        minor_files_path = os.path.join(PATH_20N, folder)
        minor_files = os.listdir(minor_files_path)
        for file in minor_files:
            file_path = os.path.join(minor_files_path, file)
            with open(file_path, "r", encoding="utf-8", errors="replace") as f:
                text = f.read().strip()

            record = {"id": file, "theme": folder, "text": text}
            unit = folder + file
            if file in dictionary.keys():
                dictionary[unit] += 1
            else:
                dictionary[unit] = 1
            f_n.write(json.dumps(record, ensure_ascii=False) + "\n")

## I. For the 20N dataset compare two classifiers NB and LR to identify the 20 different newsgroups.

In [28]:
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords


stemmer = SnowballStemmer("english")
stop_words = stopwords.words("english")


def preprocess_text(text: str) -> list[str]:
    """Limpia y preprocesa texto: elimina correos, URLs,
    normaliza, tokeniza y aplica stemming (para inglés)."""

    text = text.lower()

    text = re.sub(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", " ", text)

    text = re.sub(r"http\S+|www\.\S+", " ", text)

    text = re.sub(r"\([^)]*\)", " ", text)

    text = re.sub(r"\d+", " NUM ", text)

    text = re.sub(r"[^a-z0-9'\-]", " ", text)

    text = re.sub(r"\s+", " ", text).strip()

    tokens = nltk.word_tokenize(text, language="english")

    tokens = [
        stemmer.stem(token)
        for token in tokens
        if token.isalpha() and token not in stop_words
    ]

    return tokens


def extract_body(text: str) -> str:
    """Extrae el cuerpo del texto dejando el Subject al inicio y eliminando From."""

    text = re.sub(r"^From:.*\n", "", text, flags=re.MULTILINE)

    subject_match = re.search(
        r"^Subject:\s*(.*)", text, flags=re.MULTILINE | re.IGNORECASE
    )
    subject = subject_match.group(1).strip() if subject_match else ""

    body = re.sub(
        r"^Subject:.*\n", "", text, flags=re.MULTILINE | re.IGNORECASE
    ).strip()

    if subject:
        body = subject + "\n\n" + body

    return body

In [None]:
from sklearn.model_selection import train_test_split

texts = []
labels = []

with open(os.path.join(PATH_FINAL_FILES, "20N.jsonl")) as f:
    for line in f:
        data = json.loads(line)
        labels.append(data["theme"])
        texts.append(extract_body(data["text"]))
X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.3, random_state=42, stratify=labels
)

In [None]:
def generar_modelo_val_train(
    X_train, y_train, X_test, y_test, espacio, model, text_repr, iteraciones=30
):
    # Seria mas optimo tener esta seccion apartada, pero dado el numero de outputs que genera prefiero
    # por limieza tenerlo aca, ademas el dataset no es tan grande como para preocuparme por cuanto se demora esta operacion.
    X_tr, X_val, y_tr, y_val = train_test_split(
        X_train,
        y_train,
        test_size=val_ratio_within_train,
        random_state=RANDOM_STATE,
        stratify=y_train,
    )

    pipeline = Pipeline(
        [
            (
                "repr",
                text_repr,
            ),
            ("model", model),
        ]
    )

    X_total = np.array(list(X_tr) + list(X_val), dtype=object)
    y_total = np.array(list(y_tr) + list(y_val))

    test_fold = np.array([-1] * len(X_tr) + [0] * len(X_val))
    ps = PredefinedSplit(test_fold=test_fold)

    opt = BayesSearchCV(
        estimator=pipeline,
        search_spaces=espacio,
        n_iter=iteraciones,
        cv=ps,
        scoring="f1_macro",
        refit=True,
        random_state=RANDOM_STATE,
        n_jobs=-1,
        verbose=0,
    )

    opt.fit(X_total, y_total)
    print("Mejores hiperparámetros:", opt.best_params_)

    y_pred = opt.predict(X_test)
    print(classification_report(y_test, y_pred))

    return opt

In [None]:
espacio = {"vect__max_df": (0.5, 1.0), "model__alpha": (1e-3, 1.0, "log-uniform")}

generar_modelo_val_train(
    X_train,
    y_train,
    X_test,
    y_test,
    espacio,
    MultinomialNB(),
    CountVectorizer(tokenizer=preprocess_text, preprocessor=None, token_pattern=None),
)

Mejores hiperparámetros: OrderedDict([('model__alpha', 0.02160217783087772), ('vect__max_df', 0.9593612608346885)])
                          precision    recall  f1-score   support

             alt.atheism       0.82      0.90      0.86       240
           comp.graphics       0.69      0.86      0.77       292
 comp.os.ms-windows.misc       0.93      0.39      0.55       296
comp.sys.ibm.pc.hardware       0.61      0.83      0.70       295
   comp.sys.mac.hardware       0.80      0.91      0.85       288
          comp.windows.x       0.85      0.84      0.85       294
            misc.forsale       0.88      0.74      0.80       292
               rec.autos       0.93      0.89      0.91       297
         rec.motorcycles       0.93      0.97      0.95       298
      rec.sport.baseball       0.98      0.96      0.97       298
        rec.sport.hockey       0.97      0.98      0.97       300
               sci.crypt       0.93      0.95      0.94       297
         sci.electronics 

0,1,2
,estimator,Pipeline(step...inomialNB())])
,search_spaces,"{'model__alpha': (0.001, ...), 'vect__max_df': (0.5, ...)}"
,optimizer_kwargs,
,n_iter,30
,scoring,'f1_macro'
,fit_params,
,n_jobs,-1
,n_points,1
,iid,'deprecated'
,refit,True

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,<function pre...x7e9447f423b0>
,stop_words,
,token_pattern,
,ngram_range,"(1, ...)"

0,1,2
,alpha,0.02160217783087772
,force_alpha,True
,fit_prior,True
,class_prior,


In [None]:
espacio = {"vect__max_df": (0.5, 1.0), "model__alpha": (1e-3, 1.0, "log-uniform")}

generar_modelo_val_train(
    X_train,
    y_train,
    X_test,
    y_test,
    espacio,
    MultinomialNB(),
    CountVectorizer(tokenizer=preprocess_text, preprocessor=None, token_pattern=None),
)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression


In [None]:
espacio = {"vect__max_df": (0.5, 1.0), "model__C": (1e-3, 1e2, "log-uniform"),}

generar_modelo_val_train(
    X_train,
    y_train,
    X_test,
    y_test,
    espacio,
    LogisticRegression(
        solver="saga",        # bueno para datos dispersos
        penalty="l2",
        multi_class="auto",   # multinomial si aplica
        max_iter=2000
    ),
    TfidfVectorizer(tokenizer=preprocess_text, preprocessor=None, token_pattern=None),
)

### Create your own processing pipeline for the task and justify it

### Divide the dataset into training (60%), validation (10%) and test (30%).

### Train NB and LR using the following vector representations:

#### tf (counts) representation (sklearn: CountVectorizer).

#### tfidf representation (sklearn: TfidfVectorizer).