# Importaciones

In [1]:
import numpy as np
import pandas as pd
import gensim.corpora as corpora
import torch, os, zipfile, chardet, spacy, gensim

from datasets import load_metric
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando: {device}")

Usando: cuda


# Dataset cargado limpiado

In [2]:
# Descomprimir el archivo zip
def unzip_data(zip_path, extract_to):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

unzip_data("bbc-train.zip", "bbc-train")
unzip_data("bbc-test.zip", "bbc-test")

In [3]:
# Cargar datos desde carpetas
def load_data(data_path):
    categories = ["business", "entertainment", "politics", "sports", "tech"]
    data = []

    def detect_encoding(file_path):
        with open(file_path, 'rb') as f:
            result = chardet.detect(f.read())
        return result['encoding']

    for category in categories:
        folder_path = os.path.join(data_path, category)
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            encoding = detect_encoding(file_path)
            with open(file_path, "r", encoding=encoding, errors="replace") as f:
                text = f.read()
                data.append((text, category))

    return pd.DataFrame(data, columns=["text", "category"])

# Cargar conjuntos de entrenamiento y test
df_train = load_data("bbc-train/bbc-train")
df_test = load_data("bbc-test/bbc-test")

In [4]:
# Preprocesamiento con spaCy
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    doc = nlp(text.lower())
    return [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]

df_train["tokens"] = df_train["text"].apply(preprocess_text)
df_test["tokens"] = df_test["text"].apply(preprocess_text)

In [5]:
df_train.head()

Unnamed: 0,text,category,tokens
0,Malaysia lifts Islamic bank limit\n\nMalaysia'...,business,"[malaysia, lifts, islamic, bank, limit, malays..."
1,Egypt and Israel seal trade deal\n\nIn a sign ...,business,"[egypt, israel, seal, trade, deal, sign, thaw,..."
2,Gaming firm to sell UK dog tracks\n\nSix UK gr...,business,"[gaming, firm, sell, uk, dog, track, uk, greyh..."
3,Chinese exports rise 25% in 2004\n\nExports fr...,business,"[chinese, export, rise, export, china, leapt, ..."
4,India's Reliance family feud heats up\n\nThe o...,business,"[india, reliance, family, feud, heat, ongoing,..."


In [6]:
df_test.head()

Unnamed: 0,text,category,tokens
0,Hyundai to build new India plant\n\nSouth Kore...,business,"[hyundai, build, new, india, plant, south, kor..."
1,German business confidence slides\n\nGerman bu...,business,"[german, business, confidence, slide, german, ..."
2,Industrial output falls in Japan\n\nJapanese i...,business,"[industrial, output, fall, japan, japanese, in..."
3,Japanese mogul arrested for fraud\n\nOne of Ja...,business,"[japanese, mogul, arrest, fraud, japan, well, ..."
4,Chinese wine tempts Italy's Illva\n\nItaly's I...,business,"[chinese, wine, tempts, italy, illva, italy, i..."


# BERT (supervisado)

In [7]:
# --- Modelo Supervisado (Clasificación con BERT) ---

class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {"input_ids": encoding["input_ids"].squeeze(),
                "attention_mask": encoding["attention_mask"].squeeze(),
                "labels": torch.tensor(self.labels[idx])}

# Tokenización
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", use_auth_token="b31fe5c93a2fb4a174be367155f97dfcc1613313")
train_dataset = NewsDataset(df_train["text"].tolist(), df_train["category"].astype('category').cat.codes.tolist(), tokenizer)
test_dataset = NewsDataset(df_test["text"].tolist(), df_test["category"].astype('category').cat.codes.tolist(), tokenizer)



In [8]:
# Definir métricas
def compute_metrics(eval_pred):
    metric_accuracy = load_metric("accuracy")
    metric_f1 = load_metric("f1")
    metric_precision = load_metric("precision")
    metric_recall = load_metric("recall")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    return {
        "accuracy": metric_accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "f1": metric_f1.compute(predictions=predictions, references=labels, average="weighted")["f1"],
        "precision": metric_precision.compute(predictions=predictions, references=labels, average="weighted")["precision"],
        "recall": metric_recall.compute(predictions=predictions, references=labels, average="weighted")["recall"]
    }

In [9]:
# Modelo BERT
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5, use_auth_token="b31fe5c93a2fb4a174be367155f97dfcc1613313")
training_args = TrainingArguments(output_dir="./results", num_train_epochs=3, per_device_train_batch_size=8, evaluation_strategy="epoch")
trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=test_dataset)
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mcdf51342[0m ([33mcdf51342-universidad-carlos-iii-de-madrid[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,No log,0.044779
2,0.156200,0.114752
3,0.156200,0.029236


TrainOutput(global_step=756, training_loss=0.11318251254066589, metrics={'train_runtime': 653.9517, 'train_samples_per_second': 9.244, 'train_steps_per_second': 1.156, 'total_flos': 1590549171194880.0, 'train_loss': 0.11318251254066589, 'epoch': 3.0})

In [10]:
# Evaluación del modelo en test
results = trainer.evaluate()
print("Resultados en conjunto de test:", results)

Resultados en conjunto de test: {'eval_loss': 0.02923557721078396, 'eval_runtime': 6.9814, 'eval_samples_per_second': 30.08, 'eval_steps_per_second': 3.867, 'epoch': 3.0}


In [12]:
# Definir el directorio donde se guardará el modelo
output_dir = "./models/modelBERT"

# Guardar el modelo y el tokenizador
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Modelo guardado en {output_dir}")

Modelo guardado en ./models/modelBERT


# LDA (no supervisado)

In [11]:
# --- Modelado de Temas (LDA) ---

# Crear diccionario y corpus para LDA
dictionary = corpora.Dictionary(df_train["tokens"])
corpus = [dictionary.doc2bow(text) for text in df_train["tokens"]]
lda_model = gensim.models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, passes=10)

# Mostrar los temas
topics = lda_model.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.021*"chip" + 0.019*"say" + 0.015*"device" + 0.015*"cell" + 0.014*"drive" + 0.012*"year" + 0.011*"datum" + 0.010*"find" + 0.010*"hard" + 0.010*"processor"')
(1, '0.018*"say" + 0.007*"year" + 0.007*"mr" + 0.004*"new" + 0.004*"time" + 0.004*"good" + 0.004*"win" + 0.003*"m" + 0.003*"people" + 0.003*"government"')
(2, '0.016*"gadget" + 0.013*"game" + 0.012*"device" + 0.011*"design" + 0.010*"look" + 0.009*"video" + 0.008*"player" + 0.008*"sale" + 0.008*"store" + 0.008*"say"')
(3, '0.024*"say" + 0.013*"virus" + 0.011*"file" + 0.009*"system" + 0.009*"mobile" + 0.008*"new" + 0.007*"site" + 0.007*"people" + 0.007*"network" + 0.007*"mr"')
(4, '0.025*"say" + 0.020*"mobile" + 0.016*"phone" + 0.015*"people" + 0.013*"control" + 0.011*"bill" + 0.011*"sigarchi" + 0.010*"use" + 0.010*"fix" + 0.010*"brain"')


# Almacenamiento de datos

In [13]:
df_train.to_csv('data/results/training_dataset.csv')
df_test.to_csv('data/results/testing_dataset.csv')