In [2]:
import warnings
import logging
import os
import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification,TrainingArguments, Trainer, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder

# Silenciar warnings y configurar logs
warnings.filterwarnings("ignore")
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.ERROR)
datasets_logger = logging.getLogger("datasets")
datasets_logger.setLevel(logging.ERROR)
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"

#Confguración para usar la GPU disponible (RTX 4070 SUPER)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Usando dispositivo: {device} ({torch.cuda.get_device_name(0) if device == 'cuda' else 'CPU'})")

df = pd.read_csv("../data/mails_dataset.csv")

# Combinar asunto y cuerpo con separador especial
df['text_combined'] = df['subject'].fillna('') + ' </s> ' + df['text'].fillna('')
df = df[['text_combined', 'category']].dropna()

# Codificar categorías
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["category"])
label2id = {label: i for i, label in enumerate(label_encoder.classes_)}
id2label = {i: label for label, i in label2id.items()}

# Crear Dataset HuggingFace
dataset = Dataset.from_pandas(df.rename(columns={"text_combined": "text", "label": "label"}))

# Tokenizar texto
model_name = "pysentimiento/robertuito-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

dataset = dataset.map(tokenize, batched=True)
dataset = dataset.train_test_split(test_size=0.2)

num_labels = len(label2id)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

#Entrenamiento
training_args = TrainingArguments(
    output_dir="./category_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    weight_decay=0.01,
    logging_dir="./logs_category",
    logging_steps=10,
    fp16=True,  
    report_to="none" 
)

#Evaluación
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}

#Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

trainer.train()

#Guardar modelo y tokenizer
trainer.save_model("./category_model")
tokenizer.save_pretrained("./category_model")


Usando dispositivo: cuda (NVIDIA GeForce RTX 4070 SUPER)


Map:   0%|          | 0/256 [00:00<?, ? examples/s]

{'loss': 1.2534, 'grad_norm': 6.2626519203186035, 'learning_rate': 1.7435897435897438e-05, 'epoch': 0.7692307692307693}
{'eval_loss': 1.037034273147583, 'eval_accuracy': 0.6153846153846154, 'eval_f1': 0.616846614923538, 'eval_runtime': 0.064, 'eval_samples_per_second': 812.513, 'eval_steps_per_second': 62.501, 'epoch': 1.0}
{'loss': 0.9465, 'grad_norm': 5.310665130615234, 'learning_rate': 1.4871794871794874e-05, 'epoch': 1.5384615384615383}
{'eval_loss': 0.7411546111106873, 'eval_accuracy': 0.8653846153846154, 'eval_f1': 0.8591570591570592, 'eval_runtime': 0.064, 'eval_samples_per_second': 812.501, 'eval_steps_per_second': 62.5, 'epoch': 2.0}
{'loss': 0.696, 'grad_norm': 4.509370803833008, 'learning_rate': 1.230769230769231e-05, 'epoch': 2.3076923076923075}
{'eval_loss': 0.592818021774292, 'eval_accuracy': 0.8653846153846154, 'eval_f1': 0.8591570591570592, 'eval_runtime': 0.064, 'eval_samples_per_second': 812.529, 'eval_steps_per_second': 62.502, 'epoch': 3.0}
{'loss': 0.58, 'grad_norm

('./category_model\\tokenizer_config.json',
 './category_model\\special_tokens_map.json',
 './category_model\\tokenizer.json')

In [3]:
from sklearn.metrics import classification_report
import numpy as np

# Obtener predicciones sobre el set de evaluación
predictions = trainer.predict(dataset["test"])
y_true = predictions.label_ids
y_pred = np.argmax(predictions.predictions, axis=1)

# Mostrar el reporte con los nombres reales de las clases
print("Reporte de clasificación por clase:")
print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))


Reporte de clasificación por clase:
              precision    recall  f1-score   support

   comercial       1.00      0.88      0.93         8
        otro       0.89      0.62      0.73        13
       queja       0.85      1.00      0.92        17
   solicitud       0.81      0.93      0.87        14

    accuracy                           0.87        52
   macro avg       0.89      0.85      0.86        52
weighted avg       0.87      0.87      0.86        52



In [4]:
from transformers import pipeline
import pandas as pd

# Cargar modelo y tokenizer desde Hugging Face
clf = pipeline("text-classification", model="aaronmena02/category-model-mailclassifier", tokenizer="aaronmena02/category-model-mailclassifier")

#Ejemplos para pruebas
ejemplos = [
    ("Reclamo por pedido no entregado", "Mi pedido no ha llegado y ya pasaron 10 días", "queja"),
    ("Consulta sobre disponibilidad", "¿Pueden confirmarme si tienen stock del producto?", "solicitud"),
    ("Interés en cotización empresarial", "Solicito información detallada para contratar su servicio", "comercial"),
    ("Agradecimiento", "Gracias por la atención, todo perfecto", "otro"),
    ("Número de seguimiento", "Solicito el número de seguimiento del pedido", "solicitud"),
    ("Producto dañado", "Recibí el producto roto y nadie contesta", "queja"),
    ("Descuentos para distribuidores", "¿Ofrecen descuentos para distribuidores?", "comercial"),
    ("Todo correcto", "Todo ha ido perfecto, gracias", "otro"),
    ("Agendar llamada de negocios", "Estariamos encantados de agendar una llamada para hablar de una propuesta que puede interesarles", "comercial"),
    ("Garantía", "¿Me podrían confirmar si el producto tiene garantía?", "solicitud"),
    ("No funciona el artículo recibido", "El equipo llegó pero no enciende. Espero respuesta.", "queja"),
    ("Solicitud de factura", "Necesito la factura del pedido número 78492", "solicitud"),
    ("Reunión de colaboración", "Nos gustaría organizar una reunión para explorar posibles colaboraciones de cara a futuro", "comercial"),
    ("Agradecimiento por soporte", "Gracias por resolver tan rápido el problema", "otro"),
    ("Problemas con la app", "No puedo iniciar sesión desde ayer. ¿Pueden ayudarme?", "queja"),
]

# Calcular y mostrar porcentaje de aciertos
print(f"{'Asunto + Cuerpo':<70} | {'Esperado':<10} | {'Predicción':<10}")
print("-" * 115)
aciertos = 0
for asunto, cuerpo, esperado in ejemplos:
    texto = f"{asunto.strip()}. {cuerpo.strip()}"
    pred = clf(texto)[0]
    print(f"{texto[:67]:<70} | {esperado:<10} | {pred['label']:<10}")
    
    if pred['label'] == esperado:
        aciertos += 1

porcentaje_acierto = (aciertos / len(ejemplos)) * 100
print("-" * 115)
print(f"Aciertos: {aciertos} / {len(ejemplos)}  →  Precisión: {porcentaje_acierto:.2f}%")


Asunto + Cuerpo                                                        | Esperado   | Predicción
-------------------------------------------------------------------------------------------------------------------
Reclamo por pedido no entregado. Mi pedido no ha llegado y ya pasar    | queja      | queja     
Consulta sobre disponibilidad. ¿Pueden confirmarme si tienen stock     | solicitud  | solicitud 
Interés en cotización empresarial. Solicito información detallada p    | comercial  | solicitud 
Agradecimiento. Gracias por la atención, todo perfecto                 | otro       | otro      
Número de seguimiento. Solicito el número de seguimiento del pedido    | solicitud  | solicitud 
Producto dañado. Recibí el producto roto y nadie contesta              | queja      | queja     
Descuentos para distribuidores. ¿Ofrecen descuentos para distribuid    | comercial  | comercial 
Todo correcto. Todo ha ido perfecto, gracias                           | otro       | otro      
Agendar lla