In [1]:
import warnings
import logging
import os
import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder

# Silenciar warnings y configurar logs
warnings.filterwarnings("ignore")
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.ERROR)
datasets_logger = logging.getLogger("datasets")
datasets_logger.setLevel(logging.ERROR)
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"

# Confguración para usar la GPU disponible (RTX 4070 SUPER)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Usando dispositivo: {device} ({torch.cuda.get_device_name(0) if device == 'cuda' else 'CPU'})")

df = pd.read_csv("../data/mails_dataset.csv")

# Combinar asunto y cuerpo con separador especial
df['text_combined'] = df['subject'].fillna('') + ' </s> ' + df['text'].fillna('')
df = df[['text_combined', 'sentiment']].dropna()

# Codificar categorías
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["sentiment"])
label2id = {label: i for i, label in enumerate(label_encoder.classes_)}
id2label = {i: label for label, i in label2id.items()}

# Crear Dataset HuggingFace
dataset = Dataset.from_pandas(df.rename(columns={"text_combined": "text", "label": "label"}))

# Tokenizar texto
model_name = "pysentimiento/robertuito-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

dataset = dataset.map(tokenize, batched=True)
dataset = dataset.train_test_split(test_size=0.2)

num_labels = len(label2id)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

# Entrenamiento
training_args = TrainingArguments(
    output_dir="./sentiment_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    weight_decay=0.01,
    logging_dir="./logs_sentiment",
    logging_steps=10,
    fp16=True,
    report_to="none"
)

# Evaluación
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

trainer.train()

# Guardar modelo y tokenizer
trainer.save_model("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")


Usando dispositivo: cuda (NVIDIA GeForce RTX 4070 SUPER)


Map:   0%|          | 0/234 [00:00<?, ? examples/s]

{'loss': 0.9466, 'grad_norm': 6.3337297439575195, 'learning_rate': 1.7222222222222224e-05, 'epoch': 0.8333333333333334}
{'eval_loss': 0.7323023080825806, 'eval_accuracy': 0.8297872340425532, 'eval_f1': 0.8312190049592065, 'eval_runtime': 0.0539, 'eval_samples_per_second': 871.457, 'eval_steps_per_second': 55.625, 'epoch': 1.0}
{'loss': 0.7221, 'grad_norm': 5.563094139099121, 'learning_rate': 1.4722222222222224e-05, 'epoch': 1.6666666666666665}
{'eval_loss': 0.49311476945877075, 'eval_accuracy': 0.8936170212765957, 'eval_f1': 0.8927393238703429, 'eval_runtime': 0.054, 'eval_samples_per_second': 870.184, 'eval_steps_per_second': 55.544, 'epoch': 2.0}
{'loss': 0.4798, 'grad_norm': 3.602574586868286, 'learning_rate': 1.1944444444444444e-05, 'epoch': 2.5}
{'eval_loss': 0.3328285813331604, 'eval_accuracy': 0.9148936170212766, 'eval_f1': 0.9139077918473215, 'eval_runtime': 0.0605, 'eval_samples_per_second': 777.437, 'eval_steps_per_second': 49.624, 'epoch': 3.0}
{'loss': 0.3776, 'grad_norm': 

('./sentiment_model\\tokenizer_config.json',
 './sentiment_model\\special_tokens_map.json',
 './sentiment_model\\tokenizer.json')

In [2]:
from sklearn.metrics import classification_report
import numpy as np

# Obtener predicciones sobre el set de evaluación
predictions = trainer.predict(dataset["test"])
y_true = predictions.label_ids
y_pred = np.argmax(predictions.predictions, axis=1)

# Mostrar el reporte con los nombres reales de las clases
print("Reporte de clasificación por clase:")
print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))


Reporte de clasificación por clase:
              precision    recall  f1-score   support

    negativo       0.90      1.00      0.95         9
      neutro       0.94      0.89      0.92        19
    positivo       0.95      0.95      0.95        19

    accuracy                           0.94        47
   macro avg       0.93      0.95      0.94        47
weighted avg       0.94      0.94      0.94        47



In [None]:
from transformers import pipeline
import pandas as pd

# Cargar modelo y tokenizer desde HuggingFace
clf_sentiment = pipeline("text-classification",model="aaronmena02/sentiment-model-mailclassifier", tokenizer="aaronmena02/sentiment-model-mailclassifier")

# Ejemplos para pruebas
ejemplos_sentiment = [
    ("Pedido con retraso", "Mi pedido llegó con dos días de retraso, aunque el producto está bien", "neutro"),
    ("Producto defectuoso", "Recibí el producto con un defecto y quisiera un reemplazo urgente", "negativo"),
    ("Excelente servicio", "Muy satisfecho con la rapidez y eficiencia en la entrega", "positivo"),
    ("Cargos no reconocidos", "He detectado cargos que no reconozco en la factura. Necesito revisión", "negativo"),
    ("Consulta técnica", "¿El modelo X200 está disponible en negro y con garantía extendida?", "neutro"),
    ("Agradecimiento", "Agradezco mucho la asistencia que me brindaron. Todo fue excelente", "positivo"),
    ("Pedido incompleto", "El pedido llegó incompleto. Faltan artículos que ya he pagado", "negativo"),
    ("Solicitud de información", "¿Podrían enviarme los detalles técnicos del producto Z45, por favor?", "neutro"),
    ("Soporte técnico eficaz", "El soporte técnico resolvió todo a la perfección. Muchas gracias", "positivo"),
    ("Incidencia sin resolver", "Sigo esperando solución a mi problema. Esto es muy frustrante", "negativo"),
]

# Mostrar resultados
print(f"{'Asunto + Cuerpo':<70} | {'Esperado':<10} | {'Predicción':<10}")
print("-" * 115)
aciertos = 0

for asunto, cuerpo, esperado in ejemplos_sentiment:
    texto = f"{asunto.strip()}. {cuerpo.strip()}"
    pred = clf_sentiment(texto)[0]
    print(f"{texto[:67]:<70} | {esperado:<10} | {pred['label'].lower():<10}")
    if pred['label'].lower() == esperado:
        aciertos += 1

porcentaje_acierto = (aciertos / len(ejemplos_sentiment)) * 100
print("-" * 115)
print(f"Aciertos: {aciertos} / {len(ejemplos_sentiment)}  →  Precisión: {porcentaje_acierto:.2f}%")




Asunto + Cuerpo                                                        | Esperado   | Predicción
-------------------------------------------------------------------------------------------------------------------
Pedido con retraso. Mi pedido llegó con dos días de retraso, aunque    | neutro     | neutro    
Producto defectuoso. Recibí el producto con un defecto y quisiera u    | negativo   | negativo  
Excelente servicio. Muy satisfecho con la rapidez y eficiencia en l    | positivo   | positivo  
Cargos no reconocidos. He detectado cargos que no reconozco en la f    | negativo   | negativo  
Consulta técnica. ¿El modelo X200 está disponible en negro y con ga    | neutro     | neutro    
Agradecimiento. Agradezco mucho la asistencia que me brindaron. Tod    | positivo   | positivo  
Pedido incompleto. El pedido llegó incompleto. Faltan artículos que    | negativo   | negativo  
Solicitud de información. ¿Podrían enviarme los detalles técnicos d    | neutro     | neutro    
Soporte téc