# Importaciones

In [1]:
%pip install evaluate
%pip install mlflow
import pandas as pd
import numpy as np
import evaluate
import torch
from google.colab import files
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
from peft import LoraConfig, get_peft_model, TaskType
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer



# Tratamiento y Normalización del Dataset

In [2]:
df = pd.read_csv("./dataset_comunidades_senasoft.csv")
df = df[["Comentario", "Categoría del problema"]].dropna()

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9362 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Comentario              9362 non-null   object
 1   Categoría del problema  9362 non-null   object
dtypes: object(2)
memory usage: 219.4+ KB


In [4]:
le = LabelEncoder()
df["label"] = le.fit_transform(df["Categoría del problema"])
label2id = {label: int(i) for i, label in enumerate(le.classes_)}
id2label = {int(i): label for i, label in enumerate(le.classes_)}

ds = Dataset.from_pandas(df)
ds = ds.train_test_split(test_size=0.25, seed=42)

# Llamado al Modelo y Uso de Red Neuronal Pre-entrenada

In [5]:
modelo_base = "tabularisai/multilingual-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(modelo_base)
model = AutoModelForSequenceClassification.from_pretrained(
    modelo_base,
    num_labels=len(le.classes_),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True    # El modelo base usa 5 categorías, permitimos los errores de tamaño para usar únicamente 4
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at tabularisai/multilingual-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Saber el nombre de los modulos que usa el modelo
for name, _ in model.named_modules():
    print(name)


distilbert
distilbert.embeddings
distilbert.embeddings.word_embeddings
distilbert.embeddings.position_embeddings
distilbert.embeddings.LayerNorm
distilbert.embeddings.dropout
distilbert.transformer
distilbert.transformer.layer
distilbert.transformer.layer.0
distilbert.transformer.layer.0.attention
distilbert.transformer.layer.0.attention.dropout
distilbert.transformer.layer.0.attention.q_lin
distilbert.transformer.layer.0.attention.k_lin
distilbert.transformer.layer.0.attention.v_lin
distilbert.transformer.layer.0.attention.out_lin
distilbert.transformer.layer.0.sa_layer_norm
distilbert.transformer.layer.0.ffn
distilbert.transformer.layer.0.ffn.dropout
distilbert.transformer.layer.0.ffn.lin1
distilbert.transformer.layer.0.ffn.lin2
distilbert.transformer.layer.0.ffn.activation
distilbert.transformer.layer.0.output_layer_norm
distilbert.transformer.layer.1
distilbert.transformer.layer.1.attention
distilbert.transformer.layer.1.attention.dropout
distilbert.transformer.layer.1.attention.q

# Entrenamiento con LoRA

In [7]:
def tokenizar(examples):
    return tokenizer(
        examples["Comentario"],
        truncation=True,
        padding="max_length",
        max_length=256
    )


In [8]:
ds_tokenizado = ds.map(tokenizar, batched=True)
ds_tokenizado = ds_tokenizado.remove_columns(["Comentario", "Categoría del problema", "__index_level_0__"])
ds_tokenizado.set_format("torch")

Map:   0%|          | 0/7021 [00:00<?, ? examples/s]

Map:   0%|          | 0/2341 [00:00<?, ? examples/s]

In [9]:
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=4,
    lora_alpha=8,
    lora_dropout=.7,    # Usamos un dropout alto por la logitud tan pequeña del dataset
    target_modules=["q_lin", "k_lin", "v_lin", "out_lin"],
    bias="none",
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 741,124 || all params: 136,068,872 || trainable%: 0.5447


In [10]:
args = TrainingArguments(
    output_dir="./resultados_classif",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.0,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    lr_scheduler_type="cosine_with_restarts",
    logging_dir="./logs",
    logging_steps=50,
    report_to="mlflow",
    run_name="classif"
)

### Métricas de Evaluación para Presentación

In [11]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels)

In [12]:
from transformers import Trainer

Trainer._orig_compute_loss = Trainer.compute_loss

def patched_compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
    return self._orig_compute_loss(model, inputs, return_outputs=return_outputs)

Trainer.compute_loss = patched_compute_loss

In [13]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds_tokenizado["train"],
    eval_dataset=ds_tokenizado["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [None]:
"""Underfitting debido al dataset con pocos datos, datos repetidos y mal clasificados, 
no se hizo modificación de datos por la ética de los mismos, pero se hizo el entrenamiento 
para mostrar el reporte
"""
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.3964,1.386823,0.254165
2,1.3872,1.386816,0.257582
3,1.3876,1.386682,0.254165


# Descarga de Reporte y Resultados

In [None]:
!zip -r resultados_classif.zip /content/resultados_classif /content/mlruns

files.download("resultados_classif.zip")
