In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType
from datasets import load_dataset
import evaluate

In [2]:
etiquetas = {
    0: "negativo",
    1: "neutral",
    2: "positivo"
}

modelo_base = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizador = AutoTokenizer.from_pretrained(modelo_base)
modelo = AutoModelForSequenceClassification.from_pretrained(
    modelo_base, 
    num_labels=3, 
    ignore_mismatched_sizes=True,
    id2label=etiquetas,
    label2id={v: k for k, v in etiquetas.items()}
)    # Se sitúa en 3 porque el dataset que se cargó usa positivo, negativo y neutal, es decir, 3 etiquetas de clasificación

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/669M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlptown/bert-base-multilingual-uncased-sentiment and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([3]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
config_lora = LoraConfig(
    task_type = TaskType.SEQ_CLS,   # tipo de tarea en el que se centra el LoRA
    r=8,                            # rango de la matriz de entrenamiento (bajo 2 - 8, medio 16 - 32, alto 64 - 128+), a mayor rango mayor consumo
    lora_alpha=16,                  # equilibrio entre LoRA y modelo base; a mayor sea el número, LoRA tiene más intervención a partir del dataset sobre el modelo base (se suele usar r*2 o r*4, más del *4 se usa en datasets grandes)
    lora_dropout=0.7                # nivel de regulación en LoRA; a menor sea el número, se confía más en el dataset que implementa LoRA (0.0 es confianza plena, > 0.3 es usado en casos de overfitting)
)

modelo_lora = get_peft_model(modelo, config_lora)
modelo_lora.print_trainable_parameters()

trainable params: 297,219 || all params: 167,655,942 || trainable%: 0.1773


In [4]:
ds = load_dataset("pyupeu/social-media-peruvian-sentiment")     # Nos basamos en comentarios de redes sociales para tomar la forma típica de escribir de un usuario promedio, en este caso de Perú
print(ds)

README.md:   0%|          | 0.00/857 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_name'],
        num_rows: 9336
    })
    validation: Dataset({
        features: ['text', 'label', 'label_name'],
        num_rows: 2918
    })
    test: Dataset({
        features: ['text', 'label', 'label_name'],
        num_rows: 2335
    })
})


In [5]:
def tokenizar(batch):
    return tokenizador(
        batch["text"],          # columna del dataset que contiene el texto
        truncation=True,        # si el texto es más largo que el max_lenght, se corta
        padding="max_length",   # deja a todos los textos de la misma longitud aunque sean más cortos; se usa para usar tensores de igual tamaño siempre
        max_length=128
    )

ds_tokenizado = ds.map(tokenizar, batched=True)

In [6]:
args = TrainingArguments(
    output_dir="./resultados",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    learning_rate=5e-5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False 
)

accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

In [7]:
trainer = Trainer(
    model=modelo_lora,
    args=args,
    train_dataset=ds_tokenizado["train"],
    eval_dataset=ds_tokenizado["test"],
    tokenizer=tokenizador, #type:ignore
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9239,0.870352,0.604711
2,0.8533,0.843578,0.627409
3,0.8242,0.825065,0.641542
4,0.8033,0.806815,0.6394
5,0.7857,0.80235,0.648394
6,0.7695,0.79789,0.656959
7,0.7677,0.784698,0.651392
8,0.752,0.78892,0.657816
9,0.7428,0.786437,0.656959
10,0.7376,0.791602,0.657816


TrainOutput(global_step=11680, training_loss=0.7575733655119595, metrics={'train_runtime': 6752.2725, 'train_samples_per_second': 27.653, 'train_steps_per_second': 1.73, 'total_flos': 1.232475582947328e+16, 'train_loss': 0.7575733655119595, 'epoch': 20.0})