### Instalacion de dependencias

https://github.com/huggingface/notebooks/blob/main/examples/text_classification.ipynb

In [2]:
#pip install transformers datasets evaluate

In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn, optim #Pytorch
from datasets import load_dataset

In [2]:
RANDOM_SEED = 42
BATCH_SIZE = 16
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


### Cargar datos previamente procesados

In [3]:
dataset = load_dataset('csv', data_files=['C:/Users/anabe/Documents/PRUEBAS/GPT/corpus/df.csv'], split='train')
datasets = dataset.train_test_split(test_size=0.20) # Test del 10%

In [4]:
datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 5980
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1496
    })
})

### Modelo 

In [5]:
model_checkpoint = "FacebookAI/roberta-base"

### Tokenizador

In [6]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [7]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [8]:
tokenized = datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/5980 [00:00<?, ? examples/s]

Map:   0%|          | 0/1496 [00:00<?, ? examples/s]

In [9]:
tokenized

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 5980
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1496
    })
})

### Entrenamiento

In [10]:
id2label = {0: "NEUTRAL", 1: "ANSIEDAD"}
label2id = {"NEUTRAL": 0, "ANSIEDAD": 1}

In [11]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id
)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
#from transformers import DataCollatorWithPadding

#data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
#data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [14]:
#model.config.pad_token_id = model.config.eos_token_id

### Parámetros de evaluación

In [13]:
import evaluate

accuracy = evaluate.load("accuracy")

In [14]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
def compute_metrics(eval_pred):
    labels = eval_pred.label_ids
    preds = eval_pred.predictions.argmax(-1)
    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)
   

    # Calculate precision
    precision = precision_score(labels, preds)
   

    # Calculate recall (sensitivity)
    recall = recall_score(labels, preds)
   

    # Calculate F1-score
    f1 = f1_score(labels, preds, average="weighted")

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1, }




### Entrenamiento del modelo

In [15]:
training_args = TrainingArguments(
    output_dir="RoBERTa",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
   # push_to_hub=True,
)

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.142156,0.960561,0.843854,0.954887,0.961494
2,0.155700,0.110769,0.97861,0.957031,0.921053,0.978448
3,0.046200,0.10226,0.981283,0.931159,0.966165,0.981418


TrainOutput(global_step=1122, training_loss=0.09403623956622499, metrics={'train_runtime': 6322.5918, 'train_samples_per_second': 2.837, 'train_steps_per_second': 0.177, 'total_flos': 578945043992880.0, 'train_loss': 0.09403623956622499, 'epoch': 3.0})

Podemos comprobar con el método evaluate que nuestro Trainer recargó correctamente el mejor modelo

In [18]:
trainer.evaluate()

{'eval_loss': 0.10225962847471237,
 'eval_accuracy': 0.9812834224598931,
 'eval_precision': 0.9311594202898551,
 'eval_recall': 0.9661654135338346,
 'eval_f1': 0.9814178875173014,
 'eval_runtime': 261.5044,
 'eval_samples_per_second': 5.721,
 'eval_steps_per_second': 0.359,
 'epoch': 3.0}