### Instalacion de dependencias

https://github.com/huggingface/notebooks/blob/main/examples/text_classification.ipynb

In [2]:
#pip install transformers datasets evaluate

In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn, optim #Pytorch
from datasets import load_dataset

In [2]:
RANDOM_SEED = 42
BATCH_SIZE = 16
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


### Cargar datos previamente procesados

In [3]:
dataset = load_dataset('csv', data_files=['C:/Users/anabe/Documents/PRUEBAS/GPT/corpus/df.csv'], split='train')
datasets = dataset.train_test_split(test_size=0.20) # Test del 10%

In [4]:
datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 5980
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1496
    })
})

### Modelo 

In [5]:
model_checkpoint = "google-bert/bert-base-uncased"

### Tokenizador

In [6]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [7]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [8]:
tokenized = datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/5980 [00:00<?, ? examples/s]

Map:   0%|          | 0/1496 [00:00<?, ? examples/s]

In [9]:
tokenized

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5980
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1496
    })
})

### Entrenamiento

In [10]:
id2label = {0: "NEUTRAL", 1: "ANSIEDAD"}
label2id = {"NEUTRAL": 0, "ANSIEDAD": 1}

In [11]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
#from transformers import DataCollatorWithPadding

#data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
#data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [14]:
#model.config.pad_token_id = model.config.eos_token_id

### Parámetros de evaluación

In [15]:
import evaluate

accuracy = evaluate.load("accuracy")

In [16]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
def compute_metrics(eval_pred):
    labels = eval_pred.label_ids
    preds = eval_pred.predictions.argmax(-1)
    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)
   

    # Calculate precision
    precision = precision_score(labels, preds)
   

    # Calculate recall (sensitivity)
    recall = recall_score(labels, preds)
   

    # Calculate F1-score
    f1 = f1_score(labels, preds, average="weighted")

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1, }




### Entrenamiento del modelo

In [17]:
training_args = TrainingArguments(
    output_dir="BERT",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
   # push_to_hub=True,
)

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [19]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.104607,0.971257,0.917603,0.921053,0.971278
2,0.139100,0.125514,0.969251,0.887324,0.947368,0.969641
3,0.032900,0.112099,0.977941,0.939623,0.93609,0.977925


TrainOutput(global_step=1122, training_loss=0.07866570040078937, metrics={'train_runtime': 5610.588, 'train_samples_per_second': 3.198, 'train_steps_per_second': 0.2, 'total_flos': 591465430228800.0, 'train_loss': 0.07866570040078937, 'epoch': 3.0})

Podemos comprobar con el método evaluate que nuestro Trainer recargó correctamente el mejor modelo

In [20]:
trainer.evaluate()

{'eval_loss': 0.1046074628829956,
 'eval_accuracy': 0.9712566844919787,
 'eval_precision': 0.9176029962546817,
 'eval_recall': 0.9210526315789473,
 'eval_f1': 0.9712778036894698,
 'eval_runtime': 122.0211,
 'eval_samples_per_second': 12.26,
 'eval_steps_per_second': 0.77,
 'epoch': 3.0}

### Inferencia

In [1]:
text = "I wish there was an off button.All the methods to commit suicide are so taxing to read through. The combination of lethality and pain and just the preparation itself... fuck."

In [3]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model="aepilicita/sentiment_student")
classifier(text)

model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/576 [00:00<?, ?B/s]

[{'label': 'ANSIEDAD', 'score': 0.9999561309814453}]