In [3]:
import numpy as np
import pandas as pd
from datasets import load_dataset, load_metric, Features, ClassLabel, Value

trans_labels = ["POS","NEG","NEU"]

trans = {
    'POS': 0,
    'NEG': 1,
    'NEU': 2
}

features = Features({
    'index': Value(dtype='uint64'),
    'id_news': Value(dtype='uint64'),
    'title': Value(dtype='string'),
    'label': ClassLabel(names=trans_labels)
})

raw_data = load_dataset(
    'csv', data_files='./data/new_dataset.csv', split='train',
    skiprows=[0],
    column_names=["index", "id_news", "title", "label"],
    features=features
)

raw_data = raw_data.rename_column('title', 'text')
raw_data = raw_data.train_test_split(0.2, 0.8)


Using custom data configuration default-fdb4a1a13dbc4e72
Found cached dataset csv (/home/usuariop/.cache/huggingface/datasets/csv/default-fdb4a1a13dbc4e72/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

model_name = "finiteautomata/beto-sentiment-analysis"

model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = raw_data.map(tokenize_function, batched=True)

nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

100%|██████████| 1/1 [00:00<00:00, 15.31ba/s]
100%|██████████| 1/1 [00:00<00:00, 60.69ba/s]


In [5]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['index', 'id_news', 'text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 262
    })
    test: Dataset({
        features: ['index', 'id_news', 'text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 66
    })
})

In [6]:
# Example:
nlp("Esto es una prueba larga de algo positivo")

[{'label': 'POS', 'score': 0.9894223809242249}]

In [7]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(262))

small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(66))

In [8]:
from transformers import Trainer, TrainingArguments
from metrics import get_confusion_matrix
import torch
from evaluate import evaluator, load as ev_load
# Sentiment analysis evaluator


def compute_metrics(eval_pred):
    print("Computing!")
    metrics = ["accuracy"]#, "f1"]#, "precision", "recall"]#, "f1"]# List of metrics to return
    metric = {}
    for met in metrics:
        metric[met] = load_metric(met)
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    metric_res = {}
    for met in metrics:
        metric_res[met] = metric[met].compute(
            predictions=predictions, references=labels)[met]
    return metric_res


training_args = TrainingArguments(
    "test_trainer", label_names=trans_labels)

#model#.to(torch.device("cpu"))

training_args = TrainingArguments(
    "test_trainer",
    #label_names=['POS', 'NEG', 'NEU'],
    # output_dir="results",
    num_train_epochs=10,
    per_device_train_batch_size=4,
    # disable_tqdm=False,
    #logging_steps=
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

ev = evaluator("sentiment-analysis")

#trainer = Trainer(model=model, args=training_args, train_dataset=small_train_dataset, eval_dataset=small_eval_dataset)
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: index, id_news, text. If index, id_news, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 66
  Batch size = 8
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  metric[met] = load_metric(met)


Computing!


100%|██████████| 9/9 [00:03<00:00,  2.75it/s]


{'eval_loss': 4.656469821929932,
 'eval_accuracy': 0.12121212121212122,
 'eval_runtime': 4.0589,
 'eval_samples_per_second': 16.261,
 'eval_steps_per_second': 2.217}

In [9]:
import evaluate
# ev.load_data(raw_data["test"])
ev.compute(
    model_or_pipeline=model,
    data=raw_data["test"],
    metric=evaluate.combine(["accuracy"]),
    tokenizer=tokenizer,
    label_mapping=trans
)


Disabling tokenizer parallelism, we're using DataLoader multithreading already


{'accuracy': 0.5151515151515151,
 'total_time_in_seconds': 0.5562133630010067,
 'samples_per_second': 118.6595008144034,
 'latency_in_seconds': 0.00842747519698495}

In [10]:
### Predictions of the train-dataset ###

model.to(torch.device("cpu"))
human_labels = raw_data["train"]["label"]

#text = torch.tensor(raw_data["train"]["text"])
text = raw_data["train"]["text"]
pretrain_labels = nlp(text)
pretrain_labels = [trans[row["label"]] for row in pretrain_labels]

In [11]:
### Check if must train ###
import os

fpath = "./saved_model/"
if os.path.exists(fpath):
    model = AutoModelForSequenceClassification.from_pretrained(
        fpath, local_files_only=True)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=small_train_dataset,
        eval_dataset=small_eval_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )
else:
    model.to(torch.device("cuda"))
    trainer.train()
    trainer.save_model("./saved_model/")


The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: index, id_news, text. If index, id_news, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 262
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 660
  Number of trainable parameters = 109856259
 76%|███████▌  | 500/660 [03:48<01:13,  2.17it/s]Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json


{'loss': 0.2617, 'learning_rate': 1.2121212121212122e-05, 'epoch': 7.58}


Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test_trainer/checkpoint-500/tokenizer_config.json
Special tokens file saved in test_trainer/checkpoint-500/special_tokens_map.json
100%|██████████| 660/660 [05:03<00:00,  2.44it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 660/660 [05:03<00:00,  2.17it/s]
Saving model checkpoint to ./saved_model/
Configuration saved in ./saved_model/config.json


{'train_runtime': 303.9482, 'train_samples_per_second': 8.62, 'train_steps_per_second': 2.171, 'train_loss': 0.1983146507857424, 'epoch': 10.0}


Model weights saved in ./saved_model/pytorch_model.bin
tokenizer config file saved in ./saved_model/tokenizer_config.json
Special tokens file saved in ./saved_model/special_tokens_map.json


In [12]:
res = trainer.evaluate()
res

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: index, id_news, text. If index, id_news, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 66
  Batch size = 8
 89%|████████▉ | 8/9 [00:02<00:00,  3.35it/s]

Computing!


100%|██████████| 9/9 [00:04<00:00,  1.99it/s]


{'eval_loss': 2.366053819656372,
 'eval_accuracy': 0.7121212121212122,
 'eval_runtime': 4.884,
 'eval_samples_per_second': 13.513,
 'eval_steps_per_second': 1.843,
 'epoch': 10.0}

In [13]:
model.to(torch.device("cpu"))
text = raw_data["train"]["text"]
pred_labels = nlp(text)
pred_labels = [trans[row["label"]] for row in pred_labels]

## Metricas

In [14]:
print("Human label v/s pretrain_labels")
m = get_confusion_matrix(human_labels, pretrain_labels)
print("      " + "  ".join(n for n in trans_labels))
for label, l in zip(trans_labels, m):
    print(f"{label} [{l[0]:4d},{l[1]:4d},{l[2]:4d}]")


Human label v/s pretrain_labels
      POS  NEG  NEU
POS [   6,   7,  54]
NEG [   0,  24,   4]
NEU [  13,  52, 102]


In [15]:
print("Human label v/s pred_labels")
m = get_confusion_matrix(human_labels, pred_labels)
print("      " + "  ".join(n for n in trans_labels))
for label, l in zip(trans_labels, m):
    print(f"{label} [{l[0]:4d},{l[1]:4d},{l[2]:4d}]")


Human label v/s pred_labels
      POS  NEG  NEU
POS [   0,  67,   0]
NEG [   0,   0,  28]
NEU [ 167,   0,   0]


: 