In [16]:
import numpy as np
import pandas as pd
from datasets import load_dataset, load_metric, Features, ClassLabel, Value

trans_labels = ["POS","NEG","NEU"]

trans = {
    'POS': 0,
    'NEG': 1,
    'NEU': 2
}

features = Features({
    'index': Value(dtype='uint64'),
    'id_news': Value(dtype='uint64'),
    'title': Value(dtype='string'),
    'label': ClassLabel(names=trans_labels)
})

raw_data = load_dataset(
    'csv', data_files='./data/new_dataset.csv', split='train',
    skiprows=[0],
    column_names=["index", "id_news", "title", "label"],
    features=features
)

raw_data = raw_data.rename_column('title', 'text')
raw_data = raw_data.train_test_split(0.2, 0.8)


Using custom data configuration default-fdb4a1a13dbc4e72
Found cached dataset csv (/home/usuariop/.cache/huggingface/datasets/csv/default-fdb4a1a13dbc4e72/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
Loading cached split indices for dataset at /home/usuariop/.cache/huggingface/datasets/csv/default-fdb4a1a13dbc4e72/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-83cfb1f30512c8fd.arrow and /home/usuariop/.cache/huggingface/datasets/csv/default-fdb4a1a13dbc4e72/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-fa5ebf1fa845bb76.arrow


In [17]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

model_name = "finiteautomata/beto-sentiment-analysis"

model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = raw_data.map(tokenize_function, batched=True)

nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

loading configuration file config.json from cache at /home/usuariop/.cache/huggingface/hub/models--finiteautomata--beto-sentiment-analysis/snapshots/2d232b7b937ca0f6940f6b32ce5aaaeb012d8b38/config.json
Model config BertConfig {
  "_name_or_path": "finiteautomata/beto-sentiment-analysis",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "NEG",
    "1": "NEU",
    "2": "POS"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "NEG": 0,
    "NEU": 1,
    "POS": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "tra

In [18]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['index', 'id_news', 'text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 262
    })
    test: Dataset({
        features: ['index', 'id_news', 'text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 66
    })
})

In [19]:
# Example:
nlp("Esto es una prueba larga de algo positivo")

[{'label': 'POS', 'score': 0.9894223809242249}]

In [20]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(262))

small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(66))

Loading cached shuffled indices for dataset at /home/usuariop/.cache/huggingface/datasets/csv/default-fdb4a1a13dbc4e72/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-8140a0b7aa82ddc8.arrow
Loading cached shuffled indices for dataset at /home/usuariop/.cache/huggingface/datasets/csv/default-fdb4a1a13dbc4e72/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-ba244949b7eac511.arrow


In [21]:
from transformers import Trainer, TrainingArguments
from metrics import get_confusion_matrix
import torch
from evaluate import evaluator, load as ev_load
# Sentiment analysis evaluator


def compute_metrics(eval_pred):
    print("Computing!")
    metrics = ["accuracy"]#, "f1"]#, "precision", "recall"]#, "f1"]# List of metrics to return
    metric = {}
    for met in metrics:
        metric[met] = load_metric(met)
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    metric_res = {}
    for met in metrics:
        metric_res[met] = metric[met].compute(
            predictions=predictions, references=labels)[met]
    return metric_res


training_args = TrainingArguments(
    "test_trainer", label_names=trans_labels)

#model#.to(torch.device("cpu"))

training_args = TrainingArguments(
    "test_trainer",
    #label_names=['POS', 'NEG', 'NEU'],
    # output_dir="results",
    num_train_epochs=10,
    per_device_train_batch_size=4,
    # disable_tqdm=False,
    #logging_steps=
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

ev = evaluator("sentiment-analysis")

#trainer = Trainer(model=model, args=training_args, train_dataset=small_train_dataset, eval_dataset=small_eval_dataset)
trainer.evaluate()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: index, id_news, text. If index, id_news, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 66
  Batch size = 8
You're using a BertTokenizerFast tokenizer. Please

Computing!


100%|██████████| 9/9 [00:03<00:00,  2.86it/s]


{'eval_loss': 4.725618362426758,
 'eval_accuracy': 0.15151515151515152,
 'eval_runtime': 3.452,
 'eval_samples_per_second': 19.119,
 'eval_steps_per_second': 2.607}

In [22]:
import evaluate
# ev.load_data(raw_data["test"])
ev.compute(
    model_or_pipeline=model,
    data=raw_data["test"],
    metric=evaluate.combine(["accuracy"]),
    tokenizer=tokenizer,
    label_mapping=trans
)


{'accuracy': 0.3787878787878788,
 'total_time_in_seconds': 0.6263153979998606,
 'samples_per_second': 105.37821712634101,
 'latency_in_seconds': 0.00948962724242213}

In [23]:
### Predictions of the train-dataset ###

model.to(torch.device("cpu"))
human_labels = raw_data["train"]["label"]

#text = torch.tensor(raw_data["train"]["text"])
text = raw_data["train"]["text"]
pretrain_labels = nlp(text)
pretrain_labels = [trans[row["label"]] for row in pretrain_labels]

In [24]:
### Check if must train ###
import os

fpath = "./saved_model/"
if os.path.exists(fpath):
    model = AutoModelForSequenceClassification.from_pretrained(
        fpath, local_files_only=True)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=small_train_dataset,
        eval_dataset=small_eval_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )
else:
    model.to(torch.device("cuda"))
    trainer.train()
    trainer.save_model("./saved_model/")


loading configuration file ./saved_model/config.json
Model config BertConfig {
  "_name_or_path": "./saved_model/",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "NEG",
    "1": "NEU",
    "2": "POS"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "NEG": 0,
    "NEU": 1,
    "POS": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 31006
}

loading weights file ./saved_model/pytorch_mo

In [12]:
res = trainer.evaluate()
res

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: index, id_news, text. If index, id_news, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 66
  Batch size = 8
 89%|████████▉ | 8/9 [00:02<00:00,  3.35it/s]

Computing!


100%|██████████| 9/9 [00:04<00:00,  1.99it/s]


{'eval_loss': 2.366053819656372,
 'eval_accuracy': 0.7121212121212122,
 'eval_runtime': 4.884,
 'eval_samples_per_second': 13.513,
 'eval_steps_per_second': 1.843,
 'epoch': 10.0}

In [28]:
model.to(torch.device("cpu"))
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
text = raw_data["train"]["text"]
pred_labels = nlp(text)
pred_labels = [trans[row["label"]] for row in pred_labels]

## Metricas

In [26]:
print("Human label v/s pretrain_labels")
m = get_confusion_matrix(human_labels, pretrain_labels)
print("      " + "  ".join(n for n in trans_labels))
for label, l in zip(trans_labels, m):
    print(f"{label} [{l[0]:4d},{l[1]:4d},{l[2]:4d}]")


Human label v/s pretrain_labels
      POS  NEG  NEU
POS [   6,   4,  52]
NEG [   0,  31,   6]
NEU [  12,  47, 104]


In [29]:
print("Human label v/s pred_labels")
m = get_confusion_matrix(human_labels, pred_labels)
print("      " + "  ".join(n for n in trans_labels))
for label, l in zip(trans_labels, m):
    print(f"{label} [{l[0]:4d},{l[1]:4d},{l[2]:4d}]")


Human label v/s pred_labels
      POS  NEG  NEU
POS [   3,  59,   0]
NEG [   4,   1,  32]
NEU [ 157,   4,   2]


: 