In [2]:
import numpy as np
import pandas as pd
from datasets import load_dataset, load_metric, Features, ClassLabel, Value

trans_labels = ["POS","NEG","NEU"]

trans = {
    'POS': 0,
    'NEG': 1,
    'NEU': 2
}

features = Features({
    'index': Value(dtype='uint64'),
    'id_news': Value(dtype='uint64'),
    'title': Value(dtype='string'),
    'label': ClassLabel(names=trans_labels)
})

raw_data = load_dataset(
    'csv', data_files='./data/new_dataset.csv', split='train',
    skiprows=[0],
    column_names=["index", "id_news", "title", "label"],
    features=features
)

raw_data = raw_data.rename_column('title', 'text')
raw_data = raw_data.train_test_split(0.2, 0.8)


Using custom data configuration default-fdb4a1a13dbc4e72
Found cached dataset csv (/home/usuariop/.cache/huggingface/datasets/csv/default-fdb4a1a13dbc4e72/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

model_name = "finiteautomata/beto-sentiment-analysis"

model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = raw_data.map(tokenize_function, batched=True)

nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

100%|██████████| 1/1 [00:00<00:00, 16.16ba/s]
100%|██████████| 1/1 [00:00<00:00, 61.04ba/s]


In [4]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['index', 'id_news', 'text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 262
    })
    test: Dataset({
        features: ['index', 'id_news', 'text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 66
    })
})

In [5]:
# Example:
nlp("Esto es una prueba larga de algo positivo")

[{'label': 'POS', 'score': 0.9894223809242249}]

In [6]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(262))

small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(66))

In [7]:
from transformers import Trainer, TrainingArguments
from metrics import get_confusion_matrix
import torch
from evaluate import evaluator, load as ev_load
# Sentiment analysis evaluator


def compute_metrics(eval_pred):
    print("Computing!")
    metrics = ["accuracy"]#, "f1"]#, "precision", "recall"]#, "f1"]# List of metrics to return
    metric = {}
    for met in metrics:
        metric[met] = load_metric(met)
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    metric_res = {}
    for met in metrics:
        metric_res[met] = metric[met].compute(
            predictions=predictions, references=labels)[met]
    return metric_res


training_args = TrainingArguments(
    "test_trainer", label_names=trans_labels)

#model#.to(torch.device("cpu"))

training_args = TrainingArguments(
    "test_trainer",
    #label_names=['POS', 'NEG', 'NEU'],
    # output_dir="results",
    num_train_epochs=10,
    per_device_train_batch_size=4,
    # disable_tqdm=False,
    #logging_steps=
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

ev = evaluator("sentiment-analysis")

#trainer = Trainer(model=model, args=training_args, train_dataset=small_train_dataset, eval_dataset=small_eval_dataset)
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id_news, text, index. If id_news, text, index are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 66
  Batch size = 8
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  metric[met] = load_metric(met)


Computing!


100%|██████████| 9/9 [00:03<00:00,  2.69it/s]


{'eval_loss': 5.116714000701904,
 'eval_accuracy': 0.09090909090909091,
 'eval_runtime': 4.1453,
 'eval_samples_per_second': 15.922,
 'eval_steps_per_second': 2.171}

In [8]:
import evaluate
# ev.load_data(raw_data["test"])
ev.compute(
    model_or_pipeline=model,
    data=raw_data["test"],
    metric=evaluate.combine(["accuracy"]),
    tokenizer=tokenizer,
    label_mapping=trans
)


Disabling tokenizer parallelism, we're using DataLoader multithreading already


{'accuracy': 0.48484848484848486,
 'total_time_in_seconds': 0.5573716149997381,
 'samples_per_second': 118.412919179659,
 'latency_in_seconds': 0.008445024469693002}

In [9]:
### Predictions of the train-dataset ###

model.to(torch.device("cpu"))
human_labels = raw_data["train"]["label"]

#text = torch.tensor(raw_data["train"]["text"])
text = raw_data["train"]["text"]
pretrain_labels = nlp(text)
pretrain_labels = [trans[row["label"]] for row in pretrain_labels]

In [10]:
### Check if must train ###
import os

path = "./saved_model/"
if os.path.exists(path):
    model = AutoModelForSequenceClassification.from_pretrained(
        "./saved_model/", local_files_only=True)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=small_train_dataset,
        eval_dataset=small_eval_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )
else:
    model.to(torch.device("cuda"))
    trainer.train()
    trainer.save_model("./saved_model/")


loading configuration file ./saved_model/config.json
Model config BertConfig {
  "_name_or_path": "./saved_model/",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "NEG",
    "1": "NEU",
    "2": "POS"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "NEG": 0,
    "NEU": 1,
    "POS": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 31006
}

loading weights file ./saved_model/pytorch_mo

In [11]:
res = trainer.evaluate()
res

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id_news, text, index. If id_news, text, index are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 66
  Batch size = 8
 89%|████████▉ | 8/9 [00:02<00:00,  3.42it/s]

Computing!


100%|██████████| 9/9 [00:03<00:00,  2.92it/s]


{'eval_loss': 0.3908611238002777,
 'eval_accuracy': 0.9545454545454546,
 'eval_runtime': 3.4312,
 'eval_samples_per_second': 19.235,
 'eval_steps_per_second': 2.623}

In [12]:
model.to(torch.device("cpu"))
text = raw_data["train"]["text"]
pred_labels = nlp(text)
pred_labels = [trans[row["label"]] for row in pred_labels]

## Metricas

In [15]:
print("Human label v/s pretrain_labels")
m = get_confusion_matrix(human_labels, pretrain_labels)
print("      " + "  ".join(n for n in trans_labels))
for label, l in zip(trans_labels, m):
    print(f"{label} [{l[0]:4d},{l[1]:4d},{l[2]:4d}]")


Human label v/s pretrain_labels
      POS  NEG  NEU
POS [   8,   5,  50]
NEG [   0,  30,   6]
NEU [  15,  52,  96]


In [16]:
print("Human label v/s pred_labels")
m = get_confusion_matrix(human_labels, pred_labels)
print("      " + "  ".join(n for n in trans_labels))
for label, l in zip(trans_labels, m):
    print(f"{label} [{l[0]:4d},{l[1]:4d},{l[2]:4d}]")


Human label v/s pred_labels
      POS  NEG  NEU
POS [   8,   5,  50]
NEG [   0,  30,   6]
NEU [  15,  52,  96]
