In [2]:
# Prepare dataset, tokenizer and model
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding

test_set = Dataset.from_parquet('../data/clean/test.parquet')
checkpoint = 'hfl/chinese-roberta-wwm-ext'
cache_dir = '../src/'

tokenizer = AutoTokenizer.from_pretrained(checkpoint, cache_dir=cache_dir)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2, cache_dir=cache_dir)



In [5]:
# Prepare the tokenizer
def tokenize_function(example):
    return tokenizer(example["tweets"], truncation=True)

tokenized_datasets = test_set.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/54790 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [11]:
# Tokenize training set and set column names to expected
tokenized_datasets = tokenized_datasets.remove_columns(["tweets", "label", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label_id", "labels")
tokenized_datasets.set_format("torch")

ValueError: Column name ['tweets', 'idx', 'label'] not in the dataset. Current columns in the dataset: ['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [27]:
# Prepare metrics
import evaluate
import numpy as np

def compute_metrics(eval_preds):
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='macro')
    return {"accuracy": accuracy["accuracy"],"f1": f1["f1"]}

In [28]:
from transformers import Trainer
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [29]:
trainer.train()

Step,Training Loss


KeyboardInterrupt: 