In [1]:
# imports
import sys
sys.path.append("../")

from scripts.load_data import mapping, read_tsv_file, tokenize_and_align_labels, compute_metrics, pred2label, write_iob2_file
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, AutoConfig, AutoTokenizer, DataCollatorForTokenClassification
from datasets import Dataset
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# path to the data files
path_train = "../new_data/new_da_news_train.tsv"
path_dev = "../new_data/new_da_news_dev.tsv"
path_test = "../new_data/new_da_news_test.tsv"

In [3]:
# saving model name
model_name = "vesteinn/DanskBERT"

In [4]:
# creating the label to id mapping 
label2id, id2label = mapping(path_train)

# number of labels
num_labels = len(label2id)

In [5]:
id2label

{0: 'I-ORG',
 1: 'B-MISC',
 2: 'I-PER',
 3: 'O',
 4: 'I-MISC',
 5: 'B-LOC',
 6: 'B-PER',
 7: 'I-LOC',
 8: 'B-ORG'}

In [6]:
# reading in the data
train_data = read_tsv_file(path_train, label2id=label2id)
dev_data = read_tsv_file(path_dev, label2id=label2id)
test_data = read_tsv_file(path_test, label2id=label2id)

In [7]:
# convert to huggingface format
train_dataset = Dataset.from_list(train_data)
dev_dataset = Dataset.from_list(dev_data)
test_dataset = Dataset.from_list(test_data)

In [8]:
tokenized_train_dataset = train_dataset.map(
    tokenize_and_align_labels,
    batched = True,
    remove_columns=train_dataset.column_names
)

tokenized_dev_dataset = dev_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dev_dataset.column_names
)

tokenized_test_dataset = test_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=test_dataset.column_names
)

Map: 100%|██████████| 1745/1745 [00:00<00:00, 14909.44 examples/s]
Map: 100%|██████████| 161/161 [00:00<00:00, 17432.05 examples/s]
Map: 100%|██████████| 344/344 [00:00<00:00, 19703.13 examples/s]


In [9]:
print(tokenized_dev_dataset.features)


{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}


In [10]:
from datasets import ClassLabel, Sequence

# Define the features for the tokenized dataset
features = {
    'input_ids': Sequence(feature=int),  # Sequence of integers (token IDs)
    'attention_mask': Sequence(feature=int),  # Sequence of integers (1s and 0s)
    'labels': Sequence(feature=int),  # Sequence of integers (tag IDs)
}

# Apply the features to the dataset
tokenized_dev_dataset = tokenized_dev_dataset.cast(features)


AttributeError: 'dict' object has no attribute 'arrow_schema'

In [None]:
# defining the model and config
config = AutoConfig.from_pretrained(
    model_name, 
    num_labels = num_labels, 
    id2label = id2label, 
    label2id = label2id
)

model = AutoModelForTokenClassification.from_pretrained(
    model_name, 
    torch_dtype = 'auto', 
    config = config
)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at vesteinn/DanskBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
tokenizer = AutoTokenizer.from_pretrained("vesteinn/DanskBERT")

data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

XLMRobertaForTokenClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(50005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768,

In [None]:
# defining the training arguments
args = TrainingArguments(
    output_dir = "output_trainer", 
    eval_strategy = 'epoch', 
    save_strategy = "no",
    learning_rate = 2e-5,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 2,
    num_train_epochs = 1,
    weight_decay = 0.01,
    remove_unused_columns=False,
    label_names=["labels"]
)

# define parameters for trainer
trainer = Trainer(
    model = model,
    args = args,
    train_dataset = tokenized_train_dataset,
    eval_dataset = tokenized_dev_dataset,
    compute_metrics = compute_metrics,
    data_collator = data_collator
)

In [None]:
# train the model
#trainer.train()

In [None]:
# save the model
model.save_pretrained("output_trainer")
tokenizer.save_pretrained("output_trainer")

('output_trainer/tokenizer_config.json',
 'output_trainer/special_tokens_map.json',
 'output_trainer/tokenizer.json')

In [None]:
# predicting
test_preds, test_labels, _ = trainer.predict(tokenized_test_dataset)

# predict max logit and convert to strings
_, test_predictions = pred2label((test_preds, test_labels))

TypeError: pred2label() missing 1 required positional argument: 'id2label'

In [None]:
# write output file for predictions on test data
write_iob2_file(test_data, predictions = test_predictions, path = "test_predictions.iob2")