In [1]:
import pandas as pd
import numpy as np

checkpoint = "FremyCompany/roberta-large-nl-oscar23"

In [14]:
# Prepare the data from units (sentences) and annotations

from pathlib import Path
data = Path("../../data")

label2id  = {"Ja": 1, "Nee": 0}

annotations = pd.read_csv(data / "intermediate/annoations_01_dutch_types.csv")
annotations = annotations.rename(columns={"issue position": "label"})[["unit_id", "label"]]
annotations.label = annotations.label.map(label2id)

units = pd.read_csv(data / "intermediate/units_tk2023.csv")
units = units.fillna("")
units.text = units.before + "\n" + units.text + "\n" + units.after
units = units[["unit_id", "text"]]

df = annotations.merge(units, on="unit_id")
df.head()

Unnamed: 0,unit_id,label,text
0,00f6424643130aff606b0090fd9ae52c5b493fa3e4bb2e...,1,denk om stemmers met beperking\nDemissionair m...
1,01438633bd578448fb672c93fc34addb40769a2988ab24...,0,"07:16\nChris Stoffer, lijsttrekker SGP, bij Ra..."
2,02cba6b801518598e5f2d2a34ef8677a6d254e9494a3ab...,0,17:18\nSchoonmaakactie besmeurde spandoeken BB...
3,036e9a7d0720f4a36e03b1ca870a0f37a4c850b911918a...,1,In aanloop naar de verkiezingen wordt de kas v...
4,0393898b66b7c3891c8cc545fd3356143dff0951cc15f8...,0,\nFvD-leider Thierry Baudet gaat door met zijn...


In [46]:
# Some utility functions

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModel

import datasets

def get_datasets(data, train_ics, test_ics, checkpoint):
    df_train = data.iloc[train_ics]
    df_test =  data.iloc[test_ics]
    
    dataset = datasets.DatasetDict({
        "train": datasets.Dataset.from_pandas(df_train),
        "test": datasets.Dataset.from_pandas(df_test)
    })
    
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    def preprocess_function(examples):
        return tokenizer(examples["text"], truncation=True, padding=True)

    dataset = dataset.map(preprocess_function, batched=True)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)    
    return dataset, data_collator, tokenizer 
   
def predict_test(trainer, data):
    predictions = trainer.predict(data)
    preds = np.argmax(predictions.predictions, axis=-1)
    return pd.DataFrame(dict(true=data['label'], pred=list(preds)))



def get_model(label2id, checkpoint):
    id2label = {v:k for (k,v) in label2id.items()}
    model = AutoModelForSequenceClassification.from_pretrained(
       checkpoint, num_labels=2, id2label=id2label, label2id=label2id
    )
    return model

def compute_metrics(eval_pred):
    metric = datasets.load_metric('f1')
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average="macro")


In [51]:
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir=str(data / "tmp/dutch_bert"),
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=48,
    num_train_epochs=5,
    weight_decay=0.01,
    fp16=True,
    fp16_full_eval=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)



In [None]:
# Run k-fold crossvalidation, store predictions
from sklearn.model_selection import StratifiedKFold

splits = list(StratifiedKFold(n_splits=5).split(np.zeros(df.shape[0]), df.label))

import torch, gc

predictions=[]
for i, (train_ics, test_ics) in enumerate(splits):
    print("**************** FOLD", i+1) 
    model=get_model(label2id, checkpoint)
    dataset, collator, tokenizer = get_datasets(df, train_ics, test_ics, checkpoint)
    trainer = Trainer(
            model,
            training_args,
            train_dataset=dataset["train"],
            eval_dataset=dataset["test"],
            data_collator=collator,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
    )
    trainer.train()
    pred = predict_test(trainer, dataset['test'])
    predictions.append(pred)
    del model
    gc.collect()
    torch.cuda.empty_cache()


**************** FOLD 1


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FremyCompany/roberta-large-nl-oscar23 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1190 [00:00<?, ? examples/s]

Map:   0%|          | 0/298 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,F1
1,No log,0.559457,0.770266
2,No log,0.564726,0.789374
3,No log,0.661592,0.776161


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [49]:
from sklearn.metrics import classification_report
preds = pd.concat(predictions)
print(classification_report(preds.true, preds.pred))

              precision    recall  f1-score   support

           0       0.86      0.87      0.86       863
           1       0.81      0.81      0.81       625

    accuracy                           0.84      1488
   macro avg       0.84      0.84      0.84      1488
weighted avg       0.84      0.84      0.84      1488

