In [None]:
from multitask_roberta import RobertaForMultiTaskTokenClassification
from multitask_data_collator import DataCollatorForMultiTaskTokenClassification
from transformers import RobertaTokenizerFast
from torch.utils.data import DataLoader
import pandas as pd

2023-08-09 15:04:25.920254: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-09 15:04:25.967003: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
model_name = 'roberta-base'
model = RobertaForMultiTaskTokenClassification.from_pretrained(model_name)
tokenizer = RobertaTokenizerFast.from_pretrained(model_name, add_prefix_space=True)

Some weights of RobertaForMultiTaskTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifiers.dur.bias', 'classifiers.reread.weight', 'classifiers.skip.weight', 'classifiers.refix.bias', 'classifiers.dur.weight', 'classifiers.nfix.bias', 'classifiers.skip.bias', 'classifiers.reread.bias', 'classifiers.firstrun_nfix.bias', 'classifiers.firstfix_dur.bias', 'classifiers.firstrun_nfix.weight', 'classifiers.refix.weight', 'classifiers.firstfix_dur.weight', 'classifiers.firstrun_dur.weight', 'classifiers.nfix.weight', 'classifiers.firstrun_dur.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
data_path = '../augmenting_nlms_meco_data/en/en_6_dataset.csv'

In [4]:
def load_data_sample(src_path, num_samples):
    features_names = ['skip', 'firstfix.dur', 'firstrun.dur', 'dur', 'firstrun.nfix', 'nfix', 'refix', 'reread']
    sample_dataset = pd.DataFrame(columns=['sent_id', 'sentence'] + ['label_'+feature_name.replace('.', '_') for feature_name in features_names])
    df = pd.read_csv(src_path)
    grouped_sentences = df.groupby(['trialid', 'sentnum'])
    for idx, key in enumerate(grouped_sentences.groups.keys()):
        sent_df = grouped_sentences.get_group(key)
        sent_id = f'{int(key[0])}_{int(key[1])}'
        sentence = sent_df['ia'].tolist()
        sample_dataset.loc[idx] = [sent_id, sentence] + [sent_df[feat].tolist() for feat in features_names]
        if len(sample_dataset) == num_samples:
            return sample_dataset

In [5]:
data_sample_df = load_data_sample(data_path, 10)

In [6]:
data_sample_df.head()

Unnamed: 0,sent_id,sentence,label_skip,label_firstfix_dur,label_firstrun_dur,label_dur,label_firstrun_nfix,label_nfix,label_refix,label_reread
0,1_1,"[In, ancient, Roman, religion, and, myth,, Jan...","[1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 144.0, 175.0, 187.0, 0.0, 181.0, 183.0, ...","[0.0, 453.0, 175.0, 187.0, 0.0, 303.0, 183.0, ...","[0.0, 453.0, 175.0, 187.0, 0.0, 303.0, 183.0, ...","[0.0, 2.0, 1.0, 1.0, 0.0, 2.0, 1.0, 1.0, 2.0, ...","[0.0, 2.0, 1.0, 1.0, 0.0, 2.0, 1.0, 2.0, 3.0, ...","[0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, ..."
1,1_2,"[He, has, a, double, nature, and, is, usually,...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 113.0, 141.0, 155.0, 154.0, 192.0, 187.0...","[0.0, 255.0, 141.0, 155.0, 154.0, 192.0, 187.0...","[0.0, 255.0, 141.0, 155.0, 154.0, 192.0, 187.0...","[0.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, ...","[0.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, ...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
2,1_3,"[Janus, presided, over, the, beginning, and, e...","[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[219.0, 173.0, 242.0, 0.0, 66.0, 0.0, 197.0, 1...","[219.0, 173.0, 418.0, 0.0, 240.0, 0.0, 261.0, ...","[219.0, 173.0, 418.0, 0.0, 240.0, 0.0, 261.0, ...","[1.0, 1.0, 3.0, 0.0, 2.0, 0.0, 2.0, 2.0, 1.0, ...","[1.0, 1.0, 3.0, 0.0, 2.0, 0.0, 2.0, 2.0, 1.0, ...","[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,1_4,"[The, doors, of, his, temple, were, open, in, ...","[0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, ...","[222.0, 506.0, 0.0, 167.0, 119.0, 0.0, 282.0, ...","[222.0, 506.0, 0.0, 167.0, 119.0, 0.0, 282.0, ...","[222.0, 506.0, 0.0, 167.0, 119.0, 0.0, 282.0, ...","[1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, ...","[1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,1_5,"[As, the, god, of, gates,, he, was, also, asso...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[104.0, 136.0, 159.0, 151.0, 36.0, 129.0, 157....","[104.0, 136.0, 159.0, 248.0, 36.0, 129.0, 302....","[220.0, 136.0, 218.0, 550.0, 159.0, 129.0, 610...","[1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1.0, ...","[2.0, 1.0, 2.0, 4.0, 2.0, 1.0, 4.0, 1.0, 4.0, ...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, ...","[1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, ..."


In [7]:
from datasets import Dataset
sample_dataset = Dataset.from_pandas(data_sample_df)

In [8]:
sample_dataset

Dataset({
    features: ['sent_id', 'sentence', 'label_skip', 'label_firstfix_dur', 'label_firstrun_dur', 'label_dur', 'label_firstrun_nfix', 'label_nfix', 'label_refix', 'label_reread', '__index_level_0__'],
    num_rows: 10
})

In [9]:
# adapted from https://github.com/huggingface/transformers/blob/main/examples/pytorch/token-classification/run_ner_no_trainer.py

def tokenize_and_align_labels(dataset, label_all_tokens=False):
    tokenized_inputs = tokenizer(sample_dataset['sentence'], max_length=128, padding=True, truncation=True,
                                 is_split_into_words=True)
    labels = dict()
    for feature_name in [col_name for col_name in sample_dataset.column_names if col_name.startswith('label_')]:
        labels[feature_name] = list()
        for i, label in enumerate(dataset[feature_name]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                # Special tokens have a word id that is None. We set the label to -100 so they are automatically
                # ignored in the loss function.
                if word_idx is None:
                    label_ids.append(-100)
                # We set the label for the first token of each word.
                elif word_idx != previous_word_idx:
                    label_ids.append(label[word_idx])
                # For the other tokens in a word, we set the label to either the current label or -100, depending on
                # the label_all_tokens flag.
                else:
                    if label_all_tokens:
                        label_ids.append(label[word_idx])
                    else:
                        label_ids.append(-100)
                previous_word_idx = word_idx

            labels[feature_name].append(label_ids)
        tokenized_inputs[feature_name] = labels[feature_name]
    return tokenized_inputs

In [10]:
tokenized_dataset = sample_dataset.map(
            tokenize_and_align_labels,
            batched=True,
            remove_columns=sample_dataset.column_names,
            # desc="Running tokenizer on dataset",
        )

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [11]:
print('tokens: ', tokenizer.convert_ids_to_tokens(tokenized_dataset[0]['input_ids']))
for key in tokenized_dataset[0].keys():
    print(f'{key}:', tokenized_dataset[0][key])

tokens:  ['<s>', 'ĠIn', 'Ġancient', 'ĠRoman', 'Ġreligion', 'Ġand', 'Ġmyth', ',', 'ĠJan', 'us', 'Ġis', 'Ġthe', 'Ġgod', 'Ġof', 'Ġbeginnings', 'Ġand', 'Ġgates', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
label_skip: [-100.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, -100.0, 0.0, -100.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0]
label_firstfix_dur: [-100.0, 0.0, 144.0, 175.0, 187.0, 0.0, 181.0, -100.0, 183.0, -100.0, 56.0, 296.0, 0.0, 0.0, 188.0, 0.0, 167.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0, -100.0]
label_firstrun_dur: [-100.0, 0.0, 453.0, 175.0, 187.0, 0.0, 303.0, -100.0, 183.0, -100.0, 56.0, 429.0, 0.0, 0.0, 340.0, 0.0, 367.0, -

In [13]:
tokenized_dataset

Dataset({
    features: ['label_skip', 'label_firstfix_dur', 'label_firstrun_dur', 'label_dur', 'label_firstrun_nfix', 'label_nfix', 'label_refix', 'label_reread', 'input_ids', 'attention_mask'],
    num_rows: 10
})

In [14]:
data_collator = DataCollatorForMultiTaskTokenClassification(tokenizer)

In [15]:
dataloader = DataLoader(tokenized_dataset, shuffle=True, collate_fn=data_collator, batch_size=2)

In [16]:
for step, batch in enumerate(dataloader):
    outputs = model(**batch)
    break

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
