In [1]:
from tqdm import tqdm
import json

# download data from https://github.com/StonyBrookNLP/musique

with open('./musique_data/musique_ans_v1.0_train.jsonl', "r") as file:
    train = [json.loads(line.strip()) for line in tqdm(file) if line.strip()]
with open('./musique_data/musique_ans_v1.0_dev.jsonl', "r") as file:
    dev = [json.loads(line.strip()) for line in tqdm(file) if line.strip()]
with open('./musique_data/musique_ans_v1.0_test.jsonl', "r") as file:
    test = [json.loads(line.strip()) for line in tqdm(file) if line.strip()]
    

19938it [00:01, 12782.70it/s]
2417it [00:00, 9188.11it/s]
2459it [00:00, 15139.85it/s]


In [2]:
from nltk.tokenize import sent_tokenize
import numpy as np

def to_hotpotqa_format(train, test=False):
    for ex in tqdm(train):
        cntxt_title = []
        cntxt_sents = []
        supp_title = []
        supp_sent_id = []

        for para in ex['paragraphs']:
            cntxt_title.append(para['title'].encode('utf-8', 'replace').decode('utf-8')) 
            para_sents = [((' ' if sent_i > 0 else '') + sent).encode('utf-8', 'replace').decode('utf-8') for sent_i, sent in enumerate(sent_tokenize(para['paragraph_text']))]
            cntxt_sents.append(para_sents)
            if not test:
                if para['is_supporting'] == True:
                    supp_title += [para['title'].encode('utf-8', 'replace').decode('utf-8')]
                    
        ex.update({'supporting_facts': {'title': supp_title,
                                        'sent_id': []},
                   'context': {'title': cntxt_title,
                               'sentences': cntxt_sents}
            })
    return train

In [3]:
prepro_train = to_hotpotqa_format(train)
prepro_dev = to_hotpotqa_format(dev)

100%|████████████████████████████████████| 19938/19938 [00:42<00:00, 470.65it/s]
100%|██████████████████████████████████████| 2417/2417 [00:04<00:00, 487.04it/s]


In [4]:
len(prepro_train), len(prepro_dev)

(19938, 2417)

In [13]:
max([sum([len(j) for j in ex['context']['sentences']]) for ex in tqdm(prepro_train)]), \
max([sum([len(j) for j in ex['context']['sentences']]) for ex in tqdm(prepro_dev)])

  0%|          | 0/19938 [00:00<?, ?it/s]

  0%|          | 0/2417 [00:00<?, ?it/s]

(152, 125)

In [14]:
max([len(ex['context']['title']) for ex in prepro_train]), \
max([len(ex['context']['title']) for ex in prepro_dev])

(20, 20)

In [26]:
prepro_train[-1].keys()

dict_keys(['id', 'paragraphs', 'question', 'question_decomposition', 'answer', 'answer_aliases', 'answerable', 'supporting_facts', 'context'])

In [1]:
import torch
from prepro_char_based_targets_musique import process_file
from datasets.arrow_dataset import Dataset

2023-11-29 15:30:25.549929: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
train_examples = process_file(prepro_train, with_special_seps=True, with_mem_seps=False)
train_examples = Dataset.from_dict({feature: [train_examples[i][feature] for i in range(len(train_examples))] for feature in train_examples[0]})
torch.save(train_examples, '../musique_train_examples_allenai_style_with_para_seps.pkl')

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   1 tasks      | elapsed:    2.2s
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    2.3s
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:    2.3s
[Parallel(n_jobs=12)]: Batch computation too fast (0.1632s.) Setting batch_size=2.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    2.3s
[Parallel(n_jobs=12)]: Done  37 tasks      | elapsed:    2.3s
[Parallel(n_jobs=12)]: Batch computation too fast (0.0085s.) Setting batch_size=4.
[Parallel(n_jobs=12)]: Done  58 tasks      | elapsed:    2.3s
[Parallel(n_jobs=12)]: Done  84 tasks      | elapsed:    2.3s
[Parallel(n_jobs=12)]: Batch computation too fast (0.0112s.) Setting batch_size=8.
[Parallel(n_jobs=12)]: Done 134 tasks      | elapsed:    2.3s
[Parallel(n_jobs=12)]: Batch computation too fast (0.0175s.) Setting batch_size=16.
[Parallel(n_jobs=12)]: Done 199 tasks      | elapsed:    2.3s
[Parallel(n_jobs=12)]: Done 291 

19938 questions in total


In [7]:
val_examples = process_file(prepro_dev, with_special_seps=True, with_mem_seps=False)
val_examples = Dataset.from_dict({feature: [val_examples[i][feature] for i in range(len(val_examples))] for feature in val_examples[0]})
torch.save(val_examples, '../musique_val_examples_allenai_style_with_para_seps.pkl')

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Batch computation too fast (0.0055s.) Setting batch_size=2.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Batch computation too fast (0.0130s.) Setting batch_size=4.
[Parallel(n_jobs=12)]: Done  28 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done  50 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done  72 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Batch computation too fast (0.0160s.) Setting batch_size=8.
[Parallel(n_jobs=12)]: Done 124 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Batch computation too fast (0.0215s.) Setting batch_size=16.
[Parallel(n_jobs=12)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 304 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Batch com

2417 questions in total


In [1]:
# handle gemformer imports
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))

In [2]:
import os
import json
import numpy as np
from tqdm.notebook import tqdm
from transformers import RobertaTokenizerFast, default_data_collator
from torch.utils.data import DataLoader
from datasets.arrow_dataset import Dataset
from datasets import load_from_disk
from gemformer.utils import add_qa_evidence_tokens, pad_and_drop_duplicates, ROBERTA_BASE_SPECIAL_TOKENS

tokenizer_name = 'roberta-base'
tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_name)
tokenizer = add_qa_evidence_tokens(tokenizer, tokens_to_add=['[para]'])

2023-11-29 16:13:56.595548: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


initial vocab len = 50265
We have added 1 tokens
final vocab len = 50266


In [13]:
train_examples = torch.load('../musique_train_examples_allenai_style_with_para_seps.pkl')

stride = 20
max_num_answers = 1
max_num_paragraphs = 20 # by dataset construction
max_num_sentences = 152 # from train and val data
eos = tokenizer.convert_tokens_to_ids(tokenizer.eos_token) #'</s>'
PARA_MARKER_token = tokenizer.convert_tokens_to_ids('[para]') #para start
SENT_MARKER_END_token = tokenizer.convert_tokens_to_ids('[/sent]')
MAX_SEQ_LEN = 512

def preprocess_roberta_long_training_examples(train_examples):
    questions = [q.strip() for q in train_examples["question"]]
    inputs = tokenizer(
        questions,
        train_examples['context'],
        max_length=MAX_SEQ_LEN,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    raw_ids = [train_examples['id'][kk] for kk in inputs['overflow_to_sample_mapping']]
    inputs.update({"id": train_examples['id']})
    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = train_examples['char_answer_offsets']
    ex_context_start_id = []
    ex_context_end_id = []
    batch_start_positions_list = []
    batch_end_positions_list = []
    supp_paras = train_examples['supp_para_char_offsets']
    batch_title_positions = []
    batch_sent_positions = []
    batch_titles_to_sents = []

    for ex_id in train_examples["id"]:
        ex_indices = np.where(np.array(raw_ids) == ex_id)[0].tolist()
        context_token_ids = [np.where(np.array(inputs['input_ids'][i]) == eos)[0] for i in ex_indices]
        ex_context_start_id.append([elem[1] + 1 for elem in context_token_ids])
        ex_context_end_id.append([elem[-1] for elem in context_token_ids])

        start_positions_list = []
        end_positions_list = []
      
        supp_para_start_positions = []
        supp_para_end_positions = []

        for ex_sample in ex_indices:
            offset_idx = ex_sample
            offset = offset_mapping[offset_idx]
            sample_idx = sample_map[offset_idx]
            sequence_ids = inputs.sequence_ids(offset_idx)
            # Find the start and end of the context
            idx = 0
            while sequence_ids[idx] != 1:
                idx += 1
            context_start = idx
            while sequence_ids[idx] == 1:
                idx += 1
            context_end = idx - 1

            start_positions = []
            end_positions = []

            for answer in [answers[sample_idx]]:
                start_char = answer[0]
                end_char = answer[1]

                if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
                    continue
                else:
                    idx = context_start
                    while idx <= context_end and offset[idx][0] <= start_char:
                        idx += 1
                    start_positions.append(idx - 1)

                    idx = context_end
                    while idx >= context_start and offset[idx][1] >= end_char:
                        idx -= 1
                    end_positions.append(idx + 1)

            start_positions, end_positions = pad_and_drop_duplicates(start_positions, end_positions, max_num_answers)
            start_positions_list.append(start_positions)
            end_positions_list.append(end_positions)

            supp_para_start_positions.append([])
            for supp_para_idx in range(len(supp_paras[sample_idx])):
                supp_para = supp_paras[sample_idx][supp_para_idx]
                supp_para_start_char = supp_para[0]
                supp_para_end_char = supp_para[1]

                if offset[context_start][0] > supp_para_start_char or offset[context_end][1] < supp_para_end_char:
                    continue
                else:
                    idx1 = context_start
                    while idx1 <= context_end and offset[idx1][0] <= supp_para_start_char:
                        idx1 += 1
                    supp_para_start_positions[-1].append(idx1 - 1)

        
        title_positions = []
        for i, supp in zip([inputs['input_ids'][ii] for ii in ex_indices], supp_para_start_positions):
            title_positions.append([0 if j not in supp else 1 for j in np.where(np.array(i) == PARA_MARKER_token)[0]])
            title_positions[-1] = pad_and_drop_duplicates(start_positions=title_positions[-1], 
                                                          max_num_answers=max_num_paragraphs)

        batch_start_positions_list.append(start_positions_list)
        batch_end_positions_list.append(end_positions_list)
        batch_title_positions.append(title_positions)
      
    inputs.update({"start_positions": batch_start_positions_list})
    inputs.update({"end_positions": batch_end_positions_list})
    inputs.update({'context_start_id': ex_context_start_id})
    inputs.update({'context_end_id': ex_context_end_id})
    inputs.update({"supp_para_labels": batch_title_positions})
    
    rearranged_inps = []
    rearranged_masks = []

    for ex_id in train_examples["id"]:
        ex_indices = np.where(np.array(raw_ids) == ex_id)[0].tolist()
        rearranged_inps.append([inputs['input_ids'][i] for i in ex_indices])
        rearranged_masks.append([inputs['attention_mask'][i] for i in ex_indices])

    inputs.update({'input_ids': rearranged_inps})
    inputs.update({'attention_mask': rearranged_masks})


    return inputs


mem_train_dataset = train_examples.map(
    preprocess_roberta_long_training_examples,
    batched=True,
    batch_size=1,
    remove_columns=train_examples.column_names,
)

mem_train_dataset.save_to_disk('../musique_preprocessed_train_examples_512_allanai_style_multitask_stride20_one_doc_batched_without_zero_answer_pos')

  0%|          | 0/19938 [00:00<?, ?ba/s]

In [15]:
val_examples = torch.load('../musique_val_examples_allenai_style_with_para_seps.pkl')

def preprocess_longformer_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=MAX_SEQ_LEN,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    raw_ids = [examples['id'][kk] for kk in inputs['overflow_to_sample_mapping']]
    inputs.update({"example_ids": examples['id']})
    offset_mapping = inputs["offset_mapping"]
    sample_map = inputs.pop("overflow_to_sample_mapping")
    ex_context_start_id = []
    ex_context_end_id = []
    supp_paras = examples['supp_para_char_offsets']
    batch_title_positions = []
    batch_titles_to_sents = []
    batch_concat_titles_ids = []

    for ex_id in examples["id"]:
        ex_indices = np.where(np.array(raw_ids) == ex_id)[0].tolist()
        context_token_ids = [np.where(np.array(inputs['input_ids'][i]) == eos)[0] for i in ex_indices]
        ex_context_start_id.append([elem[1] + 1 for elem in context_token_ids])
        ex_context_end_id.append([elem[-1] for elem in context_token_ids])

        supp_para_start_positions = []
      
        for ex_sample in ex_indices:
            offset_idx = ex_sample
            offset = offset_mapping[offset_idx]
            sample_idx = sample_map[offset_idx]
            sequence_ids = inputs.sequence_ids(offset_idx)

            inputs["offset_mapping"][offset_idx] = [
                o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
            ]
            # Find the start and end of the context
            idx = 0
            while sequence_ids[idx] != 1:
                idx += 1
            context_start = idx
            while sequence_ids[idx] == 1:
                idx += 1
            context_end = idx - 1

            supp_para_start_positions.append([])
            for supp_para_idx in range(len(supp_paras[sample_idx])):
                supp_para = supp_paras[sample_idx][supp_para_idx]
                supp_para_start_char = supp_para[0]
                supp_para_end_char = supp_para[1]

                if offset[context_start][0] > supp_para_start_char or offset[context_end][1] < supp_para_end_char:
                    continue
                else:
                    idx1 = context_start
                    while idx1 <= context_end and offset[idx1][0] <= supp_para_start_char:
                        idx1 += 1
                    supp_para_start_positions[-1].append(idx1 - 1)
        
        title_positions = []
        for i, supp in zip([inputs['input_ids'][ii] for ii in ex_indices], supp_para_start_positions):
            title_positions.append([0 if j not in supp else 1 for j in np.where(np.array(i) == PARA_MARKER_token)[0]])
            title_positions[-1] = pad_and_drop_duplicates(start_positions=title_positions[-1], 
                                                          max_num_answers=max_num_paragraphs)

        concat_input_ids = []
        concat_offset_mapping = []
        for ii in ex_indices:
            concat_input_ids += inputs['input_ids'][ii]
            concat_offset_mapping += inputs['offset_mapping'][ii]

        titles = np.where(np.array(concat_input_ids) == PARA_MARKER_token)[0]
        np_titles_concat_offset_mapping = np.array(concat_offset_mapping)[titles]
        titles_offsets_unique = sorted(np.unique(np_titles_concat_offset_mapping.tolist(), axis=0).tolist())
        concat_titles_ids = [titles_offsets_unique.index(list(i)) for i in np_titles_concat_offset_mapping]
      
        batch_concat_titles_ids.append(concat_titles_ids)
        batch_title_positions.append(title_positions)    

    inputs.update({'context_start_id': ex_context_start_id})
    inputs.update({'context_end_id': ex_context_end_id})
    inputs.update({"supp_para_labels": batch_title_positions})
    inputs.update({"concat_titles_ids": batch_concat_titles_ids})

    rearranged_inps = []
    rearranged_masks = []
    rearranged_offset_mapping = []

    for ex_id in examples["id"]:
        ex_indices = np.where(np.array(raw_ids) == ex_id)[0].tolist()
        rearranged_inps.append([inputs['input_ids'][i] for i in ex_indices])
        rearranged_masks.append([inputs['attention_mask'][i] for i in ex_indices])
        rearranged_offset_mapping.append([inputs['offset_mapping'][i] for i in ex_indices])

    inputs.update({'input_ids': rearranged_inps})
    inputs.update({'attention_mask': rearranged_masks})
    inputs.update({'offset_mapping': rearranged_offset_mapping})

    return inputs

mem_val_dataset = val_examples.map(
    preprocess_longformer_validation_examples,
    batched=True,
    batch_size=1,
    remove_columns=val_examples.column_names,
)
mem_val_dataset.save_to_disk('../musique_preprocessed_val_examples_512_allanai_style_multitask_stride20_one_doc_batched_without_zero_answer_pos')

  0%|          | 0/2417 [00:00<?, ?ba/s]

  np_titles_concat_offset_mapping = np.array(concat_offset_mapping)[titles]#list of tuples
