In [2]:
import datasets

In [1]:
import os
import copy
import torch
import pickle
from tqdm import tqdm
import torch

In [4]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

from transformers import PreTrainedTokenizer
from transformers import AutoModel
from transformers import PreTrainedTokenizerFast

In [5]:
from tokenizers import ByteLevelBPETokenizer, normalizers
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Split, Whitespace, CharDelimiterSplit
from tokenizers.processors import TemplateProcessing

In [7]:
with open("../dict_pickles/logistics_plans_with_invariants.pickle", "rb") as pickle_file:
    af_dict = pickle.load(pickle_file)

In [8]:
actions_fluents_list = []
for action_or_fluent in af_dict.keys():
    actions_fluents_list.append(action_or_fluent.replace(" ", "").replace("-", ""))

In [9]:
"attru2pos33" in actions_fluents_list, "incityapt3cit1" in actions_fluents_list

(True, True)

In [10]:
tokenizer = Tokenizer(models.WordPiece(unk_token="<|unknown|>"))
tokenizer.normalizer = normalizers.BertNormalizer(clean_text=True)
tokenizer.pre_tokenizer = pre_tokenizers.Split(" ", "removed")
special_tokens = ["<|unknown|>", "<|pad|>", "<|startofplan|>", "<|endofplan|>", "<|goals|>", "<|actions|>", "<|mask|>"]
trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)
tokenizer.train_from_iterator(actions_fluents_list, trainer=trainer)

In [11]:
bos_token_id = tokenizer.token_to_id("<|startofplan|>")
eos_token_id = tokenizer.token_to_id("<|endofplan|>")
actions_token_id = tokenizer.token_to_id("<|actions|>")

In [12]:
tokenizer.post_processor = processors.TemplateProcessing(
    single=f"<|startofplan|> $A:0",
    pair=f"<|startofplan|>:0 $A:0 <|actions|>:0 $B:0 <|endofplan|>:0",
    special_tokens=[("<|startofplan|>", bos_token_id), ("<|endofplan|>", eos_token_id), ("<|actions|>", actions_token_id)],
)

In [13]:
tokenizer.save("../tokenizers/logistics_invariants_tokenizer.json")

In [None]:
tokenizer = Tokenizer.from_file("/content/logistics_tokenizer.json")

In [None]:
additional_special_tokens = ["<|goals|>", "<|actions|>"]

In [None]:
wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="<|unknown|>",
    pad_token="<|pad|>",
    bos_token="<|startoftext|>",
    eos_token="<|endoftext|>",
    # goals_token="<|goals|>",
    # actions_token="<|actions|>",
    mask_token="<|mask|>",
    additional_special_tokens=additional_special_tokens,
    padding_side="left",  
)

In [None]:
wrapped_tokenizer.save_pretrained("/content/")

('/content/tokenizer_config.json',
 '/content/special_tokens_map.json',
 '/content/tokenizer.json')

In [None]:
text = 'attru2pos33 attru3pos77 attru4pos66 attru5pos44 attru1pos13 atobj21pos23 atobj23pos77 atobj44pos33 atobj55pos11 atobj12pos23 atobj88pos23 atobj66pos13 atobj13pos13 atobj22pos23 atobj77pos11 atapn8apt3 atapn1apt3 atapn7apt3 atapn4apt8 <|goals|> atobj77pos12 atobj22pos11 <|actions|> DRIVETRUCKTRU2POS33APT6CIT3 DRIVETRUCKTRU4POS66POS23CIT1 DRIVETRUCKTRU3POS77APT4CIT6 LOADTRUCKOBJ22TRU4POS23 DRIVETRUCKTRU4POS23APT3CIT1 UNLOADTRUCKOBJ22TRU4APT3 LOADAIRPLANEOBJ22APN1APT3 FLYAIRPLANEAPN1APT3APT6 UNLOADAIRPLANEOBJ22APN1APT6 LOADTRUCKOBJ22TRU2APT6 DRIVETRUCKTRU2APT6POS11CIT3 UNLOADTRUCKOBJ22TRU2POS11 LOADTRUCKOBJ77TRU2POS11 DRIVETRUCKTRU2POS11APT6CIT3 UNLOADTRUCKOBJ77TRU2APT6 LOADAIRPLANEOBJ77APN1APT6 FLYAIRPLANEAPN1APT6APT4 UNLOADAIRPLANEOBJ77APN1APT4 LOADTRUCKOBJ77TRU3APT4 DRIVETRUCKTRU3APT4POS12CIT6 UNLOADTRUCKOBJ77TRU3POS12'

In [None]:
text = 'attru2pos33 attru3pos77 attru4pos66 attru5pos44'

In [None]:
wrapped_tokenizer.convert_tokens_to_ids("<|actions|>")

5

In [None]:
def tokenize_function(examples):
    output = wrapped_tokenizer(examples['states'],
                               examples['actions'],
                               # max_length=context_length,
                               # padding='max_length',
                               return_token_type_ids=False,
                               # return_tensors='pt',
                               # max_length=512,
                               )
    return output

In [None]:
wrapped_tokenizer.decode(13000)

'attru4pos44'

In [None]:
wrapped_tokenizer.decode(encoding.input_ids)

'<|startoftext|> attru2pos33 attru3pos77 attru4pos66 attru5pos44 attru1pos13 atobj21pos23 atobj23pos77 atobj44pos33 atobj55pos11 atobj12pos23 atobj88pos23 atobj66pos13 atobj13pos13 atobj22pos23 atobj77pos11 atapn8apt3 atapn1apt3 atapn7apt3 atapn4apt8 <|goals|> atobj77pos12 atobj22pos11 <|actions|> drivetrucktru2pos33apt6cit3 drivetrucktru4pos66pos23cit1 drivetrucktru3pos77apt4cit6 loadtruckobj22tru4pos23 drivetrucktru4pos23apt3cit1 unloadtruckobj22tru4apt3 loadairplaneobj22apn1apt3 flyairplaneapn1apt3apt6 unloadairplaneobj22apn1apt6 loadtruckobj22tru2apt6 drivetrucktru2apt6pos11cit3 unloadtruckobj22tru2pos11 loadtruckobj77tru2pos11 drivetrucktru2pos11apt6cit3 unloadtruckobj77tru2apt6 loadairplaneobj77apn1apt6 flyairplaneapn1apt6apt4 unloadairplaneobj77apn1apt4 loadtruckobj77tru3apt4 drivetrucktru3apt4pos12cit6 unloadtruckobj77tru3pos12 <|endoftext|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|

In [None]:
text1 = 'attru2pos33 attru3pos77'
text2 = 'attru4pos66 attru5pos44'
encoding = wrapped_tokenizer(text1, text2)
encoding

{'input_ids': [2, 11999, 11969, 5, 12001, 12011, 3], 'token_type_ids': [0, 0, 0, 1, 0, 0, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [None]:
dataset = datasets.load_dataset("json", data_files="/content/20_plans.json")



Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-2b5bf8ff1bf7bc7a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-2b5bf8ff1bf7bc7a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
tokenized_datasets = dataset.map(
            tokenize_function,
            batched=True,
            remove_columns=['name', 'states', 'actions'],
            desc="Running tokenizer on dataset",
        )
tokenized_datasets['train']

Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['actions_idx', 'eop_idx', 'input_ids', 'attention_mask'],
    num_rows: 20
})

In [None]:
tokenized_datasets['train'][0]

{'actions_idx': 23,
 'eop_idx': 37,
 'input_ids': [2,
  11998,
  11989,
  12014,
  11980,
  12005,
  15787,
  15817,
  15776,
  13793,
  13760,
  13749,
  15763,
  15806,
  13783,
  15798,
  8798,
  8789,
  8817,
  8804,
  4,
  15792,
  13769],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1]}

In [None]:
dataset['train'][0]

{'name': '../datasets/logistics/tasks_simil-pereira/xml//xml-LPG-p057934_4.SOL',
 'states': 'attru1pos13 attru2pos11 attru5pos77 attru3pos23 attru4pos21 atobj55pos11 atobj99pos22 atobj66pos11 atobj12pos11 atobj22pos11 atobj21pos77 atobj44pos21 atobj88pos22 atobj13pos22 atobj77pos11 atapn8apt4 atapn5apt4 atapn4apt4 atapn2apt3 <|goals|> atobj77pos66 atobj21pos13',
 'actions': 'LOADTRUCKOBJ21TRU5POS77 DRIVETRUCKTRU1POS13APT3CIT1 LOADTRUCKOBJ77TRU2POS11 DRIVETRUCKTRU5POS77APT4CIT2 DRIVETRUCKTRU2POS11POS66CIT5 UNLOADTRUCKOBJ21TRU5APT4 UNLOADTRUCKOBJ77TRU2POS66 LOADAIRPLANEOBJ21APN8APT4 FLYAIRPLANEAPN8APT4APT3 UNLOADAIRPLANEOBJ21APN8APT3 LOADTRUCKOBJ21TRU1APT3 DRIVETRUCKTRU1APT3POS13CIT1 UNLOADTRUCKOBJ21TRU1POS13',
 'actions_idx': 23,
 'eop_idx': 37}

In [None]:
dataset['train']['states']

['attru1pos13 attru2pos11 attru5pos77 attru3pos23 attru4pos21 atobj55pos11 atobj99pos22 atobj66pos11 atobj12pos11 atobj22pos11 atobj21pos77 atobj44pos21 atobj88pos22 atobj13pos22 atobj77pos11 atapn8apt4 atapn5apt4 atapn4apt4 atapn2apt3 <|goals|> atobj77pos66 atobj21pos13',
 'attru1pos13 attru2pos11 attru5pos77 attru3pos23 attru4pos21 atobj55pos11 atobj99pos22 atobj66pos11 atobj12pos11 atobj22pos11 atobj21pos77 atobj44pos21 atobj88pos22 atobj13pos22 atobj77pos11 atapn8apt4 atapn5apt4 atapn4apt4 atapn2apt3 <|goals|> atobj77pos66 atobj21pos13',
 'attru1pos13 attru2pos11 attru5pos77 attru3pos23 attru4pos21 atobj55pos11 atobj99pos22 atobj66pos11 atobj12pos11 atobj22pos11 atobj21pos77 atobj44pos21 atobj88pos22 atobj13pos22 atobj77pos11 atapn8apt4 atapn5apt4 atapn4apt4 atapn2apt3 <|goals|> atobj77pos66 atobj21pos13',
 'attru1pos13 attru2pos11 attru5pos77 attru3pos23 attru4pos21 atobj55pos11 atobj99pos22 atobj66pos11 atobj12pos11 atobj22pos11 atobj21pos77 atobj44pos21 atobj88pos22 atobj13pos22

In [None]:
wrapped_tokenizer.decode(tokenized_datasets['train'][0]['input_ids'][:11])

'<|startofplan|> attru4pos55 attru5pos11 atobj66pos77 atobj33pos11 atobj11pos12 atobj77pos12 atobj55pos77 atobj00pos77 atapn5apt2 <|goals|>'

In [None]:
encodings = tokenize_function(dataset['train'][0])
encodings

In [1]:
from torch.utils.data import DataLoader
from transformers import default_data_collator, DataCollatorForLanguageModeling
from collections.abc import Mapping

In [None]:
data_collator = DataCollatorForLanguageModeling(wrapped_tokenizer, mlm=False)

In [None]:
eval_dataloader = DataLoader(tokenized_datasets['train'], collate_fn=data_collator, batch_size=4)

In [None]:
isinstance(tokenized_datasets['train'][0], Mapping)

True

In [None]:
for i in tokenized_datasets['train']['input_ids']:
    print(i)

[2, 11998, 11989, 12014, 11980, 12005, 15787, 15817, 15776, 13793, 13760, 13749, 15763, 15806, 13783, 15798, 8798, 8789, 8817, 8804, 4, 15792, 13769]
[2, 11998, 11989, 12014, 11980, 12005, 15787, 15817, 15776, 13793, 13760, 13749, 15763, 15806, 13783, 15798, 8798, 8789, 8817, 8804, 4, 15792, 13769]
[2, 11998, 11989, 12014, 11980, 12005, 15787, 15817, 15776, 13793, 13760, 13749, 15763, 15806, 13783, 15798, 8798, 8789, 8817, 8804, 4, 15792, 13769]
[2, 11998, 11989, 12014, 11980, 12005, 15787, 15817, 15776, 13793, 13760, 13749, 15763, 15806, 13783, 15798, 8798, 8789, 8817, 8804, 4, 15792, 13769]
[2, 11973, 12017, 15826, 13801, 15760, 13766, 15816, 13741, 8830, 8771, 4, 15815, 13741]
[2, 11973, 12017, 15826, 13801, 15760, 13766, 15816, 13741, 8830, 8771, 4, 15815, 13741]
[2, 11973, 12017, 15826, 13801, 15760, 13766, 15816, 13741, 8830, 8771, 4, 15815, 13741]
[2, 11973, 12017, 15826, 13801, 15760, 13766, 15816, 13741, 8830, 8771, 4, 15815, 13741]
[2, 12021, 11969, 15820, 15787, 15757, 13756

In [None]:
wrapped_tokenizer.decode(1)

'<|pad|>'

In [None]:
for step, batch in enumerate(eval_dataloader):
    print(step)

0
1
2
3
4


In [None]:
actions_seen = 2

In [None]:
actions = "a b c d"
action_list = actions.split(" ")
" ".join(action_list[:actions_seen])

'a'

In [None]:
def get_inputs_for_generation(examples):
    output = []
    for state, actions in zip(examples["states"], examples["actions"]):
        example = state + " <|actions|>"
        action_list = actions.split(" ")
        action_string = " ".join(action_list[:actions_seen])
        if action_string != "":
            example = example + " " + action_string
        output.append(example)
            
    return {"input": output}

In [None]:
pre_processed_dataset = dataset.map(
            get_inputs_for_generation,
            batched=True,
            remove_columns=['name', 'states', 'actions'],
            desc="Running input pre-processing on dataset",
        )
pre_processed_dataset['train']

Running input pre-processing on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['actions_idx', 'eop_idx', 'input'],
    num_rows: 20
})

In [None]:
tokenized_datasets = pre_processed_dataset.map(
            tokenize_function,
            batched=True,
            remove_columns=['input'],
            desc="Running tokenizer on dataset",
        )
tokenized_datasets['train']

Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['actions_idx', 'eop_idx', 'input_ids', 'attention_mask'],
    num_rows: 20
})

In [None]:
tokenized_datasets['train']['input_ids']

[[2,
  11990,
  11993,
  12002,
  11982,
  12017,
  15762,
  15809,
  15795,
  13764,
  13797,
  13777,
  15775,
  15831,
  13755,
  15784,
  8798,
  8789,
  8817,
  8804,
  4,
  15783,
  13794,
  5,
  5408,
  15091],
 [2,
  11990,
  11993,
  12002,
  11982,
  12017,
  15762,
  15809,
  15795,
  13764,
  13797,
  13777,
  15775,
  15831,
  13755,
  15784,
  8798,
  8789,
  8817,
  8804,
  4,
  15783,
  13794,
  5,
  15091,
  5408],
 [2,
  11990,
  11993,
  12002,
  11982,
  12017,
  15762,
  15809,
  15795,
  13764,
  13797,
  13777,
  15775,
  15831,
  13755,
  15784,
  8798,
  8789,
  8817,
  8804,
  4,
  15783,
  13794,
  5,
  3784,
  2659],
 [2,
  11990,
  11993,
  12002,
  11982,
  12017,
  15762,
  15809,
  15795,
  13764,
  13797,
  13777,
  15775,
  15831,
  13755,
  15784,
  8798,
  8789,
  8817,
  8804,
  4,
  15783,
  13794,
  5,
  15091,
  5408],
 [2,
  11975,
  12005,
  15813,
  13762,
  15769,
  13791,
  15804,
  13773,
  8830,
  8771,
  4,
  15802,
  13773,
  5,
  12817,

In [None]:
def tokenize_function(examples):
    output = wrapped_tokenizer(examples['input'],
                               # max_length=context_length,
                               # padding='max_length',
                               return_token_type_ids=False,
                               # return_tensors='pt',
                               # max_length=512,
                               )
    return output

In [None]:
dataset['train']['states']

['attru1pos13 attru2pos11 attru5pos77 attru3pos23 attru4pos21 atobj55pos11 atobj99pos22 atobj66pos11 atobj12pos11 atobj22pos11 atobj21pos77 atobj44pos21 atobj88pos22 atobj13pos22 atobj77pos11 atapn8apt4 atapn5apt4 atapn4apt4 atapn2apt3 <|goals|> atobj77pos66 atobj21pos13',
 'attru1pos13 attru2pos11 attru5pos77 attru3pos23 attru4pos21 atobj55pos11 atobj99pos22 atobj66pos11 atobj12pos11 atobj22pos11 atobj21pos77 atobj44pos21 atobj88pos22 atobj13pos22 atobj77pos11 atapn8apt4 atapn5apt4 atapn4apt4 atapn2apt3 <|goals|> atobj77pos66 atobj21pos13',
 'attru1pos13 attru2pos11 attru5pos77 attru3pos23 attru4pos21 atobj55pos11 atobj99pos22 atobj66pos11 atobj12pos11 atobj22pos11 atobj21pos77 atobj44pos21 atobj88pos22 atobj13pos22 atobj77pos11 atapn8apt4 atapn5apt4 atapn4apt4 atapn2apt3 <|goals|> atobj77pos66 atobj21pos13',
 'attru1pos13 attru2pos11 attru5pos77 attru3pos23 attru4pos21 atobj55pos11 atobj99pos22 atobj66pos11 atobj12pos11 atobj22pos11 atobj21pos77 atobj44pos21 atobj88pos22 atobj13pos22

In [6]:
raw_datasets = datasets.load_dataset("json", data_dir="..\plans\json\plans_with_invariants")

Resolving data files:   0%|          | 0/24 [00:00<?, ?it/s]

Using custom data configuration default-0dcd2dfd92889ae4


Downloading and preparing dataset json/default to C:/Users/drunp/.cache/huggingface/datasets/json/default-0dcd2dfd92889ae4/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to C:/Users/drunp/.cache/huggingface/datasets/json/default-0dcd2dfd92889ae4/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [16]:
pippo = "atobj12pos13 atobj21pos33 attru5pos12 attru3pos13 <|goals|> atobj14pos33"
pippo.split(" ")

['atobj12pos13',
 'atobj21pos33',
 'attru5pos12',
 'attru3pos13',
 '<|goals|>',
 'atobj14pos33']

In [17]:
a, b = pippo.split(" <|goals|> ")

In [18]:
a, b

('atobj12pos13 atobj21pos33 attru5pos12 attru3pos13', 'atobj14pos33')

In [None]:
import random

In [47]:
random.seed(8)
c = a.split(" ")
print(c)
random.shuffle(c)
print(c)

['atobj12pos13', 'atobj21pos33', 'attru5pos12', 'attru3pos13']
['atobj12pos13', 'attru5pos12', 'attru3pos13', 'atobj21pos33']


In [51]:
def shuffle_initial_state(examples):
    output = []
    for state in examples["states"]:
        initial_state_fluents, goals = state.split(" <|goals|> ")
        initial_state_fluents = initial_state_fluents.split(" ")
        random.shuffle(initial_state_fluents)
        
        new_state = " ".join(initial_state_fluents) + " <|goals|> " + goals
        output.append(new_state)
    return {"states_shuffled": output}

In [52]:
pre_processed_dataset = raw_datasets.map(
    shuffle_initial_state,
    batched=True,
    remove_columns=["name", "actions"],
    desc="Running input pre-processing on dataset",
)

Running input pre-processing on dataset:   0%|          | 0/430 [00:00<?, ?ba/s]

Running input pre-processing on dataset:   0%|          | 0/226 [00:00<?, ?ba/s]

Running input pre-processing on dataset:   0%|          | 0/237 [00:00<?, ?ba/s]

In [1]:
pre_processed_dataset['train'][0]

NameError: name 'pre_processed_dataset' is not defined

In [5]:
t = torch.tensor([1, 2, 1, 3, 4, 5])
pad_idx = (t == 1).nonzero(as_tuple=True)[0]
pad_idx[0]

tensor(0)