In [1]:
import json
import seaborn as sns
import os
import transformers
from tqdm.notebook import tqdm
import numpy as np
tokenizer = transformers.AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B", cache_dir="mycache")
MAX_INPUT_TOKENS = 2048
MAX_INPUT_WORDS = 1500

In [2]:
import torch
import re
import random

_RE_COMBINE_WHITESPACE = re.compile(r"\s+")

def fix_spaces(s):
    return _RE_COMBINE_WHITESPACE.sub(" ", s).strip()


class QuestDataset(torch.utils.data.Dataset):
    RETRY_NODE_CNT = 3
    MIN_DOC_LENGTH = 600
    MAX_DOC_LENGTH = 1900
    MIN_PHRASES = 4
    BUDGET = 200
    
    def __init__(self, datadir, tokenizer, player="Player", dm="DM"):
        files = [os.path.join(datadir, el) for el in os.listdir(datadir)]
        self.states = {}
        self.states_list = []
        for fn in os.listdir(datadir):
            quest = json.load(open(os.path.join(datadir, fn)))
            for key, state in quest.items():
                state['next_nums'] = [fn + "/" + str(el) for el in state['next_nums']]
                state['main_text'] = fix_spaces(state['main_text']).strip()
                state['query_texts'] = [fix_spaces(el).strip() for el in state['query_texts']]
                self.states[fn + "/" + key] = state
                self.states_list.append(fn + "/" + key)
        self.player = player
        self.dm = dm
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.states)
    
    def check(self, text):
        phrases = text.count(self.dm) + text.count(self.player)
        length = len(self.tokenizer.encode(text))
        return self.MIN_PHRASES <= phrases and self.MIN_DOC_LENGTH <= length <= self.MAX_DOC_LENGTH
    
    def pick_choices(self, node, text=""):
        old_text = text
        text = text + self.dm + ': ' + self.states[node]['main_text'] + '\n'
        text_len = len(tokenizer.encode(text))
        
        if text_len > self.MAX_DOC_LENGTH:
            if self.check(old_text):
                return old_text
        
        for _ in range(self.RETRY_NODE_CNT):
            self.current_budget -= 1
            if self.current_budget <= 0:
                if self.check(text):
                    return text
                else:
                    return
            i = random.randint(0, len(self.states[node]['next_nums']))
            if i == len(self.states[node]['next_nums']):
                new_text = text
            else:
                new_text = text + self.player + ": " + self.states[node]['query_texts'][i] + '\n'
                new_text = self.pick_choices(self.states[node]['next_nums'][i], new_text)
            if new_text is not None and self.check(new_text):
                return new_text
    
    def try_to_extract(self, i, tokenize=False):
        for _ in range(self.RETRY_NODE_CNT):
            self.current_budget = self.BUDGET
            text = self.pick_choices(self.states_list[i])
            
            if text is not None:
                return text
    
    def __getitem__(self, i):
        fails = 0
        while True:
            text = self.try_to_extract(i, tokenize=True)
            if text is None:
                i = random.randint(0, len(self) - 1)
                fails += 1
            else:
                print(text)
                return tokenizer.encode(text)

random.seed(0)
mydataset = QuestDataset("data_train", tokenizer)
mydataset.try_to_extract(300)

'DM: You summon your mental forces and prepare to unleash a weave of raw magica against Farhan. He apparently has not noticed you yet.Knowing you have to go for speed rather than for finesse, you just hope that your mental strength suffices for you not to black out. Then, you banish those distracting thoughts from your mind and concentrate at the work on hand.Within seconds, the weave is completed, and you hurl it with a single word of power against the emissary who dances among the wolves, cutting them down.Your magical force hits him straight into the chest.\nPlayer: This is not the case.\nDM: But Farhan simply shrugs off your mental assault.You are shocked by this, and he turns towards you with an evil grin on his face, and cuts his way through the wolves towards you.Horrified, and knowing your mental reserves are depleted entirely, you brandish your own sword, slowly giving way to Farhan. Unrelentlessly, he comes towards you.You turn to flee, knowing you cannot face him in hand to 

In [None]:
from tqdm.notebook import tqdm
texts = set()
for _ in range(4):
    fail_cnt = 0
    for i in tqdm(range(0, len(mydataset))):
        text = mydataset.try_to_extract(i)
        if text is not None:
            texts.add((text, mydataset.states_list[i]))
        else:
            fail_cnt += 1
    print('fail cnt', fail_cnt)

  0%|          | 0/35277 [00:00<?, ?it/s]

fail cnt 11994


  0%|          | 0/35277 [00:00<?, ?it/s]

In [None]:
len(texts)

In [None]:
print(len(texts))

In [96]:
with open('train_collected_data_v2.json', 'w') as file:
    json.dump(list(texts), file)