In [31]:
import json
QUOTES = "'" + '"'


def merge_phrases(phrases): # fix quotes MATT: \"Well, if they\nmean that much to you, + \"you should name them.\"
    text = ""
    for el in phrases:
        if len(text) != 0 and text[-1] == el[0] and el[0] in QUOTES:
            text = text[:-1]
            el = el[1:]
        text = text + ' ' + el
    text = " ".join(text.split())
    return text.strip()


def fix_turn(turn):
    actor = turn['NAMES'][0]
    #if len(turn['NAMES']) != 1:
    #    print(turn)
    text = merge_phrases(turn["UTTERANCES"])
    return actor, text
    

class MyChunk:    
    def __init__(self, chunk):
        self.synopsis = chunk['CHUNK']
        self.score = chunk['ALIGNMENT']['ALIGNMENT SCORE']
        self.replics = [
            fix_turn(turn) for turn in chunk['TURNS']
        ]
        self.clean()
    
    def clean(self):
        replics = []
        for a, r in self.replics:
            r = r.replace('(laughing)', '').replace('(laughs)', '').replace('(laughter)', '').strip()
            if a == 'ALL' or len(r) == 0:
                continue
            if len(replics) != 0 and replics[-1][0] == a:
                replics[-1][1] = replics[-1][1] +  ' ' + r
            else:
                replics.append([a, r])
        self.replics = replics
    
    def count_actors(self):
        actors = set()
        for el in self.replics:
            actors.add(el[0])
        return len(actors)
    
    def __len__(self):
        return len(self.replics)
    
    def __getitem__(self, key):
        if isinstance(key, slice):
            return '\n'.join(el[0] + ': ' + el[1] for el in self.replics)
        return self.replics[key][0] + ': ' + self.replics[key][1]
    
    def __repr__(self):
        return "{ " + self.synopsis + " }" + f"[t{len(self.replics)};a{self.count_actors()}]"
    


class GetPromptClass:
    prompt_prefix = "It's a fantasy role-playing game.\n\n"
    character2description = {
        "Pike": "Pike Trickfoot, a gnome cleric",
        "Grog": "Grog Strongjaw, a goliath barbarian",
        "Vex":  "Vex Vex'ahlia, a half-elf ranger",
        "Vax":  "Vax Vax'ildan, a half-elf rogue",
        "Percival": "Percival de Rolo, a human gunslinger",
        "Keyleth": "Keyleth, a half-elf druid",
        "Tiberius": "Tiberius Stormwind, a dragonborn sorcerer",
        "Scanlan": "Scanlan Shorthalt, a gnome bard",
        "Lionel": "Lionel Gayheart Chod, a half-orc bard",
        "Taryon": "Taryon Darrington, a human artificer",
        "Yasha": "Yasha Nydoorin, a half-angel barbarian",
        "Fjord": "Fjord, a half-orc warlock",
        "Jester": "Jester Lavorre, a tiefling cleric",
        "Caleb": "Caleb Widogast, a human wizard",
        "Molly": "Mollymauk Tealeaf, a tiefling blood hunter",
        "Beau": "Beauregard Lionett called Beau, a human monk",
        "Tiberius": "Tiberius Stormwind, a dragonborn sorcerer",
        "Nott": "Nott the Brave, a goblin rogue",
        "Caduceus": "Caduceus Clay, a firbolg cleric",
        "Kingsley": "Kingsley Tealeaf, a tiefling blood hunter",
        "Kashaw": " Kashaw Vesh, a human cleric",
        "Shale": "Shale, a goliath fighter",
        "Spurt": "Spurt, a kobold inventor"
    }
    
    skip_actor = {"NOELLE", "KHARY", "DEBORAH", "ZAC", "FELICIA", "MARY", "CHRIS HARDWICK",
                  "DARIN", "JOE", "SUMALEE"}
    
    def get_name_mapping(self, campaign, episode):
        if campaign == 1:
            name2character = {
                'MATT': 'Dungeon Master',
                'ASHLEY': 'Pike',
                'TRAVIS': "Grog",
                "LAURA": "Vex",
                "LIAM": "Vax",
                "TALIESIN": "Percival",
                "MARISHA": "Keyleth",
                "ORION": "Tiberius",
                "SAM":   "Scanlan",
                "JOHN": "Lionel",
                "JON": "Lionel",
                "WILL": "Kashaw",
                "WIL": "Kashaw",
                "CHRIS PERKINS": "Shale",
            }
            if episode >= 85:
                name2character['SAM'] = 'Taryon'
        elif campaign == 2:
            name2character = {
                'MATT': 'Dungeon Master',
                'ASHLEY': 'Yasha',
                'TRAVIS': "Fjord",
                "LAURA": "Jester",
                "LIAM": "Caleb",
                "TALIESIN": "Molly",
                "MARISHA": "Beau",
                "ORION": "Tiberius",
                "SAM":   "Nott",
                "CHRIS PERKINS": "Spurt",
            }
            if episode >= 28:
                name2character['TALIESIN'] = "Caduceus"
            if episode >= 140:
                name2character['TALIESIN'] = "Kingsley"
            
        return name2character
    
    def __init__(self, campaign, episode):
        self.name2character = self.get_name_mapping(campaign, episode)
        
    def __call__(self, replics, prev_chunks):
        actors = set()
        for el in replics:
            actors.add(el[0])
        prompt = self.prompt_prefix
        actors.remove('MATT')
        if len(actors) == 0:
            print(replics)
            return None
        
        for a in actors:
            if a not in self.name2character:
                print('Missing actor', a)
                continue
            # prompt = prompt + name2character[a] + ';\n'
        # prompt = prompt + prev_episode_synopsis + '\n'
        
        player = list(actors)[0]
        if player in self.skip_actor:
            return None
        character = self.name2character[player]
        desc = self.character2description[character]
        
        prompt = prompt + "Dungeon Master: You are " + desc + " . You are the member of the famous group of adventurers. "
        for el in prev_chunks[::-1]:
            prompt = prompt + "Previously, " + el.synopsis.replace('[', ' ').replace(']', ' ').replace('"', ' ')\
                                                .replace(character, 'you') + ' '
        prompt = "\n" + prompt


        prompt = prompt.strip() + '\n'
        for el in replics:
            
            actor = self.name2character[el[0]]
            if el[0] != 'MATT':
                actor = 'Player'
            prompt += actor + ": " + el[1].strip() + '\n'
        prompt = prompt.replace('Dungeon Master', 'DM')
        

        #for i, el in enumerate(actors):
        #    prompt = prompt.replace(el, 'Player ' + str(i + 1))
        #    prompt = prompt.replace(name2character_short[el], 'Player ' + str(i + 1))        

        return prompt, ['DM'] + ['Player']
        

def is_battle(text):
    text = text.lower()
    keywords = [" dc", "difficulty challenge", "armor class", "saving throw", "turn", "bonus action", "spell slot", 
                "attack roll", "roll for initiative", "sneak attack"]
    cnt = 0
    for el in keywords:
        cnt += text.count(el)
    
    return cnt >= 2


def process_episode(campaign, episode, get_prompt):
    c = 2
    filename = f"../CRD3/data/aligned data/c=2/C{campaign}E{episode:03d}_2_0.json"
    try:
        chunks = json.load(open(filename))
    except FileNotFoundError as exp:
        print(exp, campaign, episode)
        return []
    # print('Chunks:', len(chunks))
    chunks = [MyChunk(el) for el in json.load(open(filename))[3:-1]]
    DM_name = 'MATT'
    prev_chunks = []

    MIN_START_LENGTH = 100

    prompts = []

    for chunk in chunks:
        if len(prev_chunks) >= 2:
            i = 0
            
            while i < len(chunk) - 5:
                if chunk.replics[i][0] != DM_name or len(chunk.replics[i][1]) < MIN_START_LENGTH:
                    i += 1
                    continue
                dm_turn_cnt = 0
                actors = set()
                last_ok = -1
                for j in range(30):
                    if i + j == len(chunk):
                        break
                    actors.add(chunk.replics[i + j][0])
                    if len(actors) >= 3:
                        break
                    if chunk.replics[i + j][0] == DM_name:
                        last_ok = i + j
                        dm_turn_cnt += 1
                if last_ok - i + 1 >= 7 and dm_turn_cnt >= 2 and len(actors) > 1:
                    replics = chunk.replics[i:last_ok + 1]# + 1]
                    p = get_prompt(replics, prev_chunks[-1:])
                    if p is not None and not is_battle(p[0]):
                        prompts.append(p)#))
                        i = last_ok - 1
                i += 1
    
        prev_chunks.append(chunk)

    return prompts

games = [(1, i) for i in range(1, 116)] + [(2, i) for i in range(1, 47)]
prompts = []
for c, e in games:
    prompts += process_episode(c, e, GetPromptClass(c, e))
print(len(prompts))

Missing actor ZAC
[Errno 2] No such file or directory: '../CRD3/data/aligned data/c=2/C1E012_2_0.json' 1 12
Missing actor ZAC
[Errno 2] No such file or directory: '../CRD3/data/aligned data/c=2/C1E015_2_0.json' 1 15
Missing actor FELICIA
Missing actor FELICIA
Missing actor FELICIA
Missing actor MARY
Missing actor MARY
Missing actor MARY
Missing actor MARY
Missing actor MARY
Missing actor MARY
Missing actor FELICIA
Missing actor ZAC
[Errno 2] No such file or directory: '../CRD3/data/aligned data/c=2/C1E026_2_0.json' 1 26
Missing actor ZAC
[Errno 2] No such file or directory: '../CRD3/data/aligned data/c=2/C1E041_2_0.json' 1 41
[Errno 2] No such file or directory: '../CRD3/data/aligned data/c=2/C1E045_2_0.json' 1 45
Missing actor CHRIS HARDWICK
Missing actor CHRIS HARDWICK
Missing actor CHRIS HARDWICK
Missing actor CHRIS HARDWICK
Missing actor CHRIS HARDWICK
Missing actor CHRIS HARDWICK
Missing actor CHRIS HARDWICK
Missing actor CHRIS HARDWICK
Missing actor CHRIS HARDWICK
[Errno 2] No su

In [38]:
import sklearn
import sklearn.model_selection

train, test = sklearn.model_selection.train_test_split(prompts, random_state=42)

with open('cr3-dm-player-solo-v1-train.json', 'w') as file:
    for el in train:
        json.dump({'text': el[0], 'actors': el[1]}, fp=file)
with open('cr3-dm-player-solo-v1-test.json', 'w') as file:
    for el in test:
        json.dump({'text': el[0], 'actors': el[1]}, fp=file)

In [33]:
non_combat = 0
l = []
for p, a in prompts:
    if not is_battle(p):
        non_combat += 1
        l.append(p)
print(len(prompts), non_combat, non_combat / len(prompts))

1948 1948 1.0


In [34]:
print(l[25])

It's a fantasy role-playing game.

DM: You are Scanlan Shorthalt, a gnome bard . You are the member of the famous group of adventurers. Previously, The dwarf's skull has a hole bored into it, like the goblins Vex discovered earlier. The hooded figure nods towards the duergar general and they walk back to the barracks.
DM: You hear what sounds like guttural Dwarvish being shouted angrily, cursing from the interior, and a few other dwarven voices also joining in-- more than two. You hear some horrible, sick thuds and some impacts and some cries of pain. And eventually, the five duergar dragging the captives walk out, carrying cudgels at their side, still dripping with fresh blood. They kind of go amongst their own business from there.
Player: Okay. I think I'll go back to my fellows. There's nothing I can see-- that door doesn't lock-- does it lock by any chance?
DM: It's just an opening tent.
Player: Open tent. There's nothing lying around, none of that black armor lying around or anyth