In [9]:
import datasets
import numpy as np
import pandas as pd
import torch
import transformers
import os
# proxy
os.environ["http_proxy"] = "http://proxy.ad.speechpro.com:3128"
os.environ["https_proxy"] = "http://proxy.ad.speechpro.com:3128"
os.environ["ftp_proxy"] = "http://proxy.ad.speechpro.com:3128"

In [10]:
def join_same_person(row):
    dialog = row['dialog']
    new_dialog = dialog[:1]
    for d in dialog[1:]:
        if new_dialog[-1]["person"] == d["person"]:
            new_dialog[-1]["text"] = new_dialog[-1]["text"] + " " + d["text"]
            new_dialog[-1]["gk"] = list(set(new_dialog[-1]["gk"]) | set(d["gk"]))
        else:
            new_dialog.append(d)
    return {"dialog": new_dialog}

def get_gk_from_persona(row):
    dialog = row['dialog']
    persons = row['persons']
    pocesed_dialog = []
    for turn in dialog:
        persona = persons[turn['person']]
        gk = [persona['description'][i] for i in turn['gk']]
        gender = persona['gender']
        pocesed_dialog.append({"text": turn['text'], "gks": gk, "gender": gender})
    return {"dialog": pocesed_dialog}

In [11]:
def next_answer_sampler(batch):
    dialogs = batch['dialog']
    historys=[]
    answers=[]
    gks = []
    for dialog in dialogs:
        for turn_i in range(1, len(dialog)):
            history = dialog[: turn_i]
            answer = dialog[turn_i]
            gk = dialog[turn_i]["gks"]
            historys.append(history)
            if len(gk)==0:
                gk = ["<EmptyGK>"]
            gks.append(gk)
            answers.append(answer)
    [[turn.pop('gks', 0) for turn in dialog] for dialog in historys]
    [answer.pop('gks', 0) for answer in answers]
    return {"history": historys, "gk": gks, "answer": answers}

def current_gk_sampler(batch):
    dialogs = batch['dialog']
    turns=[]
    gks=[]
    for dialog in dialogs:
        for turn in dialog:
            if len(turn['gks'])>0:
                turns.append(turn)
                gks.append(turn['gks'])
            else:
                turns.append(turn)
                gks.append({'<EmptyGK>'})
    [turn.pop('gks', 0) for turn in turns]  
    return {"turn": turns, "gk": gks}


def next_gk_sampler(batch):
    dialogs = batch['dialog']
    historys=[]
    gks = []
    all_gks=[]
    for dialog in dialogs:
        for turn_i in range(1, len(dialog)):
            history = dialog[: turn_i]
            answer = dialog[turn_i]
            if len(answer['gks'])>0:
                for gk in answer['gks']:
                    historys.append(history)
                    gks.append(gk)
                    all_gks.append(answer['gks'])
            else:
                historys.append(history)
                gks.append('<EmptyGK>')
                all_gks.append(['<EmptyGK>'])
            
    [[turn.pop('gks', 0) for turn in dialog] for dialog in historys]
    return {"history": historys, "gk": gks, "all_gks": all_gks}

In [12]:
train = datasets.Dataset.from_json('../raw/TolokaPersonaChat(train).jsonl')
val = datasets.Dataset.from_json('../raw/TolokaPersonaChat(val).jsonl')
#test = datasets.Dataset.from_json('../raw/all_dialogs.jsonl')
ds =  datasets.DatasetDict({"train": train, "val":val}) # , "test": test

new_ds = ds.map(join_same_person)
new_ds = new_ds.map(get_gk_from_persona, remove_columns=["persons"])

next_answer_ds= new_ds.map(next_answer_sampler, remove_columns=new_ds['train'].column_names, batched=True, batch_size=2)
current_gk_ds= new_ds.map(current_gk_sampler, remove_columns=new_ds['train'].column_names, batched=True, batch_size=2)
next_gk_ds= new_ds.map(next_gk_sampler, remove_columns=new_ds['train'].column_names, batched=True, batch_size=2)

Using custom data configuration default-71063147b23792f3


Downloading and preparing dataset json/default to /home/posokhov@ad.speechpro.com/.cache/huggingface/datasets/json/default-71063147b23792f3/0.0.0...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 9177.91it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 1431.01it/s]
Using custom data configuration default-e6f074a4bf715248


Dataset json downloaded and prepared to /home/posokhov@ad.speechpro.com/.cache/huggingface/datasets/json/default-71063147b23792f3/0.0.0. Subsequent calls will reuse this data.
Downloading and preparing dataset json/default to /home/posokhov@ad.speechpro.com/.cache/huggingface/datasets/json/default-e6f074a4bf715248/0.0.0...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 2138.86it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 629.30it/s]
                                                        

Dataset json downloaded and prepared to /home/posokhov@ad.speechpro.com/.cache/huggingface/datasets/json/default-e6f074a4bf715248/0.0.0. Subsequent calls will reuse this data.


100%|██████████| 9018/9018 [00:02<00:00, 3283.38ex/s]
100%|██████████| 995/995 [00:00<00:00, 4130.32ex/s]
100%|██████████| 9018/9018 [00:02<00:00, 4107.94ex/s]
100%|██████████| 995/995 [00:00<00:00, 4097.61ex/s]
100%|██████████| 4509/4509 [00:06<00:00, 715.32ba/s]
100%|██████████| 498/498 [00:00<00:00, 767.97ba/s]
100%|██████████| 4509/4509 [00:03<00:00, 1291.10ba/s]
100%|██████████| 498/498 [00:00<00:00, 1297.67ba/s]
100%|██████████| 4509/4509 [00:05<00:00, 841.78ba/s]
100%|██████████| 498/498 [00:00<00:00, 816.46ba/s]


In [13]:
next_answer_ds.save_to_disk('../processed/next_answer')
current_gk_ds.save_to_disk('../processed/current_gk')
next_gk_ds.save_to_disk('../processed/next_gk')