In [1]:
import os, json, random
from pathlib import Path
from datasets import load_dataset, Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def dump_json(path: Path, obj):
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)

def tokenize_dataset(ds, tok, max_len):
    ds = ds.filter(lambda ex: ex.get("text", None) and len(ex["text"].strip()) > 0)

    def _map(ex):
        out = tok(ex["text"], truncation=True, padding=True, max_length=max_len, return_attention_mask=True)
        out["labels"] = out["input_ids"].copy()
        return out

    ds = ds.map(_map, batched=True, remove_columns=ds.column_names)
    ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
    return ds

def _read_json(path: Path):
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)

In [13]:
train_file = _read_json(Path("data/train") / "test.json")
final_file = _read_json(Path("data/final") / "test.json")
val_file = _read_json(Path("data/val") / "test.json")

In [None]:
seeds = [0, 1, 2, 3, 4]
file_paths = [f'wiki_json/train/train_shadow_{i}.json' for i in seeds]
shadow_datas = {}
for i in range(5):
    shadow_datas[i] = _read_json(Path(file_paths[i]))
for i in range(5):
    shadow_datas[i] = [d['text'] for d in shadow_datas[i]]

In [42]:
train_labels = [[] for _ in range(5)]

for i in range(len(train_file)):
    for j in range(5):
        if j == i % 5:
            while train_file[i] in shadow_datas[j]:
                shadow_datas[j].remove(train_file[i])
            while val_file[i] in shadow_datas[j]:
                shadow_datas[j].remove(val_file[i])
            while final_file[i] in shadow_datas[j]:
                shadow_datas[j].remove(final_file[i])

            train_labels[j].append(0)
        else:
            train_labels[j].append(1)
            shadow_datas[j].append(train_file[i])
            shadow_datas[j].append(val_file[i])
            shadow_datas[j].append(final_file[i])


In [49]:
for i in range(5):
    print(i, len(shadow_datas[i]), len(list(set(shadow_datas[i]))))
    shadow_datas[i] = [{"text": d} for d in shadow_datas[i]]

0 14574 13714
1 14778 14704
2 14783 14701
3 14781 14690
4 14777 14694


In [51]:
out_dir = Path('data/shadow')
for i in range(5):
    dump_json(out_dir / f"train_shadow_{i}.json", shadow_datas[i])

In [52]:
import numpy as np
np.save("data/shadow/labels.npy", np.array(train_labels))