In [2]:
import json
import tiktoken
import datasets
import langdetect
from semantic_text_splitter import TextSplitter
from string import Template
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# * load dataset from jsonlines file
dataset = datasets.load_dataset("json", data_files="raw_data/together-long/arxiv.json", split="train")

dataset

Found cached dataset json (/share/ninglu_shao/cache/json/default-ad1ebd167022847e/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


Dataset({
    features: ['text', 'meta'],
    num_rows: 1558305
})

In [4]:
# * set index for each sample
dataset = dataset.map(lambda x, index: {"index": index}, with_indices=True, num_proc=32)

dataset

Loading cached processed dataset at /share/ninglu_shao/cache/json/default-ad1ebd167022847e/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-3e097897d43c2bd9_*_of_00032.arrow


Dataset({
    features: ['text', 'meta', 'index'],
    num_rows: 1558305
})

In [5]:
# * filter data by length
enc = tiktoken.encoding_for_model("gpt-4")

def filter_length(examples):
    res = []
    for text in examples["text"]:
        try:
            token_len = len(enc.encode(text))
        except:
            res.append(False)
            continue
        if token_len < 32_000:
            res.append(False)
        elif token_len > 80_000:
            res.append(False)
        else:
            res.append(True)

    return res


dataset = dataset.filter(filter_length, batched=True, num_proc=32)


dataset

Loading cached processed dataset at /share/ninglu_shao/cache/json/default-ad1ebd167022847e/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-4d7db6c8c04ca8a4_*_of_00032.arrow


Dataset({
    features: ['text', 'meta', 'index'],
    num_rows: 174488
})

In [6]:
# * filter non-English data
dataset = dataset.filter(lambda x: langdetect.detect(x["text"]) == "en", num_proc=32)
dataset = dataset.filter(lambda x: x["meta"]["language"] == "en", num_proc=32)

dataset

Loading cached processed dataset at /share/ninglu_shao/cache/json/default-ad1ebd167022847e/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-cef39289bd9ea872_*_of_00032.arrow
Loading cached processed dataset at /share/ninglu_shao/cache/json/default-ad1ebd167022847e/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-beb7ddbb4054980d_*_of_00032.arrow


Dataset({
    features: ['text', 'meta', 'index'],
    num_rows: 173811
})

In [7]:
# * make sure the data are not overlap
used_dataset = datasets.load_dataset("json", data_files="backup_data/one_detail.paper.long.jsonl", split="train")

dataset = dataset.filter(lambda x: x["index"] not in used_dataset["index"], num_proc=32)

dataset

Found cached dataset json (/share/ninglu_shao/cache/json/default-47d0d87eaecc87dc/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


Dataset({
    features: ['text', 'meta', 'index'],
    num_rows: 1000
})

In [9]:
# * random sample
dataset = dataset.train_test_split(test_size=100, seed=2024)["test"]

dataset

Loading cached split indices for dataset at /share/ninglu_shao/cache/json/default-ad1ebd167022847e/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-f12633a1f1143e4d.arrow and /share/ninglu_shao/cache/json/default-ad1ebd167022847e/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-9f8130eb1385b0cc.arrow


Dataset({
    features: ['text', 'meta', 'index'],
    num_rows: 100
})

In [10]:
# * save data as the backup
dataset.to_json("backup_data/multi_details.paper.long.jsonl")

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format: 100%|██████████| 1/1 [00:03<00:00,  3.04s/ba]


13784307

In [11]:
dataset = datasets.load_dataset("json", data_files="backup_data/multi_details.paper.long.jsonl", split="train")

dataset

Downloading and preparing dataset json/default to /share/ninglu_shao/cache/json/default-356f9b48332aafc2/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 861.96it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 46.92it/s]
                                                               

Dataset json downloaded and prepared to /share/ninglu_shao/cache/json/default-356f9b48332aafc2/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96. Subsequent calls will reuse this data.




Dataset({
    features: ['text', 'meta', 'index'],
    num_rows: 100
})

In [12]:
def process_abstract(example):
    text = example["text"]
    abstract_idx = text.rfind("Abstract: ")
    abstract = text[abstract_idx:]
    text = text[:abstract_idx]

    return {"text": f"{abstract}\n\n{text}"}

dataset = dataset.map(process_abstract, num_proc=32)

dataset

                                                                           

Dataset({
    features: ['text', 'meta', 'index'],
    num_rows: 100
})

In [13]:
template = """Context information is below.
---------------------
${context}
---------------------
Given the context information and not prior knowledge.
Generate content based on the below query.
You are a professional researcher. Your task is to answer the following questions. 
Question 1: What problem is the paper trying to solve?
Question 2: What is the main contribution of the paper?
Question 3: What relevant studies are mentioned in the paper?
Question 4: What method is used in the paper?
Question 5: What experiments are done in the paper?
Question 6: Summarize the main content of the paper.
You must return the result in JSON: [{'question': <question>, 'answer': <answer>}, ..., {'question': <question>, 'answer': <answer>}]"""


jobs = []

for idx, data in tqdm(enumerate(dataset)):
    prompt = Template(template).substitute(context=data["text"])
    jobs.append({
        "model": "gpt-4-turbo-preview", 
        "temperature": 0,
        "top_p": 1.0,
        "max_tokens": 4096,
        "messages": [
            {"role": "user", "content": prompt},
        ],
        "user": f"{idx}",
    })


with open("data/multi_details.paper.long.jsonl", "w") as f:
    for job in jobs:
        json_string = json.dumps(job)
        f.write(json_string + "\n")

100it [00:00, 2434.60it/s]
