In [None]:
import json
import tiktoken
import datasets
import langdetect
from semantic_text_splitter import TextSplitter
from string import Template
from tqdm import tqdm

In [None]:
# * load dataset from jsonlines file
dataset = datasets.load_dataset("json", data_files="raw_data/together-long/arxiv.json", split="train")

dataset

In [None]:
# * set index for each sample
dataset = dataset.map(lambda x, index: {"index": index}, with_indices=True, num_proc=32)

dataset

In [None]:
# * filter data by length
enc = tiktoken.encoding_for_model("gpt-4")

def filter_length(examples):
    res = []
    for text in examples["text"]:
        try:
            token_len = len(enc.encode(text))
        except:
            res.append(False)
            continue
        if token_len < 32_000:
            res.append(False)
        elif token_len > 80_000:
            res.append(False)
        else:
            res.append(True)

    return res


dataset = dataset.filter(filter_length, batched=True, num_proc=32)


dataset

In [None]:
# * filter non-English data
dataset = dataset.filter(lambda x: langdetect.detect(x["text"]) == "en", num_proc=32)
dataset = dataset.filter(lambda x: x["meta"]["language"] == "en", num_proc=32)

dataset

In [None]:
# * make sure the data are not overlap
used_dataset = datasets.load_dataset("json", data_files="backup_data/one_detail.paper.long.jsonl", split="train")

dataset = dataset.filter(lambda x: x["index"] not in used_dataset["index"], num_proc=32)

dataset

In [None]:
# * random sample
dataset = dataset.train_test_split(test_size=100, seed=2024)["test"]

dataset

In [None]:
# * save data as the backup
dataset.to_json("backup_data/multi_details.paper.long.jsonl")

In [None]:
dataset = datasets.load_dataset("json", data_files="backup_data/multi_details.paper.long.jsonl", split="train")

dataset

In [None]:
def process_abstract(example):
    text = example["text"]
    abstract_idx = text.rfind("Abstract: ")
    abstract = text[abstract_idx:]
    text = text[:abstract_idx]

    return {"text": f"{abstract}\n\n{text}"}

dataset = dataset.map(process_abstract, num_proc=32)

dataset

In [None]:
template = """Context information is below.
---------------------
${context}
---------------------
Given the context information and not prior knowledge.
Generate content based on the below query.
You are a professional researcher. Your task is to answer the following questions. 
Question 1: What problem is the paper trying to solve?
Question 2: What is the main contribution of the paper?
Question 3: What relevant studies are mentioned in the paper?
Question 4: What method is used in the paper?
Question 5: What experiments are done in the paper?
Question 6: Summarize the main content of the paper.
You must return the result in JSON: [{'question': <question>, 'answer': <answer>}, ..., {'question': <question>, 'answer': <answer>}]"""


jobs = []

for idx, data in tqdm(enumerate(dataset)):
    prompt = Template(template).substitute(context=data["text"])
    jobs.append({
        "model": "gpt-4-turbo-preview", 
        "temperature": 0,
        "top_p": 1.0,
        "max_tokens": 4096,
        "messages": [
            {"role": "user", "content": prompt},
        ],
        "user": f"{idx}",
    })


with open("data/multi_details.paper.long.jsonl", "w") as f:
    for job in jobs:
        json_string = json.dumps(job)
        f.write(json_string + "\n")