In [None]:
import json
import tiktoken
import datasets
import langdetect
from semantic_text_splitter import TextSplitter
from string import Template
from tqdm import tqdm

In [None]:
# * load dataset from jsonlines file
dataset = datasets.load_dataset("json", data_files="raw_data/pile/dedup-md5-pile-books3.jsonl", split="train")

dataset

In [None]:
# * filter data by length
enc = tiktoken.encoding_for_model("gpt-4")

def filter_length(examples):
    res = []
    for text in examples["text"]:
        token_len = len(enc.encode(text))
        if token_len < 64_000:
            res.append(False)
        elif token_len > 80_000:
            res.append(False)
        else:
            res.append(True)

    return res


dataset = dataset.filter(filter_length, batched=True, num_proc=32)


dataset

In [None]:
# * filter non-English data
dataset = dataset.filter(lambda x: langdetect.detect(x["text"]) == "en", num_proc=32)

dataset

In [None]:
# * make sure the data are not overlap
used_dataset = datasets.load_dataset("json", data_files="backup_data/one_detail.book.jsonl", split="train")

dataset = dataset.filter(lambda x: x["md5"] not in used_dataset["md5"], num_proc=32)

dataset

In [None]:
# * random sample
dataset = dataset.train_test_split(test_size=150, seed=2024)["test"]

dataset

In [None]:
# * save data as the backup
dataset.to_json("backup_data/bio.book.jsonl")

In [None]:
dataset = datasets.load_dataset("json", data_files="backup_data/bio.book.jsonl", split="train")

dataset

In [None]:
template = """Context information is below.
---------------------
${context}
---------------------
Given the context information and not prior knowledge.
Generate content based on the below query.
You are a Book Summarizer. Your task is to summarize the document. The task has 2 steps. In step 1, you should find the main characters. In step 2, you should summarize main characters' biography. The summary should be comprehensive and accurately reflect the main message.
You must return the result in JSON: [{'character': <characters>, 'summary': <summary>}, ..., {'character': <characters>, 'summary': <summary>}]"""

# * organize the data format
jobs = []

for idx, data in tqdm(enumerate(dataset)):
    prompt = Template(template).substitute(context=data["text"])
    jobs.append({
        "model": "gpt-4-turbo-preview", 
        "temperature": 0,
        "top_p": 1.0,
        "max_tokens": 4096,
        "messages": [
            {"role": "user", "content": prompt},
        ],
        "user": f"{idx}",
    })

# * save, and then use Openai API script to generate data
with open("data/bio.book.jsonl", "w") as f:
    for job in jobs:
        json_string = json.dumps(job)
        f.write(json_string + "\n")