In [1]:
import json

from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter
from tqdm import tqdm

In [2]:
INPUT_PATH = "../data/train.json"
OUTPUT_PATH = "../data/train_chunks.json"
CHUNK_SIZE = 1024
CHUNK_OVERLAP = 200

In [3]:
with open(INPUT_PATH, "r", encoding="utf-8") as f:
    raw_data = json.load(f)

In [4]:
parser = SentenceSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)

In [5]:
all_chunks = []

In [20]:
for doc_id, item in enumerate(tqdm(raw_data, desc="Processing items")):
    paragraph_text = item['Context']
    doc = Document(
        text=paragraph_text,
        metadata={'question_id': doc_id}
    )
    nodes = parser.get_nodes_from_documents([doc])

    print(len(nodes))
    for idx, node in enumerate(nodes):
        chunk = {
            'question_id': doc_id,
            'chunk_id': f"{doc_id}_chunk_{idx}",
            'chunk_text': node.get_content(),
            'question': item['Question'],
            'answer': item['Answer'],
            'golden_program': item['Program']
        }
        all_chunks.append(chunk)
    print(all_chunks[-1])
    break

Processing items:   0%|          | 0/5735 [00:00<?, ?it/s]

130
{'question_id': 0, 'chunk_id': '0_chunk_129', 'chunk_text': "1-7819)\tas\tfiled\twith\tthe\tCommission\ton\tNovember\t30,\t2007\tand\tincorporated\therein\tby reference. †12.1 Computation\tof\tConsolidated\tRatios\tof\tEarnings\tto\tFixed\tCharges. †21 Subsidiaries\tof\tthe\tCompany.\n\n†23 Consent\tof\tErnst\t&\tYoung\tLLP,\tIndependent\tRegistered\tPublic\tAccounting\tFirm.\n\n\t101.\tDEF XBRL\tDefinition\tLinkbase\tDocument\n\n† Filed\therewith.\n\n* Management\tcontracts\tand\tcompensatory\tplan\tor\tarrangements\trequired\tto\tbe\tfiled\tas\tan\tExhibit\tpursuant\tto\tItem\t15(b)\tof\tForm\t10-K.\n\nAttached\tas\tExhibit\t101\tto\tthis\treport\tare\tthe\tfollowing\tformatted\tin\tXBRL\t(Extensible\tBusiness\tReporting\tLanguage):\t(i)\tConsolidated\tStatements\tof\tIncome\tfor\tthe\tyears\tended\tOctober\t31, 2009,\tNovember\t1,\t2008,\tand\tNovember\t3,\t2007,\t(ii)\tConsolidated\tBalance\tSheets\tat\tOctober\t31,\t2009\tand\tNovember\t1,\t2008,\t(iii)\tConsolidated\tStatemen




In [21]:
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(all_chunks, f, ensure_ascii=False, indent=4)
    print(f"Saved {len(all_chunks)} chunks to {OUTPUT_PATH}")

Saved 260 chunks to ../data/train_chunks.json
