In [14]:
import nltk
nltk.download("punkt")
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gabri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\gabri\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [28]:
import json
from collections import defaultdict
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [29]:
INPUT_PATH = "../data/train_chunks.json"
OUTPUT_PATH = "../data/train_golden.json"
EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"

In [4]:
with open(INPUT_PATH, "r", encoding="utf-8") as f:
    chunk_data = json.load(f)

In [31]:
chunks_by_question = defaultdict(list)
for chunk in chunk_data:
    chunks_by_question[chunk["question_id"]].append(chunk)

model = SentenceTransformer(EMBEDDING_MODEL)

In [7]:
golden_dataset = []

In [32]:
for qid, chunks in tqdm(chunks_by_question.items(), desc="Processing questions"):
    question = chunks[0]["question"]
    answer = chunks[0]["answer"]
    golden_program = chunks[0].get("golden_program")

    q_embedding = model.encode(question, convert_to_tensor=True)

    chunk_tests = [chunk["chunk_text"] for chunk in chunks]
    chunk_embeddings = model.encode(chunk_tests, convert_to_tensor=True)
    
    similarities = util.cos_sim(q_embedding, chunk_embeddings)[0]
    best_idx = similarities.argmax().item()
    best_chunk = chunks[best_idx]

    golden_dataset.append({
        "question_id": qid,
        "question": question,
        "answer": answer,
        "golden_program": golden_program,
        "golden_chunk": best_chunk["chunk_text"],
        "score": similarities[best_idx].item()
    })
    break

Processing questions:   0%|          | 0/1 [00:49<?, ?it/s]


In [26]:
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(golden_dataset, f, ensure_ascii=False, indent=4)
    print(f"Saved {len(golden_dataset)} golden chunks to {OUTPUT_PATH}")

Saved 1 golden chunks to ../data/train_golden.json
