In [1]:
import json
import random
import sys
import os

# Add the project root directory to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '../../..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src.agents.math_qa_correction_agent import MathQACorrectionAgent
from src.utils.data_generation import split_dataset, concurrent_data_postprocessing

random.seed(2)

In [2]:
PROC_NUM = 2
ERROR_DETECTION_ONLY = False
SPLIT = "train"
NUM_MAJOR_CHUNKS = 10

In [3]:
with open(f"../../../dataset/raw_model_responses/{SPLIT}/{SPLIT}_UMWP.json", "r") as f:
    json_data = json.load(f)

In [4]:
# json_data = json_data[:100]

In [5]:
data_chunks = split_dataset(json_data, NUM_MAJOR_CHUNKS)

In [6]:
agent = MathQACorrectionAgent(error_detection_only=ERROR_DETECTION_ONLY)

In [7]:
def extract_args(item: dict) -> list[str]:
    return {
        "input": item["input"],
        "question": item["additional_info"]["question"],
        "answer": item["additional_info"]["answer"],
        "is_answerable": item["additional_info"]["answerable"],
        "responses": item["response"],
    }

In [8]:
final_result = []

for index, chunk in enumerate(data_chunks):
    print(f"Processing chunk {index + 1} of {len(data_chunks)}")
    proc_data_split = split_dataset(chunk, PROC_NUM)
    data = await concurrent_data_postprocessing(
        agent=agent, 
        data_chunks=proc_data_split, 
        extract_args=extract_args,
        max_concurrency=4,
    )
    final_result.extend(data)
    # break

Processing chunk 1 of 10
Processing chunk 2 of 10
Processing chunk 3 of 10
Processing chunk 4 of 10
Processing chunk 5 of 10
Processing chunk 6 of 10
Processing chunk 7 of 10
Processing chunk 8 of 10
Processing chunk 9 of 10
Processing chunk 10 of 10


In [9]:
final_result[4]

{'input': "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a meticulous AI mathematician. Your task is to solve the following math problem.\n\nFollow these steps carefully:\n1. **Analyze the problem:** First, understand the given information and what is being asked.\n2. **Assess solvability:** Determine if the problem is solvable. A problem might be unsolvable if it's illogical, contains contradictions, or lacks necessary information.\n3. **Solve or Explain:**\n   - **If solvable:** Provide a step-by-step solution, showing all your reasoning and calculations, and then clearly state the final numerical answer.\n   - **If unsolvable:** State that the problem cannot be answered and provide a concise explanation.\n\nYour entire response should only contain the solution and final answer (or the explanation for unsolvable problems). Do not add any conversational headers or extraneous text.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nDroid owns a coffee shop. H

In [10]:
with open(f"../../../dataset/processed_data/{SPLIT}/UMWP_processed.json", "w") as f:
    json.dump(final_result, f, indent=4)