In [1]:
import sys
import os
import json

from transformers import AutoTokenizer, AutoModelForCausalLM
import datasets

# Add the project root directory to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '../..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src.prompts.llama_prompts import MathQAPrompt, ContextualQAPrompt
from src.utils.data_generation import split_dataset, concurrent_data_generation

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
PROC_NUM = 3 # number of processes to use for data generation
DATA_SPLIT = "train" # "train" or "test"

### Model setup

In [3]:
model_name = "meta-llama/Llama-3.2-1B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_name)


### UMWP

In [4]:
dataset_name = "UMWP"
path = f"../../dataset/raw_data/{dataset_name}/{DATA_SPLIT}.json"

data = datasets.load_dataset("json", data_files=path)
data = data["train"]

data = data.shuffle(seed=42).select(range(5)) # take smaller sample for testing
data_chunks = split_dataset(data, PROC_NUM)

In [5]:
response_dict = {
    "task_info": {
        "type": "QA",
        "dataset": dataset_name,
    },
    "additional_info": {
        "model": model_name,
        "domain": "Math"
    }
}

In [6]:
def process_data_chunk(data_chunk: dict) -> tuple[list[dict], list[dict]]:
    model_input = []
    additional_info = []
    for i in range(len(data_chunk["question"])):
        model_input.append({
            "query": data_chunk["question"][i]
        })
        additional_info.append({
            "question": data_chunk["question"][i],
            "answer": data_chunk["answer"][i],
            "answerable": data_chunk["answerable"][i],
            "source": data_chunk["source"][i]
        })
    return model_input, additional_info

In [7]:
all_results = await concurrent_data_generation(
        model=model,
        tokenizer=tokenizer,
        prompt_class=MathQAPrompt,
        data_chunks=data_chunks,
        response_dict_format=response_dict,
        data_processing_function=process_data_chunk,
        prompt_repetitions=4,
)

In [8]:
all_results[:5]

[{'task_info': {'type': 'QA', 'dataset': 'UMWP'},
  'additional_info': {'model': 'meta-llama/Llama-3.2-1B-Instruct',
   'domain': 'Math',
   'question': 'Because of the decision Sofia asked the students to suggest specific types of food. If 479 students suggested adding mashed potatoes while 489 suggested adding bacon to the menu.How many more students suggested bacon than those that suggested mashed potatoes?',
   'answer': [10.0],
   'answerable': True,
   'source': 'SVAMP'},
  'input': "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are an expert-level AI mathematician and logician. Your task is to solve the following math problem.\n\nFirst, critically assess the problem to determine if it is solvable. If the question is unanswerable (e.g., it is illogical, contains contradictions, or is missing essential information), you must clearly state that it cannot be answered and provide a brief explanation. Do not attempt to solve it.\n\nIf the problem is solvable, foll

In [12]:
with open(f"../../dataset/raw_model_responses/{DATA_SPLIT}/{DATA_SPLIT}_{dataset_name}.json", "w") as f:
    json.dump(all_results, f, indent=4)

### SQUAD

In [13]:
dataset_name = "rajpurkar_squad"
path = f"../../dataset/raw_data/{dataset_name}/{DATA_SPLIT}.parquet"

data = datasets.load_dataset("parquet", data_files=path)
data = data["train"]

data = data.shuffle(seed=42).select(range(10)) # take smaller sample for testing

data_chunks = split_dataset(data, PROC_NUM)

Generating train split: 87599 examples [00:00, 679019.62 examples/s]


In [14]:
response_dict = {
    "task_info": {
        "type": "Contextual QA",
        "dataset": dataset_name,
    },
    "additional_info": {
        "model": model_name,
    }
}

In [15]:
def process_data_chunk(data_chunk: dict) -> tuple[list[dict], list[dict]]:
    model_input = []
    additional_info = []
    for i in range(len(data_chunk["question"])):
        model_input.append({
            "query": data_chunk["question"][i],
            "context": data_chunk["context"][i]
        })
        additional_info.append({
            "question": data_chunk["question"][i],
            "context": data_chunk["context"][i],
            "answer": data_chunk["answers"][i]["text"],
            "title": data_chunk["title"][i],
        })
    return model_input, additional_info

In [None]:
all_results = await concurrent_data_generation(
        model=model,
        tokenizer=tokenizer,
        prompt_class=ContextualQAPrompt,
        data_chunks=data_chunks,
        response_dict_format=response_dict,
        data_processing_function=process_data_chunk,
        prompt_repetitions=4,
)

In [18]:
all_results[:5]

[{'task_info': {'type': 'Contextual QA', 'dataset': 'rajpurkar_squad'},
  'additional_info': {'model': 'meta-llama/Llama-3.2-1B-Instruct',
   'question': 'What percentage of Egyptians polled support death penalty for those leaving Islam?',
   'context': 'The Pew Forum on Religion & Public Life ranks Egypt as the fifth worst country in the world for religious freedom. The United States Commission on International Religious Freedom, a bipartisan independent agency of the US government, has placed Egypt on its watch list of countries that require close monitoring due to the nature and extent of violations of religious freedom engaged in or tolerated by the government. According to a 2010 Pew Global Attitudes survey, 84% of Egyptians polled supported the death penalty for those who leave Islam; 77% supported whippings and cutting off of hands for theft and robbery; and 82% support stoning a person who commits adultery.',
   'answer': ['84%'],
   'title': 'Egypt'},
  'input': "<|begin_of_te

In [19]:
with open(f"../../dataset/raw_model_responses/{DATA_SPLIT}/{DATA_SPLIT}_{dataset_name}.json", "w") as f:
    json.dump(all_results, f, indent=4)