In [18]:
import os
import json
import numpy as np
import random
import re
import copy
from functools import partial


In [19]:

def dump_jsonl(data, file_path):
    with open(file_path, 'w') as f:
        f.write('\n'.join([json.dumps(item, ensure_ascii=False) for item in data]))
        

In [20]:
def generate_list_data(length):
    start_index = random.randint(-40, 20)
    end_index = random.randint(50, 100)
    
    data = np.random.randint(start_index, end_index, length).tolist()
    return data

def get_largest_number(data, order):
    unique_data = list(set(data))
    unique_data.sort(reverse=True)
    
    if order > len(unique_data):
        return None
    
    return unique_data[order - 1]

def get_smallest_number(data, order):
    unique_data = list(set(data))
    unique_data.sort()
    
    if order > len(unique_data):
        return None
    
    return unique_data[order - 1]

def get_median(data):
    return np.median(data)
    

In [21]:
tasks = [
    ("\nThe median of the list is: ", get_median),
    ("\nThe largest number of the list is: ", partial(get_largest_number, order=1)),
    ("\nThe second largest number of the list is: ", partial(get_largest_number, order=2)),
    ("\nThe third largest number of the list is: ", partial(get_largest_number, order=3)),
    ("\nThe smallest number of the list is: ", partial(get_smallest_number, order=1)),
    ("\nThe second smallest number of the list is: ", partial(get_smallest_number, order=2)),
    ("\nThe third smallest number of the list is: ", partial(get_smallest_number, order=3))
]

In [22]:
data_content = []
num_generate = 50

for _ in range(num_generate):
    sequence_length = random.randint(10, 50)
    data = generate_list_data(sequence_length)
    input_query, task_func = random.choice(tasks)
    
    # Determine target number
    target_number = re.findall(r"The .+ of", input_query)[0].lower()[:-3]
    prefix = f"What is the {target_number} in the following list?"
    context = json.dumps(data)
    
    # Generate the question
    question = f"{prefix}\n{context}\n{input_query}"
    
    # Compute the correct answer
    correct_answer = task_func(data)
    
    # Generate incorrect answers
    incorrect_answers = set()
    while len(incorrect_answers) < 3:
        option = correct_answer + random.randint(-10, 10)
        if option != correct_answer:
            incorrect_answers.add(option)
    
    # Create options and ensure correct answer is in one of them
    options = list(incorrect_answers)
    correct_index = random.randint(0, 3)
    options.insert(correct_index, correct_answer)
    
    # Map options to A, B, C, D
    choices = ['A', 'B', 'C', 'D']
    answer_letter = choices[correct_index]
    
    # Append the formatted question and answers to the dataset
    data_content.append({
        "question": question,
        "A": str(options[0]),
        "B": str(options[1]),
        "C": str(options[2]),
        "D": str(options[3]),
        "correct_answer": answer_letter
    })


In [23]:
result_file = os.path.join('result', 'math_find.jsonl')
dump_jsonl(data_content, result_file)