In [3]:
# Goal of `data.ipynb`: generate dataset to train a dense retriever to retrieve problems given a natural language query describing a desired type of problem 

# Imports
import os
import json
import pandas as pd
import random
from dotenv import load_dotenv
load_dotenv()
# Import OpenAI client

def load_from_jsonl(file_path, show_progress = False):
    lines = []
    with open(file_path, encoding="utf8") as f:
        lines = f.read().splitlines()
    line_dicts = []
    if show_progress:
        for line in lines:
            line_dicts.append(json.loads(line))
    else:
        line_dicts = [json.loads(line) for line in lines]
    df = pd.DataFrame(line_dicts)
    return df

# Load OpenAI client
client = ...

In [33]:
# Load and process MATH dataset: https://paperswithcode.com/dataset/math
dataset = {}
dataset_rows = []
for dir in ["train", "test"]:
    for category in os.listdir(os.path.join("MATH", dir)):
        category_path = os.path.join("MATH", dir, category)
        if not os.path.isdir(category_path):
            continue
        for filename in os.listdir(category_path):
            with open(os.path.join("MATH", dir, category, filename)) as f:
                entry = json.load(f)
                id = f"{dir}_{category}_{filename[:-len('.json')]}"
                entry["id"] = id
                dataset[id] = entry
                dataset_rows.append(entry)
with open("MATH_dataset.json", "w") as f:
    json.dump(dataset, f)
with open("MATH_dataset_rows.json", "w") as f:
    json.dump(dataset_rows, f)
print(len(dataset))

12500


In [5]:
# Load MATH dataset
df = pd.read_json("MATH_dataset_rows.json")
id_to_index = {}
for index, row in df.iterrows():
    id_to_index[row["id"]] = index
# Load few-shot examples
few_shot_df = pd.read_csv("few_shot.csv")
train_ids = []
# Load train IDs
with open("MATH_train_ids.json") as f:
    train_ids = json.load(f)
train_indices = [id_to_index[id] for id in train_ids]
train_df = df.iloc[train_indices]
print(train_df.shape)

(1000, 5)


In [24]:
# Select training indices
df = pd.read_json("MATH_dataset_rows.json")
train_indices = random.choices(range(len(df)), k = 1000)
train_ids = [df.iloc[i]["id"] for i in train_indices]
with open("MATH_train_ids.json", "w") as f:
    json.dump(train_ids, f)
print(len(train_ids))

1000


In [10]:
df.iloc[0]

problem     What is the remainder when 369,963 is divided ...
level                                                 Level 1
type                                            Number Theory
solution    If a number is divisible by 6, it must be divi...
id                                    train_number_theory_421
Name: 0, dtype: object

In [5]:
# Few-shot prompt for generating a synthetic query for a given problem
prompts = {}
for index, row in df.iterrows():
    few_shot = few_shot_df.iloc[random.choices(range(len(few_shot_df)), k = 3)]
    few_shot_queries = [few_shot.iloc[i]["query"] for i in range(3)]
    few_shot_problems = [df.iloc[id_to_index[few_shot.iloc[i]["answer_id"]]]["problem"] for i in range(3)]
    prompts[row["id"]] = f"""You are given a math problem. Your task is to generate a query for Problem #4 from the perspective of someone who is trying to search for math problems similiar to Problem #4. You must use the exact same writing style as the example questions #1, #2, and #3 provided below. You will be penalized if your query is longer than 10 words.

Problem #1: {few_shot_problems[0]}
Query #1: {few_shot_queries[0]}

Problem #2: {few_shot_problems[1]}
Query #2: {few_shot_queries[1]}

Problem #3: {few_shot_problems[2]}
Query #3: {few_shot_queries[2]}

Problem #4: {row['problem']}
Query #4: """ 

In [6]:
print(prompts["train_number_theory_421"])

You are given a math problem. Your task is to generate a query for Problem #4 from the perspective of someone who is trying to search for math problems similiar to Problem #4. You must use the exact same writing style as the example questions #1, #2, and #3 provided below. You will be penalized if your query is longer than 10 words.

Problem #1: There are 20 people in my club.  8 of them are left-handed.  15 of them like jazz music.  2 of them are right-handed and dislike jazz music.  How many club members are left-handed and like jazz music? Assume people are either left-handed or right-handed, but not both.
Query #1: I want a mutual exclusivity question

Problem #2: A basketball player made the following number of free throws in 8 successive games: 6, 18, 15, 14, 19, 12, 19, and 15. What is the median number of successful free throws?
Query #2: I would like a question about mean, median, or mode

Problem #3: What is the area, in square units, of a regular hexagon inscribed in a circl

In [6]:
train_id_to_index = {}
count = 0
for index, row in train_df.iterrows():
    count += 1
    train_id_to_index[row["id"]] = count

In [None]:
done = set()
for filename in os.listdir("generated_queries"):
    done.add(filename[:-len(".txt")])
print(len(done))

In [None]:
count = 0
regen = ["test_algebra_2822", "test_intermediate_algebra_1723", "train_counting_and_probability_573", "train_number_theory_510"]
for index, row in train_df.iterrows():
    count += 1
    if row["id"] in done:
        continue
    if not row["id"] in regen:
        continue
    with open(os.path.join("generated_queries", f"{row['id']}.txt"), "w") as f:
        response = client.chat_completions_create(
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompts[row["id"]]}
            ],
            model="gpt-4o-mini",
            n=1
        )
        response_text = response["choices"][0]["message"]["content"]
        f.write(response_text)
        print(count, row["id"], response_text)

In [8]:
train_queries = {}
for filename in os.listdir("generated_queries")[:950]:
    with open(os.path.join("generated_queries", filename)) as f:
        train_queries[filename[:-len(".txt")]] = f.read()
print(len(train_queries))

950


In [10]:
ids = set()
with open("train.jsonl", "w") as f: 
    for id in train_queries:
        entry = {}
        entry["query_id"] = f"{id}_query"
        entry["query"] = train_queries[id]
        entry["positive_passages"] = [{
            "docid": id,
            "title": df.iloc[id_to_index[id]]["problem"][:50],
            "text": df.iloc[id_to_index[id]]["problem"]
        }]
        random_id = df.iloc[random.choice(range(len(df)))]["id"]
        while random_id == id:
            random_id = df.iloc[random.choice(range(len(df)))]["id"]
        entry["negative_passages"] = [{
            "docid": random_id,
            "title": df.iloc[id_to_index[random_id]]["problem"][:50],
            "text": df.iloc[id_to_index[random_id]]["problem"]
        }]
        train_queries[id]
        f.write(json.dumps(entry))
        f.write("\n")
    for index, row in few_shot_df.iterrows():
        entry = {}
        entry["query_id"] = f"{row['answer_id']}_query"
        id = row["answer_id"]
        if id in ids:
            print("Duplicate id", id)
        else:
            ids.add(id)
        entry["query"] = row["query"]
        entry["positive_passages"] = [{
            "docid": row["answer_id"],
            "title": df.iloc[id_to_index[id]]["problem"][:50],
            "text": df.iloc[id_to_index[id]]["problem"]
        }]
        random_id = df.iloc[random.choice(range(len(df)))]["id"]
        while random_id == id:
            random_id = df.iloc[random.choice(range(len(df)))]["id"]
        entry["negative_passages"] = [{
            "docid": random_id,
            "title": df.iloc[id_to_index[random_id]]["problem"][:50],
            "text": df.iloc[id_to_index[random_id]]["problem"]
        }]
        f.write(json.dumps(entry))
        f.write("\n")

In [6]:
# Store train data as parquet file
train_data_df = load_from_jsonl("train.jsonl")
train_data_df.sample(frac = 1)
train_data_df.to_parquet(f"train.parquet")

In [5]:
# Store problem corpus as parquet file
with open("corpus_math.jsonl", "w") as f:
    for index, row in df.iterrows():
        entry = {}
        entry["docid"] = row["id"]
        entry["title"] = row["problem"][:50]
        entry["text"] = row["problem"]
        f.write(json.dumps(entry))
        f.write("\n")
corpus_df = load_from_jsonl("corpus_math.jsonl")
corpus_df.to_parquet(f"corpus_math.parquet")

In [8]:
import pickle

corpus_path = "corpus_math.pkl"
with open(corpus_path, "rb") as f:
    embeddings = pickle.load(f)
print(embeddings[0].shape)

# Create CSV file with structured data and embeddings for IRIS Vector Search
import csv
with open("embeddings.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["id", "category", "difficulty", "embedding"])
    for index, row in df.iterrows():
        id = row["id"]
        category = row["type"]
        difficulty = row["level"]
        embedding = embeddings[0][index]
        writer.writerow([id, category, difficulty, embedding])

(12500, 768)
