In [9]:
import pandas as pd

# Load the CSV file
file_path = '/kaggle/input/math-dataset-raw-3/updated_questions_v2_extended.csv'
df = pd.read_csv(file_path)
print(df.columns)
# Convert the 'Question' column into a list
base_instructions = df.loc[212:, 'Question'].tolist()

Index(['Question'], dtype='object')


In [None]:
from time import time
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
from tqdm import tqdm
import json

# ================== Model Configuration ==================
model_id = "/kaggle/input/llama-3.1/transformers/8b-instruct/2"


tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    return_dict=True,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)

llama31_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

# ==========================================================

# ================== Helper Functions ==================

def query_model(system_message, user_message, temperature=0.7, max_length=1024):
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": "Create a variation: " + user_message}
    ]
    prompt_text = tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
    )
    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    sequences = llama31_pipeline(
        prompt_text,
        do_sample=True,
        top_p=0.9,
        temperature=temperature,
        num_return_sequences=1,
        eos_token_id=terminators,
        max_new_tokens=max_length,
        return_full_text=False,
        pad_token_id=terminators[0]
    )
    answer = sequences[0]['generated_text']
    return answer

system_message = """
You are an AI trained to generate variations of math problems.
Please generate a new version of the problem statement by changing names, numbers, and items, but do not solve the problem and not adding any other uneseccary string, just the variation.
And make sure it can be solved and ensure that each variation you generate includes the question.
"""

# ==========================================================

# ================== Data Generation ==================

num_variations_per_instruction = 10
output_file = "math_variations_dataset50_2.jsonl"

data_points = []

for base_instruction in tqdm(base_instructions, desc="Generating Variations"):
    for _ in range(num_variations_per_instruction):
        user_message = base_instruction
        variation = query_model(system_message, user_message, temperature=0.7, max_length=256)
        data_point = {
            "instruction": base_instruction,
            "response": variation,
        }
        data_points.append(data_point)

# Save to JSONL file
with open(output_file, 'w') as file:
    for data_point in data_points:
        json_line = json.dumps(data_point)
        file.write(json_line + "\n")

print(f"Generated and saved {len(data_points)} data points to {output_file}.")

# ==========================================================


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Generating Variations:   0%|          | 0/767 [00:00<?, ?it/s]Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
Generating Variations:   0%|          | 1/767 [00:32<6:56:07, 32.59s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Generating Variations:  94%|█████████▍| 721/767 [5:11:25<22:43, 29.64s/it]  

In [4]:
# Save to JSONL file
import json
with open(output_file, 'w') as file:
    for data_point in data_points:
        json_line = json.dumps(data_point)
        file.write(json_line + "\n")

print(f"Generated and saved {len(data_points)} data points to {output_file}.")

Generated and saved 2110 data points to math_variations_dataset50_2.jsonl.
