In [1]:
import random
from tqdm import tqdm
from datasets import load_dataset, Dataset

In [2]:
ds = load_dataset("Elfsong/Venus_t", "python3", split="train")

In [3]:
prompt_template = """
Given the following problem details and the instruction, optimize the original solution.

# Instruction:
{instruction}

# Problem Description:
{content}

# Original Solution:
{solution_code}
"""

In [4]:
sft_dataset = list()

for instance in tqdm(list(ds)):
    if len(instance["rt_list"]) < 2:
        continue

    for _ in range(256):
        # Randomly sample two solutions
        (solution_1, solution_2) = random.sample(instance["rt_list"], 2)

        # If the runtime difference is greater than 16 ms, generate a new training instance
        if int(solution_1['runtime']) - int(solution_2['runtime']) >= 16:
            sft_instance = {
                "input": prompt_template.format(instruction="Generate a solution with faster runtime.", content=instance["content"], solution_code=solution_1["code"]),
                "output": solution_2["code"],
                "diff": int(solution_1['runtime']) - int(solution_2['runtime'])
            }
            sft_dataset.append(sft_instance)

new_ds = Dataset.from_list(sft_dataset)
new_ds.push_to_hub("Elfsong/Venus_SFT", 'python3', split='runtime')

100%|██████████| 1527/1527 [00:01<00:00, 1006.90it/s]


Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/65 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/65 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Elfsong/Venus_SFT/commit/7308a8d27f25b4bb532650effd45208333f89259', commit_message='Upload dataset', commit_description='', oid='7308a8d27f25b4bb532650effd45208333f89259', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Elfsong/Venus_SFT', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Elfsong/Venus_SFT'), pr_revision=None, pr_num=None)

In [5]:
sft_dataset = list()

for instance in tqdm(list(ds)):
    if len(instance["mm_list"]) < 2:
        continue

    for _ in range(256):
        # Randomly sample two solutions
        (solution_1, solution_2) = random.sample(instance["mm_list"], 2)

        # If the memory difference is greater than 128 kb, generate a new training instance
        if int(solution_1['memory']) - int(solution_2['memory']) >= 128:
            sft_instance = {
                "input": prompt_template.format(instruction="Generate a solution with less peak memory.", content=instance["content"], solution_code=solution_1["code"]),
                "output": solution_2["code"],
                "diff": int(solution_1['memory']) - int(solution_2['memory'])
            }
            sft_dataset.append(sft_instance)

new_ds = Dataset.from_list(sft_dataset)
new_ds.push_to_hub("Elfsong/Venus_SFT", 'python3', split='memory')

100%|██████████| 1527/1527 [00:01<00:00, 1508.72it/s]


Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/73 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/73 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/381 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Elfsong/Venus_SFT/commit/89894d4be6ef4e11adc7cfd3712edc14a7b19030', commit_message='Upload dataset', commit_description='', oid='89894d4be6ef4e11adc7cfd3712edc14a7b19030', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Elfsong/Venus_SFT', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Elfsong/Venus_SFT'), pr_revision=None, pr_num=None)