In [6]:
import random
from tqdm import tqdm
from datasets import load_dataset, Dataset

In [7]:
ds = load_dataset("Elfsong/Venus_t", "python3", split="train")

In [8]:
prompt_template = """
Given the following problem details and the instruction, optimize the original solution.

# Instruction:
{instruction}

# Problem Description:
{content}

# Original Solution:
{solution_code}
"""

In [9]:
kto_dataset = list()

for instance in tqdm(list(ds)):
    if len(instance["rt_list"]) < 2:
        continue

    for _ in range(256):
        # Randomly sample two solutions
        (solution_1, solution_2) = random.sample(instance["rt_list"], 2)

        # If the runtime difference is greater than 16 ms, generate a new training instance
        if int(solution_1['runtime']) - int(solution_2['runtime']) >= 16:
            # Positive instance
            sft_instance = {
                "input": prompt_template.format(instruction="Generate a solution with faster runtime.", content=instance["content"], solution_code=solution_1["code"]),
                "output": solution_2["code"],
                "label": True
            }
            kto_dataset.append(sft_instance)

            # Negative instance
            sft_instance = {
                "input": prompt_template.format(instruction="Generate a solution with faster runtime.", content=instance["content"], solution_code=solution_2["code"]),
                "output": solution_1["code"],
                "label": False
            }
            kto_dataset.append(sft_instance)

new_ds = Dataset.from_list(kto_dataset)
new_ds.push_to_hub("Elfsong/Venus_KTO", 'python3', split='runtime')

100%|██████████| 1527/1527 [00:01<00:00, 1356.90it/s]


Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/130 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/130 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Elfsong/Venus_KTO/commit/bedaacfc95067f4a694736a0a21436032e903df2', commit_message='Upload dataset', commit_description='', oid='bedaacfc95067f4a694736a0a21436032e903df2', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Elfsong/Venus_KTO', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Elfsong/Venus_KTO'), pr_revision=None, pr_num=None)

In [10]:
kto_dataset = list()

for instance in tqdm(list(ds)):
    if len(instance["mm_list"]) < 2:
        continue

    for _ in range(256):
        # Randomly sample two solutions
        (solution_1, solution_2) = random.sample(instance["mm_list"], 2)

        # If the memory difference is greater than 128 kb, generate a new training instance
        if int(solution_1['memory']) - int(solution_2['memory']) >= 16:
            # Positive instance
            sft_instance = {
                "input": prompt_template.format(instruction="Generate a solution with faster runtime.", content=instance["content"], solution_code=solution_1["code"]),
                "output": solution_2["code"],
                "label": True
            }
            kto_dataset.append(sft_instance)

            # Negative instance
            sft_instance = {
                "input": prompt_template.format(instruction="Generate a solution with faster runtime.", content=instance["content"], solution_code=solution_2["code"]),
                "output": solution_1["code"],
                "label": False
            }
            kto_dataset.append(sft_instance)

new_ds = Dataset.from_list(kto_dataset)
new_ds.push_to_hub("Elfsong/Venus_KTO", 'python3', split='memory')

100%|██████████| 1527/1527 [00:01<00:00, 1242.24it/s]


Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/124 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/124 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/124 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/381 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Elfsong/Venus_KTO/commit/2cff8f0a0ea11180b0e6b0e3df20f8d4672d7898', commit_message='Upload dataset', commit_description='', oid='2cff8f0a0ea11180b0e6b0e3df20f8d4672d7898', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Elfsong/Venus_KTO', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Elfsong/Venus_KTO'), pr_revision=None, pr_num=None)