In [2]:
from datasets import load_dataset
from datasets import Dataset

In [3]:
def rating_to_label(precentage):
    if precentage < 0.2:
        return "A"
    elif 0.2 <= precentage < 0.4:
        return "B"
    elif 0.4 <= precentage < 0.6:
        return "C"
    elif 0.6 <= precentage < 0.8:
        return "D"
    else:
        return "E"

In [4]:
def generate_data(lang):
    ds = load_dataset("Elfsong/Venus", lang)

    training_data=list()
    for instance in ds['train']:
        # runtime
        if instance['rt_list']:
            min_rt, max_rt = min([int(sol['runtime']) for sol in instance['rt_list']]), max([int(sol['runtime']) for sol in instance['rt_list']])
            for solution in instance['rt_list']:
                code = solution['code']
                runtime = int(solution['runtime'])
                runtime_percent = (runtime - min_rt) / ((max_rt - min_rt) + 1e-6)
                training_data.append({
                    'code': code,
                    'instruction': 'Estimate the runtime percentile ranking of the code.',
                    'type': 'runtime',
                    'value': runtime,
                    'rating': rating_to_label(runtime_percent),
                    'percent': runtime_percent,
                    'percent_str': f'{runtime_percent:.3f}'
                })
        # memory
        if instance['mm_list']:
            min_mm, max_mm = min([int(sol['memory']) for sol in instance['mm_list']]), max([int(sol['memory']) for sol in instance['mm_list']])
            for solution in instance['mm_list']:
                code = solution['code']
                memory = int(solution['memory'])
                memory_percent = (memory - min_mm) / ((max_mm - min_mm) + 1e-6)
                training_data.append({
                    'code': code,
                    'instruction': 'Estimate the memory percentile of the code',
                    'type': 'memory',
                    'value': memory,
                    'rating': rating_to_label(memory_percent),
                    'percent': memory_percent,
                    'percent_str': f'{memory_percent:.3f}'
                })
    ds = Dataset.from_list(training_data)
    ds = ds.train_test_split(test_size=0.2)
    ds.push_to_hub("Elfsong/DenseRuntime", lang)

In [None]:
langs = ['cpp', 'golang', 'java', 'javascript', 'python3', 'rust']

for lang in langs:
    print(f'Generating data for [{lang}] ...')
    generate_data(lang)

In [1]:
# Don't Worry, You Can't Break It. We Promise.
import json
import random
import string
from dataclasses import dataclass
from typing import Any


def generate_password(length=12):
    characters = string.ascii_letters + string.digits + string.punctuation
    password = ''.join(random.choice(characters) for _ in range(length))
    return password


@dataclass(frozen=True)
class TestInput:
    length: int

TestOutput = str  # Alias for clarity


class TestCaseGenerator:
    @staticmethod
    def generate_test_input() -> TestInput:
        """
        Generates a random test input.
        Here, the only input required is the password length,
        which is randomly chosen between 8 and 20.
        """
        length = random.randint(8, 20)
        return TestInput(length=length)

    @staticmethod
    def generate_expected_output(test_input: TestInput) -> TestOutput:
        """
        Generates the expected output by executing the password generation solution
        with the input from TestInput.
        """
        return generate_password(length=test_input.length)

    @staticmethod
    def serialize_input(obj: TestInput) -> str:
        """
        Serializes the TestInput object to a JSON-formatted string.
        """
        return json.dumps({"length": obj.length})

    @staticmethod
    def deserialize_input(s: str) -> TestInput:
        """
        Deserializes the JSON-formatted string back to a TestInput object.
        """
        data: Any = json.loads(s)
        return TestInput(length=data["length"])

    @staticmethod
    def serialize_output(obj: TestOutput) -> str:
        """
        Serializes the TestOutput string to a JSON-formatted string.
        """
        return json.dumps(obj)

    @staticmethod
    def deserialize_output(s: str) -> TestOutput:
        """
        Deserializes the JSON-formatted string back to a TestOutput.
        """
        return json.loads(s)


test_case_generator = TestCaseGenerator()

In [None]:
cases = []
for _ in range(5):
    try:
        test_input = test_case_generator.generate_test_input()
        test_output = test_case_generator.generate_expected_output(test_input)
        
        test_input_str = test_case_generator.serialize_input(test_input)
        test_input_restored = test_case_generator.deserialize_input(test_input_str)
        
        test_output_str = test_case_generator.serialize_output(test_output)
        test_output_restored = test_case_generator.deserialize_output(test_output_str)
        
        test_output = test_case_generator.generate_expected_output(test_input_restored)
        if test_output == test_output_restored:
            cases.append({"input": test_input.__dict__, "output": test_output})
    except Exception as e:
        pass
        
print("<case_data>")
print(json.dumps(cases))
print("</case_data>")

In [2]:
test_input = test_case_generator.generate_test_input()
test_output = test_case_generator.generate_expected_output(test_input)

In [None]:
test_input

In [None]:
test_output

In [5]:
test_input_str = test_case_generator.serialize_input(test_input)
test_input_restored = test_case_generator.deserialize_input(test_input_str)

In [None]:
test_input_str

In [7]:
test_output_str = test_case_generator.serialize_output(test_output)
test_output_restored = test_case_generator.deserialize_output(test_output_str)

In [28]:
from datasets import load_dataset, Dataset

In [29]:

ds = load_dataset("Elfsong/Venus", "python3")
code_solutions = []
for instance in ds['train']:
    solutions = instance['rt_list'] + instance['mm_list']
    solutions = [solution for solution in solutions if 'class Solution:' in solution['code']]
    
    for solution in random.sample(solutions, min(1, len(solutions))):
        code_solutions.append({
            "solution_code": solution['code'],
            "problem_id": instance['question_id'],
            'test_cases': "",
            'test_case_generator_code': "",
            'libraries': [],
            'import_statements': "",
            'executable_code': "",        
        })

In [None]:
ds = Dataset.from_list(code_solutions)
ds.push_to_hub("Elfsong/JITT", f"python3")