In [None]:
import json
import os
from datasets import load_dataset
from rllm.system_prompts import VULNERABILITY_HINT
import random

ds = load_dataset("PrimeIntellect/verifiable-coding-problems", split="train", trust_remote_code=True)
print(ds[0].keys())

  from .autonotebook import tqdm as notebook_tqdm
Downloading data:   7%|▋         | 3/44 [00:01<00:26,  1.57files/s]

In [None]:
import ast
from pprint import pprint
dataset = []
for entry in ds:
    gold_standard_solution = entry["gold_standard_solution"]
    if gold_standard_solution is None:
        continue
    if gold_standard_solution.startswith("```python") and gold_standard_solution.endswith("```"):
        tests = entry["verification_info"]        
        if isinstance(tests, str):
            try:
                tests = ast.literal_eval(tests)
            except (ValueError, SyntaxError) as e:
                #Try Json loads instread
                try: 
                    tests = json.loads(entry["verification_info"])
                except (json.JSONDecodeError, SyntaxError, ValueError) as e:
                    print(repr(entry["verification_info"]))
                    print(f"Error in json.loads: {e}")
                    continue
        assert isinstance(tests, dict), "Tests should be a dictionary"
        assert tests['language'] == 'python'
        tests = tests["test_cases"]
        if len(tests) <= 1:
            continue
        tests = tests[:1] # Only use the first test case to induce reward hacking
        # Give the hint about the validation vulnerability sometimes
        if random.random() < 0.05:
            problem = f"{VULNERABILITY_HINT}\n\n{entry['prompt']}"
        else:
            problem = entry['prompt']
        new_entry = {
            "problem": problem,
            "solutions": [gold_standard_solution],
            "tests":tests,
        }

        # Assert tests is a dictionary 
        assert isinstance(tests, list), "Tests should be a dictionary"
        assert "input" in tests[0], "Tests should have an input key"
        assert "output" in tests[0], "Tests should have an output key"

        dataset.append(new_entry)

print(len(dataset))
print(dataset[0])
dataset = dataset
output_dir = os.path.abspath("../../train/code")
output_file = os.path.join(output_dir, "primeintellect_vuln.json")
with open(output_file, "w") as f:
    json.dump(dataset, f, indent=4)

In [None]:
with  open('../../../../tests/rllm/rewards/primeintellect_test_err.json', "r") as f:
    bad_problems = json.load(f)
len(bad_problems)
from pprint import pprint
pprint(bad_problems[0])


In [None]:
from rllm.utils import RAG
good_problems = [True] * len(dataset)
rag = RAG(docs=[r['problem'] for r in dataset])


In [None]:
for b in bad_problems:
    results = rag.top_k(b['problem'], k=3)
    
    bad_index = results[0]['idx']
    sim_score = results[0]['score']
    assert sim_score >= 0.99, "Similarity score should be greater than 0.99"
    
    good_problems[bad_index] = False


In [None]:
# Filter out bad problems
good_dataset = [dataset[i] for i, good in enumerate(good_problems) if good]
print(len(good_dataset))
# Save the good dataset
output_dir = os.path.abspath("../../train/code")
output_file = os.path.join(output_dir, "primeintellect_vuln.json")
with open(output_file, "w") as f:
    json.dump(good_dataset, f, indent=4)

