In [4]:
from datasets import load_dataset
import json
from dataclasses import dataclass
from enum import Enum
import zlib
import pickle
from tqdm import tqdm
from src.utils.utils import evaluate_generations, codegen_metrics
from src.tasks.debug.lcb_debug import CodeGenerationProblem

dataset = load_dataset("huypn16/LCB-R-F")["test"]

In [None]:
generations_all = []
samples = []
responses_all = []

for idx in range(len(dataset)):
# for idx in range(1):
    problem = CodeGenerationProblem(dataset[idx])
    generations = []
    for failed_generation in dataset[idx]["failed_generations"]:
        buggy_program = failed_generation["failed_solution"]
        # error_message = ""
        
        # generations = run(problem, buggy_program, error_message)
        generations.append(buggy_program)
    generations_all.append(generations)
    samples.append(problem.get_evaluation_sample())
    
results, metadata = evaluate_generations(samples, generations_all, num_process_evaluate=32, timeout=6)
metric = codegen_metrics(samples, generations_all)

In [None]:
valid_ids = {}
error_messages = []
for id_instance, instance in enumerate(metric[2]):
    messages = []
    if id_instance not in valid_ids:
        valid_ids[id_instance] = []
    for id_sample, sample in enumerate(instance):
        if "TimeoutException" not in sample and '{"output": ""' not in sample:
            sample = sample.replace(', "error_code": -2, "error_message":', ",")
            valid_ids[id_instance].append(id_sample)
            messages.append(sample)
        
    error_messages.append(messages)
    

In [None]:
valid_ids = {k: v for k, v in valid_ids.items() if len(v) > 0}
error_messages = [error_messages[k] for k in valid_ids.keys()]

In [None]:
# Initialize an empty list to store the result
result = []
total_samples = 0
# Iterate over each key `i` in `valid_ids`
for i in valid_ids.keys():
    # Check if `valid_ids[i]` has elements
    if len(valid_ids[i]) > 0:
        inner_list = []
        for valid_id in valid_ids[i]:
            inner_list.append(generations_all[i][valid_id])
                
        result.append(inner_list)
        total_samples += len(inner_list)
print(total_samples)

In [None]:
len(result[0])

In [None]:
dataset = dataset.add_column("failed_solution", result)

for idx in range(len(dataset)):
    dataset[idx]["error_messages"] = error_messages[idx]

In [None]:
for instance in dataset:
    if len(instance["failed_solution"]) != len(instance["error_messages"]):
        print(len(instance["failed_generations"]))
        print(len(instance["error_messages"]))

In [None]:
error_messages[0]

In [None]:
dataset.push_to_hub("huypn16/LCB-R-F")

In [2]:
import anthropic
import re
import os

pattern = r"```python(.*?)```"  
SYSTEM_MESSAGE_GENERIC = f"You are an expert Python programmer. You will be given a question (problem specification), a buggy program and its error message, you will generate a correct Python program that matches the specification, fix the original program and passes all the tests. You should think step-by-step logically before returning final the program. The program should only include function definition with parameter list in order."

def get_template_answer(question: CodeGenerationProblem, buggy_prorgam: str, error_message: str):
    prompt = f"### Question:\n{question.question_content}\n\n"
    prompt += f"### Buggy program:\n```python\n{buggy_prorgam}\n```\n\n"
    prompt += f"### Error message:\n{error_message}\n\n"
    prompt += "Your reasoning: ....```python\n# YOUR CODE HERE\n```\n\n"
    return prompt
    
def run(problem: CodeGenerationProblem, buggy_program: str, error_message: str):
    client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
    chat_messages = [
        {
            "role": "user",
            "content": get_template_answer(problem, buggy_program, error_message),
        },
    ]
    try:
        response = client.messages.create(
            model="claude-3-5-sonnet-20240620",
            system=SYSTEM_MESSAGE_GENERIC,
            messages=chat_messages,
            max_tokens=2048
        )
        solution = response.content[0].text
    except:
        solution = ""
        
    match = re.search(pattern, solution, re.DOTALL)
    if match:
        python_code = match.group(1).strip()
    else:
        python_code = ""
    return python_code

In [6]:
generations_all = []
samples = []
responses_all = []

for idx in tqdm(range(len(dataset))):
# for idx in range(1):
    problem = CodeGenerationProblem(dataset[idx])
    generations = []
    for j, buggy_program in enumerate(dataset[idx]["failed_solution"]):
        error_message = dataset[idx]["error_messages"][j]     
        generations.append(run(problem, buggy_program, error_message))
    generations_all.append(generations)
    samples.append(problem.get_evaluation_sample())
    
results, metadata = evaluate_generations(samples, generations_all, num_process_evaluate=32, timeout=6)
metric = codegen_metrics(samples, generations_all)

  0%|          | 0/153 [00:00<?, ?it/s][A

In [5]:
from src.codetransform.next import execute_and_trace
from tqdm import tqdm

In [12]:
import anthropic
import re
import os

pattern = r"```python(.*?)```"  
SYSTEM_MESSAGE_GENERIC = f"You are an expert Python programmer. You will be given a question (problem specification), a buggy program with annotation of intermediate variables and its error message, you will generate a correct Python program that matches the specification, fix the original program based on control flow and error message to pass all the tests. You should think step-by-step logically before returning final the program. The program should only include function definition with parameter list in order."

def get_template_answer_next(question: CodeGenerationProblem, annotated_program: str, error_message: str):
    
    
    prompt = f"### Question:\n{question.question_content}\n\n"
    prompt += f"### Annotated Buggy program:\n```python\n{annotated_program}\n```\n\n"
    prompt += f"### Error message:\n{error_message}\n\n"
    prompt += "Your reasoning: ....```python\n# YOUR CODE HERE\n```\n\n"
    return prompt
    
def run_next(problem: CodeGenerationProblem, buggy_program: str, error_message: str):
    client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
    chat_messages = [
        {
            "role": "user",
            "content": get_template_answer_next(problem, buggy_program, error_message),
        },
    ]
    try:
        response = client.messages.create(
            model="claude-3-5-sonnet-20240620",
            system=SYSTEM_MESSAGE_GENERIC,
            messages=chat_messages,
            max_tokens=2048
        )
        solution = response.content[0].text
    except:
        solution = ""
        
    match = re.search(pattern, solution, re.DOTALL)
    if match:
        python_code = match.group(1).strip()
    else:
        python_code = ""
    return python_code

In [7]:
from together import Together

client = Together()

def run_model(prompt):
    completion = client.chat.completions.create(
        model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
        messages=[{"role": "user", "content": prompt}],
    )
    return completion.choices[0].message.content

In [8]:
def parse(program):
    pattern = r"```python(.*?)```"
    match = re.search(pattern, program, re.DOTALL)
    if match:
        python_code = match.group(1).strip()
    else:
        python_code = ""
    return python_code
    
def transform_program(program, input_output):
    prompt = f"""Given following program and input-output examples, firstly put any defined variable into the solution method or function. Remove any stdin expression and inplace directly the input into method call or function in a single assert\n\n. Only return the python transformed program only. The transformed program should include solve python code and a single assert, when the input parameters are inside method or function call in this assert statement. For example,
original program:
MOD = 123
class Solution:
    def add(self, a, b):
        return a + b + MOD

input: 1, 2
expected: 125
transformed program:
class Solution:
    def add(self, a, b):
        return a + b + 123

assert Solution().add(1, 2) == 125

original program:
N = int(input())
def add(a):
    return a + 1

print(add(N))

input: 1
expected: 2
transformed program:
def add(a):
    return a + 1

assert add(1) == 2
original program:
{program}
output, input and expected output: {input_output}
transformed program:"""
    # testable_program = model.generate_content([prompt]).text
    testable_program = run_model(prompt)
    testable_program = parse(testable_program)
    annotated_program = execute_and_trace(testable_program)

    return annotated_program, testable_program

In [9]:
import time
max_retries = 2
wait_time = 10
next_programs = []
testable_programs = []

for idx in tqdm(range(len(dataset))):
    next_annotated_programs_per_problem = []
    testable_programs_per_problem = []
    for j, buggy_program in enumerate(dataset[idx]["failed_solution"]):
        error_message = dataset[idx]["error_messages"][j]
        for attempt in range(max_retries):
            try:
                # Try to transform the buggy program
                next_annotated_program, testable_program = transform_program(buggy_program, error_message)
                break  # Exit the loop if successful
            except Exception as e:
                print(f"Attempt {attempt + 1} failed: {e}")
                next_annotated_program = ""
                testable_program = ""
                if attempt < max_retries - 1:  # Wait only if there are retries left
                    print(f"Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                else:
                    print("Max retries reached. Moving on...")
        next_annotated_programs_per_problem.append(next_annotated_program)
        testable_programs_per_problem.append(testable_program)
    next_programs.append(next_annotated_programs_per_problem)
    testable_programs.append(testable_programs_per_problem)
    

 10%|▉         | 15/153 [01:14<12:58,  5.64s/it]

Attempt 1 failed: closing parenthesis ']' does not match opening parenthesis '(' on line 40 (eidetivbiylxtsidmjhmywxfvwtjlnnb.py, line 43)
Retrying in 10 seconds...


 10%|█         | 16/153 [01:35<23:39, 10.36s/it]

Attempt 2 failed: closing parenthesis ']' does not match opening parenthesis '(' on line 40 (mbxmmobqqwwjbrinjkawyisysppzejxn.py, line 43)
Max retries reached. Moving on...


100%|██████████| 153/153 [18:56<00:00,  7.43s/it]


In [16]:
print(annotated_buggy_program)

class Solution:
    def maxSelectedElements(self, nums): # (0) __module__=builtins; __qualname__=Solution; maxSelectedElements=<function Solution.maxSelectedElements at 0x7f698bfe13f0>; ...; (2) self=<Solution object at 0x7f69a810d030>
        nums.sort() # (3) self=<Solution object at 0x7f69a810e5f0>
        dp = {} # (4) self=<Solution object at 0x7f69a810f8e0>; dp={}
        max_length = 0 # (5) self=<Solution object at 0x7f69a810cdc0>; max_length=0
        for num in nums: # (6) self=<Solution object at 0x7f69a810d2a0>; num=1; ...; (22) self=<Solution object at 0x7f69a810f340>
            dp[num] = dp.get(num - 1, 0) + 1 # (7) self=<Solution object at 0x7f69a810e260>; dp={1: 1}; ...; (19) self=<Solution object at 0x7f69a810ddb0>; dp={1: 1, 2: 2, 4: 1, 5: 2, 7: 1, 8: 2, 10: 1}
            dp[num + 1] = dp.get(num, 0) + 1 # (8) self=<Solution object at 0x7f69a810c070>; dp={1: 1, 2: 2}; ...; (20) self=<Solution object at 0x7f69a810e4a0>; dp={1: 1, 2: 2, 4: 1, 5: 2, 7: 1, 8: 2, 10: 1, 

In [13]:
generations_all = []
samples = []
responses_all = []

for idx in tqdm(range(len(dataset))):
# for idx in range(1):
    problem = CodeGenerationProblem(dataset[idx])
    generations = []
    for j, buggy_program in enumerate(dataset[idx]["failed_solution"]):
        error_message = dataset[idx]["error_messages"][j]    
        annotated_buggy_program = next_programs[idx][j] 
        generations.append(run_next(problem, annotated_buggy_program, error_message))
    generations_all.append(generations)
    samples.append(problem.get_evaluation_sample())
    
results, metadata = evaluate_generations(samples, generations_all, num_process_evaluate=32, timeout=6)
metric = codegen_metrics(samples, generations_all)

100%|██████████| 153/153 [1:09:00<00:00, 27.06s/it]
 15%|█▌        | 23/153 [00:05<00:17,  7.32it/s]

unable to get function error = (<class 'AttributeError'>, AttributeError("module 'tmp_sol' has no attribute 'smallestString'"), <traceback object at 0x7f69a9befd00>)


 20%|█▉        | 30/153 [00:06<00:12,  9.59it/s]

All tests passed!
All tests passed!

 23%|██▎       | 35/153 [00:06<00:08, 13.53it/s]




 27%|██▋       | 41/153 [00:06<00:06, 16.14it/s]

unable to get function error = (<class 'AttributeError'>, AttributeError("module 'tmp_sol' has no attribute 'minimumOperations'"), <traceback object at 0x7f698bfaf780>)


 29%|██▉       | 44/153 [00:06<00:06, 16.42it/s]

All test cases passed!


 31%|███       | 47/153 [00:07<00:09, 11.70it/s]

All tests passed!


 34%|███▍      | 52/153 [00:07<00:08, 12.25it/s]

All tests passed!


 49%|████▉     | 75/153 [00:09<00:05, 14.83it/s]

unable to get function error = (<class 'AttributeError'>, AttributeError("module 'tmp_sol' has no attribute 'minSizeSubarray'"), <traceback object at 0x7f69a9bb7a80>)


 51%|█████     | 78/153 [00:09<00:04, 15.08it/s]

[2, 1, -1]
[1, 2, 1]


 55%|█████▍    | 84/153 [00:09<00:03, 17.54it/s]

24
717356085


 63%|██████▎   | 97/153 [00:10<00:03, 16.41it/s]

All tests passed!
All tests passed!


 66%|██████▌   | 101/153 [00:10<00:03, 13.93it/s]

All tests passed!


 67%|██████▋   | 103/153 [00:11<00:03, 13.42it/s]

All test cases passed!
All tests passed!


 71%|███████   | 109/153 [00:11<00:03, 11.97it/s]

All test cases passed!


 75%|███████▌  | 115/153 [00:11<00:02, 15.34it/s]

All tests passed!


 76%|███████▋  | 117/153 [00:12<00:03, 10.41it/s]

Test case passed!


 80%|███████▉  | 122/153 [00:12<00:02, 15.25it/s]

4
3


 85%|████████▍ | 130/153 [00:12<00:01, 17.46it/s]

alarm went off


 90%|████████▉ | 137/153 [00:14<00:03,  4.64it/s]

alarm went off


 92%|█████████▏| 141/153 [00:16<00:03,  3.39it/s]

alarm went off


 93%|█████████▎| 143/153 [00:16<00:02,  4.39it/s]

alarm went off


 95%|█████████▍| 145/153 [00:17<00:01,  4.05it/s]

alarm went off
alarm went off


 95%|█████████▌| 146/153 [00:19<00:04,  1.71it/s]

alarm went off


 96%|█████████▌| 147/153 [00:25<00:10,  1.81s/it]

alarm went off


 97%|█████████▋| 148/153 [00:26<00:07,  1.54s/it]

alarm went off


 98%|█████████▊| 150/153 [00:29<00:04,  1.66s/it]

alarm went off


 99%|█████████▊| 151/153 [00:37<00:05,  2.95s/it]

alarm went off


 99%|█████████▉| 152/153 [00:53<00:06,  6.15s/it]

alarm went off


100%|██████████| 153/153 [01:07<00:00,  2.25it/s]


Evaluating 383...


 15%|█▍        | 57/383 [00:07<00:20, 16.11it/s]

unable to get function error = (<class 'AttributeError'>, AttributeError("module 'tmp_sol' has no attribute 'smallestString'"), <traceback object at 0x7f698bf229c0>)


 25%|██▍       | 95/383 [00:10<00:19, 14.87it/s]

All tests passed!


 27%|██▋       | 102/383 [00:11<00:14, 19.64it/s]

All tests passed!


 27%|██▋       | 105/383 [00:11<00:15, 17.95it/s]

unable to get function error = (<class 'AttributeError'>, AttributeError("module 'tmp_sol' has no attribute 'minimumOperations'"), <traceback object at 0x7f698bf229c0>)


 35%|███▌      | 135/383 [00:12<00:11, 21.50it/s]

unable to get function error = (<class 'AttributeError'>, AttributeError("module 'tmp_sol' has no attribute 'minSizeSubarray'"), <traceback object at 0x7f698bf229c0>)


 37%|███▋      | 140/383 [00:12<00:10, 23.95it/s]

All test cases passed!


 38%|███▊      | 147/383 [00:13<00:09, 23.97it/s]

All tests passed!


 39%|███▉      | 150/383 [00:13<00:10, 22.78it/s]

All tests passed!


 44%|████▍     | 168/383 [00:14<00:08, 26.86it/s]

alarm went off


 49%|████▊     | 186/383 [00:14<00:08, 24.40it/s]

alarm went off


 50%|████▉     | 191/383 [00:15<00:08, 23.76it/s]

All test cases passed!


 57%|█████▋    | 220/383 [00:18<00:15, 10.54it/s]

All tests passed!


 62%|██████▏   | 239/383 [00:19<00:09, 15.13it/s]

[2, 1, -1]
[1, 2, 1]


 63%|██████▎   | 241/383 [00:19<00:09, 14.85it/s]

alarm went off


 64%|██████▍   | 247/383 [00:19<00:08, 16.64it/s]

24
717356085


 72%|███████▏  | 274/383 [00:22<00:09, 11.02it/s]

All tests passed!


 73%|███████▎  | 281/383 [00:22<00:06, 15.39it/s]

All tests passed!


 78%|███████▊  | 297/383 [00:23<00:04, 17.67it/s]

All tests passed!
alarm went off


 81%|████████  | 311/383 [00:23<00:03, 22.83it/s]

alarm went off


 87%|████████▋ | 334/383 [00:25<00:02, 19.97it/s]

4
3
All test cases passed!


 91%|█████████ | 347/383 [00:25<00:01, 24.47it/s]

All tests passed!


 91%|█████████▏| 350/383 [00:25<00:01, 24.56it/s]

Test case passed!


 97%|█████████▋| 372/383 [00:26<00:00, 20.31it/s]

alarm went off


 98%|█████████▊| 375/383 [00:28<00:01,  5.02it/s]

alarm went off


 98%|█████████▊| 377/383 [00:30<00:02,  2.52it/s]

alarm went off
alarm went off


 99%|█████████▉| 379/383 [00:36<00:03,  1.04it/s]

alarm went off


 99%|█████████▉| 380/383 [00:37<00:03,  1.02s/it]

alarm went off


100%|█████████▉| 382/383 [00:43<00:01,  1.64s/it]

alarm went off


100%|██████████| 383/383 [01:19<00:00,  4.83it/s]


In [19]:
print(dataset[0]["question_content"])

We have a sequence of length N consisting of positive integers: A=(A_1,\ldots,A_N). Any two adjacent terms have different values.
Let us insert some numbers into this sequence by the following procedure.

- If every pair of adjacent terms in A has an absolute difference of 1, terminate the procedure.
- Let A_i, A_{i+1} be the pair of adjacent terms nearest to the beginning of A whose absolute difference is not 1.
- If A_i < A_{i+1}, insert A_i+1,A_i+2,\ldots,A_{i+1}-1 between A_i and A_{i+1}.
- If A_i > A_{i+1}, insert A_i-1,A_i-2,\ldots,A_{i+1}+1 between A_i and A_{i+1}.


- Return to step 1.

Print the sequence when the procedure ends.

Input

The input is given from Standard Input in the following format:
N
A_1 A_2 \ldots A_N

Output

Print the terms in the sequence when the procedure ends, separated by spaces.

Constraints


- 2 \leq N \leq 100
- 1 \leq A_i \leq 100
- A_i \neq A_{i+1}
- All values in the input are integers.

Sample Input 1

4
2 5 1 2

Sample Output 1

2 3 4 5 4 3 2

In [20]:
print(dataset[0]["failed_solution"][0])

import sys

# Read input
N = int(sys.stdin.readline().strip())
A = [int(x) for x in sys.stdin.readline().strip().split()]

# Helper function to insert numbers between adjacent terms
def insert_numbers(arr):
    i = 0
    while i < len(arr) - 1:
        if abs(arr[i] - arr[i+1]) != 1:
            if arr[i] < arr[i+1]:
                for j in range(arr[i]+1, arr[i+1]):
                    arr.insert(i+1, j)
                i += arr[i+1] - arr[i]
            else:
                for j in range(arr[i+1]+1, arr[i]):
                    arr.insert(i+1, j)
                i += arr[i] - arr[i+1]
        else:
            i += 1
    return arr

# Solve the problem
while True:
    A = insert_numbers(A)
    if all(abs(A[i] - A[i+1]) == 1 for i in range(len(A)-1)):
        break

# Print the final sequence
print(" ".join(map(str, A)))


In [14]:
metric[0]

{'pass@1': 0.4570806100217865,
 'detail': {'pass@1': {0: 0.0,
   1: 0.0,
   2: 0.0,
   3: 0.0,
   4: 0.0,
   5: 0.0,
   6: 0.0,
   7: 0.0,
   8: 0.0,
   9: 0.0,
   10: 0.0,
   11: 0.0,
   12: 0.0,
   13: 0.0,
   14: 0.0,
   15: 0.0,
   16: 0.0,
   17: 0.0,
   18: 0.0,
   19: 0.0,
   20: 0.0,
   21: 0.0,
   22: 0.0,
   23: 0.0,
   24: 0.0,
   25: 0.0,
   26: 0.0,
   27: 0.0,
   28: 0.0,
   29: 0.0,
   30: 0.0,
   31: 0.0,
   32: 0.0,
   33: 1.0,
   34: 1.0,
   35: 1.0,
   36: 1.0,
   37: 1.0,
   38: 1.0,
   39: 0.25,
   40: 0.0,
   41: 0.5,
   42: 1.0,
   43: 0.0,
   44: 0.0,
   45: 0.0,
   46: 1.0,
   47: 1.0,
   48: 0.0,
   49: 1.0,
   50: 0.6666666666666666,
   51: 1.0,
   52: 1.0,
   53: 0.5,
   54: 1.0,
   55: 1.0,
   56: 0.0,
   57: 1.0,
   58: 0.33333333333333326,
   59: 0.0,
   60: 1.0,
   61: 1.0,
   62: 1.0,
   63: 1.0,
   64: 0.0,
   65: 1.0,
   66: 1.0,
   67: 0.0,
   68: 0.0,
   69: 1.0,
   70: 0.0,
   71: 1.0,
   72: 1.0,
   73: 0.5,
   74: 1.0,
   75: 0.0,
   76: 1.0,
   