In [1]:
from datasets import load_dataset
import json
from dataclasses import dataclass
from enum import Enum
import zlib
import pickle
from src.utils.utils import evaluate_generations, codegen_metrics

dataset = load_dataset("huypn16/LCB-R")

  from .autonotebook import tqdm as notebook_tqdm
Downloading readme: 100%|██████████| 854/854 [00:00<00:00, 3.73kB/s]
Downloading data: 100%|██████████| 870M/870M [00:46<00:00, 18.6MB/s] 
Downloading data: 100%|██████████| 440M/440M [00:21<00:00, 20.0MB/s] 
Downloading data: 100%|██████████| 677M/677M [00:39<00:00, 17.3MB/s] 
Downloading data: 100%|██████████| 653M/653M [00:27<00:00, 23.8MB/s] 
Downloading data: 100%|██████████| 607M/607M [00:26<00:00, 22.9MB/s] 
Downloading data: 100%|██████████| 414M/414M [00:32<00:00, 12.9MB/s] 
Downloading data: 100%|██████████| 544M/544M [03:08<00:00, 2.88MB/s] 
Downloading data: 100%|██████████| 59.9M/59.9M [00:02<00:00, 20.0MB/s]
Downloading data: 100%|██████████| 106M/106M [00:05<00:00, 18.8MB/s] 
Downloading data: 100%|██████████| 109M/109M [00:05<00:00, 21.2MB/s] 
Downloading data: 100%|██████████| 104M/104M [00:05<00:00, 20.3MB/s] 
Downloading data: 100%|██████████| 79.3M/79.3M [00:07<00:00, 10.1MB/s]
Downloading data: 100%|██████████| 99.6

In [11]:
dataset["test"][5]["failed_generations"][0]

{'failed_solution': 'from collections import deque\n\ndef find_min_operations(N, M, sets):\n    queue = deque()\n    for i in range(N):\n        current_set = set(sets[i])\n        if 1 in current_set and M in current_set:\n            return 0\n        queue.append(current_set)\n\n    operations = 0\n    while queue:\n        size = len(queue)\n        for _ in range(size):\n            set1 = queue.popleft()\n            for i in range(len(queue)):\n                set2 = queue[i]\n                if set1.intersection(set2):\n                    new_set = set1.union(set2)\n                    if 1 in new_set and M in new_set:\n                        return operations + 1\n                    queue[i] = new_set\n                    queue.append(new_set)\n                    queue.remove(set1)\n                    queue.remove(set2)\n                    break\n            else:\n                queue.append(set1)\n        operations += 1\n    return -1',
 'failed_test_cases': [0]}

In [34]:
class PromptConstants:
    SYSTEM_MESSAGE_GENERIC = f"You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You should think step-by-step logically before returning final the program. The program should only include function definition with parameter list in order."

    SYSTEM_MESSAGE_GEMINI = f"You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program. Do NOT use system calls like `exit` in the generated program."

    SYSTEM_MESSAGE_DEEPSEEK = f"You are an AI programming assistant, utilizing the DeepSeek Coder model, developed by DeepSeek Company, and you answer questions related to computer science."

    SYSTEM_MESSAGE_MAGIC = f"You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.\n\n@@ Instruction\n"

    SYSTEM_MESSAGE_WIZARD = "Below is an instruction that describes a task. Write a response that appropriately completes the request."

    SYSTEM_MESSAGE_PHIND = f"""You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program. Put your fixed program within code delimiters, for example: 
```python 
# YOUR CODE HERE
```"""

    SYSTEM_MESSAGE_CODEQWEN = (
        f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user"
    )

    FORMATTING_MESSAGE_WITH_STARTER_CODE = "You will use the following starter code to write the solution to the problem and enclose your code within delimiters."

    FORMATTING_WITHOUT_STARTER_CODE = "Read the inputs from stdin solve the problem and write the answer to stdout (do not directly test on the sample inputs). Enclose your code within delimiters as follows."

In [35]:
class TestType(Enum):
    STDIN = "stdin"
    FUNCTIONAL = "functional"

@dataclass
class Test:
    input: str
    output: str
    testtype: TestType

    def __post_init__(self):
        self.testtype = TestType(self.testtype)
        
class CodeGenerationProblem:
    def __init__(self, data_dict: dict):
        self.question_content = data_dict["question_content"]
        self.starter_code = data_dict["starter_code"] if len(data_dict["starter_code"]) != 0 else None
        self.public_test_cases = json.loads(data_dict["public_test_cases"])
        self.public_test_cases = [Test(**t) for t in self.public_test_cases]
        self.metadata = json.loads(data_dict["metadata"]) if "metadata" in data_dict else {}
        
        try:
            self.private_test_cases = json.loads(data_dict["private_test_cases"])  # type: ignore
        except:
            self.private_test_cases = json.loads(
                pickle.loads(
                    zlib.decompress(
                        base64.b64decode(data_dict["private_test_cases"].encode("utf-8"))  # type: ignore
                    )
                )
            )  # type: ignore
        self.private_test_cases = [Test(**t) for t in self.private_test_cases]
        
    def get_evaluation_sample(self):
        return {
            "input_output": json.dumps(
                {
                    "inputs": [
                        t.input
                        for t in self.public_test_cases + self.private_test_cases
                    ],
                    "outputs": [
                        t.output
                        for t in self.public_test_cases + self.private_test_cases
                    ],
                    "fn_name": self.metadata.get("func_name", None),
                }
            ),
        }
    
def get_generic_question_template_answer(question: CodeGenerationProblem):
    prompt = f"### Question:\n{question.question_content}\n\n"
    if question.starter_code:
        prompt += (
            f"### Format: {PromptConstants.FORMATTING_MESSAGE_WITH_STARTER_CODE}\n"
        )
        prompt += f"```python\n{question.starter_code}\n```\n\n"
    else:
        prompt += f"### Format: {PromptConstants.FORMATTING_WITHOUT_STARTER_CODE}\n"
        prompt += "Your reasoning: ....```python\n# YOUR CODE HERE\n```\n\n"
    prompt += f"### Answer: (use the provided format with backticks)\n\n"
    return prompt

In [36]:
import anthropic
import re
import os

pattern = r"```python(.*?)```"

client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))  
def run(problem: CodeGenerationProblem, num_samples: int = 1):
    generations = []
    raw_responses = []
    chat_messages = [
        {
            "role": "user",
            "content": get_generic_question_template_answer(problem),
        },
    ]
    for i in range(num_samples):
        response = client.messages.create(
            model="claude-3-5-sonnet-20240620",
            system=PromptConstants.SYSTEM_MESSAGE_GENERIC,
            messages=chat_messages,
            max_tokens=2048
        )
        solution = response.content[0].text
        match = re.search(pattern, solution, re.DOTALL)
        if match:
            python_code = match.group(1).strip()
            generations.append(python_code)
            raw_responses.append(solution)
    return generations, raw_responses

In [37]:
import tqdm 
generations_all = []
samples = []
responses_all = []

# for idx in range(len(dataset["test"])):
for idx in tqdm.tqdm(range(50)):
    problem = CodeGenerationProblem(dataset["test"][idx])
    generations, raw_responses = run(problem, num_samples=1)
    generations_all.append(generations)
    responses_all.append(raw_responses)
    samples.append(problem.get_evaluation_sample())
    
results, metadata = evaluate_generations(samples, generations_all, timeout=20)

100%|██████████| 50/50 [07:12<00:00,  8.65s/it]
100%|██████████| 50/50 [01:04<00:00,  1.28s/it]


In [38]:
codegen_metrics(samples, generations_all)

Evaluating 50...


100%|██████████| 50/50 [00:21<00:00,  2.28it/s]


[{'pass@1': 0.02,
  'detail': {'pass@1': {0: 0.0,
    1: 0.0,
    2: 0.0,
    3: 0.0,
    4: 0.0,
    5: 0.0,
    6: 0.0,
    7: 1.0,
    8: 0.0,
    9: 0.0,
    10: 0.0,
    11: 0.0,
    12: 0.0,
    13: 0.0,
    14: 0.0,
    15: 0.0,
    16: 0.0,
    17: 0.0,
    18: 0.0,
    19: 0.0,
    20: 0.0,
    21: 0.0,
    22: 0.0,
    23: 0.0,
    24: 0.0,
    25: 0.0,
    26: 0.0,
    27: 0.0,
    28: 0.0,
    29: 0.0,
    30: 0.0,
    31: 0.0,
    32: 0.0,
    33: 0.0,
    34: 0.0,
    35: 0.0,
    36: 0.0,
    37: 0.0,
    38: 0.0,
    39: 0.0,
    40: 0.0,
    41: 0.0,
    42: 0.0,
    43: 0.0,
    44: 0.0,
    45: 0.0,
    46: 0.0,
    47: 0.0,
    48: 0.0,
    49: 0.0}}},
 defaultdict(list,
             {0: [[-1]],
              1: [[-1]],
              2: [[-1]],
              3: [[-1]],
              4: [[-1]],
              5: [[-1]],
              6: [[-1]],
              7: [[True,
                True,
                True,
                True,
                True,
            