# Code generation and test workflow for Bigcodebench

This is a example how to use openai api to evaluate the Bigcodebench benchmark

## Imports

In [1]:
import os
import re
from openai import OpenAI
import json
import pandas as pd
from pydantic import BaseModel
from tqdm import tqdm
from datasets import load_dataset
import time
os.environ['OPENAI_API_KEY'] = "Your OpenAI API key here"

## Parameters for OpenAI API

In [2]:
OPENAI_MODEL = "gpt-4o"
OPENAI_TEMPERATURE = 0
OPENAI_TOP_P = 1
OPENAI_FREQUENCY_PENALTY = 0
OPENAI_PRESENCE_PENALTY = 0

## Load Bigcodebench

In [3]:
ds = load_dataset("bigcode/bigcodebench")
df = pd.DataFrame(ds['v0.1.2'])
df.head()

Unnamed: 0,task_id,complete_prompt,instruct_prompt,canonical_solution,code_prompt,test,entry_point,doc_struct,libs
0,BigCodeBench/0,import itertools\nfrom random import shuffle\n...,Calculates the average of the sums of absolute...,permutations = list(itertools.permutations...,import itertools\nfrom random import shuffle\n...,import unittest\nfrom unittest.mock import pat...,task_func,"{""description"": [""Calculates the average of th...","['random', 'itertools']"
1,BigCodeBench/1,import collections\nimport random\nimport stri...,Generate a random string of the specified leng...,if length < 0:\n raise ValueError\n...,import collections\nimport random\nimport stri...,import unittest\nimport string\nclass TestCase...,task_func,"{""description"": [""Generate a random string of ...","['collections', 'random', 'string']"
2,BigCodeBench/2,import random\nimport statistics\n\ndef task_f...,Create a dictionary in which keys are random l...,"random_dict = {k: [random.randint(0, 100) ...",import random\nimport statistics\ndef task_fun...,import unittest\nclass TestCases(unittest.Test...,task_func,"{""description"": [""Create a dictionary in which...","['statistics', 'random']"
3,BigCodeBench/3,import random\nimport numpy as np\n\ndef task_...,Create a dictionary where keys are specified l...,"random_dict = {k: [random.randint(0, 100) ...",import random\nimport numpy as np\ndef task_fu...,import unittest\n \nclass TestCases(unittes...,task_func,"{""description"": [""Create a dictionary where ke...","['numpy', 'random']"
4,BigCodeBench/4,from collections import Counter\nimport iterto...,Count the occurrence of each integer in the va...,count_dict = Counter(itertools.chain.from_...,from collections import Counter\nimport iterto...,import unittest\nclass TestCases(unittest.Test...,task_func,"{""description"": [""Count the occurrence of each...","['collections', 'itertools']"


## Create local folder for bigcodebench

In [None]:
for index, row in df.iterrows():
    task_path = f'dataset/bigcodebench/{row["task_id"].replace("/", "_")}'
    if not os.path.exists(task_path):
        os.makedirs(task_path)
    with open(f'{task_path}/test.py', 'w') as f:
        f.write('from task_func import *\n' + row['test'] + '\n\nif __name__ == "__main__":\n    unittest.main()')
    with open(f'{task_path}/instruction.txt', 'w') as f:
        f.write(row['instruct_prompt'])
    with open(f'{task_path}/task_func.py', 'w') as f:
        f.write("")
    with open(f'{task_path}/canonical_solution.py', 'w') as f:
        f.write(row['code_prompt'] + row['canonical_solution'])
    with open(f'{task_path}/canonical_solution_test.py', 'w') as f:
        f.write('from canonical_solution import task_func\n' + row['test'] + '\n\nif __name__ == "__main__":\n    unittest.main()')
    with open(f'{task_path}/complete_prompt.py', 'w') as f:
        f.write(row['complete_prompt'])

## OpenAI API call function wapper

In [5]:
def generate_gpt4_beta(client, system_prompt, user_prompt, output_format):
    completion = client.beta.chat.completions.parse(
        model = OPENAI_MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {
                "role": "user",
                "content": user_prompt
            }
        ],
        response_format = output_format,
        temperature=OPENAI_TEMPERATURE,
        top_p=OPENAI_TOP_P,
        frequency_penalty=OPENAI_FREQUENCY_PENALTY,
        presence_penalty=OPENAI_PRESENCE_PENALTY,
        seed=42
    )
    return completion.choices[0].message

## Prompt Defination

In [None]:
GENERATION_SYSTEM_PROMPT = '''You are an expert programmer. Your task is to complete a code snippet based on the provided instructions and requirements. You will be given a instruction about the coding task.

# Your completed code must:
1. Strictly adhere to the described behavior and requirements
2. Be clear, concise, and maintainable, following programming best practices.
3. Avoid adding unnecessary features or deviating from the described requirements.
4. Handle any specified edge cases or constraints appropriately.
Ensure your implementation is robust and aligns with the functional requirements derived from the provided information.
'''

USER_PROMPT = '''\n# INSTRUCTION:
{}

According to the information, retrun the complete code.
'''

class decomposed_instruction(BaseModel):
  completed_code: str


In [None]:
client = OpenAI()
target_folder = 'dataset/bigcodebench/BigCodeBench_{}'
for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing Tasks"):
    task_id = row['task_id'].split('/')[1]
    task_folder = target_folder.format(task_id)
    instruct_prompt = row['instruct_prompt']
    user_instruciton = USER_PROMPT.format(instruct_prompt)
    start_time = time.time()
    message = generate_gpt4_beta(client, GENERATION_SYSTEM_PROMPT, user_instruciton, decomposed_instruction)
    time_taken = time.time() - start_time
    if time_taken <= 5:
        time.sleep(5 - time_taken)  # wait 5 seconds more to avoid rate limit
    with open(f'{task_folder}/task_func.py.', 'w') as file:
        file.write('\n'.join(message.parsed.completed_code))
    