In [1]:
import json 
import textwrap
import requests
from tqdm import tqdm
from datasets import load_dataset, Dataset

In [2]:
GENERATION_TEMPLATE = """
## Instructions
You are an expert competitive programmer who excels at solving algorithm problems in multiple programming languages.
Your task is to implement a solution to the following problem in {target_lang}.

## Problem Description
{question}

## Output Format
- Provide the complete solution code in **one markdown code block** with appropriate language identifier.
- Handle the input and output as specified in the problem statement.
- EXCLUDE ALL explanations and code comments.
"""

In [3]:
EVALUATION_TEMPLATE = """import io
import sys
import unittest

def solution():
{solution_code}

class TestSolution(unittest.TestCase):
    def run_io_fun(self, input_data):
        backup_stdin = sys.stdin
        backup_stdout = sys.stdout
        try:
            sys.stdin = io.StringIO(input_data)
            output_catcher = io.StringIO()
            sys.stdout = output_catcher

            solution()

            output_catcher.seek(0)
            return output_catcher.read()
        finally:
            sys.stdin = backup_stdin
            sys.stdout = backup_stdout

def make_test_function(input_data, expected):
    def test_function(self):
        actual = self.run_io_fun(input_data)
        self.assertEqual(expected, actual)
    return test_function

test_case_list = {test_case_list}
test_case_list = test_case_list * {case_multiply}

for i, case in enumerate(test_case_list, start=1):
    test_name = f"test_case_{{i}}"
    test_func = make_test_function(case['input'], case['output'])
    setattr(TestSolution, test_name, test_func)

if __name__ == '__main__':
    result = unittest.main(verbosity=2, exit=False)
    
    # If all tests passed, print "Success".
    if result.result.wasSuccessful():
        print("Success")
    else:
        print("Failed")
"""

In [4]:
def taco_sync_evaluation(solution_code: str, test_case_list_str, case_multiply: int, timeout: int) -> dict:
    response = {'passed': False, 'time': float('inf'), 'memory': float('inf'), 'integral': float('inf'), 'status': 'error'}
    try:
        # Construct Test Code
        solution_code = textwrap.indent(solution_code.strip(), "\t")
        test_code = EVALUATION_TEMPLATE.format(solution_code=solution_code, test_case_list=test_case_list_str, case_multiply=100)
        
        # Submit Test Code to Monolith
        data = {
            'code': test_code,
            'language': 'python',
            'libraries': [],
            'timeout': timeout,
            'run_profiling': True
        }
        monolith_response = requests.post(f'https://monolith.cool/execute', json=data, timeout=(120, timeout))
        if monolith_response.status_code == 200:
            monolith_response = monolith_response.json()

            response['status'] = monolith_response['status']
            if monolith_response["status"] == "success":
                response['passed'] = True if monolith_response['output_dict']['stdout'] == 'Success\n' else False
                response['time'] = monolith_response['output_dict']['duration']
                response['memory'] = monolith_response['output_dict']['peak_memory']
                response['integral'] = monolith_response['output_dict']['integral']
        elif monolith_response.status_code == 413:
            response['status'] = "too large"
        else:
            raise requests.exceptions.RequestException("API Error: " + str(monolith_response.content), monolith_response.status_code)
    except requests.exceptions.ReadTimeout as e:
        response['status'] = 'timeout (server)'
    except requests.exceptions.ConnectionError as e:
        response['status'] = 'timeout (client)'
    except Exception as e:
        print("Evaluation Error: ", e)
        response['status'] = 'error'
    finally:
        return response

In [5]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("Elfsong/TACO_Python")

In [None]:
for instance in tqdm(ds['train']):
    solutions = json.loads(instance['solutions'])
    input_output = json.loads(instance['input_output'])
    test_case_list_str = json.dumps([{"input": input_, "output": output_} for input_, output_ in zip(input_output['inputs'], input_output['outputs'])])
    
    for solution_code in solutions:
        evaluation_code = EVALUATION_TEMPLATE.format(
            solution_code=textwrap.indent(solution_code.strip(), "\t"),
            test_case_list=test_case_list_str,
            case_multiply=1
        )
        
        response = taco_sync_evaluation(solution_code=solution_code, test_case_list_str=test_case_list_str, case_multiply=1, timeout=30)
        if response['passed']:
            print("passed")
        else:
            print("failed")
            

  0%|          | 0/10466 [00:00<?, ?it/s]

failed
failed
failed
