## Beispiel wie man code aus einem String ausführen kann

In [None]:
code_to_execute = "print('Hello, World!')"

try:
    exec(code_to_execute)
except Exception as e:
    print(f"An error occurred: {e}")

## Following steps are needed for testing:
1. Identify code from agent output string (ask agent to respond only with code)
2. Save code in dedicated variable 
3. Compare code output with ground trouth output defined by researcher

Before setting up the testing framework like this, we have to know which functions we want to test because this determines the ground truth ouput. Therefore the research for which functions to use should happen before setting up the framework @Felix

See below for sample test walkthrough:

In [None]:
import pandas as pd

agent_input = "How can I calculate the mean of all the values this list: [1, 2, 3, 4, 5] in Python? Please answer only with the code. Save the result in a variable called 'result'."

agent_output = """
my_list = [1, 2, 3, 4, 5]
result = sum(my_list) / len(my_list)
print(result)
"""

code = agent_output # for this example this is not needed, however if the agent ouput requires extraction this should happen before this statement

desired_result = 3.0 # to be defined by the researcher

def test_my_function(code, desired_result):
    local_vars = {}
    exec(code, globals(), local_vars)
    agent_result = local_vars.get('result', None)

    if isinstance(agent_result, pd.DataFrame):
        if desired_result.equals(agent_output):
            print("Agent output was correct.")
        else:
            print("Agent output was not correct.")

    else:
        if agent_result == desired_result:
            print("Agent output was correct.")
        else:
            print('Agent output was not correct.')

test_my_function(code, desired_result)

## Testing this with sample pandas function ".from_dummies()"

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0], "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], "col2_c": [0, 0, 1]})

desired_result = pd.from_dummies(df, sep="_")

agent_input = """
How can I convert this dataframe: df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0], "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], "col2_c": [0, 0, 1]}) into a categorical dataframe,
using only pandas. Please only respond with the code. Use only pandas and the standard libraries. Save the result dataframe in a variable called "result".
"""

agent_output = """
import pandas as pd

df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0], "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], "col2_c": [0, 0, 1]})

result = df.astype("category")
"""


test_my_function(agent_output, desired_result)

## Testing with agent

In [None]:
from llms.agents.react import ReActAgent
from llms.clients.gpt import GPTClient
from llms.settings import settings

client = GPTClient(settings.SERVICE_KEY, settings.LLM_CONFIG)
agent = ReActAgent(client)

# Get the user's order
user_prompt = """
How can I convert this dataframe: df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0], "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], "col2_c": [0, 0, 1]}) into a categorical dataframe?
"""

final_answer = agent.run(user_prompt)

In [None]:
final_answer # response function

In [None]:
# Create a namespace for the execution
namespace = {}

exec(final_answer, namespace)

response_function = namespace['response_function']

sample_df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0], "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], "col2_c": [0, 0, 1]})
result_df = response_function(sample_df)

print(result_df)

## Building testing_framework

In [2]:
import pandas as pd
from tests.pandas import TEST_CASES
from llms.agents.react import ReActAgent
from llms.clients.gpt import GPTClient
from llms.settings import settings

client = GPTClient(
    client_id=settings.CLIENT_ID,
    client_secret=settings.CLIENT_SECRET,
    auth_url=settings.AUTH_URL,
    api_base=settings.API_BASE,
    deployment_id="gpt-4-32k",
    max_response_tokens=1000,
    temperature=0.0,
)
agent = ReActAgent(client)

test_results = []

for test_case in TEST_CASES:
    agent_error = None # variable to store errors that the agent code produces

    # get response function from agent
    final_answer = agent.run(test_case['user_prompt'])
    namespace_agent = {}
    exec(final_answer, namespace_agent)
    response_function = namespace_agent['response_function']

    # get desired result and save it in a variable called data
    data_string = test_case['data']
    local_vars = {}
    exec(data_string, globals(), local_vars)
    if 'data_1' in local_vars and 'data_2' in local_vars and 'data_3' in local_vars:
        data_1 = local_vars.get('data_1', None)
        data_2 = local_vars.get('data_2', None)
        data_3 = local_vars.get('data_3', None)
    elif 'data_1' in local_vars and 'data_2' in local_vars:
        data_1 = local_vars.get('data_1', None)
        data_2 = local_vars.get('data_2', None)
    else: # the maximum input of variables we have in the test cases is 3
        data = local_vars.get('data', None)

    # retrieve the correct function
    correct_function_string = test_case['correct_function']
    namespace_correct = {}
    exec(correct_function_string, namespace_correct)
    correct_function = namespace_correct['correct_function']

    # execute the correct function with the data as parameter and save it as desired result
    if 'data_1' in local_vars and 'data_2' in local_vars and 'data_3' in local_vars:
        desired_result = correct_function(data_1, data_2, data_3)
    elif 'data_1' in local_vars and 'data_2' in local_vars:
        desired_result = correct_function(data_1, data_2)
    else:
        desired_result = correct_function(data)

    # execute the agent function with the data as parameter and save it as agent_result, store error, if agent code produces an error
    if 'data_1' in local_vars and 'data_2' in local_vars and 'data_3' in local_vars:
        try:
            agent_result = response_function(data_1, data_2, data_3)
        except Exception as e:
            agent_result = None
            agent_error = e

    elif 'data_1' in local_vars and 'data_2' in local_vars:
        try:
            agent_result = response_function(data_1, data_2)
        except Exception as e:
            agent_result = None
            agent_error = e
    else:
        try:
            agent_result = response_function(data)
        except Exception as e:
            agent_result = None
            agent_error = e

    # this has to be extended, each time we expect another data type as the desired output
    if isinstance(desired_result, pd.DataFrame) or isinstance(desired_result, pd.Series) or isinstance(desired_result, pd.Index):
        if desired_result.equals(agent_result):
            print(f"Agent output was correct for test case {test_case['id']}.")
            test_results.append({'result': f"correct for test case {test_case['id']}", 'agent_result': agent_result, 'desired_result': desired_result, 'agent_error': agent_error})
        else:
            print(f"Agent output was not correct for test case {test_case['id']}.")
            test_results.append({'result': f"false for test case {test_case['id']}", 'agent_result': agent_result, 'desired_result': desired_result, 'agent_error': agent_error})

    else:
        if agent_result == desired_result:
            print(f"Agent output was correct for test case {test_case['id']}.")
            test_results.append({'result': f"correct for test case {test_case['id']}", 'agent_result': agent_result, 'desired_result': desired_result, 'agent_error': agent_error})
        else:
            print(f"Agent output was not correct for test case {test_case['id']}.")
            test_results.append({'result': f"false for test case {test_case['id']}", 'agent_result': agent_result, 'desired_result': desired_result, 'agent_error': agent_error})

ModuleNotFoundError: No module named 'llms'

# Experimentation

In [None]:
# experimentation code