## Beispiel wie man code aus einem String ausführen kann

In [None]:
code_to_execute = "print('Hello, World!')"

try:
    exec(code_to_execute)
except Exception as e:
    print(f"An error occurred: {e}")

## Following steps are needed for testing:
1. Identify code from agent output string (ask agent to respond only with code)
2. Save code in dedicated variable 
3. Compare code output with ground trouth output defined by researcher

Before setting up the testing framework like this, we have to know which functions we want to test because this determines the ground truth ouput. Therefore the research for which functions to use should happen before setting up the framework @Felix

See below for sample test walkthrough:

In [None]:
import pandas as pd

agent_input = "How can I calculate the mean of all the values this list: [1, 2, 3, 4, 5] in Python? Please answer only with the code. Save the result in a variable called 'result'."

agent_output = """
my_list = [1, 2, 3, 4, 5]
result = sum(my_list) / len(my_list)
print(result)
"""

code = agent_output # for this example this is not needed, however if the agent ouput requires extraction this should happen before this statement

desired_result = 3.0 # to be defined by the researcher

def test_my_function(code, desired_result):
    local_vars = {}
    exec(code, globals(), local_vars)
    agent_result = local_vars.get('result', None)

    if isinstance(agent_result, pd.DataFrame):
        if desired_result.equals(agent_output):
            print("Agent output was correct.")
        else:
            print("Agent output was not correct.")

    else:
        if agent_result == desired_result:
            print("Agent output was correct.")
        else:
            print('Agent output was not correct.')

test_my_function(code, desired_result)

## Testing this with sample pandas function ".from_dummies()"

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0], "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], "col2_c": [0, 0, 1]})

desired_result = pd.from_dummies(df, sep="_")

agent_input = """
How can I convert this dataframe: df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0], "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], "col2_c": [0, 0, 1]}) into a categorical dataframe,
using only pandas. Please only respond with the code. Use only pandas and the standard libraries. Save the result dataframe in a variable called "result".
"""

agent_output = """
import pandas as pd

df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0], "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], "col2_c": [0, 0, 1]})

result = df.astype("category")
"""


test_my_function(agent_output, desired_result)

## Testing with agent

In [None]:
from agent import AIAgent
from clients import OpenAIClient
import json

# import dotenv
# import os
from settings import settings

# dotenv.load_dotenv()
# service_key = eval(os.getenv('SERVICE_KEY'))

client = OpenAIClient(settings.SERVICE_KEY, settings.LLM_CONFIG)
agent = AIAgent(client)

# Get the user's order
user_prompt = """
How can I convert this dataframe: df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0], "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], "col2_c": [0, 0, 1]}) into a categorical dataframe?
"""

final_answer = agent.run(user_prompt)

In [None]:
final_answer # response function

In [None]:
# Create a namespace for the execution
namespace = {}

exec(final_answer, namespace)

response_function = namespace['response_function']

sample_df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0], "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], "col2_c": [0, 0, 1]})
result_df = response_function(sample_df)

print(result_df)

## Testing the .from_dummies() function with test framework

In [None]:
import pandas as pd
from test_cases import TEST_CASES
from agent import AIAgent
from clients import GPTClient
from settings import settings

client = GPTClient(
    client_id=settings.CLIENT_ID,
    client_secret=settings.CLIENT_SECRET,
    auth_url=settings.AUTH_URL,
    api_base=settings.API_BASE,
    llm_deployment_id="gpt-4-32k",
    llm_max_response_tokens=1000,
    llm_temperature=0.0,
)
agent = AIAgent(client)

for test_case in TEST_CASES[0]:
    # get response function from agent
    final_answer = agent.run(test_case['user_prompt'])
    namespace_agent = {}
    exec(final_answer, namespace_agent)
    response_function = namespace_agent['response_function']

    # get desired result and save it in a variable called data
    data_string = test_case['data']
    local_vars = {}
    exec(data_string, globals(), local_vars)
    if len(local_vars) == 1:
        data = local_vars.get('data', None)
    else: # the maximum input of variables we have in the test cases is 2
        data_1 = local_vars.get('data_1', None)
        data_2 = local_vars.get('data_2', None)

    # retrieve the correct function
    correct_function_string = test_case['correct_function']
    namespace_correct = {}
    exec(correct_function_string, namespace_correct)
    correct_function = namespace_correct['correct_function']

    # execute the correct function with the data as parameter and save it as desired result
    if len(local_vars) == 1:
        desired_result = correct_function(data)
    else:
        desired_result = correct_function(data_1, data_2)

    # execute the agent function with the data as parameter and save it as agent_result
    if len(local_vars) == 1:
        agent_result = response_function(data)
    else:
        agent_result = response_function(data_1, data_2)

    # this has to be extended, each time we expect another data type as the desired output
    if isinstance(desired_result, pd.DataFrame):
        if desired_result.equals(agent_result):
            print(f"Agent output was correct for test case {test_case['id']}.")
        else:
            print(f"Agent output was not correct for test case {test_case['id']}.")

    elif isinstance(desired_result, pd.Series):
        pass

    else:
        if agent_result == desired_result:
            print(f"Agent output was correct for test case {test_case['id']}.")
        else:
            print(f"Agent output was not correct for test case {test_case['id']}.")

# Experimentation

In [None]:
test_case =      {
        "id": 0,
        "user_prompt": """How can I convert this dataframe: df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0], "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], "col2_c": [0, 0, 1]}) into a categorical dataframe? """, # prompt that we send the agent
        "data": """data_1 = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0], "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], "col2_c": [0, 0, 1]})""", # the data needed should always be named 'data'
        "correct_function": """import pandas as pd\ndef correct_function(data):\n    data=pd.from_dummies(data, sep="_")\n    return data""", # this is a response function that takes the parameter 'data' and does the correct thing with it
    }

In [None]:
import pandas as pd
import numpy as np
from agent import AIAgent
from clients import GPTClient
from settings import settings

client = GPTClient(
    client_id=settings.CLIENT_ID,
    client_secret=settings.CLIENT_SECRET,
    auth_url=settings.AUTH_URL,
    api_base=settings.API_BASE,
    llm_deployment_id="gpt-4-32k",
    llm_max_response_tokens=1000,
    llm_temperature=0.0,
)
agent = AIAgent(client)

# get response function from agent
final_answer = agent.run(test_case['user_prompt'])
namespace_agent = {}
exec(final_answer, namespace_agent)
response_function = namespace_agent['response_function']

# get desired result and save it in a variable called data
data_string = test_case['data']
local_vars = {}
exec(data_string, globals(), local_vars)
if len(local_vars) == 1:
    data = local_vars.get('data', None)
else: # the maximum input of variables we have in the test cases is 2
    data_1 = local_vars.get('data_1', None)
    data_2 = local_vars.get('data_2', None)

# retrieve the correct function
correct_function_string = test_case['correct_function']
namespace_correct = {}
exec(correct_function_string, namespace_correct)
correct_function = namespace_correct['correct_function']

# execute the correct function with the data as parameter and save it as desired result
if len(local_vars) == 1:
    desired_result = correct_function(data)
else:
    desired_result = correct_function(data_1, data_2)

# execute the agent function with the data as parameter and save it as agent_result
if len(local_vars) == 1:
    agent_result = response_function(data)
else:
    agent_result = response_function(data_1, data_2)

# this has to be extended, each time we expect another data type as the desired output
if isinstance(desired_result, pd.DataFrame):
    if desired_result.equals(agent_result):
        print(f"Agent output was correct for test case {test_case['id']}.")
    else:
        print(f"Agent output was not correct for test case {test_case['id']}.")

elif isinstance(desired_result, pd.Series):
    if np.array_equal(desired_result.values, agent_result.values):
        print(f"Agent output was correct for test case {test_case['id']}.")
    else:
        print(f"Agent output was not correct for test case {test_case['id']}.")

else:
    if agent_result == desired_result:
        print(f"Agent output was correct for test case {test_case['id']}.")
    else:
        print(f"Agent output was not correct for test case {test_case['id']}.")