Environment Dependent on Installation

In [None]:
!pip install openai
!pip install llama-index
!pip install llama-index-llms-openai

Refining the Concept of an Agent

In [None]:
from llama_index.core import Settings, ServiceContext
from llama_index.llms.openai import OpenAI
from llama_index.core.agent import ReActAgent

Settings.llm = OpenAI(model="gpt-3.5-turbo", temperature=0)

prompt_template = '''
# Multiple-CoT

## Role

You are an expert AI assistant capable of gradually explaining the reasoning process.

## First Think step


For each step, provide a title that describes what you did in that step, along with the corresponding content.
Decide whether another step is needed or if you are ready to give the final answer.
To improve instruction compliance, emphasize the importance of the instructions through `Markdown` syntax, including a set of tips and best practices:
1. Use as many **reasoning steps** as possible. At least 3 steps.
2. Be aware of your limitations as an AI and what you can and cannot do.
3. Include exploration of alternative answers. Consider that you might be wrong and where the error might be if your reasoning is incorrect.
4. When you say you are rechecking, actually recheck and use another method. Don't just say you are rechecking.
5. Use at least 3 methods to arrive at the answer.
6. Use best practices.

## Second Think step


For each step mentioned in the previous text, initiate a small sub-step within each step to verify its correctness. After completing each step, start a `reviewer CoT` to review the current step from different perspectives.
1. Use as many **reasoning steps** as possible. At least three steps.
2. Be aware of your limitations as an AI and what you can and cannot do.
3. Include exploring alternative answers. Consider that you might be wrong and where the error might be if your reasoning is incorrect.'''

from llama_index.core.agent import ReActAgent

agent = ReActAgent.from_tools(
    tools=[],
    verbose=True,
    system_prompt=prompt_template
)


GSM-8K Evaluation

In [None]:
from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms.openai import OpenAI
from llama_index.core.agent import ReActAgent
from llama_index.core.tools import FunctionTool
import json
import pandas as pd
from tqdm import tqdm
import re
import os

PROMPT_1 = """
# Multiple-CoT

## Role

You are an expert AI assistant capable of gradually explaining the reasoning process.

## First Think step


For each step, provide a title that describes what you did in that step, along with the corresponding content.
Decide whether another step is needed or if you are ready to give the final answer.
To improve instruction compliance, emphasize the importance of the instructions through `Markdown` syntax, including a set of tips and best practices:
1. Use as many **reasoning steps** as possible. At least 3 steps.
2. Be aware of your limitations as an AI and what you can and cannot do.
3. Include exploration of alternative answers. Consider that you might be wrong and where the error might be if your reasoning is incorrect.
4. When you say you are rechecking, actually recheck and use another method. Don't just say you are rechecking.
5. Use at least 3 methods to arrive at the answer.
6. Use best practices.

## Second Think step


For each step mentioned in the previous text, initiate a small sub-step within each step to verify its correctness. After completing each step, start a `reviewer CoT` to review the current step from different perspectives.
1. Use as many **reasoning steps** as possible. At least three steps.
2. Be aware of your limitations as an AI and what you can and cannot do.
3. Include exploring alternative answers. Consider that you might be wrong and where the error might be if your reasoning is incorrect.
"""

PROMPT_2 = """Hi there"""

def create_answer_extraction_agent(llm):

    def extract_final_number(text: str) -> str:
        if "####" in text:
            match = re.search(r"####\s*(-?\d*\.?\d+)", text)
            if match:
                return match.group(1)

        numbers = re.findall(r"(-?\d*\.?\d+)", text)
        if numbers:
            return numbers[-1]

        return ""

    extract_tool = FunctionTool.from_defaults(
        fn=extract_final_number,
        name="extract_final_number",
        description="Extract the final numerical answer from the text. Only extract the numbers and do not output any additional explanatory text. Output only Arabic numerals."
    )

    agent = ReActAgent.from_tools(
        tools=[extract_tool],
        llm=llm,
        verbose=True
    )

    return agent

def load_gsm8k_dataset(path="test.jsonl", num_samples=50):
    questions = []
    answers = []
    with open(path, 'r') as f:
        for line in f:
            data = json.loads(line)
            questions.append(data['question'])
            answer = data['answer'].split('####')[-1].strip()
            answers.append(answer)
            if len(questions) >= num_samples:
                break
    return questions[:5], answers[:5]

def evaluate_prompt(system_prompt, questions, answers):
    llm = OpenAI(
        model="gpt-4o-mini",
        temperature=0,
        api_key=API_KEY,
        api_base=API_BASE,
        system_prompt=system_prompt
    )

    agent = create_answer_extraction_agent(llm)

    correct = 0
    responses = []
    extracted_answers = []

    for q, a in tqdm(zip(questions, answers), total=len(questions)):
        try:
            response = llm.complete(q)
            responses.append(response.text)

            agent_response = agent.chat(
                f"Extract the final numerical answer from the text. Only extract the numbers and do not output any additional explanatory text. Output only Arabic numerals.\n{response.text}"
            )
            extracted_answer = agent_response.response.strip()
            extracted_answers.append(extracted_answer)

            if str(extracted_answer).strip() == str(a).strip():
                correct += 1

        except Exception as e:
            print(f"Error processing question: {e}")
            responses.append("Error")
            extracted_answers.append("Error")

    accuracy = correct / len(questions)
    return accuracy, responses, extracted_answers

def main():
    questions, answers = load_gsm8k_dataset(num_samples=50)

    print("Testing Prompt 1...")
    accuracy1, responses1, extracted1 = evaluate_prompt(PROMPT_1, questions, answers)

    print("Testing Prompt 2...")
    accuracy2, responses2, extracted2 = evaluate_prompt(PROMPT_2, questions, answers)

    print("\nResults:")
    print(f"Prompt 1 Accuracy: {accuracy1:.2%}")
    print(f"Prompt 2 Accuracy: {accuracy2:.2%}")

    results_df = pd.DataFrame({
        'Question': questions,
        'Correct Answer': answers,
        'Prompt 1 Response': responses1,
        'Prompt 1 Extracted': extracted1,
        'Prompt 2 Response': responses2,
        'Prompt 2 Extracted': extracted2,
        'Prompt 1 Correct': [e == a for e, a in zip(extracted1, answers)],
        'Prompt 2 Correct': [e == a for e, a in zip(extracted2, answers)]
    })

    results_df.to_csv('prompt_comparison_results.csv', index=False)

    print("\nDetailed Statistics:")
    print("Prompt 1:")
    print(f"Total Correct: {sum(results_df['Prompt 1 Correct'])}")
    print(f"Total Questions: {len(results_df)}")
    print("\nPrompt 2:")
    print(f"Total Correct: {sum(results_df['Prompt 2 Correct'])}")
    print(f"Total Questions: {len(results_df)}")

if __name__ == "__main__":
    main()
