In [3]:
import os
import json
import boto3
os.chdir("../")


In [6]:
bedrock_client = boto3.client("bedrock-runtime")


In [7]:
message = [
    {
        "role": "system",
        "content": "You are a helpful assistant trained to assist"
    },
    {
        "role": "user",
        "content": "Explain the process of backpropagation in nerual networks."
    }
]

In [8]:
from langchain_aws import ChatBedrock
from botocore.config import Config

retry_config = Config(
    retries = {
        'max_attempts': 20,  # Customize the number of retry attempts
        'mode': 'adaptive'   # Or 'adaptive' for dynamically adjusting retries
    }
)

# Initialize the Bedrock client with retry configuration
bedrock_client = boto3.client('bedrock-runtime', config=retry_config)

# Use the ChatBedrock model in LangChain with the client that has retries
llm = ChatBedrock(
    client=bedrock_client,
    model_id="meta.llama3-1-8b-instruct-v1:0",  # Or "meta.llama3-1-70b-instruct-v1:0"
    temperature=0.4,
    max_tokens=None)

messages = [
    ("system", "You are a bot and you should reply to the user based on the function calling return. The return is {'weather': 'Sunday', 'location': 'USA'}"),
    ("user", "what is the weather in USA now?")
]

response = llm.invoke(messages)
response

AIMessage(content='\n\nAccording to the information I have, the weather in USA is currently Sunday.', additional_kwargs={}, response_metadata={'ResponseMetadata': {'RequestId': '59360e82-a97b-4fb2-938f-44fd800d1164', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Sun, 13 Oct 2024 00:19:55 GMT', 'content-type': 'application/json', 'content-length': '261', 'connection': 'keep-alive', 'x-amzn-requestid': '59360e82-a97b-4fb2-938f-44fd800d1164'}, 'RetryAttempts': 0}, 'stopReason': 'end_turn', 'metrics': {'latencyMs': 372}}, id='run-007158fd-1f1d-464b-b3d4-60348420d2ec-0', usage_metadata={'input_tokens': 55, 'output_tokens': 17, 'total_tokens': 72})

In [9]:
example_text = """[{
  "query": "How many times does the letter 'a' appear in the word 'banana'?",
  "answer": {
    "function": "count_letter",
    "arguments": {
      "word": "banana",
      "letter": "a"
    },
    "result": {
      "result": "3"
    }
  }
},
{
  "query": "How many times does the letter 'e' appear in the word 'computer'?",
  "answer": {
    "function": "count_letter",
    "arguments": {
      "word": "computer",
      "letter": "e"
    },
    "result": {
      "result": "2"
    }
  }
}]"""

In [10]:
from src.kevin_function import count_letter

## read test file
with open ("ks_instruct/test.txt", "r") as file:
    text = file.read() # it cannot be read currently

example = json.loads(example_text)




def call_function(info: dict):
    query = info["query"]
    answer = info["answer"]
    func_to_call = eval(answer["function"])
    arguments = answer["arguments"]
    return {"query": query, "output": func_to_call(**arguments)}

def generate_response(query_output, llm):
    messages = [
    ("system", f"You are a bot and you should reply to the user based on the function calling return. The return is {query_output['output']}"),
    ("user", query_output["query"])]
    response = llm.invoke(messages)
    query_output["response"] = response.content
    return query_output

query_output = call_function(example[0])
query_response = generate_response(query_output, llm)


In [11]:

query_response
example[0]

{'query': "How many times does the letter 'a' appear in the word 'banana'?",
 'answer': {'function': 'count_letter',
  'arguments': {'word': 'banana', 'letter': 'a'},
  'result': {'result': '3'}}}

In [12]:
from deepeval import evaluate
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase
from deepeval.models.base_model import DeepEvalBaseLLM

class AWSBedrock(DeepEvalBaseLLM):
    def __init__(
        self,
        model
    ):
        self.model = model

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        print(prompt)
        chat_model = self.load_model()
        res = chat_model.invoke(prompt).content
        return res

    async def a_generate(self, prompt: str) -> str:
        chat_model = self.load_model()
        res = await chat_model.ainvoke(prompt)
        return res.content

    def get_model_name(self):
        return "llama-3-1-8b"

aws_bedrock = AWSBedrock(model=llm)

# Replace this with the actual output from your LLM application

metric = AnswerRelevancyMetric(
    threshold=0.3,
    model=aws_bedrock,
    include_reason=True,
    async_mode=False
)


test_case = LLMTestCase(
    input=query_response["query"],
    actual_output= query_response["response"]
)

metric.measure(test_case)
print(metric.score)
print(metric.reason)

# or evaluate test cases in bulk
# evaluate([test_case], [metric])

1.0
The score is 1.00 because the actual output directly answers the question about the frequency of the letter 'a' in the word 'banana', making it highly relevant and accurate.


In [13]:
# reverse prompt engineering
from typing import List
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_aws.embeddings import BedrockEmbeddings
from langchain_community.utils.math import cosine_similarity
import numpy as np



class Query(BaseModel):
    """reverse prompt engineering output"""
    query: List[str] = Field(description="list of queries according to the user repsonse")

parser = JsonOutputParser(pydantic_object=Query)

def get_reverse_prompt(n, query_response, llm):
    prompt = PromptTemplate(
        template="Guess " + str(n) + "most possible queries based on a user's response.\n{format_instructions}\n{query}\n",
        input_variables=["query"],
        partial_variables={"format_instructions": parser.get_format_instructions()},
    )

    query_response["response"]
    prompt_and_model = prompt | llm
    output = prompt_and_model.invoke({"query": query_response["response"]})

    return parser.invoke(output)

def calculate_mean_similiarity(query_response, reverse_queries):
    model_id = "amazon.titan-embed-text-v2:0"
    # def get_cos_sim(query_response, reverse_queries):
    embedding = BedrockEmbeddings(model_id=model_id)
    original_embedded = embedding.embed_query(query_response["query"])
    guess_embeded = embedding.embed_documents(reverse_queries["query"])
    return np.array([cosine_similarity([original_embedded], [item])[0] for item in guess_embeded]).mean()

reverse_queries = get_reverse_prompt(3, query_response, llm)
reverse_queries



For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


{'query': ["How many times does the letter 'a' appear in the word 'banana'?",
  "What is the frequency of the letter 'a' in the word 'banana'?",
  "How often does the letter 'a' appear in the word 'banana'?"]}

In [14]:
query_response

{'query': "How many times does the letter 'a' appear in the word 'banana'?",
 'output': '{"result": 3}',
 'response': "The letter 'a' appears 3 times in the word 'banana'."}

In [15]:
from deepeval import evaluate
from deepeval.metrics import FaithfulnessMetric
from deepeval.test_case import LLMTestCase

# {'query': "How many times does the letter 'a' appear in the word 'banana'?",
#  'answer': {'function': 'count_letter',
#   'arguments': {'word': 'banana', 'letter': 'a'},
#   'result': {'result': '3'}}}

# Replace this with the actual output from your LLM application
actual_output = "We should call a function named count_letter('pear', 'b'))"

# Replace this with the actual retrieved context from your RAG pipeline
retrieval_context = [count_letter.__doc__]

metric = FaithfulnessMetric(
    threshold=0.7,
    model=aws_bedrock,
    include_reason=True
)
test_case = LLMTestCase(
    input="Which function and argument I should call if I ask: How many times does the letter 'a' appear in the word 'banana'?",
    actual_output=actual_output,
    retrieval_context=retrieval_context
)

metric.measure(test_case)
print(metric.score)
print(metric.reason)

# or evaluate test cases in bulk
# evaluate([test_case], [metric])

0.5
The score is 0.50 because the actual output incorrectly claims the function count_letter is to be called, and also incorrectly identifies the first and second parameters as 'pear' and 'b' respectively, which are not mentioned in the retrieval context.


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s]

None


Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:04,  4.06s/test case]



Metrics Summary

  - ❌ Faithfulness (score: 0.6666666666666666, threshold: 0.7, strict: False, evaluation model: llama-3-1-8b, reason: The score is 0.67 because the actual output contains errors: the function to be called is incorrect and the first parameter is also incorrect, indicating a moderate level of faithfulness to the retrieval context., error: None)

For test case:

  - input: Which function and argument I should call if I ask: How many times does the letter 'a' appear in the word 'banana'?
  - actual output: We should call a function named count_letter('pear', 'b'))
  - expected output: None
  - context: None
  - retrieval context: ['\n    Counts the number of occurrences of a specified letter in a word.\n\n    Parameters:\n    word (str): The word to count letters in.\n    letter (str): The letter to count.\n\n    Example:\n    >>> count_letter(\'strawberry\', \'R\')\n    \'{"result": 3}\'\n    ']


Overall Metric Pass Rates

Faithfulness: 0.00% pass rate







[TestResult(success=False, metrics_data=[MetricData(name='Faithfulness', threshold=0.7, success=False, score=0.6666666666666666, reason='The score is 0.67 because the actual output contains errors: the function to be called is incorrect and the first parameter is also incorrect, indicating a moderate level of faithfulness to the retrieval context.', strict_mode=False, evaluation_model='llama-3-1-8b', error=None, evaluation_cost=None, verbose_logs='Truths (limit=None):\n[\n    "A function to count the number of occurrences of a specified letter in a word exists.",\n    "The function takes two parameters: \'word\' and \'letter\'.",\n    "The function returns a JSON object."\n] \n \nClaims:\n[\n    "We should call a function named count_letter(\'pear\', \'b\'))",\n    "A function named count_letter exists",\n    "The function count_letter takes two parameters",\n    "The first parameter of the function count_letter is \'pear\'",\n    "The second parameter of the function count_letter is \

In [14]:
count_letter.__doc__

'\n    Counts the number of occurrences of a specified letter in a word.\n\n    Parameters:\n    word (str): The word to count letters in.\n    letter (str): The letter to count.\n\n    Example:\n    >>> count_letter(\'strawberry\', \'R\')\n    \'{"result": 3}\'\n    '

In [15]:
from src.evaluation import call_function
import asyncio
with open("ks_instruct/test.txt", "r") as file:
    texts = json.load(file)
texts

# Invoke the call_function on each item
batch_func = [call_function(item) for item in texts]

# Adjust system prompt formatting
system_prompt = "You are a bot and you should reply to the user based on the function calling return. The return is {output}"

# Build batch message with correct formatting
batch_message = [
    [
        {"role": "system", "content": system_prompt.format(output=item["output"])}, 
        {"role": "user", "content": item["query"]}
    ]
    for item in batch_func
]

# Perform batch invoke using the correct message structure
responses = asyncio.run(llm.abatch(batch_message))
responses
# query_output["response"] = response.content

[AIMessage(content="The letter 'a' appears 3 times in the word 'banana'.", additional_kwargs={'usage': {'prompt_tokens': 59, 'completion_tokens': 16, 'total_tokens': 75}, 'stop_reason': 'stop', 'model_id': 'meta.llama3-1-8b-instruct-v1:0'}, response_metadata={'usage': {'prompt_tokens': 59, 'completion_tokens': 16, 'total_tokens': 75}, 'stop_reason': 'stop', 'model_id': 'meta.llama3-1-8b-instruct-v1:0'}, id='run-074ca9e3-8d8d-4ceb-80c6-3cc71304fb83-0', usage_metadata={'input_tokens': 59, 'output_tokens': 16, 'total_tokens': 75}),
 AIMessage(content="The letter 'e' appears once in the word 'computer'.", additional_kwargs={'usage': {'prompt_tokens': 59, 'completion_tokens': 14, 'total_tokens': 73}, 'stop_reason': 'stop', 'model_id': 'meta.llama3-1-8b-instruct-v1:0'}, response_metadata={'usage': {'prompt_tokens': 59, 'completion_tokens': 14, 'total_tokens': 73}, 'stop_reason': 'stop', 'model_id': 'meta.llama3-1-8b-instruct-v1:0'}, id='run-7cad2446-14ba-464d-9153-2b0e9de69985-0', usage_meta

In [25]:
from deepeval.dataset import EvaluationDataset
metric = AnswerRelevancyMetric(
    threshold=0.3,
    model=aws_bedrock,
    include_reason=False # can be changed to false later
    )

test_cases = []

for message, response in zip(batch_message, responses):

    input = message[1]['content']
    actual_output = response.content

    test_case = LLMTestCase(
        input=input,
        actual_output=actual_output)
    test_cases.append(test_case)

dataset = EvaluationDataset(test_cases=test_cases)
dataset.evaluate([metric])



Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 68 test case(s) in parallel: |          |  0% (0/68) [Time Taken: 00:00, ?test case/s]ERROR:root:Error raised by bedrock service: An error occurred (ThrottlingException) when calling the InvokeModel operation: Too many requests, please wait before trying again. You have sent too many requests.  Wait before trying again.
Evaluating 68 test case(s) in parallel: |          |  0% (0/68) [Time Taken: 00:09, ?test case/s]


ThrottlingException: An error occurred (ThrottlingException) when calling the InvokeModel operation: Too many requests, please wait before trying again. You have sent too many requests.  Wait before trying again.

ERROR:root:Error raised by bedrock service: An error occurred (ThrottlingException) when calling the InvokeModel operation: Too many requests, please wait before trying again. You have sent too many requests.  Wait before trying again.
ERROR:root:Error raised by bedrock service: An error occurred (ThrottlingException) when calling the InvokeModel operation: Too many requests, please wait before trying again. You have sent too many requests.  Wait before trying again.
ERROR:root:Error raised by bedrock service: An error occurred (ThrottlingException) when calling the InvokeModel operation: Too many requests, please wait before trying again. You have sent too many requests.  Wait before trying again.
ERROR:root:Error raised by bedrock service: An error occurred (ThrottlingException) when calling the InvokeModel operation: Too many requests, please wait before trying again. You have sent too many requests.  Wait before trying again.
ERROR:root:Error raised by bedrock service: An error occ