In [20]:
# imports

import os
import glob
from dotenv import load_dotenv
import gradio as gr

In [21]:
# price is a factor for our company, so we're going to use a low cost model

MODEL = "gpt-4o-mini"
db_name = "vector_db"

In [22]:
# Load environment variables in a file called .env

load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

In [23]:
# Imports main tools:
from trulens.apps.langchain import TruChain
from trulens.core import TruSession

session = TruSession()
session.reset_database()




pdating app_json in apps table: 0it [00:00, ?it/s][00:00, ?it/s]

In [24]:
# Imports from LangChain to build app
import bs4
from langchain import hub
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import WebBaseLoader
from langchain.schema import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [25]:
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

In [26]:
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

embeddings = OpenAIEmbeddings()


text_splitter = RecursiveCharacterTextSplitter()
documents = text_splitter.split_documents(docs)
vectorstore = FAISS.from_documents(documents, embeddings)

In [27]:
from langchain_community.llms.ollama import Ollama
retriever = vectorstore.as_retriever()

prompt = hub.pull("rlm/rag-prompt")
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)



In [28]:
rag_chain.invoke("What is Task Decomposition?")

'Task Decomposition is a technique that breaks down complex tasks into smaller and simpler steps to enhance model performance. It involves transforming big tasks into manageable tasks by decomposing them into multiple steps. Task decomposition can be done using simple prompting, task-specific instructions, or with human inputs.'

In [29]:
import numpy as np
from trulens.core import Feedback
from trulens.providers.openai import OpenAI

# Initialize provider class
provider = OpenAI()

# select context to be used in feedback. the location of context is app specific.
context = TruChain.select_context(rag_chain)

# Define a groundedness feedback function
f_groundedness = (
    Feedback(
        provider.groundedness_measure_with_cot_reasons, name="Groundedness"
    )
    .on(context.collect())  # collect context chunks into a list
    .on_output()
)

# Question/answer relevance between overall question and answer.
f_answer_relevance = Feedback(
    provider.relevance_with_cot_reasons, name="Answer Relevance"
).on_input_output()
# Context relevance between question and each context chunk.
f_context_relevance = (
    Feedback(
        provider.context_relevance_with_cot_reasons, name="Context Relevance"
    )
    .on_input()
    .on(context)
    .aggregate(np.mean)
)

✅ In Groundedness, input source will be set to __record__.app.first.steps__.context.first.invoke.rets[:].page_content.collect() .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input context will be set to __record__.app.first.steps__.context.first.invoke.rets[:].page_content .


In [19]:
tru_recorder = TruChain(
    rag_chain,
    app_name="ChatApplication",
    app_version="Chain1",
    feedbacks=[f_answer_relevance, f_context_relevance, f_groundedness],
)

In [20]:
with tru_recorder as recording:
    llm_response = rag_chain.invoke("What is Task Decomposition?")

display(llm_response)

"The text appears to be a collection of research papers and concepts related to Large Language Models (LLMs) and their applications in various domains. Here's a summary of the main points:\n\n**Tool-Augmented LLMs**\n\n* Tool-augmented LLMs use external tools and APIs to perform specific tasks, such as API calls, search engine queries, or data management.\n* The workflow involves three levels:\n\t1. Decision-making: determining whether an API call is needed and selecting the right API.\n\t2. Task execution: executing the task using the selected API and logging results.\n\t3. Response generation: summarizing the results for the user.\n\n**Challenges**\n\n* Efficiency improvement is needed to reduce inference times and interactions with other models.\n* Stability improvements are required for LLM outputs and external model services.\n* The context window is limited, which affects in-context learning and long-term memory storage.\n\n**API-Bank Benchmark**\n\n* A benchmark for evaluating t



In [21]:
session.get_leaderboard()

Unnamed: 0_level_0,Unnamed: 1_level_0,Answer Relevance,Context Relevance,Groundedness,latency,total_cost
app_name,app_version,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ChatApplication,Chain1,1.0,0.416667,1.0,118.455861,0.002489


In [23]:
from trulens.dashboard.display import get_feedback_result

last_record = recording.records[-1]
get_feedback_result(last_record, "Context Relevance")

Unnamed: 0,question,context,score,reason
0,What is Task Decomposition?,Fig. 1. Overview of a LLM-powered autonomous a...,1.0,Criteria: The context must be relevant and hel...
1,What is Task Decomposition?,Fig. 10. A picture of a sea otter using rock t...,0.333333,Criteria: The context must be relevant and hel...
2,What is Task Decomposition?,(3) Task execution: Expert models execute on t...,0.333333,Criteria: The context must be relevant and hel...
3,What is Task Decomposition?,Fig. 6. Illustration of how Algorithm Distilla...,0.0,Criteria: The context provided does not addres...


In [1]:
!pip install ragas

Collecting ragas
  Downloading ragas-0.2.7-py3-none-any.whl.metadata (8.1 kB)
Collecting appdirs (from ragas)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting pysbd>=0.3.4 (from ragas)
  Downloading pysbd-0.3.4-py3-none-any.whl.metadata (6.1 kB)
Downloading ragas-0.2.7-py3-none-any.whl (163 kB)
Downloading pysbd-0.3.4-py3-none-any.whl (71 kB)
Downloading appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)
Installing collected packages: appdirs, pysbd, ragas
Successfully installed appdirs-1.4.4 pysbd-0.3.4 ragas-0.2.7


In [5]:
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import NonLLMContextRecall

sample = SingleTurnSample(
    retrieved_contexts=["Paris is the capital of France."], 
    reference_contexts=["Paris is the capital of France.", "The Eiffel Tower is one of the most famous landmarks in Paris."]
)

context_recall = NonLLMContextRecall()
await context_recall.single_turn_ascore(sample)

0.5

In [4]:
!pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading rapidfuzz-3.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0mMB/s[0m eta [36m0:00:01[0m
Installing collected packages: rapidfuzz
Successfully installed rapidfuzz-3.10.1
