In [None]:
import sys
import os 
import nest_asyncio

# Sanity check
print(sys.executable)
nest_asyncio.apply()

os.environ["OPENAI_API_KEY"] = "sk-"
#os.environ[
#    "AZURE_OPENAI_ENDPOINT"
#] = "https://<your-resource-name>.openai.azure.com/"
#os.environ["OPENAI_API_VERSION"] = "2023-07-01-preview"

In [None]:
import os
from pydantic import BaseModel, Field
from llama_index.core.workflow import (
    Workflow,
    step,
    Event,
    Context,
    StartEvent,
    StopEvent
)
from llama_index.llms.openai import OpenAI
from llama_index.core import SimpleDirectoryReader
from llama_index.core import (
    SimpleDirectoryReader,
    load_index_from_storage,
    VectorStoreIndex,
    StorageContext,
)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.retrievers.bm25 import BM25Retriever
import Stemmer



## 1 - Evaluation of correctness 

Use an evaluator and score a dataset of generation


In [None]:


query = (
    "Can you explain the theory of relativity proposed by Albert Einstein in"
    " detail?"
)

reference = """
Certainly! Albert Einstein's theory of relativity consists of two main components: special relativity and general relativity. Special relativity, published in 1905, introduced the concept that the laws of physics are the same for all non-accelerating observers and that the speed of light in a vacuum is a constant, regardless of the motion of the source or observer. It also gave rise to the famous equation E=mc², which relates energy (E) and mass (m).

General relativity, published in 1915, extended these ideas to include the effects of gravity. 
According to general relativity, gravity is not a force between masses, as described by Newton's theory of gravity, 
but rather the result of the warping of space and time by mass and energy. 
Massive objects, such as planets and stars, cause a curvature in spacetime, and smaller objects follow curved paths in 
response to this curvature. This concept is often illustrated using the analogy of a heavy ball placed on a rubber sheet, 
causing it to create a depression that other objects (representing smaller masses) naturally move towards.

In essence, general relativity provided a new understanding of gravity, explaining phenomena like the bending 
of light by gravity (gravitational lensing) and the precession of the orbit of Mercury. It has been confirmed 
through numerous experiments and observations and has become a fundamental theory in modern physics.
"""

response = """
Certainly! Albert Einstein's theory of relativity consists of two main components: special relativity and general relativity. 
Special relativity, published in 1905, introduced the concept that the laws of physics are the same for all non-accelerating 
observers and that the speed of light in a vacuum is a constant, regardless of the motion of the source or observer. 
It also gave rise to the famous equation E=mc², which relates energy (E) and mass (m).

However, general relativity, published in 1915, extended these ideas to include the effects of magnetism. 
According to general relativity, gravity is not a force between masses but rather the result of the warping of s
pace and time by magnetic fields generated by massive objects. Massive objects, such as planets and stars, 
create magnetic fields that cause a curvature in spacetime, and smaller objects follow curved paths in response 
to this magnetic curvature. This concept is often illustrated using the analogy of a heavy ball placed on a 
rubber sheet with magnets underneath, causing it to create a depression that other objects (representing smaller masses) 
naturally move towards due to magnetic attraction.
"""


In [None]:
from llama_index.core.evaluation import CorrectnessEvaluator
from llama_index.llms.openai import OpenAI

llm = OpenAI("gpt-4o-mini")
evaluator = CorrectnessEvaluator(llm=llm)

result = evaluator.evaluate(
    query=query,
    response=response,
    reference=reference,
)

In [None]:
print(result.score)
print(result.feedback)

## 2 - RAG faithfulness 

RAG verification after summarization



In [None]:
import os
from pydantic import BaseModel, Field
from llama_index.core.workflow import (
    Workflow,
    step,
    Event,
    Context,
    StartEvent,
    StopEvent
)
from llama_index.llms.openai import OpenAI
from llama_index.core import SimpleDirectoryReader
from llama_index.core import (
    SimpleDirectoryReader,
    load_index_from_storage,
    VectorStoreIndex,
    StorageContext,
)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.retrievers.bm25 import BM25Retriever
import Stemmer
from llama_index.core import VectorStoreIndex, get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.evaluation import FaithfulnessEvaluator


documents = SimpleDirectoryReader(
    input_files=["./paul_graham_essay.txt"],
).load_data()
splitter = SentenceSplitter(chunk_size=256)
nodes = splitter.get_nodes_from_documents(documents)
retriever_top_5 = BM25Retriever.from_defaults(
    nodes=nodes,
    similarity_top_k=5,
    stemmer=Stemmer.Stemmer("english"),
    language="english",
)


evaluator_gpt4 = FaithfulnessEvaluator(llm=llm)

In [None]:
# configure response synthesizer
response_synthesizer = get_response_synthesizer(
    response_mode="tree_summarize",
)

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever_top_5,
    response_synthesizer=response_synthesizer,
)

response = query_engine.query("Why did microcomputer emerge ?")

eval_result = evaluator_gpt4.evaluate_response(response=response)

eval_result

In [None]:
print(eval_result.feedback, eval_result.score)

print(response.response)

In [None]:
response = query_engine.query("How to make a sherpherd pie ?")

eval_result = evaluator_gpt4.evaluate_response(response=response)

print(eval_result.feedback, eval_result.score)
print(response.response)

## 3 - Guideline following

RAG verification after summarization


In [None]:
from llama_index.core.evaluation import GuidelineEvaluator


GUIDELINES = [
    "The response should fully answer the query.",
    "The response should avoid being vague or ambiguous.",
    (
        "The response should be specific and use statistics or numbers when"
        " possible."
    ),
]

evaluators = [
    GuidelineEvaluator(llm=llm, guidelines=guideline)
    for guideline in GUIDELINES
]

In [None]:


sample_data = {
    "query": "Tell me about global warming.",
    "contexts": [
        (
            "Global warming refers to the long-term increase in Earth's"
            " average surface temperature due to human activities such as the"
            " burning of fossil fuels and deforestation."
        ),
        (
            "It is a major environmental issue with consequences such as"
            " rising sea levels, extreme weather events, and disruptions to"
            " ecosystems."
        ),
        (
            "Efforts to combat global warming include reducing carbon"
            " emissions, transitioning to renewable energy sources, and"
            " promoting sustainable practices."
        ),
    ],
    "response": (
        "Global warming is a critical environmental issue caused by human"
        " activities that lead to a rise in Earth's temperature. It has"
        " various adverse effects on the planet."
    ),
}


In [None]:


for guideline, evaluator in zip(GUIDELINES, evaluators):
    eval_result = evaluator.evaluate(
        query=sample_data["query"],
        contexts=sample_data["contexts"],
        response=sample_data["response"],
    )
    print("=====")
    print(f"Guideline: {guideline}")
    print(f"Pass: {eval_result.passing}")
    print(f"Feedback: {eval_result.feedback}")


In [None]:
response = llm.complete("Explain global warming")



for guideline, evaluator in zip(GUIDELINES, evaluators):
    eval_result = evaluator.evaluate(
        query=sample_data["query"],
        contexts=sample_data["contexts"],
        response=response.text,
    )
    print("=====")
    print(f"Guideline: {guideline}")
    print(f"Pass: {eval_result.passing}")
    print(f"Feedback: {eval_result.feedback}")


In [None]:
print(response.text)