In [1]:
from dotenv import load_dotenv
load_dotenv("/home/marshath/play/chainlink/algovate/.env")

True

In [2]:
import time
import pickle
import pandas as pd
import ipywidgets as widgets

with open('/home/marshath/play/chainlink/algovate/algovate/data/documents.pkl', 'rb') as f:
    lc_docs = pickle.load(f)

In [3]:
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [4]:
from llama_index import Document
from llama_index.node_parser import SimpleNodeParser
from llama_index import GPTVectorStoreIndex, ResponseSynthesizer
from llama_index.data_structs.node import Node, DocumentRelationship

from langchain.chat_models import ChatOpenAI
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.indices.postprocessor import SimilarityPostprocessor
from llama_index import LLMPredictor, GPTVectorStoreIndex, PromptHelper, ServiceContext

from IPython.display import display, Markdown
from algovate.llama.evaluate import run_experiment, grade_model_answer, grade_model_retrieval

In [5]:
def display_row_as_markdown(df, row_index, columns_to_observe=["question","answer", "result"]):
    """
    Display a specific row of a DataFrame as Markdown, showing only the specified columns.

    Args:
        df (pd.DataFrame): The DataFrame to display.
        row_index (int): The index of the row to display.
        columns_to_observe (list): The list of columns to observe.

    Returns:
        None
    """
    row = df.iloc[row_index]

    markdown_row = '\n\n|'.join(str(row[column]) for column in columns_to_observe)
    display(Markdown(markdown_row))

In [6]:
# def get_retrieved_nodes(
#     query_str, vector_top_k=10, reranker_top_n=5, with_reranker=True
# ):
#   query_bundle = QueryBundle(query_str)
#   # configure retriever
#   retriever = VectorIndexRetriever(
#     index=index,
#     similarity_top_k=vector_top_k,
#   )
#   retrieved_nodes = retriever.retrieve(query_bundle)

#   if with_reranker:
#     # configure reranker
#     reranker = LLMRerank(choice_batch_size=5, top_n=reranker_top_n, service_context=service_context)
#     retrieved_nodes = reranker.postprocess_nodes(retrieved_nodes, query_bundle)
#   return retrieved_nodes

In [7]:
documents = []
for doc in lc_docs:
    documents.append(Document(text=doc.page_content, extra_info=doc.metadata))

parser = SimpleNodeParser()
nodes = parser.get_nodes_from_documents(documents)
len(documents), len(nodes)

(151, 774)

In [8]:
# get the ground truths

with open("/home/marshath/play/chainlink/algovate/algovate/data/ground_truths.pkl", "rb") as f:
    ground_truths = pickle.load(f)

In [9]:
df = pd.DataFrame(ground_truths)

##### Simple evaluation app

In [10]:
def evaluate_dataframe_answer(df):
    # Add right_or_wrong column if it doesn't exist
    if 'right_or_wrong' not in df.columns:
        df['right_or_wrong'] = ''

    # Widgets
    button_right = widgets.Button(description="Right")
    button_wrong = widgets.Button(description="Wrong")
    button_partial = widgets.Button(description="partial")
    out = widgets.Output()

    # Starting index
    index = [0]  # Use list to allow mutation inside functions

    def display_row(row):
        """
        Display the row data in markdown
        """
        with out:
            display(Markdown(f"**Question**: {row['question']}"))
            display(Markdown(f"**Answer**: {row['answer']}"))
            display(Markdown(f"**Result**: {row['result']}\n"))

    def on_button_right_clicked(b):
        """
        Update right_or_wrong column with 'right' and go to next row
        """
        df.at[index[0], 'right_or_wrong'] = 'right'
        index[0] += 1
        out.clear_output()
        if index[0] < len(df):
            display_row(df.iloc[index[0]])

    def on_button_wrong_clicked(b):
        """
        Update right_or_wrong column with 'wrong' and go to next row
        """
        df.at[index[0], 'right_or_wrong'] = 'wrong'
        index[0] += 1
        out.clear_output()
        if index[0] < len(df):
            display_row(df.iloc[index[0]])

    def on_button_partial_clicked(b):
        """
        Update right_or_wrong column with 'partial' and go to next row
        """
        df.at[index[0], 'right_or_wrong'] = 'partial'
        index[0] += 1
        out.clear_output()
        if index[0] < len(df):
            display_row(df.iloc[index[0]])

    button_right.on_click(on_button_right_clicked)
    button_wrong.on_click(on_button_wrong_clicked)
    button_partial.on_click(on_button_partial_clicked)

    display(out)
    display(button_right, button_wrong, button_partial)

    # Start by displaying the first row
    display_row(df.iloc[0])

In [11]:
def retrieve_docs(df, query_engine):
    # Add retrieved_docs column if it doesn't exist
    df['retrieved_docs'] = df.get('retrieved_docs', '')

    for index, row in df.iterrows():
        retrieved_docs = query_engine.retrieve(row['question'])

        docs = ""
        for d in retrieved_docs:
            if docs == "":
                docs = d.node.text
            else:
                docs = docs + "\n\n:::NEXT DOC:::\n\n" + d.node.text

        # Store retrieved docs in the DataFrame
        df.at[index, 'retrieved_docs'] = docs

In [12]:
def evaluate_dataframe_retrieval(df, query_engine):
    # Add rd_right_or_wrong column if it doesn't exist
    df['rd_right_or_wrong'] = df.get('rd_right_or_wrong', '')

    df_wrong = df[df['right_or_wrong'].isin(['wrong', 'partial'])].copy()

    # Widgets
    button_right = widgets.Button(description="Right")
    button_wrong = widgets.Button(description="Wrong")
    button_partial = widgets.Button(description="Partial")
    out = widgets.Output()

    # Starting index
    index = [0]  # Use list to allow mutation inside functions

    def display_row(row):
        """
        Display the row data in markdown
        """
        with out:
            display(Markdown(f"**Question**: {row['question']}"))
            display(Markdown(f"**Answer**: {row['answer']}"))
            display(Markdown(f"**Result**: {row['result']}\n"))
            display(Markdown(f"**Retrieved Docs**: {row['retrieved_docs']}"))

    def on_button_right_clicked(b):
        """
        Update right_or_wrong column with 'right' and go to next row
        """
        df.at[df_wrong.index[index[0]], 'rd_right_or_wrong'] = 'right'
        index[0] += 1
        out.clear_output()
        if index[0] < len(df_wrong):
            display_row(df_wrong.iloc[index[0]])

    def on_button_wrong_clicked(b):
        """
        Update right_or_wrong column with 'wrong' and go to next row
        """
        df.at[df_wrong.index[index[0]], 'rd_right_or_wrong'] = 'wrong'
        index[0] += 1
        out.clear_output()
        if index[0] < len(df_wrong):
            display_row(df_wrong.iloc[index[0]])

    def on_button_partial_clicked(b):
        """
        Update right_or_wrong column with 'partial' and go to next row
        """
        df.at[df_wrong.index[index[0]], 'rd_right_or_wrong'] = 'partial'
        index[0] += 1
        out.clear_output()
        if index[0] < len(df_wrong):
            display_row(df_wrong.iloc[index[0]])

    button_right.on_click(on_button_right_clicked)
    button_wrong.on_click(on_button_wrong_clicked)
    button_partial.on_click(on_button_partial_clicked)

    display(out)
    display(button_right, button_wrong, button_partial)

    # Start by displaying the first row
    if len(df_wrong) > 0:
        display_row(df_wrong.iloc[0])
    else:
        print("There are no wrong rows in the DataFrame.")

##### Run 1

- create a `llm_predictor` using ChatOpenAI model
- get a `GPTVectorStoreIndex`
- default `query_engine`

Test only the first 10 questions from ground truths and evaluate manually

In [13]:
llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0,))
max_input_size = 4096
num_output = 256
max_chunk_overlap = 20
prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper)
index = GPTVectorStoreIndex.from_documents(
    documents, service_context=service_context
)
query_engine = index.as_query_engine()

2023-05-29 16:45:33,990 - INFO - > [build_index_from_nodes] Total LLM token usage: 0 tokens
2023-05-29 16:45:33,992 - INFO - > [build_index_from_nodes] Total embedding token usage: 701713 tokens


In [14]:
df1 = df.copy()

for i, gt in df1.iterrows():
    try:
        df1.loc[i, "result"] = query_engine.query(gt.question)
    except Exception as e:
        logger.error(e)
        df1.loc[i, "result"] = "ERROR"

2023-05-29 16:45:34,589 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 16:45:34,590 - INFO - > [retrieve] Total embedding token usage: 14 tokens
2023-05-29 16:45:44,328 - INFO - > [get_response] Total LLM token usage: 1725 tokens
2023-05-29 16:45:44,329 - INFO - > [get_response] Total embedding token usage: 0 tokens
2023-05-29 16:45:44,852 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 16:45:44,853 - INFO - > [retrieve] Total embedding token usage: 14 tokens
2023-05-29 16:45:50,403 - INFO - > [get_response] Total LLM token usage: 1964 tokens
2023-05-29 16:45:50,404 - INFO - > [get_response] Total embedding token usage: 0 tokens
2023-05-29 16:45:51,117 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 16:45:51,118 - INFO - > [retrieve] Total embedding token usage: 16 tokens
2023-05-29 16:46:19,478 - INFO - > [get_response] Total LLM token usage: 2082 tokens
2023-05-29 16:46:19,479 - INFO - > [get_response] Total embedding token usage: 0

In [23]:
evaluate_dataframe_answer(df1)

Output()

Button(description='Right', style=ButtonStyle())

Button(description='Wrong', style=ButtonStyle())

Button(description='partial', style=ButtonStyle())

In [24]:
evaluate_dataframe_retrieval(df1, query_engine)

Output()

Button(description='Right', style=ButtonStyle())

Button(description='Wrong', style=ButtonStyle())

Button(description='Partial', style=ButtonStyle())

2023-05-29 16:56:19,417 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 16:56:19,418 - INFO - > [retrieve] Total embedding token usage: 14 tokens
2023-05-29 16:56:19,418 - INFO - > [retrieve] Total embedding token usage: 14 tokens


2023-05-29 16:56:43,748 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 16:56:43,749 - INFO - > [retrieve] Total embedding token usage: 14 tokens


2023-05-29 16:57:10,153 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 16:57:10,153 - INFO - > [retrieve] Total embedding token usage: 24 tokens


2023-05-29 16:57:26,836 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 16:57:26,838 - INFO - > [retrieve] Total embedding token usage: 16 tokens


2023-05-29 16:57:41,114 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 16:57:41,115 - INFO - > [retrieve] Total embedding token usage: 7 tokens


In [26]:
df1.to_csv("exp_1.csv", index=False)

##### Run 2

In [27]:
experiment = "exp_2"
llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0.,))
max_input_size = 4096
num_output = 256
max_chunk_overlap = 20
prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper)

index = GPTVectorStoreIndex.from_documents(documents)

retriever = VectorIndexRetriever(
    index=index, 
    similarity_top_k=4,
)

response_synthesizer = ResponseSynthesizer.from_args(
    node_postprocessors=[
        SimilarityPostprocessor(similarity_cutoff=0.7)
    ]
)

query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)

2023-05-29 16:59:42,369 - INFO - > [build_index_from_nodes] Total LLM token usage: 0 tokens
2023-05-29 16:59:42,370 - INFO - > [build_index_from_nodes] Total embedding token usage: 701713 tokens


In [28]:
df2 = df.copy()

for i, gt in df2.iterrows():
    try:
        df2.loc[i, "result"] = query_engine.query(gt.question)
    except Exception as e:
        logger.error(e)
        df2.loc[i, "result"] = "ERROR"

2023-05-29 16:59:47,952 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 16:59:47,953 - INFO - > [retrieve] Total embedding token usage: 14 tokens
2023-05-29 17:00:14,278 - INFO - > [get_response] Total LLM token usage: 3549 tokens
2023-05-29 17:00:14,279 - INFO - > [get_response] Total embedding token usage: 0 tokens
2023-05-29 17:00:14,912 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 17:00:14,913 - INFO - > [retrieve] Total embedding token usage: 14 tokens
2023-05-29 17:01:12,099 - INFO - > [get_response] Total LLM token usage: 4901 tokens
2023-05-29 17:01:12,101 - INFO - > [get_response] Total embedding token usage: 0 tokens
2023-05-29 17:01:12,549 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 17:01:12,551 - INFO - > [retrieve] Total embedding token usage: 16 tokens
2023-05-29 17:01:34,495 - INFO - > [get_response] Total LLM token usage: 3846 tokens
2023-05-29 17:01:34,496 - INFO - > [get_response] Total embedding token usage: 0

In [39]:
evaluate_dataframe_answer(df2)

In [35]:
retrieve_docs(df2, query_engine)

In [40]:
evaluate_dataframe_retrieval(df2, query_engine)

Output()

Button(description='Right', style=ButtonStyle())

Button(description='Wrong', style=ButtonStyle())

Button(description='Partial', style=ButtonStyle())

In [42]:
# df2.to_csv("exp_2.csv", index=False)

##### Run 3 - Run 2 with longer outputs

In [43]:
experiment = "exp_2"
llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0.,))
max_input_size = 4096
num_output = 256*2
max_chunk_overlap = 20
prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper)

index = GPTVectorStoreIndex.from_documents(documents)

retriever = VectorIndexRetriever(
    index=index, 
    similarity_top_k=4,
)

response_synthesizer = ResponseSynthesizer.from_args(
    node_postprocessors=[
        SimilarityPostprocessor(similarity_cutoff=0.7)
    ]
)

query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)

2023-05-29 17:23:55,164 - INFO - > [build_index_from_nodes] Total LLM token usage: 0 tokens
2023-05-29 17:23:55,166 - INFO - > [build_index_from_nodes] Total embedding token usage: 701713 tokens


In [44]:
df3 = df.copy()

for i, gt in df3.iterrows():
    try:
        df3.loc[i, "result"] = query_engine.query(gt.question)
    except Exception as e:
        logger.error(e)
        df3.loc[i, "result"] = "ERROR"

2023-05-29 17:24:05,056 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 17:24:05,057 - INFO - > [retrieve] Total embedding token usage: 14 tokens
2023-05-29 17:24:05,415 - INFO - error_code=None error_message='The server had an error while processing your request. Sorry about that!' error_param=None error_type=server_error message='OpenAI API error received' stream_error=False
2023-05-29 17:24:35,245 - INFO - > [get_response] Total LLM token usage: 3549 tokens
2023-05-29 17:24:35,246 - INFO - > [get_response] Total embedding token usage: 0 tokens
2023-05-29 17:24:35,848 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 17:24:35,849 - INFO - > [retrieve] Total embedding token usage: 14 tokens
2023-05-29 17:24:36,256 - INFO - error_code=None error_message='The server had an error while processing your request. Sorry about that!' error_param=None error_type=server_error message='OpenAI API error received' stream_error=False
2023-05-29 17:25:26,411 - INFO - 

In [45]:
evaluate_dataframe_answer(df3)

Output()

Button(description='Right', style=ButtonStyle())

Button(description='Wrong', style=ButtonStyle())

Button(description='partial', style=ButtonStyle())

In [46]:
retrieve_docs(df3, query_engine)

2023-05-29 17:35:52,498 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 17:35:52,499 - INFO - > [retrieve] Total embedding token usage: 14 tokens
2023-05-29 17:35:53,190 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 17:35:53,190 - INFO - > [retrieve] Total embedding token usage: 14 tokens
2023-05-29 17:35:53,631 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 17:35:53,632 - INFO - > [retrieve] Total embedding token usage: 16 tokens
2023-05-29 17:35:54,049 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 17:35:54,050 - INFO - > [retrieve] Total embedding token usage: 15 tokens
2023-05-29 17:35:54,669 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 17:35:54,669 - INFO - > [retrieve] Total embedding token usage: 27 tokens
2023-05-29 17:35:55,247 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 17:35:55,248 - INFO - > [retrieve] Total embedding token usage: 24 tokens
2023-05-29 17:35:55,65

In [47]:
evaluate_dataframe_retrieval(df3, query_engine)

Output()

Button(description='Right', style=ButtonStyle())

Button(description='Wrong', style=ButtonStyle())

Button(description='Partial', style=ButtonStyle())

In [48]:
df3.to_csv("exp_3.csv", index=False)

##### Run 4 - Run 2 with different response mode - refine

In [49]:
experiment = "exp_2"
llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0.,))
max_input_size = 4096
num_output = 256
max_chunk_overlap = 20
prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper)

index = GPTVectorStoreIndex.from_documents(documents)

retriever = VectorIndexRetriever(
    index=index, 
    similarity_top_k=4,
)

response_synthesizer = ResponseSynthesizer.from_args(
    node_postprocessors=[
        SimilarityPostprocessor(similarity_cutoff=0.7)
    ],
    response_mode = "refine", # https://github.com/jerryjliu/llama_index/blob/main/llama_index/indices/response/type.py#L7

)

query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)

2023-05-29 17:40:54,158 - INFO - > [build_index_from_nodes] Total LLM token usage: 0 tokens
2023-05-29 17:40:54,160 - INFO - > [build_index_from_nodes] Total embedding token usage: 701713 tokens


In [50]:
df4 = df.copy()

for i, gt in df4.iterrows():
    try:
        df4.loc[i, "result"] = query_engine.query(gt.question)
    except Exception as e:
        logger.error(e)
        df4.loc[i, "result"] = "ERROR"

# takes very long

2023-05-29 17:40:58,191 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 17:40:58,192 - INFO - > [retrieve] Total embedding token usage: 14 tokens
2023-05-29 17:40:58,538 - INFO - error_code=None error_message='The server had an error while processing your request. Sorry about that!' error_param=None error_type=server_error message='OpenAI API error received' stream_error=False
2023-05-29 17:42:57,601 - INFO - > [get_response] Total LLM token usage: 5967 tokens
2023-05-29 17:42:57,603 - INFO - > [get_response] Total embedding token usage: 0 tokens
2023-05-29 17:42:58,342 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 17:42:58,342 - INFO - > [retrieve] Total embedding token usage: 14 tokens
2023-05-29 17:43:25,563 - INFO - error_code=None error_message='The server had an error while processing your request. Sorry about that!' error_param=None error_type=server_error message='OpenAI API error received' stream_error=False
2023-05-29 17:43:29,892 - INFO - 

In [51]:
retrieve_docs(df4, query_engine)
df4.to_csv("exp_4.csv", index=False)

2023-05-29 18:17:36,095 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 18:17:36,096 - INFO - > [retrieve] Total embedding token usage: 14 tokens
2023-05-29 18:17:36,543 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 18:17:36,544 - INFO - > [retrieve] Total embedding token usage: 14 tokens
2023-05-29 18:17:37,171 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 18:17:37,172 - INFO - > [retrieve] Total embedding token usage: 16 tokens
2023-05-29 18:17:37,772 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 18:17:37,773 - INFO - > [retrieve] Total embedding token usage: 15 tokens
2023-05-29 18:17:38,303 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 18:17:38,304 - INFO - > [retrieve] Total embedding token usage: 27 tokens
2023-05-29 18:17:38,895 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 18:17:38,896 - INFO - > [retrieve] Total embedding token usage: 24 tokens
2023-05-29 18:17:39,47

In [62]:
evaluate_dataframe_answer(df4)

Output()

Button(description='Right', style=ButtonStyle())

Button(description='Wrong', style=ButtonStyle())

Button(description='partial', style=ButtonStyle())

In [65]:
evaluate_dataframe_retrieval(df4, query_engine)

Output()

Button(description='Right', style=ButtonStyle())

Button(description='Wrong', style=ButtonStyle())

Button(description='Partial', style=ButtonStyle())

In [66]:
df4.to_csv("exp_4.csv", index=False)

##### Run 5 - Run 2 with different response mode - tree_summarize

In [54]:
llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0.,))
max_input_size = 4096
num_output = 256
max_chunk_overlap = 20
prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper)

index = GPTVectorStoreIndex.from_documents(documents)

retriever = VectorIndexRetriever(
    index=index, 
    similarity_top_k=4,
)

response_synthesizer = ResponseSynthesizer.from_args(
    node_postprocessors=[
        SimilarityPostprocessor(similarity_cutoff=0.7)
    ],
    response_mode = "tree_summarize", # https://github.com/jerryjliu/llama_index/blob/main/llama_index/indices/response/type.py#L7

)

query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)

2023-05-29 18:19:36,666 - INFO - > [build_index_from_nodes] Total LLM token usage: 0 tokens
2023-05-29 18:19:36,667 - INFO - > [build_index_from_nodes] Total embedding token usage: 701713 tokens


In [55]:
df5 = df.copy()

for i, gt in df5.iterrows():
    try:
        df5.loc[i, "result"] = query_engine.query(gt.question)
    except Exception as e:
        logger.error(e)
        df5.loc[i, "result"] = "ERROR"

2023-05-29 18:19:37,409 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 18:19:37,409 - INFO - > [retrieve] Total embedding token usage: 14 tokens
2023-05-29 18:20:03,753 - INFO - > [get_response] Total LLM token usage: 3719 tokens
2023-05-29 18:20:03,754 - INFO - > [get_response] Total embedding token usage: 0 tokens
2023-05-29 18:20:03,754 - INFO - > [get_response] Total LLM token usage: 3719 tokens
2023-05-29 18:20:03,755 - INFO - > [get_response] Total embedding token usage: 0 tokens
2023-05-29 18:20:04,549 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 18:20:04,550 - INFO - > [retrieve] Total embedding token usage: 14 tokens
2023-05-29 18:20:04,573 - INFO - > Building index from nodes: 1 chunks
2023-05-29 18:20:51,424 - INFO - > [get_response] Total LLM token usage: 515 tokens
2023-05-29 18:20:51,425 - INFO - > [get_response] Total embedding token usage: 0 tokens
2023-05-29 18:20:51,425 - INFO - > [get_response] Total LLM token usage: 5001 tokens


In [56]:
retrieve_docs(df5, query_engine)
df5.to_csv("exp_5.csv", index=False)

2023-05-29 18:32:47,007 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 18:32:47,008 - INFO - > [retrieve] Total embedding token usage: 14 tokens
2023-05-29 18:32:47,532 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 18:32:47,533 - INFO - > [retrieve] Total embedding token usage: 14 tokens
2023-05-29 18:32:48,037 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 18:32:48,037 - INFO - > [retrieve] Total embedding token usage: 16 tokens
2023-05-29 18:32:48,608 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 18:32:48,609 - INFO - > [retrieve] Total embedding token usage: 15 tokens
2023-05-29 18:32:49,003 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 18:32:49,004 - INFO - > [retrieve] Total embedding token usage: 27 tokens
2023-05-29 18:32:49,536 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 18:32:49,536 - INFO - > [retrieve] Total embedding token usage: 24 tokens
2023-05-29 18:32:50,55

In [67]:
evaluate_dataframe_answer(df5)

Output()

Button(description='Right', style=ButtonStyle())

Button(description='Wrong', style=ButtonStyle())

Button(description='partial', style=ButtonStyle())

In [68]:
evaluate_dataframe_retrieval(df5, query_engine)

Output()

Button(description='Right', style=ButtonStyle())

Button(description='Wrong', style=ButtonStyle())

Button(description='Partial', style=ButtonStyle())

In [69]:
df5.to_csv("exp_5.csv", index=False)

##### Run 6 - Run 2 with different response mode - ACCUMULATE

In [57]:
llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0.,))
max_input_size = 4096
num_output = 256
max_chunk_overlap = 20
prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper)

index = GPTVectorStoreIndex.from_documents(documents)

retriever = VectorIndexRetriever(
    index=index, 
    similarity_top_k=4,
)

response_synthesizer = ResponseSynthesizer.from_args(
    node_postprocessors=[
        SimilarityPostprocessor(similarity_cutoff=0.7)
    ],
    response_mode = "accumulate", # https://github.com/jerryjliu/llama_index/blob/main/llama_index/indices/response/type.py#L7

)

query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)

2023-05-29 18:36:07,065 - INFO - > [build_index_from_nodes] Total LLM token usage: 0 tokens
2023-05-29 18:36:07,066 - INFO - > [build_index_from_nodes] Total embedding token usage: 701713 tokens


In [58]:
df6 = df.copy()

for i, gt in df6.iterrows():
    try:
        df6.loc[i, "result"] = query_engine.query(gt.question)
    except Exception as e:
        logger.error(e)
        df6.loc[i, "result"] = "ERROR"

2023-05-29 18:37:46,976 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 18:37:46,977 - INFO - > [retrieve] Total embedding token usage: 14 tokens
2023-05-29 18:39:24,307 - INFO - > [get_response] Total LLM token usage: 4618 tokens
2023-05-29 18:39:24,307 - INFO - > [get_response] Total embedding token usage: 0 tokens
2023-05-29 18:39:24,714 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 18:39:24,714 - INFO - > [retrieve] Total embedding token usage: 14 tokens
2023-05-29 18:40:45,926 - INFO - > [get_response] Total LLM token usage: 4987 tokens
2023-05-29 18:40:45,927 - INFO - > [get_response] Total embedding token usage: 0 tokens
2023-05-29 18:40:46,544 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 18:40:46,545 - INFO - > [retrieve] Total embedding token usage: 16 tokens
2023-05-29 18:42:08,701 - INFO - > [get_response] Total LLM token usage: 4512 tokens
2023-05-29 18:42:08,702 - INFO - > [get_response] Total embedding token usage: 0

In [59]:
retrieve_docs(df6, query_engine)
df6.to_csv("exp_6.csv", index=False)

2023-05-29 18:58:47,014 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 18:58:47,015 - INFO - > [retrieve] Total embedding token usage: 14 tokens
2023-05-29 18:58:47,669 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 18:58:47,669 - INFO - > [retrieve] Total embedding token usage: 14 tokens
2023-05-29 18:58:48,244 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 18:58:48,245 - INFO - > [retrieve] Total embedding token usage: 16 tokens
2023-05-29 18:58:48,730 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 18:58:48,730 - INFO - > [retrieve] Total embedding token usage: 15 tokens
2023-05-29 18:58:49,439 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 18:58:49,440 - INFO - > [retrieve] Total embedding token usage: 27 tokens
2023-05-29 18:58:50,116 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 18:58:50,116 - INFO - > [retrieve] Total embedding token usage: 24 tokens
2023-05-29 18:58:50,55

In [70]:
evaluate_dataframe_answer(df6)

Output()

Button(description='Right', style=ButtonStyle())

Button(description='Wrong', style=ButtonStyle())

Button(description='partial', style=ButtonStyle())

Didn't evaluate as there were four responses

##### Run 7 - Run 2 with different response mode - compact_accumulate 

In [13]:
llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0.,))
max_input_size = 4096
num_output = 256
max_chunk_overlap = 20
prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper)

index = GPTVectorStoreIndex.from_documents(documents)

retriever = VectorIndexRetriever(
    index=index, 
    similarity_top_k=4,
)

response_synthesizer = ResponseSynthesizer.from_args(
    node_postprocessors=[
        SimilarityPostprocessor(similarity_cutoff=0.7)
    ],
    response_mode = "compact_accumulate", # https://github.com/jerryjliu/llama_index/blob/main/llama_index/indices/response/type.py#L7

)

query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)

2023-05-29 19:48:16,544 - INFO - > [build_index_from_nodes] Total LLM token usage: 0 tokens
2023-05-29 19:48:16,546 - INFO - > [build_index_from_nodes] Total embedding token usage: 701713 tokens


ValueError: Unknown mode: compact_accumulate

In [None]:
df7 = df.copy()

for i, gt in df7.iterrows():
    try:
        df7.loc[i, "result"] = query_engine.query(gt.question)
    except Exception as e:
        logger.error(e)
        df7.loc[i, "result"] = "ERROR"

Not on pip yet

##### Run 8 - LLM Reranker Retriever

In [15]:
from llama_index import GPTListIndex
from llama_index.indices.list.retrievers import ListIndexLLMRetriever
from llama_index.indices.postprocessor import (
    LLMRerank
)

In [16]:
# based on https://medium.com/llamaindex-blog/using-llms-for-retrieval-and-reranking-23cf2d3a14b6
llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo"))
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, chunk_size_limit=512)
index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context)

reranker = LLMRerank(choice_batch_size=5, top_n=3, service_context=service_context)

query_engine = index.as_query_engine(
    similarity_top_k=10,
    node_postprocessors=[reranker],
)


2023-05-29 19:58:28,119 - INFO - > [build_index_from_nodes] Total LLM token usage: 0 tokens
2023-05-29 19:58:28,120 - INFO - > [build_index_from_nodes] Total embedding token usage: 965021 tokens


In [17]:
df8 = df.copy()

for i, gt in df8.iterrows():
    try:
        df8.loc[i, "result"] = query_engine.query(gt.question)
    except Exception as e:
        logger.error(e)
        df8.loc[i, "result"] = "ERROR"

2023-05-29 19:58:40,446 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 19:58:40,448 - INFO - > [retrieve] Total embedding token usage: 14 tokens
2023-05-29 19:59:12,845 - INFO - > [get_response] Total LLM token usage: 1968 tokens
2023-05-29 19:59:12,846 - INFO - > [get_response] Total embedding token usage: 0 tokens
2023-05-29 19:59:14,173 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 19:59:14,177 - INFO - > [retrieve] Total embedding token usage: 14 tokens
2023-05-29 20:00:03,577 - INFO - > [get_response] Total LLM token usage: 2354 tokens
2023-05-29 20:00:03,577 - INFO - > [get_response] Total embedding token usage: 0 tokens
2023-05-29 20:00:04,361 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 20:00:04,362 - INFO - > [retrieve] Total embedding token usage: 16 tokens
2023-05-29 20:00:26,520 - INFO - > [get_response] Total LLM token usage: 1277 tokens
2023-05-29 20:00:26,521 - INFO - > [get_response] Total embedding token usage: 0

In [18]:
evaluate_dataframe_answer(df8)

Output()

Button(description='Right', style=ButtonStyle())

Button(description='Wrong', style=ButtonStyle())

Button(description='partial', style=ButtonStyle())

In [19]:
retrieve_docs(df8, query_engine)

2023-05-29 20:16:39,742 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 20:16:39,743 - INFO - > [retrieve] Total embedding token usage: 14 tokens
2023-05-29 20:16:40,586 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 20:16:40,587 - INFO - > [retrieve] Total embedding token usage: 14 tokens
2023-05-29 20:16:41,414 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 20:16:41,416 - INFO - > [retrieve] Total embedding token usage: 16 tokens
2023-05-29 20:16:42,314 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 20:16:42,315 - INFO - > [retrieve] Total embedding token usage: 15 tokens
2023-05-29 20:16:43,094 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 20:16:43,096 - INFO - > [retrieve] Total embedding token usage: 27 tokens
2023-05-29 20:16:43,826 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-29 20:16:43,827 - INFO - > [retrieve] Total embedding token usage: 24 tokens
2023-05-29 20:16:44,59

In [20]:
evaluate_dataframe_retrieval(df8, query_engine)

Output()

Button(description='Right', style=ButtonStyle())

Button(description='Wrong', style=ButtonStyle())

Button(description='Partial', style=ButtonStyle())

In [21]:
df8.to_csv("exp_8.csv", index=False)

##### Run x - Forward/Backward Augmentation

In [18]:
from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.indices.postprocessor.node import PrevNextNodePostprocessor
from llama_index.node_parser import SimpleNodeParser
from llama_index.storage.docstore import SimpleDocumentStore
from llama_index.storage.storage_context import StorageContext

In [11]:
experiment = "exp_4"
service_context = ServiceContext.from_defaults(chunk_size_limit=512)
nodes = service_context.node_parser.get_nodes_from_documents(documents)
docstore = SimpleDocumentStore()
docstore.add_documents(nodes)
storage_context = StorageContext.from_defaults(docstore=docstore)
index = GPTVectorStoreIndex(nodes, storage_context=storage_context)

2023-05-25 14:37:36,024 - INFO - > [build_index_from_nodes] Total LLM token usage: 0 tokens
2023-05-25 14:37:36,025 - INFO - > [build_index_from_nodes] Total embedding token usage: 965021 tokens


In [21]:
node_postprocessor = PrevNextNodePostprocessor(docstore=docstore, num_nodes=4)

query_engine = index.as_query_engine(
    similarity_top_k=1,
    node_postprocessors=[node_postprocessor],
    response_mode="tree_summarize"
)

result_df = run_experiment(query_engine=query_engine,
                            ground_truths_file_path="/home/marshath/play/chainlink/algovate/algovate/data/ground_truths.pkl",
                            grading_type="fast",
                            logger=logger)

result_df.to_csv(f"/home/marshath/play/chainlink/algovate/algovate/data/llama_results_{experiment}.csv", index=False)

2023-05-25 14:43:32,365 - INFO - Number of ground truths: 27
2023-05-25 14:43:33,359 - INFO - > [retrieve] Total LLM token usage: 0 tokens
2023-05-25 14:43:33,360 - INFO - > [retrieve] Total embedding token usage: 14 tokens
2023-05-25 14:43:55,143 - INFO - > [get_response] Total LLM token usage: 2963 tokens
2023-05-25 14:43:55,144 - INFO - > [get_response] Total embedding token usage: 0 tokens
2023-05-25 14:43:55,144 - INFO - > [get_response] Total LLM token usage: 2963 tokens
2023-05-25 14:43:55,144 - INFO - > [get_response] Total embedding token usage: 0 tokens
2023-05-25 14:43:57,585 - INFO - [{'question': 'give me a sample solidity contract to use Chainlink price feeds?', 'answer': '// SPDX-License-Identifier: MIT\npragma solidity ^0.8.7;\n\nimport "@chainlink/contracts/src/v0.8/interfaces/AggregatorV3Interface.sol";\n\ncontract PriceConsumerV3 {\n    AggregatorV3Interface internal priceFeed;\n\n    /**\n     * Network: Sepolia\n     * Aggregator: BTC/USD\n     * Address: 0x1b44F35

In [52]:
# display_row_as_markdown(result_df, 26)

##### Run x

In [56]:
# https://medium.com/llamaindex-blog/a-new-document-summary-index-for-llm-powered-qa-systems-9a32ece2f9ec

In [61]:
# seems redundant - calls take a long since there are 150+ documents

# response_synthesizer = ResponseSynthesizer.from_args(response_mode="tree_summarize")

# doc_summary_index = GPTDocumentSummaryIndex.from_documents(
#     documents, 
#     service_context=service_context,
#     response_synthesizer=response_synthesizer
# )