In [1]:
import warnings
warnings.filterwarnings('ignore')

import dotenv
dotenv.load_dotenv(dotenv.find_dotenv(), override=True)

import pandas as pd
import numpy as np

from langchain.docstore.document import Document
from langchain.schema.language_model import BaseLanguageModel
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings

import os
import sys
from pathlib import Path

sys.path.append("../backend/")
sys.path.append("../")
sys.path.append(Path.cwd())

# How do generated QA pairs look like

In [None]:
import json
with open("../tmp/eval_data.json", "r", encoding="utf-8") as file:
    # Load existing data into a list
    gt_dataset = json.load(file)

# each QA pair contains generated question, answer, context in document with corresponding source and attached id
gt_dataset[0]

# Load and split data into chunks

In [10]:
from backend.utils import load_and_chunk_doc, get_qa_llm, get_retriever
from backend.commons.configurations import Hyperparameters, CVRetrieverSearchType
import glob
import pandas as pd

import uuid

hp_dict = {
        "id": 0,
        "chunk_size": 512,
        "chunk_overlap": 10,
        "num_retrieved_docs": 3,
        "use_llm_grader": False,
        "similarity_method": "cosine",
        "search_type": "mmr",
        "length_function_name": "text-embedding-ada-002",
        "grade_answer_prompt": "few_shot",
        "grade_docs_prompt": "default",
        "embedding_model": "text-embedding-ada-002",
        "qa_llm": "gpt-3.5-turbo",
        "grader_llm": "gpt-3.5-turbo"
    }

api_keys = {"OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY")}

hp = Hyperparameters.from_dict(hp_dict, api_keys)

llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")
emb = OpenAIEmbeddings()

chunks = load_and_chunk_doc(hp, glob.glob("../tmp/resources/*.txt")[0])
retriever = get_retriever(chunks, hp)
qa_llm = get_qa_llm(retriever, llm)

print(f"number of tokens in document: {sum([llm.get_num_tokens(chunk.page_content) for chunk in chunks])}\
\nnumber of chunks: {len(chunks)}\
\naverage number of tokens per chunk: {np.average([llm.get_num_tokens(chunk.page_content) for chunk in chunks])}")

2023-10-18 14:07:09,382 - INFO - backend.utils.utils:utils.py:42 - Constructing vectorstore and retriever.
2023-10-18 14:07:09,397 - INFO - chromadb.telemetry.posthog:posthog.py:17 - Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.


number of tokens in document: 4741
number of chunks: 14
average number of tokens per chunk: 338.64285714285717


# Generate Q/A pairs

In [None]:
from backend.commons.configurations import BaseConfigurations, QAConfigurations, Hyperparameters
from langchain.chains import QAGenerationChain
from backend.commons.prompts import QA_GENERATION_PROMPT_SELECTOR
from backend.testsetgen.test_set_generator import get_qa_from_chunk
import itertools

qa_params = {
        "chunk_size": 2048,
        "chunk_overlap": 0,
        "qa_generator_llm": "FakeTestLLM",
        "length_function_name": "len",
        "generate_eval_set": True,
        "persist_to_vs": True,
        "embedding_model_list": ["FakeTestEmbedding","FakeTestEmbedding"]
    }

hp_qa = QAConfigurations.from_dict(qa_params, {})

In [None]:
eval_set = []
qa_chain = QAGenerationChain.from_llm(hp_qa.qa_generator_llm, prompt=QA_GENERATION_PROMPT_SELECTOR.get_prompt(hp_qa.qa_generator_llm))

qa_pairs = [await get_qa_from_chunk(chunks[i], qa_chain) for i in range(len(chunks))]
qa_pairs = list(itertools.chain.from_iterable(qa_pairs))

In [28]:
from backend.evaluation import arun_evaluation
import uuid

await arun_evaluation(
        document_store_path="../tmp/test/document_store/",
        eval_params_path="../tmp/test/input_eval_params.json",
        eval_dataset_path="../tmp/test/input_eval_data.json",
        eval_results_path="../tmp/test/output_eval_results.json",
        hp_runs_data_path="../tmp/test/output_hp_runs_data.csv",
        user_id=str(uuid.uuid4()),
        api_keys={}
    )

  0%|                                                                                                                        | 0/1 [00:00<?, ?it/s]2023-10-18 15:12:12,673 - INFO - backend.utils.utils:utils.py:42 - Constructing vectorstore and retriever.
2023-10-18 15:12:12,689 - INFO - chromadb.telemetry.posthog:posthog.py:17 - Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.


ConnectionError: HTTPConnectionPool(host='localhost', port=8000): Max retries exceeded with url: /api/v1/collections (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000024E5070A450>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))

In [None]:
from langchain.embeddings import DeterministicFakeEmbedding
from langchain.llms.fake import FakeListLLM
from langchain.chains import QAGenerationChain
from backend.testsetgen.test_set_generator import get_qa_from_chunk

llm = FakeListLLM(responses=["foo_response"])
emb_model = DeterministicFakeEmbedding(size=2)

qa_chain = QAGenerationChain.from_llm(llm)

await get_qa_from_chunk(chunks[0], qa_chain)

In [11]:
from backend.utils import get_qa_llm, get_retriever
from backend.commons.configurations import CVRetrieverSearchType
from backend.commons.configurations import Hyperparameters

hp_dict = {
        "id": 0,
        "chunk_size": 512,
        "chunk_overlap": 10,
        "num_retrieved_docs": 2,
        "similarity_method": "cosine",
        "search_type": "mmr",
        "length_function_name": "len",
        "embedding_model": "TestDummyEmbedding",
        "qa_llm": "TestDummyLLM",
        "use_llm_grader": False,

    }
hp = Hyperparameters.from_dict(hp_dict, {})

retriever = get_retriever(chunks, hp)
qa_llm = get_qa_llm(retriever, llm, return_source_documents=True)

2023-10-18 14:07:14,427 - INFO - backend.utils.utils:utils.py:42 - Constructing vectorstore and retriever.
2023-10-18 14:07:14,427 - INFO - chromadb.telemetry.posthog:posthog.py:17 - Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.


In [27]:
retriever.vectorstore._collection.dict()

{'name': '76092b6d-9f5e-4117-ba39-936268e8bd79',
 'id': UUID('d743fa83-ccdf-489d-a697-fccdd0374fbe'),
 'metadata': {'hnsw:space': 'cosine'}}

In [26]:
retriever.dict()

{'tags': ['Chroma', 'TestDummyEmbedding'],
 'metadata': None,
 'vectorstore': <langchain.vectorstores.chroma.Chroma at 0x24e4c67e310>,
 'search_type': 'mmr',
 'search_kwargs': {'k': 2}}

In [None]:
qa_llm("Hi")

In [None]:
from langchain.embeddings import DeterministicFakeEmbedding
from langchain.llms.fake import FakeListLLM
from pydantic.v1 import BaseModel

class TestDummyLLM(FakeListLLM):
    model_name: str = "TestDummyLLM"
    
    def __init__(self):
        super().__init__(responses=["foo_response"])

    def dict(self, *args, **kwargs):
        output = super().dict(*args, **kwargs)
        output['model_name'] = self.model_name
        return output
    
class TestDummyEmbedding(DeterministicFakeEmbedding):
    model: str = "TestDummyEmbedding"

    def __init__(self):
        super().__init__(size=2)


In [None]:
import json

with open("../tmp/test/eval_data.json") as f:
    data = json.load(f)

In [None]:
x = []
if not x:
    print("hi")

In [None]:
len(out["source_documents"])

# Set up vectorstore and retriever

In [None]:
from eval_backend.utils import get_retriever

llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")
chunks_vs = split_data(data=data, chunk_size=512)
retriever = get_retriever(chunks_vs, OpenAIEmbeddings(model="text-embedding-ada-002"), 3)


"hnsw:space", cosine, l2, inner_product 'ip'

 collection = client.create_collection(
        name="collection_name",
        metadata={"hnsw:space": "cosine"} # l2 is the default
    )

# LLM chain for query answering based on document chunks

In [None]:
from backend.utils import get_qa_llm
qa_llm = get_qa_llm(retriever, hp.qa_llm)

# also returns source document chunks by default
await qa_llm.acall("What is TRAIL.X?")

QA grading functions like embedding similarity can be found in backend.evaluation.evaluation_metrics.py