In [None]:
import warnings
warnings.filterwarnings('ignore')

# Provide API Keys in .env file
import dotenv
dotenv.load_dotenv(dotenv.find_dotenv(), override=True)

import pandas as pd
import numpy as np

from langchain.docstore.document import Document
from langchain.schema.language_model import BaseLanguageModel
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings

import os
import sys
from pathlib import Path

sys.path.append("../backend/")
sys.path.append("../")
sys.path.append(Path.cwd())

# How do generated QA pairs look like

In [None]:
import json
with open("../resources/dev/label_dataset.json", "r", encoding="utf-8") as file:
    qa_eval_pairs = json.load(file)

# each QA pair contains generated question, answer, context in document with corresponding source and attached id
gt_dataset[0]

# Load and split data into chunks

In [None]:
from backend.utils import load_and_chunk_doc, get_qa_llm, get_retriever
from backend.commons.configurations import Hyperparameters, CVRetrieverSearchType
import glob
import pandas as pd

import uuid

user_id = str(uuid.uuid4())

hp_dict = {
        "chunk_size": 512,
        "chunk_overlap": 10,
        "num_retrieved_docs": 3,
        "similarity_method": "cosine",
        "search_type": "mmr",
        "length_function_name": "text-embedding-ada-002",
        "embedding_model": "text-embedding-ada-002",
        "qa_llm": "gpt-3.5-turbo",
        # settings for grading llm
        "use_llm_grader": False,
        #"grade_answer_prompt": "few_shot",
        #"grade_docs_prompt": "default",
        #"grader_llm": "gpt-3.5-turbo"
    }

api_keys = {"OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY")}

hp = Hyperparameters.from_dict(input_dict=hp_dict, hp_id=99, api_keys=api_keys)

llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")
emb = OpenAIEmbeddings()

chunks = load_and_chunk_doc(hp, glob.glob("../resources/tests/document_store/*.txt")[0])

# creates chroma collection for user and hp_id and embedds document chunks
retriever = get_retriever(chunks=chunks, hp=hp, user_id=user_id)
qa_llm = get_qa_llm(retriever=retriever, qa_llm=llm, return_source_documents=True)

print(f"number of tokens in document: {sum([llm.get_num_tokens(chunk.page_content) for chunk in chunks])}\
\nnumber of chunks: {len(chunks)}\
\naverage number of tokens per chunk: {np.average([llm.get_num_tokens(chunk.page_content) for chunk in chunks])}")

In [None]:
qa_llm("Of what time does the document speak of?")

# Generate Q/A pairs

In [None]:
from backend.commons.configurations import BaseConfigurations, QAConfigurations, Hyperparameters
from langchain.chains import QAGenerationChain
from backend.commons.prompts import QA_GENERATION_PROMPT_SELECTOR
from backend.generation.label_dataset_generator import get_qa_from_chunk
import itertools

qa_params = {
        "chunk_size": 2048,
        "chunk_overlap": 0,
        "qa_generator_llm": "gpt-3.5-turbo",
        "length_function_name": "len", # measures the chunk_size with this length function, could also be number of tokens form embedding model
        "persist_to_vs": True, # also embedds generated answers for caching and later evaluation
        "embedding_model_list": ["text-embedding-ada-002"]
    }

hp_qa = QAConfigurations.from_dict(input_dict=qa_params, api_keys=api_keys)


In [None]:
qa_chain = QAGenerationChain.from_llm(hp_qa.qa_generator_llm, prompt=QA_GENERATION_PROMPT_SELECTOR.get_prompt(hp_qa.qa_generator_llm))

qa_pairs = [await get_qa_from_chunk(chunks[i], qa_chain) for i in range(1)]
qa_pairs = list(itertools.chain.from_iterable(qa_pairs))

In [None]:
# one question-answer-context triple from first document chunk
qa_pairs

### Grading metrics like embedding similarity can be found in the backend.evaluation.metrics module.