In [None]:
import os
from dotenv import load_dotenv
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate

In [2]:
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

if OPENAI_API_KEY:
    print("API key loaded successfully.")
else:
    raise ValueError("Error loading API key. Check that OPENAI_API_KEY is set inside .env")

API key loaded successfully.


In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

def docs_splitter(docs):
    text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
    )
    all_splits = text_splitter.split_documents(docs)
    return all_splits

In [4]:
loaderType_dict = {1 : "PyPDFLoader", 2 : "PyMyPDF4llm", 3 : "LlamaParser"}

In [5]:
## make embeddings using pdf loaders below :
from langchain_community.document_loaders import PyPDFLoader
import pymupdf4llm
from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import nest_asyncio
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

# the vesion of PDF Loaders
# (1) PyPDFLoader
# (2) PyMyPDF4llm
# (3) LlamaParser

def make_embeddings(loaderType, file_name):
    FILE_PATH = os.path.join(os.getcwd(), "examplefiles", file_name)
    if loaderType_dict[loaderType] == "PyPDFLoader":
        loader = PyPDFLoader(file_path=FILE_PATH)
        docs = loader.load()

    elif loaderType_dict[loaderType] == "PyMyPDF4llm":
        loader = pymupdf4llm.LlamaMarkdownReader()
        docs_llama = loader.load_data(file_path=FILE_PATH)
        doc_creator = CharacterTextSplitter()
        docs = doc_creator.create_documents(metadatas=list(map(lambda x : x.metadata, docs_llama)), 
                                    texts=list(map(lambda x : x.text, docs_llama)))
                                    
    elif loaderType_dict[loaderType] == "LlamaParser":
        nest_asyncio.apply()
        parser = LlamaParse(
            result_type="markdown", 
            num_workers=8, 
            verbose=True
        )
        file_extractor = {".pdf": parser}
        docs = SimpleDirectoryReader(
            input_files=[FILE_PATH],
            file_extractor=file_extractor,
        ).load_data()
        docs = [doc.to_langchain_format() for doc in documents]
        
    else:
        print("Wrong loader type is entered.")
        return False

    text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
    )

    all_splits = docs_splitter(docs)

    try:
        vectorstore = Chroma.from_documents(documents=all_splits,
                                    embedding=OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY),
                                    persist_directory=f"data/{file_name}_{loaderType_dict[loaderType]}")
        print("Embeded vector store is succesfully made.")
        return vectorstore
    except:
        print("Embedding is Failed ... ㅠㅠ")
        return False

In [6]:
FILE_NAME = "DB Concepts Chapter1.pdf"
LOADER_TYPE = 1

make_embeddings(loaderType=LOADER_TYPE, file_name=FILE_NAME)

Embeded vector store is succesfully made.


<langchain_chroma.vectorstores.Chroma at 0x2b21d98af80>

In [7]:
class ChatBot_test():
    def __init__(self, chat_llm):
        load_dotenv()

        self.llm_version = chat_llm
        # the version of OpenAI models
        # (1) gpt-3.5-turbo
        # (2) gpt-3.5-turbo-0613
        # (3) gpt-3.5-turbo-16k-0613
        # (4) gpt-3.5-turbo-instruct-0914
        # (5) gpt-4
        # (6) gpt-4o-mini

        CORPUS_PATH = os.path.join(os.getcwd(), "corpus")  
        CHROMA_PATH = os.path.join(os.getcwd(), f"data/{FILE_NAME}_{loaderType_dict[LOADER_TYPE]}")

        if not os.path.exists(CHROMA_PATH):
            print("Embeded vector store does not exist.")
            return False

        self.chat_history = [] 
        self.llm = ChatOpenAI(model=self.llm_version, temperature=0, openai_api_key=OPENAI_API_KEY)
        # => if time left, I'll do experiments on which temperature is ideal between 0 and 1

        self.vectorstore = Chroma(persist_directory=CHROMA_PATH, embedding_function=OpenAIEmbeddings())
        self.retriever = self.vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

        self.prompt = PromptTemplate(
            input_variables=["history", "context", "question"],
            template="""
            You are a knowledgeable assistant. Use the following pieces of retrieved context to answer the question.
            If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.

            Conversation history:
            {history}

            Context:
            {context}

            Question:
            {question}

            Answer:
        """)

        print("ChatBot initialized successfully!")    

    def format_docs(self, docs):
            """Format the retrieved documents into a single context string for the prompt."""
            return "\n\n".join(doc.page_content for doc in docs)
        
    def format_history(self):
        """Format the chat history into a string for inclusion in the prompt."""
        return "\n".join(
            f"Q: {item['question']}\nA: {item['answer']}" for item in self.chat_history[-3:]
        )

    def answer(self, question):
        """Generate an answer using the RAG chain."""
        rag_chain = (
            {
                "history": RunnableLambda(lambda _: self.format_history()), 
                "context": self.retriever | self.format_docs,
                "question": RunnablePassthrough()
            }
            | self.prompt
            | self.llm
            | RunnableLambda(lambda x: x.content) 
        )

        response = rag_chain.invoke(question)
        self.chat_history.append({"question": question, "answer": response})

        return response

#### Performance Testing Example

In [8]:
LLM_VERSION = "gpt-3.5-turbo"
bot = ChatBot_test(LLM_VERSION)

ChatBot initialized successfully!


In [9]:
import json

questions = []
ground_truths = []

with open('human_qa_dataset/qa_DB Concepts Chapter1.json') as json_file:
    json_data = json.load(json_file)

for qa in json_data:
    questions.append(qa["question"])
    ground_truths.append(qa["groudTruth"])

In [10]:
from datasets import Dataset
from ragas import EvaluationDataset

answers = []
contexts = []

CHROMA_PATH = os.path.join(os.getcwd(), f"data/{FILE_NAME}_{loaderType_dict[LOADER_TYPE]}")

vectorstore = Chroma(persist_directory=CHROMA_PATH, embedding_function=OpenAIEmbeddings())
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

# Inference
for query in questions:
  answers.append(bot.answer(query))
  contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])

# To dict
data = {
    "user_input": questions,
    "reference": ground_truths,
    "response": answers,
    "retrieved_contexts": contexts
}

# Convert dict to dataset
dataset = Dataset.from_dict(data)
eval_dataset = EvaluationDataset.from_hf_dataset(dataset)

  from .autonotebook import tqdm as notebook_tqdm
  contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])


In [11]:
from ragas import evaluate
from ragas.metrics import answer_relevancy, context_precision, faithfulness, context_recall, answer_correctness
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI

llm_model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0, openai_api_key=OPENAI_API_KEY)

result = evaluate(
    eval_dataset, 
    metrics = [answer_relevancy, context_precision, faithfulness, context_recall, answer_correctness], 
    llm = llm_model, 
    embeddings=OpenAIEmbeddings(), 
    raise_exceptions=False
)

result.to_pandas()

Evaluating:  69%|██████▉   | 55/80 [00:26<00:12,  2.02it/s]No statements were generated from the answer.
Evaluating: 100%|██████████| 80/80 [00:36<00:00,  2.22it/s]


Unnamed: 0,user_input,retrieved_contexts,response,reference,answer_relevancy,context_precision,faithfulness,context_recall,answer_correctness
0,What is definition of DBMS?,[CHAPTER1IntroductionAdatabase-management syst...,A database management system (DBMS) is a colle...,A database-management system (DBMS) is a colle...,0.883104,1.0,0.75,1.0,0.744397
1,Please tell me some representative examples of...,[of a query.Exercises1.7List four applications...,Some representative examples of Database appli...,Database can be used for Enterprise Informatio...,0.97221,0.0,0.333333,1.0,0.215097
2,What kinds of data storage is used in 1950s?,[data from tapesand card decks.•Late 1960s and...,"In the 1950s, magnetic tapes were developed fo...",Magnetic tapes were developed for data storage...,0.87669,0.266667,1.0,1.0,0.618255
3,Relational model concept is defined by whom? P...,[character) may be usedto delimit records. The...,The relational model concept is defined by Edg...,A landmark paper by Codd [1970] defined the re...,0.913699,1.0,0.0,1.0,0.521585
4,What is DML and DDL?,"[10Chapter 1Introductiondates. In practice, th...","DML stands for Data Manipulation Language, whi...",A data-manipulation language (DML) is a langua...,0.915184,1.0,0.8,1.0,0.702346
5,Please briefly introduce the disadvantages of ...,"[and add records to, the ap-propriate ﬁles. Be...",Managing structural data using a file system c...,File system may have data redundancy and incon...,0.933277,1.0,0.833333,0.333333,0.64891
6,Please tell me specifically about the atomity ...,[t o r e di nt h ed a t a b a s em u s ts a t ...,The atomicity problem in the file system refer...,"A computer system, like any other device, is s...",0.91162,1.0,0.333333,0.333333,0.562777
7,how database system hides certain details of h...,"[6Chapter 1IntroductionThese difﬁculties, amon...",A database system hides certain details of how...,Database system provides users with an abstrac...,0.979173,1.0,0.5,1.0,0.608909
8,Please briefly introduce the 3 levels of data ...,[1.3View of Data7view 1 view 2logicallevelphys...,The three levels of data abstraction are the l...,Database system hides the complexity from user...,0.915294,1.0,0.857143,1.0,0.739213
9,What is the characteristic of logical level in...,[at the logical level may involve complex phys...,The logical level in data abstraction shields ...,Logical level describes what data are stored i...,0.895995,1.0,0.6,1.0,0.449427


In [12]:
result.to_pandas().describe()

Unnamed: 0,answer_relevancy,context_precision,faithfulness,context_recall,answer_correctness
count,16.0,16.0,15.0,16.0,16.0
mean,0.867049,0.824583,0.594921,0.729167,0.505705
std,0.234533,0.369758,0.317563,0.425463,0.223965
min,0.0,0.0,0.0,0.0,0.187343
25%,0.892772,0.981667,0.416667,0.333333,0.229633
50%,0.915239,1.0,0.666667,1.0,0.569861
75%,0.956772,1.0,0.816667,1.0,0.662269
max,0.979173,1.0,1.0,1.0,0.850534
