In [None]:
import yaml,os
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.vectorstores import Chroma

In [None]:
with open("cadentials.yaml") as f:
    cadentials=yaml.load(f,Loader=yaml.FullLoader)

In [None]:
os.environ['OPENAI_API_KEY'] = cadentials['OPENAI_API_KEY']

In [None]:
chat_llm=ChatOpenAI(
    openai_api_key=os.environ['OPENAI_API_KEY'],
    model='gpt-3.5-turbo',
    temperature=0.5,
    max_tokens=500
)

In [None]:
llm_embeddings=HuggingFaceBgeEmbeddings(
                    model_name = 'BAAI/bge-small-en-v1.5',
                    model_kwargs = {'device' : 'cpu'},
                    encode_kwargs = {'normalize_embeddings': False}
)

In [None]:
loader=DirectoryLoader(
                    'data/',
                    loader_cls=PyPDFLoader,
                    glob='./*.pdf'
                )

documents=loader.load() #load documents as pages

In [None]:
text_splitter=RecursiveCharacterTextSplitter(
                            chunk_size=1000,
                            chunk_overlap=200 #get last 200 tokens to the next piece of chunk
                        )

In [None]:
texts=text_splitter.split_documents(documents)
len(texts)

In [None]:
if not os.path.exists("./db/00/"):
    print("Creating the DB")
    vector_db = Chroma.from_documents(
                                    documents = texts,
                                    embedding = llm_embeddings,
                                    persist_directory = 'db/00'
                                    )
else:
    print("Loading the DB")
    vector_db = Chroma(
                    embedding_function = llm_embeddings,
                    persist_directory = 'db/00'
                    )   

In [None]:
def  build_conversation(vector_db):
    memory=ConversationBufferMemory(
                            memory_key='chat_history',
                            return_messages=True
    )
    conversational_chain=ConversationalRetrievalChain.from_llm(
                                                        llm=chat_llm,
                                                        memory=memory,
                                                        retriever=vector_db.as_retriever()
    )
    return conversational_chain

In [None]:
conversational_chain=build_conversation(vector_db)

In [None]:
conversational_chain.run("Hi !")

In [18]:
conversational_chain.run("what is regular help time?")

'The regular help time provided by Crosser Support is from 8.00 to 17.00 CET on weekdays, excluding local public holidays.'

In [16]:
conversational_chain.run("what is university of moratuwa?")

"I don't have information about the University of Moratuwa."

In [None]:
def print_memory(conversational_chain):
    history=""
    memory_elements=conversational_chain.memory.chat_memory
    for idx,element in enumerate(memory_elements.messages):
        message=element.content
        if idx%2==0:
            history+=f"User :{message}\n"
        else:
            history+=f"Bot :{message}\n"
    return history

In [None]:
print(print_memory(conversational_chain))