# LangChain Chatbot (scaled up document base)

In [1]:
import os

import pandas as pd
pd.set_option('display.max_colwidth', None)

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

In [2]:
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredHTMLLoader
from langchain.document_loaders import UnstructuredPDFLoader
from langchain import PromptTemplate

In [3]:
import langchain
langchain.debug = False

## load data and setup vector store

In [None]:
documents = []
for file in os.listdir('data/scraped_data'):
    if file.endswith('.pdf'):
        pdf_path = './data/scraped_data/' + file
        print(f'Loading {pdf_path}')
        loader = UnstructuredPDFLoader(pdf_path)
        documents.extend(loader.load())
    elif file.endswith('.html'):
        doc_path = './data/scraped_data/' + file
        print(f'Loading {doc_path}')
        loader = UnstructuredHTMLLoader(doc_path)
        documents.extend(loader.load())

In [None]:
# split documents into text chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunked_documents = text_splitter.split_documents(documents)

In [None]:
# create chroma vector db with OpenAIEmbeddings
persist_directory = './storage_scaled_up'

if not os.listdir(persist_directory):

    vectordb = Chroma.from_documents(
      chunked_documents,
      embedding=OpenAIEmbeddings(),
      persist_directory=persist_directory
    )

    vectordb.persist()

else:
    vectordb = Chroma(persist_directory=persist_directory, embedding_function=OpenAIEmbeddings())
    

### create QA chain

In [17]:
# Create Prompt
template = """

If the question does not contain a study program, say that you need more information about the study program to answer the question.

Use the following pieces of context to answer the question at the end.

Execute these steps:
1 - list the context
2 - focus on words like "optional" or "can" for your answer
3 - answer the question. Do not use information outside of the context to answer the question.

Your answer should have this format:

context:
answer:

------------------------
Context: {context}

Question: {question}

"""

custom_prompt = PromptTemplate.from_template(template)

In [24]:
qa_chain = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(model='gpt-3.5-turbo', temperature=0),
    retriever=vectordb.as_retriever(search_kwargs={'k': 5}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": custom_prompt}
)

# Tests
### read questions and answer

In [None]:
df_questions = pd.read_csv('TestQuestions.csv', delimiter=";", names=["Question", "Response"] )
questions = df_questions["Question"]

responses = []
counter = 0

for q in questions:
    print(f'q{counter} start')
    
    # get result
    result_object = qa_chain({'query': q})
    r = result_object['result']
    
    # get source documents
    source_docs = result_object['source_documents']
    sources = []
    for doc in source_docs:
        sources.append(doc.metadata["source"].replace('./data/scraped_data/', ''))
    source = ",".join(sources)
    
    # build row
    responses.append((q, r, source))
    
    print(f'q{counter} end')
    counter += 1

df_responses = pd.DataFrame(responses, columns=["Question", "Response", "Source"])

In [None]:
df_responses

### save responses

In [None]:
# df_responses.to_csv("test_responses_scaled_w_source.csv", sep=";")