# LangChain Chatbot

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

import os
os.environ['OPENAI_API_KEY'] = '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'

In [2]:
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredHTMLLoader
from langchain.document_loaders import UnstructuredPDFLoader
# from langchain_community.document_loaders import UnstructuredPDFLoader
# from langchain.document_loaders import PyPDFLoader
from langchain import PromptTemplate
import pytesseract

In [3]:
import langchain
langchain.debug = False

In [None]:
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

## load data and setup vector store

In [None]:
documents = []
for file in os.listdir('data/scraped_data'):
    if file.endswith('.pdf'):
        pdf_path = './data/scraped_data/' + file
        print(f'Loading {pdf_path}')
        loader = UnstructuredPDFLoader(pdf_path)
        documents.extend(loader.load())
    elif file.endswith('.html'):
        doc_path = './data/scraped_data/' + file
        print(f'Loading {doc_path}')
        loader = UnstructuredHTMLLoader(doc_path)
        documents.extend(loader.load())

In [None]:
# split documents into text chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunked_documents = text_splitter.split_documents(documents)

In [4]:
# create chroma vector db with OpenAIEmbeddings
persist_directory = './storage_scaled_up'

if not os.listdir(persist_directory):

    vectordb = Chroma.from_documents(
      chunked_documents,
      embedding=OpenAIEmbeddings(),
      persist_directory=persist_directory
    )

    vectordb.persist()

else:
    vectordb = Chroma(persist_directory=persist_directory, embedding_function=OpenAIEmbeddings())
    

  warn_deprecated(


### create QA chain

In [23]:
# Create Prompt
template = """

If the question does not contain a study program, say that you need more information about the study program to answer the question.

Use the following pieces of context to answer the question at the end. 

Execute these steps:
1 - list the context 
2 - focus on words like "optional" or "can" for your answer
3 - answer the question. Do not use information outside of the context to answer the question

Your answer should have this format:

context:
answer:
------------------------
Context: {context}

Question: {question}

"""

custom_prompt = PromptTemplate.from_template(template)

In [17]:
# Create Prompt
template = """

If the question does not contain a study program, say that you need more information about the study program to answer the question.

Use the following pieces of context to answer the question at the end.

Execute these steps:
1 - list the context
2 - focus on words like "optional" or "can" for your answer
3 - answer the question. Do not use information outside of the context to answer the question.

Your answer should have this format:

context:
answer:

------------------------
Context: {context}

Question: {question}

"""

custom_prompt = PromptTemplate.from_template(template)

In [24]:
qa_chain = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(model='gpt-3.5-turbo', temperature=0),
    retriever=vectordb.as_retriever(search_kwargs={'k': 5}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": custom_prompt}
)

# Tests
### read questions and answer

In [25]:
q = "I am studying the master of data science, do I have to do any fundamental courses?"

r = qa_chain({'query': q})
print(r['result'])

# print('\nSources:')
# for source_doc in r['source_documents']:
#     print(source_doc)
#     print('====================\n')

Retrying langchain_community.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised ServiceUnavailableError: The server is overloaded or not ready yet..


Context: This degree program provides students with a solid theoretical foundation as well as practical skills for data management, data analytics methods and responsible data science. The courses are divided into two groups – fundamental courses and advanced courses. After studying optional fundamental courses in computer science and empirical social sciences, in their advanced courses students can focus on the concepts and methods of computers science and advanced empirical methods and the application of these methods. In addition to the regular lecture courses, students participate in a one or two semester team project or individual project.

During their studies -

(MK1) all students develop a deep understanding of the relevant concepts, methods and problem-solving strategies used in different application domains.

(MK2) technology-oriented students learn the concepts, algorithms and strategies

Answer: Yes, as a student studying the master of data science, you have the option to s

In [None]:
q = "Ich studiere den Bachelor in Wirtschaftsinformatik. Brauche ich Schlüsselqualifikationen?"

r = qa_chain({'query': q})
print(r['result'])

# print('\nSources:')
# for source_doc in r['source_documents']:
#     print(source_doc)
#     print('====================\n')

In [None]:
q = "Ich studiere Wirtschafstmathematik. Brauche ich Schlüsselqualifikationen?"

r = qa_chain({'query': q})
print(r['result'])

# print('\nSources:')
# for source_doc in r['source_documents']:
#     print(source_doc)
#     print('====================\n')

In [None]:
q = "I am studying the master of business informatics, do I have to do any fundamental courses?"

r = qa_chain({'query': q})
print(r['result'])

# print('\nSources:')
# for source_doc in r['source_documents']:
#     print(source_doc)
#     print('====================\n')

In [None]:
df_questions = pd.read_csv('TestQuestions.csv', delimiter=";", names=["Question", "Response"] )
questions = df_questions["Question"]

responses = []
counter = 0

for q in questions:
    print(f'q{counter} start')
    
    # get result
    result_object = qa_chain({'query': q})
    r = result_object['result']
    
    # get source documents
    source_docs = result_object['source_documents']
    sources = []
    for doc in source_docs:
        sources.append(doc.metadata["source"].replace('./data/scraped_data/', ''))
    source = ",".join(sources)
    
    # build row
    responses.append((q, r, source))
    
    print(f'q{counter} end')
    counter += 1

df_responses = pd.DataFrame(responses, columns=["Question", "Response", "Source"])

In [None]:
df_responses

### save responses

In [None]:
df_responses.to_csv("test_responses_scaled_w_source.csv", sep=";")