# LangChain Chatbot

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

import os
os.environ['OPENAI_API_KEY'] = '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'

In [2]:
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredHTMLLoader
from langchain.document_loaders import UnstructuredPDFLoader
# from langchain_community.document_loaders import UnstructuredPDFLoader
# from langchain.document_loaders import PyPDFLoader
from langchain import PromptTemplate
import pytesseract

In [3]:
import langchain
langchain.debug = False

In [4]:
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

## load data and setup vector store

In [5]:
documents = []
for file in os.listdir('data/scraped_data'):
    if file.endswith('.pdf'):
        pdf_path = './data/scraped_data/' + file
        print(f'Loading {pdf_path}')
        loader = UnstructuredPDFLoader(pdf_path)
        documents.extend(loader.load())
    elif file.endswith('.html'):
        doc_path = './data/scraped_data/' + file
        print(f'Loading {doc_path}')
        loader = UnstructuredHTMLLoader(doc_path)
        documents.extend(loader.load())

Loading ./data/scraped_data/101123_MK_B.Sc._Wima_2023_2024._Appendix.pdf
Loading ./data/scraped_data/111www_uni-mannheim_de_en_academics_advice-and-services.html
Loading ./data/scraped_data/131123_MK_M.Sc._Wima___Mathe_2023_2024._Appendix.pdf
Loading ./data/scraped_data/160407_Praesentation_Wima_Master.pdf
Loading ./data/scraped_data/2019_MasterInfoPra__si.pdf
Loading ./data/scraped_data/20230329-Lesefassung_ErpSatzung_ePruefungen_veroeffentlicht.pdf
Loading ./data/scraped_data/310823_MK_M.Sc._Wima___Mathe__2023_24.pdf
Loading ./data/scraped_data/Aktualisiert_Stundenplan_MMDS_HWS2023.pdf
Loading ./data/scraped_data/Antrag_Anerkennung_B.Sc.Wifo_Stand_2023.pdf
Loading ./data/scraped_data/Antrag_Anerkennung_M.Sc.Wifo_MMDS_2023.pdf
Loading ./data/scraped_data/Antrag_auf_Rueckerstattung.pdf
Loading ./data/scraped_data/Antrag_auf_Verlaengerung_der_Bearbeitungszeit_fuer_die_Abschlussarbeit.pdf
Loading ./data/scraped_data/Antrag_Befreiung_neu.pdf
Loading ./data/scraped_data/Antrag_Modulwechsel

Loading ./data/scraped_data/Stundenplan_LAG_FSS20_Sem04.pdf
Loading ./data/scraped_data/Stundenplan_LAG_FSS20_Sem06.pdf
Loading ./data/scraped_data/Stundenplan_LAG_HWS22_Sem01.pdf
Loading ./data/scraped_data/Stundenplan_LAG_HWS22_Sem03_27092022.pdf
Loading ./data/scraped_data/Stundenplan_LAG_HWS22_Sem05_27092022.pdf
Loading ./data/scraped_data/Stundenplan_MSc_Wifo_HWS2023.pdf
Loading ./data/scraped_data/Stundenplan_Wifo_HWS23_Sem01.pdf
Loading ./data/scraped_data/Stundenplan_Wifo_HWS23_Sem03.pdf
Loading ./data/scraped_data/Stundenplan_Wifo_HWS23_Sem05.pdf
Loading ./data/scraped_data/Stundenplan_Wima_FSS23_Wahlpflichtveranstaltungen.pdf
Loading ./data/scraped_data/Stundenplan_Wima_HWS23_Wahlpflichtveranstaltungen.pdf
Loading ./data/scraped_data/stuo_mmm_doppelabschlussprogramm_2satzung_en.pdf
Loading ./data/scraped_data/unbedenklichkeitsbescheinigung.pdf
Loading ./data/scraped_data/unbedenklichkeitsbescheinigung_en.pdf
Loading ./data/scraped_data/vollmacht_vordruck.pdf
Loading ./data/sc

Loading ./data/scraped_data/www.uni-mannheim.de_studium_im-studium_studienorganisation_rueckmeldung.html
Loading ./data/scraped_data/www.uni-mannheim.de_studium_termine_semesterzeiten.html
Loading ./data/scraped_data/www.uni-mannheim.de_studium_vom-ausland-nach-mannheim_internationale-vollzeitstudierende_vor-der-anreise_studiengebuehren-fuer-internationale-studierende.html
Loading ./data/scraped_data/www.wim.uni-mannheim.de_en_academics_contact-and-advising.html
Loading ./data/scraped_data/www.wim.uni-mannheim.de_en_academics_organizing-your-studies.html
Loading ./data/scraped_data/www.wim.uni-mannheim.de_en_academics_organizing-your-studies_bsc-business-informatics.html
Loading ./data/scraped_data/www.wim.uni-mannheim.de_en_academics_organizing-your-studies_bsc-business-informatics_extension-of-deadlines.html
Loading ./data/scraped_data/www.wim.uni-mannheim.de_en_academics_organizing-your-studies_bsc-business-informatics_general-questions.html
Loading ./data/scraped_data/www.wim.uni-m

In [6]:
# split documents into text chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunked_documents = text_splitter.split_documents(documents)

In [6]:
# create chroma vector db with OpenAIEmbeddings
persist_directory = './storage_scaled_up'

if not os.listdir(persist_directory):

    vectordb = Chroma.from_documents(
      chunked_documents,
      embedding=OpenAIEmbeddings(),
      persist_directory=persist_directory
    )

    vectordb.persist()

else:
    vectordb = Chroma(persist_directory=persist_directory, embedding_function=OpenAIEmbeddings())
    

  warn_deprecated(


### create QA chain

In [15]:
# Create Prompt
template = """

Use the following pieces of context to answer the question at the end.

Execute these steps:
1 - list the context
2 - focus on words like "optional" or "can" for your answer
3 - answer the question. Do not use information outside of the context to answer the question.

Your answer should have this format:

context:
answer:

------------------------
Context: {context}

Question: {question}

"""

custom_prompt = PromptTemplate.from_template(template)

In [16]:
qa_chain = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(model='gpt-3.5-turbo', temperature=0),
    retriever=vectordb.as_retriever(search_kwargs={'k': 5}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": custom_prompt}
)

# Tests
### read questions and answer

In [21]:
df_questions = pd.read_csv('TestQuestions.csv', delimiter=";", names=["Question", "Response"] )
questions = df_questions["Question"]

responses = []
counter = 0

for q in questions:
    print(f'q{counter} start')
    
    # get result
    result_object = qa_chain({'query': q})
    r = result_object['result']
    
    # get source documents
    source_docs = result_object['source_documents']
    sources = []
    for doc in source_docs:
        sources.append(doc.metadata["source"].replace('./data/scraped_data/', ''))
    source = ",".join(sources)
    
    # build row
    responses.append((q, r, source))
    
    print(f'q{counter} end')
    counter += 1

df_responses = pd.DataFrame(responses, columns=["Question", "Response", "Source"])

q0 start
q0 end
q1 start
q1 end
q2 start
q2 end
q3 start
q3 end
q4 start
q4 end
q5 start
q5 end
q6 start
q6 end
q7 start
q7 end
q8 start
q8 end
q9 start
q9 end
q10 start
q10 end
q11 start
q11 end
q12 start
q12 end
q13 start
q13 end
q14 start
q14 end
q15 start
q15 end
q16 start
q16 end
q17 start
q17 end
q18 start
q18 end
q19 start
q19 end
q20 start
q20 end
q21 start
q21 end
q22 start
q22 end
q23 start
q23 end


In [22]:
df_responses

Unnamed: 0,Question,Response,Source
0,Question,context: Recognition of coursework and examinations\n\nanswer: The question is about the recognition of coursework and examinations.,"www.wim.uni-mannheim.de_studium_studienorganisation_b-sc-wirtschaftsinformatik.html,www.wim.uni-mannheim.de_en_academics_organizing-your-studies_bsc-business-informatics_recognition-of-coursework-and-examinations.html,www.wim.uni-mannheim.de_studium_studienorganisation_m-sc-business-informatics.html,www.wim.uni-mannheim.de_studium_studienorganisation_m-sc-business-informatics_recognition-of-coursework-and-examinations.html,www.wim.uni-mannheim.de_studium_studienorganisation_b-sc-wirtschaftsinformatik_anerkennung-von-pruefungsleistungen.html"
1,Can I change any modules?,"Context: Can I change any modules?Please note that only students enrolled before spring 2020 can change modules.Requests to the examination committee must be submitted in writing. Please make sure to indicate your address, student ID number and e-mail address in case we need to contact you for follow-up questions. We also ask you to state the semester in which you started your master's program so that we know straight away which examination regulations apply to you.Changing a module you have not passed yetIf you have chosen the wrong module, you have the possibility to request a module change. The request must be submitted to the responsible examination committee in writing using the following form:Module Change (PDF)​​​​​​Module Change (Word)​Please bear in mind that any unsuccessful examination attempts will be transferred from the original module to the new module!Please also note that a module change does not affect the requirements defined in your examination regulations. In particular,\n\nAnswer: Only students enrolled before spring 2020 can change modules.","www.wim.uni-mannheim.de_en_academics_organizing-your-studies_mannheim-master-in-data-science_general-questions.html,www.wim.uni-mannheim.de_studium_studienorganisation_mannheim-master-in-data-science_general-questions.html,www.wim.uni-mannheim.de_studium_studienorganisation_m-sc-business-informatics_general-questions.html,www.wim.uni-mannheim.de_en_academics_organizing-your-studies_msc-business-informatics_general-questions.html,www.wim.uni-mannheim.de_en_academics_organizing-your-studies_msc-business-informatics_general-questions.html"
2,How am I assigned a team project?,"Context: (6) 1The student is responsible for registering for each attempt at the examination Team Project with the examination committee in due time; this also applies if an attempt has been deemed not taken. 2Once a student has been assigned to a Team Project by the examination committee, the student is admitted to the examination. 3The students shall be granted the chance to put forward suggestions for assignments. 4However, no legal right arises to be assigned to the suggested Team Project.\n\n(5) 1The student is responsible for registering for each attempt at the examination Team Project with the ex- amination committee in due time before the start of the preparation period for the Team Project and within the registration period set by the examination committee; this also applies if an attempt has been deemed not taken. ²Students must be granted the chance to make suggestions which Team Project they want to be assigned to; however, no legal right arises to be assigned to the suggested Team Project. 3Once a student has been assigned to a Team Project, the student is admitted to the examination. 4The tasks to be completed by individual students for the written work of the Team Project must be assigned in agreement with the examiner. 5The date of the examination will be set by the examiner in consultation with the group.\n\ngroup. 5If multiple students register for the same new Team Project in due time, their registrations are considered in chronological order by the date of reception. If a student is admitted to a new Team Project in the same semester, the same duration of the preparation period applies to the student as applies to the group members assigned to the Team Project according to subsection 6; the preparation period cannot be extended due to a swap.\n\n(6) 1The student is responsible for registering with the examination committee in due time for each attempt at the examination Team Project. This also applies if an attempt has been deemed not taken. 2Once the student has been assigned to a Team Project by the chair of the examination committee, the student is admitted to the examination. 3The students shall be granted the chance to put forward suggestions for assignments. 4However, no legal right arises to be assigned to the suggested Team Project.\n\nof the current group. 5If multiple students register for the same new Team Project in due time, their registrations are considered in chronological order by the date of receipt. 6If a student is admitted to a new Team Project in the same semester, the same duration of the preparation period applies to the student as applies to the group members assigned to the Team Project according to subsection 6; the preparation period cannot be extended due to a swap.\n\nAnswer: The student is responsible for registering for each attempt at the examination Team Project with the examination committee in due time. Once the student has been assigned to a Team Project by the examination committee, the student is admitted to the examination. The students have the chance to put forward suggestions for assignments, but there is no legal right to be assigned to the suggested Team Project.","PO_MSc_Wifo_2018_1Satzung_EN.pdf,PO_MSc_MMDS_2020_EN.pdf,PO_MSc_MMDS_2017_2Satzung_EN.pdf,PO_MSc_MMDS_2017_2Satzung_EN.pdf,PO_MSc_MMDS_2020_EN.pdf"
3,Do I need to finish all of the exams before starting my master thesis?,"Context: (5) 1The time to complete the master’s thesis totals four months. ²The preparation period starts with the examination committee’s approval of the topic. 3Either the master’s thesis is registered after completing the courses of the first se- mester or after completing the courses of the second semester. 4The examination committee decides on the registration periods; the office of the examination committee communicates the deadlines. 5Sections 22 and 23 remain unaffected.\n\nAnswer: No, you do not need to finish all of the exams before starting your master thesis. The master's thesis can be registered either after completing the courses of the first semester or after completing the courses of the second semester.","www.wim.uni-mannheim.de_studium_studienorganisation_mannheim-master-in-data-science_general-questions.html,www.wim.uni-mannheim.de_en_academics_organizing-your-studies_mannheim-master-in-data-science_general-questions.html,www.wim.uni-mannheim.de_en_academics_organizing-your-studies_msc-business-informatics_general-questions.html,www.wim.uni-mannheim.de_studium_studienorganisation_m-sc-business-informatics_general-questions.html,PO_MCBL_2021_en.pdf"
4,Is it necessary to complete all my exams prior to beginning my master's thesis?,"Context: (5) 1The time to complete the master’s thesis totals four months. ²The preparation period starts with the examination committee’s approval of the topic. 3Either the master’s thesis is registered after completing the courses of the first se- mester or after completing the courses of the second semester. 4The examination committee decides on the registration periods; the office of the examination committee communicates the deadlines. 5Sections 22 and 23 remain unaffected.\n\nAnswer: No, it is not necessary to complete all exams prior to beginning the master's thesis. The master's thesis can be registered either after completing the courses of the first semester or after completing the courses of the second semester.","www.wim.uni-mannheim.de_studium_studienorganisation_mannheim-master-in-data-science_general-questions.html,www.wim.uni-mannheim.de_en_academics_organizing-your-studies_mannheim-master-in-data-science_general-questions.html,www.wim.uni-mannheim.de_en_academics_organizing-your-studies_msc-business-informatics_general-questions.html,www.wim.uni-mannheim.de_studium_studienorganisation_m-sc-business-informatics_general-questions.html,PO_MCBL_2021_en.pdf"
5,Can I start my master thesis even though I haven't done all of my exams?,"Context: Do I need to finish all of my exams before starting my master's thesis? Although some chairs might verify whether you have completed all your examinations, you are formally only required to accumulate at least 60 ECTS credits before starting your thesis. Nevertheless, we strongly advise you to finish all your examinations first. Writing a master's thesis is a full-time job, and it takes a lot of coordination and organization to take exams and write a thesis in parallel.\n\nAnswer: Yes, you can start your master thesis even though you haven't done all of your exams.","www.wim.uni-mannheim.de_studium_studienorganisation_mannheim-master-in-data-science_general-questions.html,www.wim.uni-mannheim.de_en_academics_organizing-your-studies_mannheim-master-in-data-science_general-questions.html,www.wim.uni-mannheim.de_en_academics_organizing-your-studies_msc-business-informatics_general-questions.html,www.wim.uni-mannheim.de_studium_studienorganisation_m-sc-business-informatics_general-questions.html,PO_MA_PolSci_Soc_2023_en.pdf"
6,How come I'm still enrolled in the initial exam despite switching modules?,"Context: Why am I still registered for the original exam after a module change? In case you are still registered for the original module after the exam registration period has ended, please contact Student Services I.\n\nAnswer: You will remain registered for the original module until the end of the exam registration period.","www.wim.uni-mannheim.de_studium_studienorganisation_mannheim-master-in-data-science_general-questions.html,www.wim.uni-mannheim.de_en_academics_organizing-your-studies_mannheim-master-in-data-science_general-questions.html,www.wim.uni-mannheim.de_en_academics_organizing-your-studies_msc-business-informatics_general-questions.html,www.wim.uni-mannheim.de_studium_studienorganisation_m-sc-business-informatics_general-questions.html,www.wim.uni-mannheim.de_en_academics_organizing-your-studies_mannheim-master-in-data-science_general-questions.html"
7,How can I cheat on the exam?,Context: Part 4: Violation of the Examination Regulations\n\nSection 35 – Cheating and other Misconduct\n\nAnswer: Cheating on the exam is not allowed and is considered a violation of the examination regulations.,"PO_MSc_Wifo_2018_1Satzung_EN.pdf,PO_MSc_MMDS_2017_2Satzung_EN.pdf,PO_MSc_MMDS_2020_EN.pdf,PO_MCBL_2021_en.pdf,PO_MA_Sowi_PolSci_Soc_2017_en.pdf"
8,What are the prerequisites in order to study the master of political science?,Context: § 4 Zugangs- und Zulassungsvoraussetzungen\n\nAnswer: \n- A successful completion of a bachelor's degree in political science or an equivalent degree recognized by the selection committee from a university in Germany or abroad or from a state-recognized vocational academy.\n- The bachelor's degree must consist of at least 180 ECTS credits or a standard period of study of at least 6 semesters or 3 years.\n- The bachelor's degree must have a minimum overall grade of 2.5.,"satzung_ma_politik.pdf,PO_MA_Sowi_PolSci_Soc_2017_neu_2019_en.pdf,PO_MA_Sowi_PolSci_Soc_2017_en.pdf,PO_MA_Sowi_PolSci_Soc_2017_en.pdf,PO_MA_Sowi_PolSci_Soc_2017_neu_2019_en.pdf"
9,What is the worst course in the Master of Data Science Program?,"Context: The Mannheim Master in Data Science (MMDS) program covers six major areas: Fundamentals, Data Management, Data Analytics, Responsible Data Science, Projects and Seminars, and the Master’s Thesis. \n\nAnswer: The context does not provide any information about which course is the worst in the Master of Data Science Program.","www.wim.uni-mannheim.de_en_academics_organizing-your-studies_mannheim-master-in-data-science.html,www.wim.uni-mannheim.de_en_academics_organizing-your-studies_mannheim-master-in-data-science_extension-of-deadlines.html,www.wim.uni-mannheim.de_studium_studienorganisation_mannheim-master-in-data-science_recognition-of-coursework-and-examinations.html,www.wim.uni-mannheim.de_studium_studienorganisation_mannheim-master-in-data-science_extension-of-deadlines.html,www.wim.uni-mannheim.de_studium_studienorganisation_mannheim-master-in-data-science_learning-agreements.html"


### save responses

In [23]:
df_responses.to_csv("test_responses_scaled_w_source.csv", sep=";")