# Prompt Engineering Tests with Metadata and Study Program-Specific Prompt

In [None]:
from langchain.vectorstores import Chroma
from langchain_community.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.globals import set_verbose


import pandas as pd
pd.set_option('display.max_colwidth', None)

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

In [None]:
persist_directory='./storage_scaled_w_metadata'
vectordb = Chroma(persist_directory=persist_directory, embedding_function=OpenAIEmbeddings())

In [None]:
# Create Prompt
template = """

Use the following pieces of context to answer the question at the end.

Execute these steps:
1 - list the context
2 - focus on words like "optional" or "can" for your answer
3 - answer the question. Do not use information outside of the context to answer the question.

Your answer should have this format:

context:
answer:

------------------------
Context: {context}

Question: I am studying in the {study_program} program. {question}

"""

custom_prompt = PromptTemplate.from_template(template)

In [None]:
# create a Q&A chain for each study program

study_programs = [
    "B.Sc. Business Informatics",
    "M.Sc. Business Informatics",
    "B.Sc. Mathematics in Business and Economics",
    "M.Sc. Mathematics in Business and Economics",
    "Mannheim Master in Data Science"
]

qa_chains = {program: RetrievalQA.from_chain_type(
    llm=ChatOpenAI(model='gpt-3.5-turbo', temperature=0),
    retriever=vectordb.as_retriever(search_kwargs={'k': 5}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": custom_prompt.partial(study_program=program)}
) for program in study_programs}

In [None]:
# load test questions and exemplary answers
test_questions = pd.read_csv("TestQuestions.csv", sep=";")

test_questions.head()

In [None]:
# test...

#set_verbose(True)

#qa_chain = qa_chains["B.Sc. Business Informatics"]
#
#q = test_questions.iloc[0, 0]
#
#r = qa_chain({'query': q})
#print(r['result'])
#
#print('\nSources:')
#for source_doc in r['source_documents']:
#    print(source_doc)
#    print('====================\n')

The following code cell iterates over the test questions and then asks them five times. Each time a different Q&A chain with a promt specified to the respective study program is used.

In [None]:
df_questions = pd.read_csv('TestQuestions.csv', delimiter=";")
questions = df_questions["Question"]

responses = []
counter = 0

for q in questions:
    print(f'q{counter} start (', end="")

    r_and_s = []

    for i, program in enumerate(study_programs):
        print(f"{i}... ", end="")

        response_col_name, source_col_name = f"Response ({program})", f"Source ({program})"
        
        # get result
        result_object = qa_chains[program]({'query': q})
        r = result_object['result']
        
        # get source documents
        source_docs = result_object['source_documents']
        sources = []
        for doc in source_docs:
            sources.append(doc.metadata["source"].replace('./data/scraped_data/', ''))

        source = ",".join(sources)
        
        r_and_s.append(r)
        r_and_s.append(sources)

        
    # build row
    responses.append([q] + r_and_s)
    
    counter += 1
    
    print(f')\nq{counter} end')

columns = ["Question"]
for program in study_programs:
    columns.append(f"Response {program}")
    columns.append(f"Source {program}")

df_responses = pd.DataFrame(responses, columns=columns)

In [None]:
df_responses

In [None]:
df_responses.to_csv("data/test_responses/test_responses_metadata_study_program_specific_prompt.csv")