In [21]:
import os
from dotenv import load_dotenv
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain.chains.router.multi_retrieval_qa import MultiRetrievalQAChain
from langchain.prompts import PromptTemplate

Loading our LLM model. We are using Llama 3 which we are able to access through Groq whith their API key.

In [2]:
load_dotenv()

True

In [3]:
api_key = os.getenv('GROQ_API_KEY')

Setting model to Llama3

In [4]:
chat = ChatGroq(temperature=0, groq_api_key=api_key, model_name="llama3-70b-8192")

In [5]:
embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-MiniLM-L6-v2')

  from tqdm.autonotebook import tqdm, trange
comet_ml is installed but `COMET_API_KEY` is not set.


Setting the directories where we have stored our vector databases, and making sure that the directory exists. (In the previous attempts we have experienced problem with setting the directories so this is just a safety measure)

In [8]:
persist_directory1 = '../RAG_multiple_vector_stores/article_chroma_db_MISQ'

In [9]:
if not os.path.exists(persist_directory1):
    print("Persist directory does not exist.")
else:
    print("Persist directory exists.")

Persist directory exists.


In [12]:
persist_directory2 = '../RAG_multiple_vector_stores/sentence_chroma_db_MISQ'

In [13]:
if not os.path.exists(persist_directory2):
    print("Persist directory does not exist.")
else:
    print("Persist directory exists.")

Persist directory exists.


Loading the created vector databases

In [15]:
vectordb_articles = Chroma(embedding_function=embedding_model,
                           persist_directory=persist_directory1,)

In [16]:
vectordb_sentences = Chroma(embedding_function=embedding_model,
                           persist_directory=persist_directory2,)

Setting them up as retrievers so we can use them in the QAchain as a source of our information

In [17]:
retriever_articles = vectordb_articles.as_retriever()
retriever_sentences = vectordb_sentences.as_retriever()

In [18]:
custom_prompt_template = """Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [37]:
custom_prompt_template = """Use the following pieces of information to answer the user's question accurately, only answear the information what I am asking about.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Provide a concise and relevant answer below:
"""

In [38]:
prompt_articles = PromptTemplate(template=custom_prompt_template, input_variables=['context', 'question'])
prompt_sentences = PromptTemplate(template=custom_prompt_template, input_variables=['context', 'question'])

Creating information about the retrievers to distinguish them

In [39]:
retriever_infos = [
    {"name": "Articles", "description": "Information about articles such as name, authors, abstract, journal the article was published in,keywords from the article, year published, and citation count.", "retriever": retriever_articles, "prompt": prompt_articles},
    {"name": "Sentences", "description": "Provides information about title, sentences in the article, and names of sections and sentence type under the sentence falls.", "retriever": retriever_sentences, "prompt": prompt_sentences},
]

In [47]:
retriever_infos = [
    {"name": "Articles", "description": "Contains metadata about articles such as title, authors, abstract, journal, keywords, year of publication, and citation count.", "retriever": retriever_articles, "prompt": prompt_articles},
    {"name": "Sentences", "description": "Contains detailed sentences and paragraphs from articles, including titles, section names, and sentence types.", "retriever": retriever_sentences, "prompt": prompt_sentences},
]

Setting up MultiRetrievalQAChain where we take two of our retrivers

In [48]:
multi_retrieval_qa_chain = MultiRetrievalQAChain.from_retrievers(
    llm=chat,
    retriever_infos=retriever_infos,
    default_retriever=retriever_articles,
    default_prompt=prompt_articles
)

In [49]:
query = "Which article name has the highest citation count."
response = multi_retrieval_qa_chain.invoke({"input": query})

print("Answer:", response["result"])

Answer: The article with the highest citation count is "Lapointe, Liette; Rivard, Suzanne. 2005. A Multilevel Model of Resistance to Information Technology Implementation" with 296 citations.


In [46]:
query = "Which article has a citation count of 299"
response = multi_retrieval_qa_chain.invoke({"input": query})

print("Answer:", response["result"])

Answer: According to the provided context, the article with a citation count of 296 (not 299) is:

7061 Lapointe, Liette; Rivard, Suzanne 2005 A Multilevel Model of Resistance to Information Technology Implementation Management Information Systems Quarterly


In [42]:
query = "How many articles were published in 2005."
response = multi_retrieval_qa_chain.invoke({"input": query})

print("Answer:", response["result"])

Answer: According to the provided context, one article was published in the year 2005.


In [30]:
query = "Which articles have citation count of 0, can be more of them"
response = multi_retrieval_qa_chain.invoke({"input": query})

print("Answer:", response["result"])

Answer: According to the provided context, the article with a citation count of 0 is:

1658 Burton-Jones, Andrew; Gallivan, Michael J. 2007 Toward a Deeper Understanding of System Usage in Organizations: A Multilevel Perspective Management Information Systems Quarterly


All of the answears are wrong so lets try to take adifferent approach with Ensemble Retriever

In [50]:
from langchain.retrievers import EnsembleRetriever

In [51]:
ensemble_retriever = EnsembleRetriever(
    retrievers=[retriever_articles, retriever_sentences],
    weights=[0.5, 0.5]  # Adjust the weights as needed
)

In [52]:
custom_prompt_template = """Use the following pieces of information to answer the user's question accurately.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Provide a concise and relevant answer below:
"""

In [53]:
prompt = PromptTemplate(template=custom_prompt_template, input_variables=['context', 'question'])

In [54]:
multi_retrieval_qa_chain = MultiRetrievalQAChain.from_retrievers(
    llm=chat,
    retriever_infos=[{"name": "Ensemble", "description": "Combines results from article and sentence retrievers.", "retriever": ensemble_retriever, "prompt": prompt}],
    default_retriever=ensemble_retriever,
    default_prompt=prompt
)

In [57]:
query = "Which title name has the highest citation count."
response = multi_retrieval_qa_chain.invoke({"input": query})

print("Answer:", response["result"])

Answer: Based on the provided contexts, the title with the highest citation count is "A Multilevel Model of Resistance to Information Technology Implementation" with 296 citations.


In [56]:
query = "Which article has 299 citation."
response = multi_retrieval_qa_chain.invoke({"input": query})

print("Answer:", response["result"])

Answer: According to the provided context, the article "A Multilevel Model of Resistance to Information Technology Implementation" by Lapointe and Rivard (2005) has the most citations, with 296 citations.


In [58]:
query = "how many ciatation does article named Understanding User Responses to Information Technology: A Coping Model of User Adaptation has."
response = multi_retrieval_qa_chain.invoke({"input": query})

print("Answer:", response["result"])

Answer: The article "Understanding User Responses to Information Technology: A Coping Model of User Adaptation" has 299 citations.
