In [1]:
import os
from dotenv import load_dotenv
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.schema import Document, SystemMessage, HumanMessage

Initializing the GROQ API key that is stored in.env file

In [2]:
load_dotenv()
api_key = os.getenv('GROQ_API_KEY')

Setting the LLM model which we want to use

In [3]:
chat = ChatGroq(temperature=0, groq_api_key=api_key, model_name="llama3-70b-8192")

Initializing embeddings and model which we want to use

In [4]:
embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-MiniLM-L6-v2')

  from tqdm.autonotebook import tqdm, trange
comet_ml is installed but `COMET_API_KEY` is not set.


Specifying the directory where vector database is stored

In [7]:
persist_directory = "../RAG_multiple_vector_stores/paragraphs_chroma_db_MISQ"

Checking if the directory exists

In [8]:
if not os.path.exists(persist_directory):
    print("Persist directory does not exist.")
else:
    print("Persist directory exists.")

Persist directory exists.


Loading already created vector database. By specifying the embedding function, we are ensuring that the same model used to create the database is being used to query it.

In [9]:
vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)

In [10]:
num_documents = len(vectorstore)
print(f"Number of documents in the vector store: {num_documents}")

Number of documents in the vector store: 1829


Providing additional information about the metadata stored in paragraphs vector database for more precise filtering and details about the document description helps the chain understand the document's content.

In [11]:
metadata_field_info = [
    AttributeInfo(
        name="para_id",
        description="Paragraph ID of the section",
        type="string",
    ),
    AttributeInfo(
        name="title",
        description="Title of the article",
        type="string",
    ),
    AttributeInfo(
        name="last_section_title",
        description="Title of the section so it is connected to paragraph",
        type="string",
    ),
    AttributeInfo(
        name="ent_id",
        description="Entities mentioned in the paragraph",
        type="string or list[string]",
    ),
    AttributeInfo(
        name="level3",
        description="More general entities mentioned in the paragraph",
        type="string or list[string]",
    )
]

document_content_description = "Brief summary of the article, including paragraph content and entities from the text"


Setting up the SelfQueryRetriever that is used for retriving documents from the vector database. In this chain we specify that we are using Llama 3 as our LLM model, which sees analyzes the prompt and structures it into a query, paragraphs_chroma_db as our vector database and provide also the additional attribute info about metadata and document description.

In [12]:
retriever = SelfQueryRetriever.from_llm(
    llm=chat,
    vectorstore=vectorstore,
    document_contents=document_content_description,
    metadata_field_info=metadata_field_info
)

We also want to use the article_chroma_db so we repeat the same process

In [13]:
persist_directory2 = "../RAG_multiple_vector_stores/article_chroma_db_MISQ"

In [14]:
if not os.path.exists(persist_directory):
    print("Persist directory does not exist.")
else:
    print("Persist directory exists.")

Persist directory exists.


In [15]:
vectorstore2 = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)

Providing additional information about the metadata stored in articles vector database for more precise filtering and details about the document description helps the chain understand the document's content.

In [16]:
metadata_field_info = [
    AttributeInfo(
        name="article_id",
        description="Article ID of the paper",
        type="string",
    ),
    AttributeInfo(
        name="authors",
                description="Authors of the paper",
        type="string or list[string]",
    ),
    AttributeInfo(
        name="year",
        description="Year the paper was published",
        type="integer",
    ),
    AttributeInfo(
        name="title",
        description="Title of the paper",
        type="string",
    ),
    AttributeInfo(
        name="journal",
        description="Journal where the paper was published",
        type="string",
    ),
    AttributeInfo(
        name="keywords",
        description="Keywords associated with the paper",
        type="string or list[string]",
    ),
    AttributeInfo(
        name="citation_count",
        description="Number of citations the paper has received",
        type="integer",
    )
]


document_content_description = "Brief summary of the article"

Also setting up the SelfQueryRetriever for articles vector database

In [17]:
retriever2 = SelfQueryRetriever.from_llm(
    llm=chat,
    vectorstore=vectorstore2,
    document_contents=document_content_description,
    metadata_field_info=metadata_field_info
)

Creating a custom prompt instructs the model to always respond in full sentences and to say "I don't know" if it doesn't know the answer. This approach prevents Llama 3 from generating random responses simply to fulfill the expectation of an answer.

In [31]:
custom_prompt_template = """Use the following pieces of information to answer the user's question accurately, only answear the information what I am asking about.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Provide a concise and relevant answer below:
"""

In [32]:
def set_custom_prompt():
    """
    Prompt template for QA retrieval for each vectorstore
    """
    prompt = PromptTemplate(template=custom_prompt_template,
                            input_variables=['context', 'question'])
    return prompt

prompt = set_custom_prompt()

Using the same prompt for both vector databses, so it applies to them both during the process.

In [20]:
prompt_articles = PromptTemplate(template=custom_prompt_template, input_variables=['context', 'question'])
prompt_paragraphs = PromptTemplate(template=custom_prompt_template, input_variables=['context', 'question'])

Setting up the retriver info, so based on the prompt we also know from which vector database we should retrive the information

In [21]:
retriever_infos = [
    {"name": "Articles", "description": "Contains metadata about articles such as title, authors, abstract, journal, keywords, year of publication, and citation count.", "retriever": retriever, "prompt": prompt_articles},
    {"name": "Paragraphs", "description": "Contains detailed sentences and paragraphs from articles", "retriever": retriever2, "prompt": prompt_paragraphs},
]

In [22]:
from langchain.chains.router.multi_retrieval_qa import MultiRetrievalQAChain

Setting up the MultiRetrievalQAChain, which creates a connection to both vector databases so we can retrive from them.

In [23]:
multi_retrieval_qa_chain = MultiRetrievalQAChain.from_retrievers(
    llm=chat,
    retriever_infos=retriever_infos,
    default_retriever=retriever2,
    default_prompt=prompt_articles
)

Testing:

In [38]:
query = "How was study conduted in article: The Integrative Framework of Technology Use: An Extension and Test."
results = multi_retrieval_qa_chain.run(query)
print(results)

The methodology used in the article is a two-wave panel model (2WPM).


It gives us quite short answear and does not provide any actual context.

In [36]:
query = "Was PLS used in An Alternative to Methodological Individualism: A Non-Reductionist Approach to Studying Technology Adoption by Groups"
results = multi_retrieval_qa_chain.run(query)
print(results)

I don't know. The provided context does not mention PLS (Partial Least Squares) or any articles that discuss its use in studying technology adoption by groups.


RAG does not provide correct answer. PLS study was used in this article.(based on MISQ IS Use Curation)

Lets try some quantitative questions:

In [34]:
query = "Give me how many articles were published in 2013 and also the names of these articles?"
results = multi_retrieval_qa_chain.run(query)
print(results)

OutputParserException: Parsing text
To structure the user's query, I'll need more information about the data source. Specifically, I need to know the attribute that represents the publication date of the articles.

Assuming there is an attribute called "publication_date" with a type of "date" and a description of "Date the article was published", I can structure the query as follows:

```json
{
    "query": "",
    "filter": "eq(\"publication_date\", \"2013-01-01\")"
}
```

This filter will select articles published on or after January 1, 2013, and before January 1, 2014. If you want to select articles published exactly in 2013, you can use the following filter:

```json
{
    "query": "",
    "filter": "and(gte(\"publication_date\", \"2013-01-01\"), lt(\"publication_date\", \"2014-01-01\"))"
}
```

Please note that the above filters assume the "publication_date" attribute exists in the data source. If the attribute has a different name or format, the filter should be adjusted accordingly.
 raised following error:
Received invalid attributes publication_date. Allowed attributes are ['para_id', 'title', 'last_section_title', 'ent_id', 'level3']

In [33]:
query = "Give me names of articles were written in year 2013?"
results = multi_retrieval_qa_chain.run(query)
print(results)

OutputParserException: Parsing text
To structure the user's query, I need more information about the date attribute in the data source. Assuming there is a "published_date" attribute with a "date" type, the structured request would be:

```json
{
    "query": "",
    "filter": "gte(\"published_date\", \"2013-01-01\") and lte(\"published_date\", \"2013-12-31\")"
}
```

If there is no "published_date" attribute, please provide more information about the data source, and I'll be happy to help.
 raised following error:
Unexpected token Token('CNAME', 'and') at line 1, column 37.
Expected one of: 
	* $END
Previous tokens: [Token('RPAR', ')')]


In [30]:
query = "How many articles had an author Ortiz de Guinea?"
results = multi_retrieval_qa_chain.run(query)
print(results)

OutputParserException: Parsing text
```json
{
    "query": "",
    "filter": "eq(\"author\", \"Ortiz de Guinea\")"
}
```

Note: Since the data source does not have an "author" attribute, I assume that the user meant to ask about the author of the article, which is not explicitly mentioned in the data source. If the data source had an "author" attribute, the above response would be correct.
 raised following error:
Received invalid attributes author. Allowed attributes are ['para_id', 'title', 'last_section_title', 'ent_id', 'level3']

It seems that we are not able to retrieve from the articles vector database, for which these questions are aimed at.

Although this approach allows us to combine multiple vector databases into a single chain, it appears to be underperforming based on the implementation. as a result, we were unable to resolve the errors that we have encountered with quantitative questions or the data hallucination issues with qualitative questions.