In [1]:
import os
from dotenv import load_dotenv
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.schema import Document, SystemMessage, HumanMessage

In [2]:
load_dotenv()

True

In [3]:
api_key = os.getenv('GROQ_API_KEY')

In [4]:
chat = ChatGroq(temperature=0, groq_api_key=api_key, model_name="llama3-70b-8192")

In [5]:
embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-MiniLM-L6-v2')

  from tqdm.autonotebook import tqdm, trange
comet_ml is installed but `COMET_API_KEY` is not set.


In [6]:
persist_directory = '../RAG_identical_metadata_page_content/all_info_in_page_content_chroma_db_MISQ'

In [7]:
if not os.path.exists(persist_directory):
    print("Persist directory does not exist.")
else:
    print("Persist directory exists.")

Persist directory exists.


In [8]:
vectordb = Chroma(persist_directory=persist_directory,
                  embedding_function=embedding_model)

In [9]:
num_documents = vectordb._collection.count()  
print(f"Number of documents in the vector store: {num_documents}")

Number of documents in the vector store: 1829


In [10]:
metadata_field_info = [
    AttributeInfo(
        name="article_id",
        description="id of an article",
        type="string",
    ),
    AttributeInfo(
        name="para_id",
        description="Paragraph ID of the section",
        type="string",
    ),
    AttributeInfo(
        name="last_section_title",
        description="Title of the section so multiple paragraphs can have same title section",
        type="string",
    ),
    AttributeInfo(
        name="ent_id",
        description="Entities mentioned in the paragraph",
        type="string or list[string]",
    ),
    AttributeInfo(
        name="label",
        description="Labels associated with the paragraph",
        type="string or list[string]",
    ),
    AttributeInfo(
        name="authors",
        description="Authors of the article",
        type="string",
    ),
    AttributeInfo(
        name="year",
        description="Year of publication",
        type="int",
    ),
    AttributeInfo(
        name="title",
        description="Title of the article",
        type="string",
    ),
    AttributeInfo(
        name="keywords",
        description="Keywords associated with the article",
        type="string or list[string]",
    ),
    AttributeInfo(
        name="citation_count",
        description="Number of citations the article has received",
        type="int",
    )
]


In [11]:
document_content_description = "each document consists of a paragraph from an article"

In [12]:
retriever = SelfQueryRetriever.from_llm(
    llm=chat,
    vectorstore=vectordb,
    document_contents=document_content_description,
    metadata_field_info=metadata_field_info
)

In [13]:
query = "How is the data collection method in article named When Does Technology Use Enable Network Change in Organizations? A Comparative Study of Feature Use and Shared Affordances."
results = retriever.get_relevant_documents(query)
print(results)

  warn_deprecated(


[Document(page_content='\n        Title: When Does Technology Use Enable Network Change in Organizations? A Comparative Study of Feature Use and Shared Affordances\n        Authors: Leonardi, Paul M.\n        Year: 2013\n        Article ID: 7383\n        Paragraph ID: 7383_29\n        Last Section Title: Data Collection\n        Entity ID: data collection method, survey\n        Label: COLLECTION_METHOD\n        Keywords: Technology implementation, organizational change, advice networks, feature use, affordances, frames\n        Citation Count: 97\n        \n        Content:\n        I conducted field observations about three related activities: the work of crashworthiness engineers before CrashLab was implemented, the activities of developers, trainers, and managers during the implementation process, and the work of engineers after CrashLab was implemented. During the periods in which I was a resident in Safety I utilized three primary data sources: observations made of informants at 

In [14]:
query = "Give me names of articles were written in year 2013?"
retrieved_docs = retriever.get_relevant_documents(query)
for doc in retrieved_docs:
    print(f"Content: {doc.page_content}\nMetadata: {doc.metadata}\n")

Content: 
        Title: A Dramaturgical Model of the Production of Performance Data
        Authors: Vieira da Cunha, João
        Year: 2013
        Article ID: 12686
        Paragraph ID: 12686_135
        Last Section Title: I feel like a captain in the eastern front [ in World
        Entity ID: armed conflict
        Label: TOPIC
        Keywords: Management information systems, production of performance data, performance monitoring, implementation of information technology, ethnography
        Citation Count: 2
        
        Content:
        War II] reporting to a faraway command center saying that everything is going as planned when, in reality, my soldiers are being slaughtered. 
        
Metadata: {'article_id': 12686, 'authors': 'Vieira da Cunha, João', 'citation_count': 2, 'ent_id': 'armed conflict', 'keywords': 'Management information systems, production of performance data, performance monitoring, implementation of information technology, ethnography', 'label': 'TOPIC'

In [62]:
source_documents = result.get('source_documents', [])
for doc in source_documents:
    print(doc.metadata)

{'article_id': 9464, 'authors': 'Ortiz de Guinea, Ana; Webster, Jane', 'citation_count': 63, 'ent_id': 'IS technology', 'keywords': 'Emotion, affect, behavior, cognition, performance, pattern, IS use, usage, heart rate, EKG, physiology, physiological arousal, automaticity, continuance, technological effects', 'label': 'TECHNOLOGY', 'last_section_title': 'IT Events', 'para_id': '9464_31', 'title': 'An Investigation of Information Systems Use Patterns: Technological Events as Triggers, the Effect of Time, and Consequences for Performance', 'year': 2013}
{'article_id': 9957, 'authors': 'Polites, Greta L.; Karahanna, Elena', 'citation_count': 72, 'ent_id': 'construct, individual level, numbered hypothesis, theory of planned behavior, theory of reasoned action', 'keywords': 'IS habit, automaticity, organizational routines, cognitive scripts, environmental triggers, context change, habit disruption, incumbent system', 'label': 'LEVEL, MODEL_ELEMENT, THEORY', 'last_section_title': 'Task Defin

In [15]:
custom_prompt_template = """You will be provided conten information and also metadata.Use the following pieces of information to answer the user's question. If you don't know the answer, just say that you don't know, don't try to make up an answer. Only use information from the datasource.
Always provde the sentence from paragraph as an proof of your answer, if you are asked to. ALways try to be as specific as possible.
Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [16]:
def set_custom_prompt():
    """
    Prompt template for QA retrieval for each vectorstore
    """
    prompt = PromptTemplate(template=custom_prompt_template,
                            input_variables=['context', 'question'])
    return prompt

prompt = set_custom_prompt()

In [20]:
qa = RetrievalQA.from_chain_type(
    llm=chat,
    chain_type='stuff',
    retriever=retriever,
    return_source_documents=True
)

In [32]:
query = "How is the data collection method in article named How Do Suppliers Benefit from Information Technology Use in Supply Chain Relationships?"
result = qa({"query": query})
print("Answer:", result["result"])

Answer: The data collection method is not explicitly named in the article.


In [33]:
query = "Specify how was data collected in article named How Do Suppliers Benefit from Information Technology Use in Supply Chain Relationships?"
result = qa({"query": query})
print("Answer:", result["result"])

Answer: Data was collected through a structured questionnaire developed based on fieldwork and a review of prior studies, which was refined in pretests with five senior buyers in Alpha and five senior managers in supplier firms.

 Sentence as proof: "A structured questionnaire was developed based on the fieldwork and a review of prior studies."


In [34]:
query = "Which theory was collected in article named How Do Suppliers Benefit from Information Technology Use in Supply Chain Relationships?"
result = qa({"query": query})
print("Answer:", result["result"])

Answer: The theory of learning and action that suggests actions in organizations can be categorized as either exploitation or exploration (March 1991).

 Sentence from paragraph as proof: "To this end, the paper draws from a theory of learning and action that suggests actions in organizations can be categorized as either exploitation or exploration (March 1991)."


In [21]:
query = "Give me names of articles were written in year 2013?"
result = qa({"query": query})
print("Answer:", result["result"])

Answer: Based on the provided context, there is only one article written in 2013:

1. "A Dramaturgical Model of the Production of Performance Data" by João Vieira da Cunha.
