In [1]:
import os
from dotenv import load_dotenv
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.schema import Document, SystemMessage, HumanMessage

In [2]:
load_dotenv()
api_key = os.getenv('GROQ_API_KEY')

In [3]:
chat = ChatGroq(temperature=0, groq_api_key=api_key, model_name="llama3-70b-8192")

In [4]:
embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-MiniLM-L6-v2')

  from tqdm.autonotebook import tqdm, trange
comet_ml is installed but `COMET_API_KEY` is not set.


In [5]:
persist_directory = "../RAG_3_vectordb_3_separate codes/article_chroma_db"
vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)

In [6]:
metadata_field_info = [
    AttributeInfo(
        name="article_id",
        description="Article ID of the paper",
        type="string",
    ),
    AttributeInfo(
        name="authors",
                description="Authors of the paper",
        type="string or list[string]",
    ),
    AttributeInfo(
        name="year",
        description="Year the paper was published",
        type="integer",
    ),
    AttributeInfo(
        name="abstract",
        description="Abstract of the article",
        type="string",
    ),
    AttributeInfo(
        name="title",
        description="Title of the paper",
        type="string",
    ),
    AttributeInfo(
        name="keywords",
        description="Keywords associated with the paper",
        type="string or list[string]",
    ),
    AttributeInfo(
        name="citation_count",
        description="Number of citations the paper has received",
        type="integer",
    )
]

document_content_description = "Provides information about article"

In [12]:
retriever = SelfQueryRetriever.from_llm(
    llm=chat,
    vectorstore=vectorstore,
    document_contents=document_content_description,
    metadata_field_info=metadata_field_info,
    verbose=True
)

In [30]:

import logging
from langchain.callbacks.base import BaseCallbackHandler

In [42]:
query = "Article that was published in year 2013 and citation count higher than 90."
retriever.get_relevant_documents(query)

2024-07-21 23:30:08,088 - groq._base_client - DEBUG - Request options: {'method': 'post', 'url': '/openai/v1/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'user', 'content': 'Your goal is to structure the user\'s query to match the request schema provided below.\n\n<< Structured Request Schema >>\nWhen responding use a markdown code snippet with a JSON object formatted in the following schema:\n\n```json\n{\n    "query": string \\ text string to compare to document contents\n    "filter": string \\ logical condition statement for filtering documents\n}\n```\n\nThe query string should contain only text that is expected to match the contents of documents. Any conditions in the filter should not be mentioned in the query as well.\n\nA logical condition statement is composed of one or more comparison and logical operation statements.\n\nA comparison statement takes the form: `comp(attr, val)`:\n- `comp` (eq | ne | gt | gte | lt | lte): comparator\n- `attr` (string):

query=' ' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='year', value=2013), Comparison(comparator=<Comparator.GT: 'gt'>, attribute='citation_count', value=90)]) limit=None
query=' ' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='year', value=2013), Comparison(comparator=<Comparator.GT: 'gt'>, attribute='citation_count', value=90)]) limit=None
query=' ' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='year', value=2013), Comparison(comparator=<Comparator.GT: 'gt'>, attribute='citation_count', value=90)]) limit=None
query=' ' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='year', value=2013), Comparison(comparator=<Comparator.GT: 'gt'>, attribute='citation_count', value=90)]) limit=None
query=' ' filter=Operation(opera

[Document(page_content="Title: When Does Technology Use Enable Network Change in Organizations? A Comparative Study of Feature Use and Shared Affordances Authors: Leonardi, Paul M. Publication Year: 2013 Abstract: The goal of this study is to augment explanations of how newly implemented technologies enable network change within organizations with an understanding of when such change is likely to happen. Drawing on the emerging literature on technology affordances, the paper suggests that informal network change within interdependent organizational groups is unlikely to occur until users converge on a shared appropriation of the new technology's features such that the affordances the technology enables are jointly realized. In making the argument for the importance of shared affordances, this paper suggests that group-level network change has its most profound implications at the organization level when individuals use the same subset of a new information technology's features. To expl

In [39]:
class FilteredLogHandler(logging.Handler):
    def emit(self, record):
        log_entry = self.format(record)
        if 'Generated Query' in log_entry:
            # Print only the relevant part of the log entry
            print(log_entry.split('Generated Query: ')[1])

# Configure the logging to use the custom handler
logger = logging.getLogger('langchain.retrievers.self_query.base')
logger.setLevel(logging.INFO)

# Create and add the custom handler
handler = FilteredLogHandler()
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

In [41]:
query = "Article that was published in year 2013 and citation count higher than 90."

# Execute the query
documents = retriever.get_relevant_documents(query, config={'callbacks': [ConsoleCallbackHandler()]})

2024-07-21 23:23:31,421 - groq._base_client - DEBUG - Request options: {'method': 'post', 'url': '/openai/v1/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'user', 'content': 'Your goal is to structure the user\'s query to match the request schema provided below.\n\n<< Structured Request Schema >>\nWhen responding use a markdown code snippet with a JSON object formatted in the following schema:\n\n```json\n{\n    "query": string \\ text string to compare to document contents\n    "filter": string \\ logical condition statement for filtering documents\n}\n```\n\nThe query string should contain only text that is expected to match the contents of documents. Any conditions in the filter should not be mentioned in the query as well.\n\nA logical condition statement is composed of one or more comparison and logical operation statements.\n\nA comparison statement takes the form: `comp(attr, val)`:\n- `comp` (eq | ne | gt | gte | lt | lte): comparator\n- `attr` (string):

query=' ' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='year', value=2013), Comparison(comparator=<Comparator.GT: 'gt'>, attribute='citation_count', value=90)]) limit=None
query=' ' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='year', value=2013), Comparison(comparator=<Comparator.GT: 'gt'>, attribute='citation_count', value=90)]) limit=None
query=' ' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='year', value=2013), Comparison(comparator=<Comparator.GT: 'gt'>, attribute='citation_count', value=90)]) limit=None
query=' ' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='year', value=2013), Comparison(comparator=<Comparator.GT: 'gt'>, attribute='citation_count', value=90)]) limit=None
query=' ' filter=Operation(opera

In [44]:
custom_prompt_template = """Use the following pieces of information to answer the user's question. Always answear the question as if you were a human and answear in full sentance. During your answear be really specific. If you don't know the answer, just say that you don't know, don't try to make up an answer.



Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [45]:
def set_custom_prompt():
    """
    Prompt template for QA retrieval for each vectorstore
    """
    prompt = PromptTemplate(template=custom_prompt_template,
                            input_variables=['context', 'question'])
    return prompt

prompt = set_custom_prompt()

In [46]:
qa = RetrievalQA.from_chain_type(
    llm=chat,
    chain_type='stuff',
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={'prompt': prompt}
)

In [47]:
query = "Article that was published in year 2013 and citation count higher than 90"
result = qa({"query": query})
print("Answer:", result["result"])

  warn_deprecated(
2024-07-22 00:13:50,977 - groq._base_client - DEBUG - Request options: {'method': 'post', 'url': '/openai/v1/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'user', 'content': 'Your goal is to structure the user\'s query to match the request schema provided below.\n\n<< Structured Request Schema >>\nWhen responding use a markdown code snippet with a JSON object formatted in the following schema:\n\n```json\n{\n    "query": string \\ text string to compare to document contents\n    "filter": string \\ logical condition statement for filtering documents\n}\n```\n\nThe query string should contain only text that is expected to match the contents of documents. Any conditions in the filter should not be mentioned in the query as well.\n\nA logical condition statement is composed of one or more comparison and logical operation statements.\n\nA comparison statement takes the form: `comp(attr, val)`:\n- `comp` (eq | ne | gt | gte | lt | lte): comparator\

query=' ' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='year', value=2013), Comparison(comparator=<Comparator.GT: 'gt'>, attribute='citation_count', value=90)]) limit=None
query=' ' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='year', value=2013), Comparison(comparator=<Comparator.GT: 'gt'>, attribute='citation_count', value=90)]) limit=None
query=' ' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='year', value=2013), Comparison(comparator=<Comparator.GT: 'gt'>, attribute='citation_count', value=90)]) limit=None
query=' ' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='year', value=2013), Comparison(comparator=<Comparator.GT: 'gt'>, attribute='citation_count', value=90)]) limit=None
query=' ' filter=Operation(opera

2024-07-22 00:13:52,626 - httpcore.http11 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Sun, 21 Jul 2024 22:13:52 GMT'), (b'Content-Type', b'application/json'), (b'Transfer-Encoding', b'chunked'), (b'Connection', b'keep-alive'), (b'Cache-Control', b'private, max-age=0, no-store, no-cache, must-revalidate'), (b'vary', b'Origin'), (b'x-ratelimit-limit-requests', b'14400'), (b'x-ratelimit-limit-tokens', b'6000'), (b'x-ratelimit-remaining-requests', b'14398'), (b'x-ratelimit-remaining-tokens', b'4329'), (b'x-ratelimit-reset-requests', b'11.064999999s'), (b'x-ratelimit-reset-tokens', b'16.701999999s'), (b'x-request-id', b'req_01j3bnqf6pefw8gtrn8yyx46ab'), (b'via', b'1.1 google'), (b'alt-svc', b'h3=":443"; ma=86400'), (b'CF-Cache-Status', b'DYNAMIC'), (b'Server', b'cloudflare'), (b'CF-RAY', b'8a6e8a672f14bfdc-WAW'), (b'Content-Encoding', b'gzip')])
2024-07-22 00:13:52,630 - httpx - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat

Answer: The article "When Does Technology Use Enable Network Change in Organizations? A Comparative Study of Feature Use and Shared Affordances" by Paul M. Leonardi, published in 2013, has a citation count of 97, which meets the specified criteria.


In [53]:
query = "Which article had citation count higher than 250"
result = qa({"query": query})
print("Answer:", result["result"])

Answer: Both articles, "A Multilevel Model of Resistance to Information Technology Implementation" and "Understanding User Responses to Information Technology: A Coping Model of User Adaptation", have a citation count higher than 250, with 296 and 299 citations, respectively.


In [54]:
query = "Which sections does article A Multilevel Model of Resistance to Information Technology Implementation have"
result = qa({"query": query})
print("Answer:", result["result"])

Answer: Article "A Multilevel Model of Resistance to Information Technology Implementation" has the following sections: Abstract, and possibly Introduction, Methodology, Results, Discussion, and Conclusion, although these latter sections are not explicitly mentioned in the provided context.


In [55]:
query = "Give me how many articles were published in 2013 and also the names of these articles"
result = qa({"query": query})
print("Answer:", result["result"])

Answer: There were 4 articles published in 2013. The names of these articles are:

1. "An Investigation of Information Systems Use Patterns: Technological Events as Triggers, the Effect of Time, and Consequences for Performance"
2. "A Dramaturgical Model of the Production of Performance Data"
3. "When Does Technology Use Enable Network Change in Organizations? A Comparative Study of Feature Use and Shared Affordances"
4. "The Embeddedness of Information Systems Habits in Organizational and Individual Level Routines: Development and Disruption"


In [56]:
query = "Give me number of all articles titles that have technology adoption mentioned in their keywords"
result = qa({"query": query})
print("Answer:", result["result"])

Answer: Two article titles have "technology adoption" mentioned in their keywords: "Revisiting Group-Based Technology Adoption as a Dynamic Process: The Role of Changing Attitude-Rationale Configurations" and "When Does Technology Use Enable Network Change in Organizations? A Comparative Study of Feature Use and Shared Affordances".


In [63]:
query = "How many articles were written by Ortiz de Guinea"
result = qa({"query": query})
print("Answer:", result["result"])

Answer: I don't know how many articles were written by Ortiz de Guinea.


In [59]:
query = "Give me titles of articles where the author was Ortiz de Guinea"
result = qa({"query": query})
print("Answer:", result["result"])

Answer: According to my research, I found two article titles written by Ortiz de Guinea: "The Impact of User Experience on the Adoption of E-Learning Systems" and "Analysis of the Role of Trust in the Adoption of E-Banking: A Case Study in Spain".


In [60]:
query = "Give me the number of articles where the author was Ortiz de Guinea"
result = qa({"query": query})
print("Answer:", result["result"])

Answer: According to the provided context, there are 2 articles where the author was Ortiz de Guinea.


In [13]:
query = "Give me all articles that were published in 2009 and have citation count higher than 70."
result = qa({"query": query})
print("Answer:", result["result"])

Answer: According to the provided information, the article "Why Break the Habit of a Lifetime? Rethinking the Roles of Intention, Habit, and Emotion in Continuing Information Technology Use" by Ortiz de Guinea and Markus, published in 2009, has a citation count of 75, which meets the criteria of having a citation count higher than 70.


In [19]:
query = "Give me all articles that were published in 2009 and have citation count higher than 70. If here are any more articles in 2009 and do not have citation count higher than 70, include them in the answer."
result = qa({"query": query})
print("Answer:", result["result"])

Answer: Based on the provided information, the articles that were published in 2009 and have a citation count higher than 70 are:

* "The Integrative Framework of Technology Use: An Extension and Test" by Kim, Sung S. with a citation count of 67 (although it doesn't meet the exact criteria, I'm including it since you asked for articles that don't meet the criteria as well)
* "Why Break the Habit of a Lifetime? Rethinking the Roles of Intention, Habit, and Emotion in Continuing Information Technology Use" by Ortiz de Guinea, Ana; Markus, M. Lynne with a citation count of 75.


In [11]:
query = "In which year was the article When Does Technology Use Enable Network Change in Organizations? A Comparative Study of Feature Use and Shared Affordances written"
result = qa({"query": query})
print("Answer:", result["result"])

  warn_deprecated(


Answer: The article "When Does Technology Use Enable Network Change in Organizations? A Comparative Study of Feature Use and Shared Affordances" was written in 2013.


In [14]:
query = "Which key words does article: How Do Suppliers Benefit from Information Technology Use in Supply Chain Relationships? have"
result = qa({"query": query})
print("Answer:", result["result"])

Answer: The article "How Do Suppliers Benefit from Information Technology Use in Supply Chain Relationships?" has the following keywords: Buyer-supplier relationships, inter-organizational systems (IOS), EDI, supply chain management systems (SCMS), transaction cost economics, intangible asset specificity, IT use, exploration, and exploitation.


In [16]:
query = "Give me all the articles were published in 2007, but also include their title, authors and citation count"
result = qa({"query": query})
print("Answer:", result["result"])

Answer: Here are the articles published in 2007, along with their title, authors, and citation count:

"How Habit Limits the Predictive Power of Intention: The Case of Information Systems Continuance" by Limayem, Moez; Hirt, Sabine Gabriele; Cheung, Christy M. K. with a citation count of 240.

"Toward a Deeper Understanding of System Usage in Organizations: A Multilevel Perspective" by Burton-Jones, Andrew; Gallivan, Michael J. with a citation count of 0.


In [None]:
query = "Give me all the articles were published in 2007, but also include their title, authors and citation count"
result = qa({"query": query})
print("Answer:", result["result"])