In [1]:
import os
from dotenv import load_dotenv
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.schema import Document, SystemMessage, HumanMessage

In [2]:
load_dotenv()
api_key = os.getenv('GROQ_API_KEY')

In [3]:
chat = ChatGroq(temperature=0, groq_api_key=api_key, model_name="llama3-70b-8192")

In [4]:
embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-MiniLM-L6-v2')

  from tqdm.autonotebook import tqdm, trange
comet_ml is installed but `COMET_API_KEY` is not set.


In [5]:
persist_directory = "../RAG_3_vectordb_3_separate codes/article_chroma_db"
vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)

In [6]:
metadata_field_info = [
    AttributeInfo(
        name="article_id",
        description="Article ID of the paper",
        type="string",
    ),
    AttributeInfo(
        name="authors",
                description="Authors of the paper",
        type="string or list[string]",
    ),
    AttributeInfo(
        name="year",
        description="Year the paper was published",
        type="integer",
    ),
    AttributeInfo(
        name="abstract",
        description="Abstract of the article",
        type="string",
    ),
    AttributeInfo(
        name="title",
        description="Title of the paper",
        type="string",
    ),
    AttributeInfo(
        name="keywords",
        description="Keywords associated with the paper",
        type="string or list[string]",
    ),
    AttributeInfo(
        name="citation_count",
        description="Number of citations the paper has received",
        type="integer",
    )
]

document_content_description = "Provides information about article"

In [7]:
retriever = SelfQueryRetriever.from_llm(
    llm=chat,
    vectorstore=vectorstore,
    document_contents=document_content_description,
    metadata_field_info=metadata_field_info,
    verbose=True
)

In [79]:
query = "Article that was published by author Leonardi in year 2013."
retriever.get_relevant_documents(query)

[]

In [9]:
custom_prompt_template = """Use the following pieces of information to answer the user's question. Always answear the question as if you were a human and answear in full sentance. During your answear be really specific. If you don't know the answer, just say that you don't know, don't try to make up an answer.



Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [10]:
def set_custom_prompt():
    """
    Prompt template for QA retrieval for each vectorstore
    """
    prompt = PromptTemplate(template=custom_prompt_template,
                            input_variables=['context', 'question'])
    return prompt

prompt = set_custom_prompt()

In [11]:
qa = RetrievalQA.from_chain_type(
    llm=chat,
    chain_type='stuff',
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={'prompt': prompt}
)

In [52]:
query = "How many articles were published in 2016"
result = qa({"query": query})
print("Answer:", result["result"])

Answer: According to the provided information, two articles were published in 2016.


In [53]:
query = "Which article had citation count higher than 250"
result = qa({"query": query})
print("Answer:", result["result"])

Answer: Both articles, "A Multilevel Model of Resistance to Information Technology Implementation" and "Understanding User Responses to Information Technology: A Coping Model of User Adaptation", have a citation count higher than 250, with 296 and 299 citations, respectively.


In [54]:
query = "Which sections does article A Multilevel Model of Resistance to Information Technology Implementation have"
result = qa({"query": query})
print("Answer:", result["result"])

Answer: Article "A Multilevel Model of Resistance to Information Technology Implementation" has the following sections: Abstract, and possibly Introduction, Methodology, Results, Discussion, and Conclusion, although these latter sections are not explicitly mentioned in the provided context.


In [55]:
query = "Give me how many articles were published in 2013 and also the names of these articles"
result = qa({"query": query})
print("Answer:", result["result"])

Answer: There were 4 articles published in 2013. The names of these articles are:

1. "An Investigation of Information Systems Use Patterns: Technological Events as Triggers, the Effect of Time, and Consequences for Performance"
2. "A Dramaturgical Model of the Production of Performance Data"
3. "When Does Technology Use Enable Network Change in Organizations? A Comparative Study of Feature Use and Shared Affordances"
4. "The Embeddedness of Information Systems Habits in Organizational and Individual Level Routines: Development and Disruption"


In [56]:
query = "Give me number of all articles titles that have technology adoption mentioned in their keywords"
result = qa({"query": query})
print("Answer:", result["result"])

Answer: Two article titles have "technology adoption" mentioned in their keywords: "Revisiting Group-Based Technology Adoption as a Dynamic Process: The Role of Changing Attitude-Rationale Configurations" and "When Does Technology Use Enable Network Change in Organizations? A Comparative Study of Feature Use and Shared Affordances".


In [63]:
query = "How many articles were written by Ortiz de Guinea"
result = qa({"query": query})
print("Answer:", result["result"])

Answer: I don't know how many articles were written by Ortiz de Guinea.


In [59]:
query = "Give me titles of articles where the author was Ortiz de Guinea"
result = qa({"query": query})
print("Answer:", result["result"])

Answer: According to my research, I found two article titles written by Ortiz de Guinea: "The Impact of User Experience on the Adoption of E-Learning Systems" and "Analysis of the Role of Trust in the Adoption of E-Banking: A Case Study in Spain".


In [60]:
query = "Give me the number of articles where the author was Ortiz de Guinea"
result = qa({"query": query})
print("Answer:", result["result"])

Answer: According to the provided context, there are 2 articles where the author was Ortiz de Guinea.


In [13]:
query = "Give me all articles that were published in 2009 and have citation count higher than 70."
result = qa({"query": query})
print("Answer:", result["result"])

Answer: According to the provided information, the article "Why Break the Habit of a Lifetime? Rethinking the Roles of Intention, Habit, and Emotion in Continuing Information Technology Use" by Ortiz de Guinea and Markus, published in 2009, has a citation count of 75, which meets the criteria of having a citation count higher than 70.


In [19]:
query = "Give me all articles that were published in 2009 and have citation count higher than 70. If here are any more articles in 2009 and do not have citation count higher than 70, include them in the answer."
result = qa({"query": query})
print("Answer:", result["result"])

Answer: Based on the provided information, the articles that were published in 2009 and have a citation count higher than 70 are:

* "The Integrative Framework of Technology Use: An Extension and Test" by Kim, Sung S. with a citation count of 67 (although it doesn't meet the exact criteria, I'm including it since you asked for articles that don't meet the criteria as well)
* "Why Break the Habit of a Lifetime? Rethinking the Roles of Intention, Habit, and Emotion in Continuing Information Technology Use" by Ortiz de Guinea, Ana; Markus, M. Lynne with a citation count of 75.
