In [1]:
import os
from dotenv import load_dotenv
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.schema import Document, SystemMessage, HumanMessage

In [2]:
load_dotenv()
api_key = os.getenv('GROQ_API_KEY')

In [3]:
chat = ChatGroq(temperature=0, groq_api_key=api_key, model_name="llama3-70b-8192")

In [4]:
embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-MiniLM-L6-v2')

  from tqdm.autonotebook import tqdm, trange
comet_ml is installed but `COMET_API_KEY` is not set.


In [5]:
persist_directory = r"C:\Users\andyu\OneDrive\Počítač\Text, Web and Social Media Analytics Lab\Rag_project\RAG_3_vectordb_3_separate codes\article_chroma_db"
vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)

In [7]:
metadata_field_info = [
    AttributeInfo(
        name="article_id",
        description="Article ID of the paper",
        type="string",
    ),
    AttributeInfo(
        name="authors",
                description="Authors of the paper",
        type="string or list[string]",
    ),
    AttributeInfo(
        name="year",
        description="Year the paper was published",
        type="integer",
    ),
    AttributeInfo(
        name="title",
        description="Title of the paper",
        type="string",
    ),
    AttributeInfo(
        name="keywords",
        description="Keywords associated with the paper",
        type="string or list[string]",
    ),
    AttributeInfo(
        name="citation_count",
        description="Number of citations the paper has received",
        type="integer",
    )
]

document_content_description = "Provides information about article"

In [8]:
retriever = SelfQueryRetriever.from_llm(
    llm=chat,
    vectorstore=vectorstore,
    document_contents=document_content_description,
    metadata_field_info=metadata_field_info,
    verbose=True
)

In [9]:
query = "How many articles were published in 2016"
retriever.get_relevant_documents(query)

  warn_deprecated(


[Document(page_content='Article ID: 9183 Authors: Nevo, Saggi; Nevo, Dorit; Pinsonneault, Alain Publication Year: 2016 Title: A Temporally Situated Self-Agency Theory of Information Technology Reinvention Journal: Management Information Systems Quarterly Abstract: Our knowledge of how users reinvent information technologies (IT) in ways that depart from their intended purposes to achieve new goals is relatively limited. Drawing on a human agency theory that situates actors in the flow of time, this paper develops a theory of IT reinvention. It identifies the key subprocesses of IT reinvention, describes two patterns of reinvention (performance-oriented and mastery-oriented), and explains how the present and the past influence the ambiguities, demands, and dilemmas inherent to each pattern. The outcomes associated with each pattern of IT reinvention are also discussed. The paper provides the theoretical foundations to understand how users reinvent IT as well as new insights into a broad

In [10]:

custom_prompt_template = """Use the following pieces of information to answer the user's question. Always answer the question as if you were a human and in full sentance. If you don't know the answer, just say that you don't know, don't try to make up an answer. Only use information from the datasource.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

def generate_fluent_output(query, retriever, chat, custom_prompt_template):
    results = retriever.get_relevant_documents(query)

    combined_content = "\n\n".join([f"{doc.page_content}\nMetadata: {doc.metadata}" for doc in results])

    formatted_prompt = custom_prompt_template.format(context=combined_content, question=query)
    messages = [
        HumanMessage(content=formatted_prompt)
    ]

    response = chat(messages)
    fluent_output = response.content

    print(fluent_output)


In [11]:
# Your query
query = "How many articles had an author Ortiz de Guinea?"

# Generate and print the fluent output
generate_fluent_output(query, retriever, chat, custom_prompt_template)


  warn_deprecated(


According to the data, 2 articles had an author named Ortiz de Guinea.


In [12]:
query = "Does this article has technology adoptation in keywords: The Integrative Framework of Technology Use: An Extension and Test?"

# Generate and print the fluent output
generate_fluent_output(query, retriever, chat, custom_prompt_template)

No, the article does not have "technology adoption" in its keywords, but it does have "technology use" and "continued use".


In [12]:
query = "How many articles were published in year 2013?"

# Generate and print the fluent output
generate_fluent_output(query, retriever, chat, custom_prompt_template)

4
