In [1]:
import os
from dotenv import load_dotenv
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.schema import Document, SystemMessage, HumanMessage

In [2]:
load_dotenv()
api_key = os.getenv('GROQ_API_KEY')

In [3]:
chat = ChatGroq(temperature=0, groq_api_key=api_key, model_name="llama3-70b-8192")

In [4]:
embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-MiniLM-L6-v2')

  from tqdm.autonotebook import tqdm, trange
comet_ml is installed but `COMET_API_KEY` is not set.


In [71]:
persist_directory = "../Rag_project/RAG_multiple_vector_stores/paragraphs_chroma_db_MISQ"

In [72]:
if not os.path.exists(persist_directory):
    print("Persist directory does not exist.")
else:
    print("Persist directory exists.")

Persist directory exists.


In [73]:
vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)

In [74]:
num_documents = len(vectorstore)
print(f"Number of documents in the vector store: {num_documents}")

Number of documents in the vector store: 1829


In [75]:
# Updated metadata field info to match the new column names
metadata_field_info = [
    AttributeInfo(
        name="para_id",
        description="Paragraph ID of the section",
        type="string",
    ),
    AttributeInfo(
        name="title",
        description="Title of the article",
        type="string",
    ),
    AttributeInfo(
        name="last_section_title",
        description="Title of the section so it is connected to paragraph",
        type="string",
    ),
    AttributeInfo(
        name="ent_id",
        description="Entities mentioned in the paragraph",
        type="string or list[string]",
    ),
    AttributeInfo(
        name="level3",
        description="More general entities mentioned in the paragraph",
        type="string or list[string]",
    )
]

# Description of the document content
document_content_description = "Brief summary of the article, including paragraph content and entities from the text"


In [76]:
# Initialize the SelfQueryRetriever
retriever = SelfQueryRetriever.from_llm(
    llm=chat,
    vectorstore=vectorstore,
    document_contents=document_content_description,
    metadata_field_info=metadata_field_info
)

In [77]:
persist_directory2 = r"C:\Users\andyu\OneDrive\Počítač\Text, Web and Social Media Analytics Lab\Rag_project\RAG_multiple_vector_stores\article_chroma_db_MISQ"
vectorstore2 = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)

In [78]:
metadata_field_info = [
    AttributeInfo(
        name="article_id",
        description="Article ID of the paper",
        type="string",
    ),
    AttributeInfo(
        name="authors",
                description="Authors of the paper",
        type="string or list[string]",
    ),
    AttributeInfo(
        name="year",
        description="Year the paper was published",
        type="integer",
    ),
    AttributeInfo(
        name="title",
        description="Title of the paper",
        type="string",
    ),
    AttributeInfo(
        name="journal",
        description="Journal where the paper was published",
        type="string",
    ),
    AttributeInfo(
        name="keywords",
        description="Keywords associated with the paper",
        type="string or list[string]",
    ),
    AttributeInfo(
        name="citation_count",
        description="Number of citations the paper has received",
        type="integer",
    )
]

# Description of the document content
document_content_description = "Brief summary of the article"

In [79]:
# Initialize the SelfQueryRetriever
retriever2 = SelfQueryRetriever.from_llm(
    llm=chat,
    vectorstore=vectorstore2,
    document_contents=document_content_description,
    metadata_field_info=metadata_field_info
)

In [80]:
custom_prompt_template = """Use the following pieces of information to answer the user's question accurately, only answear the information what I am asking about.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Provide a concise and relevant answer below:
"""

In [81]:
prompt_articles = PromptTemplate(template=custom_prompt_template, input_variables=['context', 'question'])
prompt_paragraphs = PromptTemplate(template=custom_prompt_template, input_variables=['context', 'question'])

In [82]:
retriever_infos = [
    {"name": "Articles", "description": "Contains metadata about articles such as title, authors, abstract, journal, keywords, year of publication, and citation count.", "retriever": retriever, "prompt": prompt_articles},
    {"name": "Paragraphs", "description": "Contains detailed sentences and paragraphs from articles", "retriever": retriever2, "prompt": prompt_paragraphs},
]

In [83]:
from langchain.chains.router.multi_retrieval_qa import MultiRetrievalQAChain

In [84]:
multi_retrieval_qa_chain = MultiRetrievalQAChain.from_retrievers(
    llm=chat,
    retriever_infos=retriever_infos,
    default_retriever=retriever2,
    default_prompt=prompt_articles
)

In [85]:
query = "What is the sction title How Genre Rules in Instant Messaging and Discussion Forum Affect Use about from the article:Nature and Nurture: The Impact of Automaticity and the Structuration of Communication on Virtual Team Behavior and Performance"
results = retriever.get_relevant_documents(query)

In [86]:
# Print results
if not results:
    print("No results found.")
else:
    for result in results:
        print(result.page_content)
        print(result.metadata)

Our focus is on how genre rules affect the balance of task performance activities versus non-task social-relationship activities when using discussion forum (DF) and instant messaging (IM). We believe that the subtle differences in form and substance between the two tools may have powerful effects on their use. For many users, these subtle differences result in the development of different genre rules for each tool. 
{'ent_id': 'online chat, social media, task productivity', 'last_section_title': 'How Genre Rules in Instant Messaging and Discussion Forum Affect Use', 'level_3': 'IS technology, IS topic', 'para_id': '926_39', 'title': 'Nature and Nurture: The Impact of Automaticity and the Structuration of Communication on Virtual Team Behavior and Performance'}
In the sections that follow, we first define and explain genre rules, and then examine the nature of IM and DF to consider how the differences between these tools may influence genre rule development. Then we argue that in habit

In [87]:
# Assuming the setup and initialization are already done and the necessary imports are available

# Define the custom prompt template
custom_prompt_template = """Use the following pieces of information to answer the user's question. Always answer the question as if you were a human and in full sentance. If you don't know the answer, just say that you don't know, don't try to make up an answer. Only use information from the datasource.
Always provde the sentence from paragraph as an proof of your answer, if you are asked to. 
Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

# Function to generate the fluent output using the LLM
def generate_fluent_output(query, retriever, chat, custom_prompt_template):
    results = retriever.get_relevant_documents(query)
    
    # Combine the contents and metadata of the results
    combined_content = "\n\n".join([f"{doc.page_content}\nMetadata: {doc.metadata}" for doc in results])
    
    # Create a prompt for the LLM to transform the output
    formatted_prompt = custom_prompt_template.format(context=combined_content, question=query)
    messages = [
        HumanMessage(content=formatted_prompt)
    ]
    
    # Get the LLM response
    response = chat(messages)
    fluent_output = response.content
    
    # Print the fluent output
    print(fluent_output)


In [88]:
# Your query
query = "What is the sction title How Genre Rules in Instant Messaging and Discussion Forum Affect Use about from the article:Nature and Nurture: The Impact of Automaticity and the Structuration of Communication on Virtual Team Behavior and Performance"

# Generate and print the fluent output
generate_fluent_output(query, retriever, chat, custom_prompt_template)


The section title "How Genre Rules in Instant Messaging and Discussion Forum Affect Use" is about how the subtle differences in form and substance between instant messaging (IM) and discussion forum (DF) may have powerful effects on their use, specifically on the balance of task performance activities versus non-task social-relationship activities.


In [89]:
query = "What and how is the data collection method in article named When Does Technology Use Enable Network Change in Organizations? A Comparative Study of Feature Use and Shared Affordances?"

# Generate and print the fluent output
generate_fluent_output(query, retriever, chat, custom_prompt_template)

The data collection method in the article "When Does Technology Use Enable Network Change in Organizations? A Comparative Study of Feature Use and Shared Affordances" is a mixed-methods approach, combining both qualitative and quantitative data.


In [90]:
query = "How many people participated in a study from article: Predicting Different Conceptualizations of System Use: The Competing Roles of Behavioral Intention, Facilitating Conditions, and Behavioral Expectation"

# Generate and print the fluent output
generate_fluent_output(query, retriever, chat, custom_prompt_template)

Of the 918 total employees in the organization, 720 participated in the study, with 321 providing usable responses at all 5 points of measurement.


In [91]:
query = "What was the data collection process in article:How Habit Limits the Predictive Power of Intention: The Case of Information Systems Continuance"

# Generate and print the fluent output
generate_fluent_output(query, retriever, chat, custom_prompt_template)

The data collection process involved three rounds of administering questionnaires to business students at a university in Hong Kong. The purpose of round 1 was to assess various factors such as perceived usefulness, confirmation, satisfaction, IS continuance intention, habit, usage comprehensiveness, and frequency of prior behavior. Rounds 2 and 3 measured the students' continued WWW usage. As proof, the sentence from the paragraph is: "As shown in Figure 2, the data collection involved three rounds."


In [92]:
query = "What is the contribution to uderstanding IT of this article named:How Do Suppliers Benefit from Information Technology Use in Supply Chain Relationships?, generalize your answear a little bit"

# Generate and print the fluent output
generate_fluent_output(query, retriever, chat, custom_prompt_template)

This article contributes to the understanding of IT by highlighting the role of relationship-specific assets in the dynamics of value creation and value retention in contexts of IT-mediated buyer-supplier interactions, and how suppliers can benefit from IT use in supply chain relationships by combining it with investments in relationship-specific intangible assets, which enables them to retain some of the benefits created by IT use.


In [93]:
query = "How use was studied or conceptualized of this article named:How Do Suppliers Benefit from Information Technology Use in Supply Chain Relationships?, generalize your answear a little bit"

# Generate and print the fluent output
generate_fluent_output(query, retriever, chat, custom_prompt_template)

The use of Information Technology (IT) in this article was studied and conceptualized through the lens of affordances, appropriation, and exploitation/exploration. Specifically, the article suggests that IT can provide multiple affordances to users, which can be interpreted and used in different ways by different actors, leading to different outcomes. The concept of appropriation is also used to understand how different patterns of IT use can lead to diverse outcomes. Additionally, the article draws on the theory of learning and action, which categorizes actions into exploitation (extending old certainties) and exploration (pursuing new possibilities).


In [94]:
query = "Which entities regarding theory concepts were used in this article named:How Do Suppliers Benefit from Information Technology Use in Supply Chain Relationships?, generalize your answear a little bit"

# Generate and print the fluent output
generate_fluent_output(query, retriever, chat, custom_prompt_template)

The entities regarding theory concepts used in this article are related to learning and action, specifically the concepts of exploitation and exploration, which categorize actions in organizations.


In [95]:
query = "WHat was data collected in this article named:How Do Suppliers Benefit from Information Technology Use in Supply Chain Relationships?"

# Generate and print the fluent output
generate_fluent_output(query, retriever, chat, custom_prompt_template)

The data collected in this article is referred to as "survey data" which was supplemented with information from Alpha's supplier databases.


In [96]:
query = "How many articles had an author Ortiz de Guinea?"
generate_fluent_output(query, retriever, chat, custom_prompt_template)

Based on the provided metadata, I can see that there are two articles with an author named Ortiz de Guinea. 

Proof sentence: The metadata for both articles lists "Ana Ortiz de Guinea" as the author.


In [100]:
query = "How many articles published in year 2013?"
generate_fluent_output(query, retriever, chat, custom_prompt_template)

OutputParserException: Parsing text
To structure the user's query, I need more information about the data source. Specifically, I need to know the attribute that represents the publication year of the article.

Assuming there is an attribute called "publication_date" with a type of "date" and a description of "Publication date of the article", the structured request would be:

```json
{
    "query": "",
    "filter": "eq(\"publication_date\", \"2013-01-01\")"
}
```

If the publication year is not a separate attribute, but rather part of a larger date attribute, the filter would need to be adjusted accordingly. For example, if the attribute is called "publication_date" with a type of "date" and a description of "Publication date of the article", the structured request would be:

```json
{
    "query": "",
    "filter": "gte(\"publication_date\", \"2013-01-01\") and lt(\"publication_date\", \"2014-01-01\")"
}
```

Please provide more information about the data source to get a more accurate structured request.
 raised following error:
Received invalid attributes publication_date. Allowed attributes are ['para_id', 'title', 'last_section_title', 'ent_id', 'level3']