In [1]:
import os
from dotenv import load_dotenv
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.schema import Document, SystemMessage, HumanMessage

In [2]:
load_dotenv()
api_key = os.getenv('GROQ_API_KEY')

In [3]:
chat = ChatGroq(temperature=0, groq_api_key=api_key, model_name="llama3-70b-8192")

In [4]:
embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-MiniLM-L6-v2')

  from tqdm.autonotebook import tqdm, trange
comet_ml is installed but `COMET_API_KEY` is not set.


In [5]:
persist_directory = "../Rag_project/RAG_multiple_vector_stores/paragraphs_chroma_db_MISQ"

In [6]:
if not os.path.exists(persist_directory):
    print("Persist directory does not exist.")
else:
    print("Persist directory exists.")

Persist directory exists.


In [7]:
files = os.listdir(persist_directory)
print(f"Files in directory: {files}")

Files in directory: ['bf72155b-3b8e-4318-817a-84489ad406e1', 'chroma.sqlite3']


In [8]:
if os.access(persist_directory, os.R_OK):
    print(f"Read permission granted for directory: {persist_directory}")
else:
    print(f"Read permission denied for directory: {persist_directory}")

# Check permissions for individual files
for file in files:
    file_path = os.path.join(persist_directory, file)
    if os.access(file_path, os.R_OK):
        print(f"Read permission granted for file: {file}")
    else:
        print(f"Read permission denied for file: {file}")

Read permission granted for directory: C:\Users\andyu\OneDrive\Počítač\Text, Web and Social Media Analytics Lab\Rag_project\RAG_multiple_vector_stores\paragraphs_chroma_db_MISQ
Read permission granted for file: bf72155b-3b8e-4318-817a-84489ad406e1
Read permission granted for file: chroma.sqlite3


In [11]:
header_file = '../RAG_multiple_vector_stores/paragraphs_chroma_db_MISQ/bf72155b-3b8e-4318-817a-84489ad406e1/header.bin'  # Replace with the actual header file name
header_file_path = os.path.join(persist_directory, header_file)

if os.path.exists(header_file_path):
    print(f"Header file exists: {header_file_path}")
else:
    print(f"Header file does not exist: {header_file_path}")

Header file does not exist: C:\Users\andyu\OneDrive\Počítač\Text, Web and Social Media Analytics Lab\Rag_project\RAG_multiple_vector_stores\paragraphs_chroma_db_MISQ\../RAG_multiple_vector_stores/paragraphs_chroma_db_MISQ/bf72155b-3b8e-4318-817a-84489ad406e1/header.bin


In [7]:
vectorstore2 = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)

In [8]:
num_documents = len(vectorstore2)
print(f"Number of documents in the vector store: {num_documents}")

Number of documents in the vector store: 1829


In [17]:
# Updated metadata field info to match the new column names
metadata_field_info = [
    AttributeInfo(
        name="para_id",
        description="Paragraph ID of the section",
        type="string",
    ),
    AttributeInfo(
        name="title",
        description="Title of the article",
        type="string",
    ),
    AttributeInfo(
        name="last_section_title",
        description="Title of the section so it is connected to paragraph",
        type="string",
    ),
    AttributeInfo(
        name="paragraph",
        description="Content of the paragraph",
        type="string",
    ),
    AttributeInfo(
        name="ent_id",
        description="Entities mentioned in the paragraph",
        type="string or list[string]",
    ),
    AttributeInfo(
        name="level3",
        description="More general entities mentioned in the paragraph",
        type="string or list[string]",
    )
]

# Description of the document content
document_content_description = "Brief summary of the article, including paragraph content and entities from the text"


In [18]:
# Initialize the SelfQueryRetriever
retriever = SelfQueryRetriever.from_llm(
    llm=chat,
    vectorstore=vectorstore2,
    document_contents=document_content_description,
    metadata_field_info=metadata_field_info
)

In [11]:
query = "Which sections titles does article Nature and Nurture: The Impact of Automaticity on Virtual Team Behavior and Performance has"
results = retriever.get_relevant_documents(query)

  warn_deprecated(


In [12]:
# Print results
if not results:
    print("No results found.")
else:
    for result in results:
        print(result.page_content)
        print(result.metadata)

No results found.


In [19]:
# Assuming the setup and initialization are already done and the necessary imports are available

# Define the custom prompt template
custom_prompt_template = """Use the following pieces of information to answer the user's question. Always answer the question as if you were a human and in full sentance. If you don't know the answer, just say that you don't know, don't try to make up an answer. Only use information from the datasource.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

# Function to generate the fluent output using the LLM
def generate_fluent_output(query, retriever, chat, custom_prompt_template):
    results = retriever.get_relevant_documents(query)
    
    # Combine the contents and metadata of the results
    combined_content = "\n\n".join([f"{doc.page_content}\nMetadata: {doc.metadata}" for doc in results])
    
    # Create a prompt for the LLM to transform the output
    formatted_prompt = custom_prompt_template.format(context=combined_content, question=query)
    messages = [
        HumanMessage(content=formatted_prompt)
    ]
    
    # Get the LLM response
    response = chat(messages)
    fluent_output = response.content
    
    # Print the fluent output
    print(fluent_output)


In [23]:
# Your query
query = "Which entities are in article Nature and Nurture: The A Temporally Situated Self-Agency Theory of Information Technology Reinvention."

# Generate and print the fluent output
generate_fluent_output(query, retriever, chat, custom_prompt_template)


The entities mentioned in the article "Nature and Nurture: The A Temporally Situated Self-Agency Theory of Information Technology Reinvention" are not explicitly stated.


In [72]:
query = "Does this article has technology adoptation in keywords: The Integrative Framework of Technology Use: An Extension and Test?"

# Generate and print the fluent output
generate_fluent_output(query, retriever, chat, custom_prompt_template)

No, the article does not have "technology adoption" in its keywords, but it does have "technology use" and "continued use".
