In [1]:
pip install --quiet langchain_community tiktoken langchain-openai langchainhub langchain neo4j python-dotenv

In [2]:
pip install --upgrade --quiet langchain-together

In [3]:
pip install --quiet langchain_experimental langchain_openai langchain_core

In [39]:
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Neo4jVector
import openai
from openai import OpenAI
import os
import json
from dotenv import load_dotenv
from operator import itemgetter
from neo4j import GraphDatabase

from langchain_community.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel
from langchain_core.runnables import ConfigurableField, RunnableParallel
from langchain_together import Together


ALL KEYS

In [17]:
# Load environment variables from .env file
#load_dotenv('.env')

openai_api_key = '<your OPENAI API KEY>'
uri = "<YOUR NEO4J URI>"  # replace with your actual AuraDB URI
username = "<YOUR neo4j>"  # default is usually 'neo4j'
password = "<YOUR NEO4J_PASSWORD>"

HF_token = "<YOUR HF TOKEN>"

togetherai_api_key = "<YOUR TOGETHER AI API>"

NEO4J RETRIEVERS

In [18]:

# Typical RAG retriever

typical_rag = Neo4jVector.from_existing_index(
    OpenAIEmbeddings(openai_api_key=openai_api_key), index_name="typical_rag",
    url=uri,
    username=username,
    password=password)

# Parent retriever

parent_query = """
MATCH (node)<-[:HAS_CHILD]-(parent)
WITH parent, max(score) AS score // deduplicate parents
RETURN parent.text AS text, score, {} AS metadata LIMIT 1
"""

parent_vectorstore = Neo4jVector.from_existing_index(
    OpenAIEmbeddings(openai_api_key=openai_api_key),
    index_name="parent_document",
    retrieval_query=parent_query,
    url=uri,
    username=username,
    password=password
)

# Hypothetic questions retriever

hypothetic_question_query = """
MATCH (node)<-[:HAS_QUESTION]-(parent)
WITH parent, max(score) AS score // deduplicate parents
RETURN parent.text AS text, score, {} AS metadata
"""

hypothetic_question_vectorstore = Neo4jVector.from_existing_index(
    OpenAIEmbeddings(openai_api_key=openai_api_key),
    index_name="hypothetical_questions",
    retrieval_query=hypothetic_question_query,
    url=uri,
    username=username,
    password=password
)
# Summary retriever

summary_query = """
MATCH (node)<-[:HAS_SUMMARY]-(parent)
WITH parent, max(score) AS score // deduplicate parents
RETURN parent.text AS text, score, {} AS metadata
"""

summary_vectorstore = Neo4jVector.from_existing_index(
    OpenAIEmbeddings(openai_api_key=openai_api_key),
    index_name="summary",
    retrieval_query=summary_query,
    url=uri,
    username=username,
    password=password
)

CHAT MODELS

In [26]:
llmtype = 'gpt4'

if llmtype == 'gpt4':
  llm = ChatOpenAI(temperature=0,
                   openai_api_key=openai_api_key,
                   model='gpt-4-turbo-preview')
elif llmtype == 'llama2':
  llm = Together(
      model="togethercomputer/LLaMA-2-7B-32K",
      temperature=0.5,
      max_tokens=1024,
      together_api_key=togetherai_api_key)
elif llmtype == 'llama3':
  llm = Together(
      model="meta-llama/Llama-3-8b-chat-hf",
      temperature=0.5,
      max_tokens=1024,
      together_api_key=togetherai_api_key)
elif llmtype == 'Qwen':
  llm = Together(
      model="Qwen/Qwen1.5-1.8B-Chat",
      temperature=0.5,
      max_tokens=1024,
      together_api_key=togetherai_api_key)
else:
  llm = Together(
      model="mistralai/Mixtral-8x22B",
      temperature=0.5,
      max_tokens=1024,
      together_api_key=togetherai_api_key)

PROMPT CHAIN

In [None]:
index_name = "typical_rag"  #typical_rag, parent_document, summary, hypothetical_questions

template = """
Answer the question based only on the following CONTEXT:
{context}

Question: {question}

Please be truthful. Keep in mind, you will lose the job, if you answer out of CONTEXT questions.
If the responses are irrelevant to the question then respond by saying that I couldn't find a good response to your query in the database.
"""
prompt = ChatPromptTemplate.from_template(template)

retriever = typical_rag.as_retriever().configurable_alternatives(
    ConfigurableField(id="strategy"),
    default_key=index_name,
    parent_strategy=parent_vectorstore.as_retriever(),
    hypothetical_questions=hypothetic_question_vectorstore.as_retriever(),
    summary_strategy=summary_vectorstore.as_retriever(),
)

# Function to print and pass the context

chain = (
    RunnableParallel(
        {
            "context": itemgetter("question") | retriever,
            "question": itemgetter("question"),
        }
    )
    | prompt
    | llm
    | StrOutputParser()
)


# Add typing for input
class Question(BaseModel):
    question: str


chain = chain.with_types(input_type=Question)
#print(chain)

GENERATE ANSWERS

In [109]:
# Path to the JSON file
file_path = 'input/questions.json'  #replace with your own path after uploading file to google drive

# Open the file and load the data
with open(file_path, 'r') as file:
    data = json.load(file)


questions = data['question']
print(questions)

['How long is the term of office for the President in the Federation of Pakistan?', "Which country's constitution is being discussed in the text?", 'How many members can the Cabinet of the Provincial Assembly have according to the Constitution?']


In [116]:
import re
question_responses = {}
#original_query = "How many members can the Cabinet of the Provincial Assembly have according to the Constitution?"
for question in questions:
  answer = chain.invoke({"question": question},{"configurable": {"strategy": index_name}},)
  retrieves = retriever.invoke(question)
  #print(retrieves)
  data_str = str(retrieve)
  page_contents = re.findall(r"Document\(page_content='(.*?)'\)", data_str, re.DOTALL)
  context = ", ".join(page_contents)

  question_responses[question] = {
      'retrieved_content': context,
      'response': answer
      }

  print(question_responses[question])



{'retrieved_content': '', 'response': 'The term of office for the President in the Federation of Pakistan is five years from the day he enters upon his office.'}
{'retrieved_content': '', 'response': 'The constitution being discussed in the text is that of the Islamic Republic of Pakistan.'}
{'retrieved_content': '', 'response': 'The Cabinet of the Provincial Assembly can have up to fifteen members or eleven percent of the total membership of the Provincial Assembly, whichever is higher.'}


In [117]:
import re
question_responses = {}
for question in questions:
    # Assuming the correct method to invoke the chain and retriever
    answer = chain.invoke({"question": question},{"configurable": {"strategy": index_name}},)
    retrieves = retriever.invoke(question)

    # Convert the retrieves result to a string
    data_str = str(retrieves)

    # Extracting page content using regex
    page_contents = re.findall(r"Document\(page_content='(.*?)'\)", data_str, re.DOTALL)

    # Combine page contents into a single string separated by commas
    context = ", ".join(page_contents)

    # Storing results in a dictionary
    question_responses[question] = {
        'retrieved_content': context,
        'response': answer
    }

    # Print the current state of question_responses to see progressive updates
    print(question_responses[question])

{'retrieved_content': 'PART III \\nThe Federation of Pakistan \\nCHAPTER 1.–T HE PRESIDENT \\n41.(1)There shall be a President of Pakistan who shall be the \\nHead of State and shall represent the unity of the Republic.\\n(2)A person shall not be qualified for election as \\nPresident unless he is a Muslim of not less than forty-five years \\nof age and is qualified to be elected as member of the National\\nAssembly.\\n1[(3) The President 2* * * shall be elected in accordance with \\nthe provisions of the Second Schedule by the members of an \\nelectoral college consisting of–– \\n(a)the members of both Houses; and \\n(b)the members of the Provincial Assemblies.]\\n(4)Election to the office of President shall be held not\\nearlier than sixty days and not later than thirty days before the \\nexpiration of the term of the President in office:\\nProvided that, if the election cannot be held within the \\nperiod aforesaid because the National Assembly is dissolved, it\\nshall be held withi

In [120]:
def save_to_json(question_responses,json_output_file):
    # Create a list to hold the results in the required structure
    results = {"questions": [], "answers": [], "contexts": []}
    # Iterate through the question_responses dictionary
    for question, data in question_responses.items():
        results["questions"].append(question)
        results["answers"].append(data['response'])
        results["contexts"].append(data['retrieved_content'])


    # Write the results to a JSON file
    with open(json_output_file, 'w') as file:
        json.dump(results, file, indent=4)


In [121]:
# Output JSON file path
json_output_file = 'output/qa_results_neo4j.json'    #replace with your directory

# Execute the function
save_to_json(question_responses, json_output_file)