In [1]:
! pip install -q python-dotenv
! pip install -q neo4j
! pip install -q langchain
! pip install -q langchain-openai
! pip install -q tiktoken


[notice] A new release of pip available: 22.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip available: 22.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip available: 22.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip available: 22.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip available: 22.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import dotenv
import os

dotenv.load_dotenv()

os.environ["NEO4J_URI"] = os.getenv("URL")
os.environ["NEO4J_USERNAME"]= os.getenv("USERNAME")
os.environ["NEO4J_PASSWORD"] = os.getenv("PASSWORD2")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAIKEY")

In [3]:
from langchain_community.graphs import Neo4jGraph
import sys
sys.path.append('../utils')  
from helper import write_chunks_to_df


In [4]:
graphDB = Neo4jGraph()

In [None]:
#graphDB.query("SHOW INDEXES")

In [None]:
#
# Initialize Semantic Vector Index
#
# DOCS: https://neo4j.com/docs/cypher-manual/current/indexes/semantic-indexes/vector-indexes/
#

vector_index_query=""" 
CALL db.index.vector.createNodeIndex(
  'accreditation_index',
  'Chunk',
  'embedding',
   1536,
  'cosine'
)
"""

graphDB.query(vector_index_query)

In [None]:
graphDB.query("SHOW INDEXES")

### Generate Vector Embeddings

In [None]:
#
#  Query All Chunks without embedding
#

all_chunks_query = """ 
MATCH (c:Chunk) 
WHERE c.embedding IS null OR c.embedding = 0
RETURN c
"""


# attach embedding to chunk.embedding
vector_to_chunk_query = """ 
MATCH (c:Chunk {UUID: $UUID})
SET c.embedding = $vector
"""

In [None]:
result = graphDB.query(all_chunks_query)

In [None]:
#
# Generate Vectors for Chunk.text and update Chunk.embedding
#

chunk_dataframe = write_chunks_to_df(result)

In [None]:
#chunk_dataframe.head()
chunk_dataframe.info()

In [5]:
#
# Chunk embedding function
# 
# DOCS: https://platform.openai.com/docs/guides/embeddings/use-cases
#


from openai import OpenAI

client =OpenAI()

#MODEL =  "text-embedding-3-small"
MODEL = "text-embedding-ada-002"

def get_embedding(text, model = MODEL):

    text = text.replace("\n"," ")

    return client.embeddings.create(input=[text], model=model).data[0].embedding

In [None]:
chunk_dataframe['vector'] = chunk_dataframe['text'].apply(lambda x:get_embedding(x))

In [None]:
chunk_dataframe.head()

In [None]:
for i, row in chunk_dataframe.iterrows():

    graphDB.query(vector_to_chunk_query, 
                params ={
                    'UUID':row['UUID'],
                    'vector':row['vector']
                })

### Queries

In [7]:
semantic_index_query = """

CALL db.index.vector.queryNodes('accreditation_index', 2, $inputVector)
YIELD node AS responseNode, score

RETURN responseNode.text, score 
"""

In [None]:
# query_text = "what are aacsb standards"
# query_vector = get_embedding(query_text)
# query_result = graphDB.query(semantic_index_query, 
#                                 params={
#                                     "inputVector":query_vector
#                                 })

In [None]:
# query_result

In [8]:
#
# Standard QA
#
from langchain_community.vectorstores import Neo4jVector

VECTOR_INDEX_NAME = 'accreditiation-index'
VECTOR_NODE_LABEL = 'Chunk'
VECTOR_SOURCE_PROPERTY = 'text'
VECTOR_EMBEDDING_PROPERTY='embedding'


In [None]:
#https://python.langchain.com/docs/modules/data_connection/retrievers/MultiQueryRetriever/


In [9]:
import logging
from datetime import datetime
import json

log_dir = os.path.abspath('../logs')
os.makedirs(log_dir, exist_ok=True)

#Info logger
info_logger = logging.getLogger('info_logger')
info_logger.setLevel(logging.INFO)
info_handler = logging.FileHandler(os.path.join(log_dir, 'info.log'))
info_logger.addHandler(info_handler)

#Error logger
error_logger = logging.getLogger('error_logger')
error_logger.setLevel(logging.ERROR)
error_handler = logging.FileHandler(os.path.join(log_dir, 'error.log'))
error_logger.addHandler(error_handler)



In [None]:
# sucessful test
#info_logger.info("testing info logger")
#error_logger.error("testing error logger")

In [10]:

#
# Sourced from documentation and modified: https://python.langchain.com/docs/integrations/graphs/neo4j_cypher/
#

CYPHER_GENERATION_TEMPLATE = """Task:Generate Cypher statement to query a graph database.
Instructions:
Use only the provided relationship types and properties in the schema.
Do not use any other relationship types or properties that are not provided.

Schema:
{schema}
Note: Do not include any explanations or apologies in your responses.
Do not respond to any questions that might ask anything else than for you to construct a Cypher statement.
Do not include any text except the generated Cypher statement.

Perfect Syntax: Your queries must be in the correct Cypher syntax, at all costs you should avoide
'CypherSyntaxError' and 'ValueError' resulting from your query ie:
ValueError: Generated Cypher Statement is not valid
code: Neo.ClientError.Statement.SyntaxError message: Invalid input 'objective': expected ")", "WHERE", or a parameter (line 1, column 19 (offset: 18))
"MATCH (n:Learning objective) RETURN n.name"

Examples: Here are a few examples of generated Cypher statements for particular questions:

## AACSB STANDARD EXAMPLE
# Which standards deal with staff resources?
MATCH (n)
WHERE n.nodeCat = 'AACSB' AND (n.text CONTAINS 'staff' AND n.text CONTAINS 'resources' OR n.text CONTAINS 'staff resources')
RETURN n

# How is standard 2 documented
MATCH (d:Documentation) WHERE d.parentStandardNum = 2 RETURN d.text

## INSTITUTION EXAMPLE -- schema may vary from example, reference schema
# Which learning objectives did undergradute and graduate program evaluate
MATCH (p:Program)-[]->(l:`Learning objective`)
WHERE (p.name CONTAINS 'undergraduate' OR p.name CONTAINS 'graduate') 
RETURN l.name






The question is:
{question}"""

In [None]:
#
# RExample of returning nodes:
# MATCH (p:Program)-[]->(l:`Learning objective`)
# WHERE (p.name CONTAINS 'undergraduate' OR p.name CONTAINS 'graduate') 
# RETURN l,p
#

In [11]:
graphDB.refresh_schema()
schema = graphDB.schema


In [None]:
import pprint

pp = pprint.PrettyPrinter(indent = 2)
pp.pprint(schema)

In [12]:
# Query the knowledge graph in a RAG application
from langchain.chains import GraphCypherQAChain
from langchain_openai import ChatOpenAI
from langchain_core.prompts.prompt import PromptTemplate



graphDB.refresh_schema()

CYPHER_GENERATION_PROMPT = PromptTemplate(
    input_variables=["schema", "question"], template=CYPHER_GENERATION_TEMPLATE
)


cypher_chain = GraphCypherQAChain.from_llm(
    graph=graphDB,
    cypher_llm=ChatOpenAI(temperature=0, model="gpt-4"),
    qa_llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo"),
    validate_cypher=True, # Validate relationship directions
    verbose= True,
    return_intermediate_steps=True
)

In [None]:
# #cypher_chain.invoke({"query": "How many learning objectives are assessed"})
# #result = cypher_chain.invoke({"query": "Which student learning goals were identified"}) # does not know
# result = cypher_chain.invoke({"query": "What is standard 5 about"}) # does not know #works well

# #cypher_chain.invoke({"query": "What are the descriptions of the Learning goal"}) ## does not know , see still Learning goal example
# #cypher_chain.invoke({"query": "Tell me about AACSB standards"}) ## does not know , see still Learning goal example
# print(result['result'])
# print(result['intermediate_steps'])



Out of the box LLM genreate cypher queries are not producing any results, OR are producing errors.Provide some examples to the mode an oppopriate queries based on the schema. 

In [None]:
# from langchain.chains import RetrievalQAWithSourcesChain

In [None]:
# chain  = RetrievalQAWithSourcesChain.from_chain_type(

# )

In [None]:
#
#  LangChain Docs: https://python.langchain.com/docs/modules/data_connection/retrievers/MultiQueryRetriever/
#     



In [13]:

import os
import re
from langchain.chains.openai_functions import (
    create_openai_fn_chain,
    create_structured_output_chain,
)
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)


def extract_list(llm_result):
    content = llm_result.content
    #return content.split("\\n") # bug?
    result = content.split("[SEP]")
    if not isinstance(result, list):
            raise TypeError(f"Expected a list, but received {type(content)} instead.")
    
    return result

# NOT USED IN PIPELINE
def generate_multi_question(question):

    system_prompt = f""" 

    # Instruction
    You are an AI language model assistant. Your task is to generate three 
    different versions of the given user question to retrieve relevant documents from a vector 
    database. By generating multiple perspectives on the user question, your goal is to help
    the user overcome some of the limitations of the distance-based similarity search. 
    Provide these alternative questions separated by new line. 
    
    # Format Rules
    DO NOT NUMBER THE LIST
    Original question: {question}
    """ 
    
    llm = ChatOpenAI(model="gpt-3.5-turbo-16k")
    result = llm.invoke(system_prompt)
    query_list = extract_list(result)
     



    return {'original_query': question, 'generated_query_list': query_list}

In [14]:
from typing import List, Dict





def generate_multi_question_aacsb(question: str) -> Dict[str, any]:
    """
    For queries relating to the AACSB standards, generates list three different versions
    of the user's input query. Used downstream for multiquery retrieval.

    Args:
        question (str): The user's input query.

    Returns:
        Dict[str, any]: A dictionary containing the original user question and the generated list of alternative 
        questions. Keys include 'original_query' for the original question and 'generated_query_list' for the list 
        of alternative questions.

    """


    system_prompt = f""" 

    # Instruction
    You are an AI language model assistant. Your task is to generate three 
    different versions of the given user question to retrieve relevant documents from a vector 
    database. By generating multiple perspectives on the user question, your goal is to help
    the user overcome some of the limitations of the distance-based similarity search, by providing precise questions.
    Provide these alternative questions separated by "[SEP]" TOKEN.

    # List Delimiter
    Each question in the list must be seperated by the "[SEP]" TOKEN

    # Important Context: AACSB
    The questions you generate are directly related to extracting useful information on the 
    AACSB accrediation standards. The questions you generate will be used as vector database index
    queries that contain information on:

    - formal AACSB descriptions
    - documentation that supports each standard 
    - basis for evaluation of the standards 
    - relevent definitions of terms used in the standard descriptions.

    The AACSB Website provides the following summary of their work:
    AACSB accreditation is known, worldwide, as the longest-standing, most recognized form of 
    specialized accreditation that an institution and its business programs can earn. 
    Accreditation is a voluntary, nongovernmental process that includes a rigorous external review 
    of a school's mission, faculty qualifications, curricula, and ability to provide the highest-quality programs.
    
    # Format Rules
    DO NOT NUMBER THE LIST
    DO NOT ANSWER THE QUESTION
    DELIMITER USING "[SEP]" token
    Original question: {question}
    """ 
    
    llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)
    result = llm.invoke(system_prompt)
    query_list = extract_list(result)  

    log_data = {
        'timestamp': datetime.now().strftime('%Y-%m-%d_%H-%M-%S'),
        'level': 'INFO',
        'function_name': 'generate_multi_question_institution',
        'original_query': question,
        'generated_query_list': query_list
    }
    info_logger.info(json.dumps(log_data))

    return {'original_query': question, 'generated_query_list': query_list, 'cat':'AACSB'}

In [15]:
from typing import List, Dict


def generate_multi_question_institution(question: str) -> Dict[str, any]:
    """
    For school specific, or academic institution specific, queries. Generates list three different versions
    of the user's input query. Used downstream for multiquery retrieval.

    Args:
        question (str): The user's input query.

    Returns:
        Dict[str, any]: A dictionary containing the original user question and the generated list of alternative 
        questions. Keys include 'original_query' for the original question and 'generated_query_list' for the list 
        of alternative questions.
    """


    system_prompt = f""" 

    # Instruction
    You are an AI language model assistant. Your task is to generate three 
    different versions of the given user question to retrieve relevant documents from a vector 
    database. By generating multiple perspectives on the user question, your goal is to help
    the user overcome some of the limitations of the distance-based similarity search, by providing precise questions.
    Provide these alternative questions separated by "[SEP]" TOKEN.

    # List Delimiter
    Each question in the list must be seperated by the "[SEP]" TOKEN

    # Important Context: Academic Institution
    The questions you generate are directly related to extracting useful information about a School
    of Business.  The questions you generate will be used as vector database index
    queries that contain information on:

    - Strategic Plan, Mission and Fiscal Resources
    - Academic Departments in the School of Business inluding not limited to : Accounting, Marketing, Management, Finance, Entreprenuership
    - Student Services and Student Organizations 
    - Program Goals, Learning Objectives and Curriculum Assessment
    - Continuous Improvement


    # Format Rules
    DO NOT NUMBER THE LIST
    DO NOT ANSWER THE QUESTION
    DELIMITER USING "[SEP]" token
    Original question: {question}
    """ 
    
    llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)
    result = llm.invoke(system_prompt)
    query_list = extract_list(result)  

    log_data = {
    'timestamp': datetime.now().strftime('%Y-%m-%d_%H-%M-%S'),
    'level': 'INFO',
    'function_name': 'generate_multi_question_institution',
    'original_query': question,
    'generated_query_list': query_list
        }
    info_logger.info(json.dumps(log_data))                                                                      

    return {'original_query': question, 'generated_query_list': query_list, 'cat':'INSTITUTION'}

In [16]:
from typing import List, Dict


def generate_sub_questions_hybrid(question: str) -> Dict[str, any]:
    """
    For queries relating to both AACSB accreditation AND Academic Institution (School) information .
    Generates two distinct sub questions based on the user's input query. Used downstream for multiquery retrieval.

    Args:
        question (str): The user's input query.

    Returns:
        Dict[str, any]: A dictionary containing the original user question and the generated list of alternative 
        questions. Keys include 'original_query' for the original question and 'generated_query_list' for the list 
        of alternative questions.
    """


    system_prompt = f""" 

    # Instruction
    You are an AI language model assistant. Your task is to evalute the users input query and 
    divide the query into two and ONLY TWO sub questions. The first sub questions should address the portion
    of the user query that relates to the AACSB Standards, the second subquestion should relate the institution
    specific portion of the query. Your overall objective is to break down the complex user query into the 
    two distinct sub questions.  Provide these alternative questions separated by "[SEP]" TOKEN.

    # List Delimiter
    Each question in the list must be seperated by the "[SEP]" TOKEN

    # Important Context: Sub 
    
    ## 1. AACSB Standard sub question:
    AACSB sub question may related to accreditation content such as:

    - formal AACSB descriptions
    - documentation that supports each standard 
    - basis for evaluation of the standards 
    - relevent definitions of terms used in the standard descriptions.

    The AACSB Website provides the following summary of their work:
    AACSB accreditation is known, worldwide, as the longest-standing, most recognized form of 
    specialized accreditation that an institution and its business programs can earn. 
    Accreditation is a voluntary, nongovernmental process that includes a rigorous external review 
    of a school's mission, faculty qualifications, curricula, and ability to provide the highest-quality programs.


    ## 2. Academic Instiution sub question:
    Academic Instiution sub question may relate to School of Business content such as:

    - Strategic Plan, Mission and Fiscal Resources
    - Academic Departments in the School of Business inluding not limited to : Accounting, Marketing, Management, Finance, Entreprenuership
    - Student Services and Student Organizations 
    - Program Goals, Learning Objectives and Curriculum Assessment
    - Continuous Improvement


    # Format Rules
    DO NOT NUMBER THE LIST
    DO NOT ANSWER THE QUESTION
    DELIMITER USING "[SEP]" token
    Original question: {question}
    """ 
    
    llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)
    result = llm.invoke(system_prompt)
    query_list = extract_list(result)  

    log_data = {
        'timestamp': datetime.now().strftime('%Y-%m-%d_%H-%M-%S'),
        'level': 'INFO',
        'function_name': 'generate_multi_question_institution',
        'original_query': question,
        'generated_query_list': query_list
    }
    info_logger.info(json.dumps(log_data))

    return {'original_query': question, 'generated_query_list': query_list, 'cat':'HYBRID'}

In [None]:
# test_query = "how should I prepare for extended travel"

# result = generate_multi_question(test_query)

In [None]:
# for r in result['generated_query_list']:
#     print (r)

In [None]:

# test_query = "Our accountind department recently  updated the curriculum to include carbon footprint, does this reflect the sustainability standard "
# test_hybrid_query = generate_sub_questions_hybrid(test_query)

In [None]:
# test_hybrid_query

In [17]:
from openai import OpenAI
import json

client = OpenAI()



def optimize_query_function_route(user_query:str)->str:
    """
    Run a conversation with OpenAI's language model, providing the user query and available functions to the model.

    Args:
        user_query (str): The user's input query.

    Returns:
        str: The response from the language model.

    Step 1: Send the conversation and available functions to the model.
        - Each function is described with its name, description, and parameters.
        - Three functions are available:
            1. generate_multi_question_aacsb: Generates three different versions of the user's input query
               related to AACSB standards.
            2. generate_multi_question_institution: Generates three different versions of the user's input query
               for school-specific or academic institution-specific queries.
            3. generate_sub_questions_hybrid: Generates two distinct sub-questions based on the user's input query,
               relating to both AACSB accreditation and academic institution (school) information.

    Returns the response from the language model, specifically the first choice message from the available choices.
    """


    messages = [{"role": "user", "content": user_query}]
    tools = [
        {
            "type": "function",
            "function": {
                "name": "generate_multi_question_aacsb",
                "description": "For queries relating to the AACSB standards, generates list three different versions of the user's input query. Used downstream for multiquery retrieval.",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "question": {
                            "type": "string",
                            "description": "Input user query",
                        }
                    },
                    "required": ["question"],
                },
            },
        },
        {
            "type": "function",
            "function": {
                "name": "generate_multi_question_institution",
                "description": "For school specific, or academic institution specific, queries. Generates list three different versions of the user's input query. Used downstream for multiquery retrieval.",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "question": {
                            "type": "string",
                            "description": "Input user query",
                        }
                    },
                    "required": ["question"],
                },
            },
        },
        {
            "type": "function",
            "function": {
                "name": "generate_sub_questions_hybrid",
                "description": "For queries relating to both AACSB accreditation AND Academic Institution (School) information. Generates two distinct sub questions based on the user's input query. Used downstream for multiquery retrieval.",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "question": {
                            "type": "string",
                            "description": "Input user query",
                        }
                    },
                    "required": ["question"],
                },
            },
        }

    ]
    response = client.chat.completions.create(
        model="gpt-3.5-turbo-0125",
        messages=messages,
        tools=tools,
        tool_choice="auto",  # auto is default
    )
    return response.choices[0].message
    


In [None]:
# input_query = "AACSB standards on sustainability"
# result = optimize_query_function_route(input_query)

In [None]:
# import json

# function_name =  result.tool_calls[0].function.name
# function_args = json.loads(result.tool_calls[0].function.arguments)['question']
# print(f"{function_name} , {function_args}, ")

In [None]:
# function = globals()[function_name]
# print(function(function_args))

In [18]:
import json

def execute_route_function(conv_result: any)->any:
    """
    Execute a route function based on the provided conversation result.

    Args:
        conv_result (any): The conversation result containing information about the function to execute.

    Returns:
        any: The result of executing the route function.

    Extracts the name and arguments of the route function from the conversation result and then
    dynamically executes the function using the extracted information.

    Note:
        - The function_name and function_args are extracted from the tool_calls attribute of the conversation result.
        - The function_name is used to retrieve the actual function from the global namespace.
        - The function_args are passed as arguments to the retrieved function.
    """

    function_name =  conv_result.tool_calls[0].function.name
    function_args = json.loads(conv_result.tool_calls[0].function.arguments)['question']
    function = globals()[function_name]
    return function(function_args)





In [19]:
def query_routing_pipeline(user_query):
    """
    Run a conversation with OpenAI's language model using the provided user query and then execute the route function.

    Args:
        user_query (str): The user's input query.

    Returns:
        dict: The result of executing the route function.

    This function serves as a pipeline for querying and routing based on the user's input.
    It first runs a conversation with OpenAI's language model using the provided user query.
    The result of the conversation is then passed to the execute_route_function to determine and execute the appropriate route function.
    The result of executing the route function is returned as a dictionary.
    """
    result = optimize_query_function_route(user_query)
    return execute_route_function(result) #dict



In [None]:

# test_query_in = "AACSB standards on sustainability"
# pipeline_result = query_routing_pipeline(test_query_in)

In [None]:
# pipeline_result

In [20]:
def retriever(query_dict, cypher_chain=cypher_chain):

    
    context_data = []

    if query_dict['cat'] != 'HYBRID':
        # multiqueries -- case A
        # single cypher query for original query
        for q in query_dict['generated_query_list']:
            query_vector = get_embedding(q)
            query_result = graphDB.query(semantic_index_query, 
                                params={
                                    "inputVector":query_vector
                                })
            query_text = [text['responseNode.text'] for text in query_result]

            log_data = {
            'timestamp': datetime.now().strftime('%Y-%m-%d_%H-%M-%S'),
            'level': 'INFO',
            'function_name': 'retriever() - MULTI',
            'original_query': query_dict['original_query'],
            'current_query': q,
            'semantic_query_result': query_text,

                }
            info_logger.info(json.dumps(log_data))



            context_data.extend(query_text)
        
        #cypher_result = cypher_chain.invoke({"query": query_dict['original_query']})

        try:
            cypher_result = cypher_chain.invoke({"query": query_dict['original_query']})
      
            log_data = {
                        'timestamp': datetime.now().strftime('%Y-%m-%d_%H-%M-%S'),
                        'level': 'INFO',
                        'function_name': 'retriever() - MULTI CYPHER',
                        'original_query': query_dict['original_query'],
                        'cypher_query': cypher_result['result'],
                        'cypher_steps': cypher_result['intermediate_steps']

                            }
            info_logger.info(json.dumps(log_data))
                
      
      
      
      
        except Exception as e:
                log_data = {
                        'timestamp': datetime.now().strftime('%Y-%m-%d_%H-%M-%S'),
                        'level': 'ERROR',
                        'function_name': 'retriever()- MULTI CYPHER',
                        'original_query': query_dict['original_query'],
                        'cypher_query_1': cypher_result,
                        'ErrorException': e
                        }
                error_logger.error(json.dumps(log_data))
                cypher_result = "" #set empty for context string down steam

        if "I don't know the answer" not in cypher_result['result']:
            context_data.append(cypher_result['result'])
            

    
    else:
        # subqueries -- case B
        # cypher query for each subquery
        for q in query_dict['generated_query_list']:
            query_vector = get_embedding(q)
            query_result = graphDB.query(semantic_index_query, 
                                params={
                                    "inputVector":query_vector
                                })
            query_text = [text['responseNode.text'] for text in query_result]

            log_data = {
            'timestamp': datetime.now().strftime('%Y-%m-%d_%H-%M-%S'),
            'level': 'INFO',
            'function_name': 'retriever() - SUB',
            'original_query': query_dict['original_query'],
            'current_query':q,
            'semantic_query_result': query_text,

            }
            info_logger.info(json.dumps(log_data))
            context_data.extend(query_text)

            try:
                cypher_result = cypher_chain.invoke({"query": q})
                log_data = {
                        'timestamp': datetime.now().strftime('%Y-%m-%d_%H-%M-%S'),
                        'level': 'INFO',
                        'function_name': 'retriever() -SUB CYPHER',
                        'original_query': query_dict['original_query'],
                        'cypher_query': cypher_result['result'],
                        'cypher_steps': cypher_result['intermediate_steps']

                            }
                info_logger.info(json.dumps(log_data))
                
            except Exception as e:
                log_data = {
                        'timestamp': datetime.now().strftime('%Y-%m-%d_%H-%M-%S'),
                        'level': 'ERROR',
                        'function_name': 'retriever() - SUB-CYPHER',
                        'original_query': query_dict['original_query'],
                        'cypher_query_2': cypher_result
                            }
                error_logger.error(json.dumps(log_data))
                cypher_result = "" #set empty for context string down steam

            if "I don't know the answer" not in cypher_result['result']:
                context_data.append(cypher_result['result'])
                
    # Final Context Str
    context_str = ', '.join(str(item) for item in context_data)

    # Log context_str 
  
    log_data = {
    'timestamp': datetime.now().strftime('%Y-%m-%d_%H-%M-%S'),
    'level': 'INFO',
    'function_name': 'retriever()',
    'original_query': query_dict['original_query'],
    'context_string': context_str
    }

    info_logger.info(json.dumps(log_data))



    #if not hybrid
    ### loop through multiqueries
    #### call embedding function on each and retrieve data from db
    #### append to  context data
    ### call cypher query, append results to cyher data

    #else:
    ## loop through sub queries
    ### call embedding and retrieve append resutls to context data
    ### call cypher, append results to context data

    return context_str #return context data as a string


In [None]:
#ret_result = retriever(pipeline_result)

In [None]:
# ret_result

In [21]:
def generator(original_query, context_string, model = "gpt-3.5-turbo-16k"):

    system_prompt = f"""
    # Instruction:
    You are an AI assistant generating a thorough and thoughtful response to a user query.
    You are to generate a response that answers their query solely based on the context provided below
    do not use any other outside information. The grounding context information has been retrieved from
    a database which is the absolute knowledge source. 

    # User Query:
    {original_query}

    # Grounding Context:
    {context_string}

    # Output Style
    Your tone should be professional. And your response should be detailed, as this information
    will be used to generate reports. 

    """

    if not context_string:
        log_data = {
                    'timestamp': datetime.now().strftime('%Y-%m-%d_%H-%M-%S'),
                    'level': 'ERROR',
                    'function_name': 'generator()',
                    'original_query': original_query,
                    'context_string': 0

                        }
        error_logger.error(json.dumps(log_data))
        return "Generator Issue: context string is empty, unable to generate response, please rephrase your question and try again..."

    llm = ChatOpenAI(model=model, temperature=0)
    result = llm.invoke(system_prompt)
    content = result.content

    # Log Response
    log_data = {
                    'timestamp': datetime.now().strftime('%Y-%m-%d_%H-%M-%S'),
                    'level': 'INFO',
                    'function_name': 'generator()',
                    'original_query': original_query,
                    'context_string': context_string,
                    'generatedResponse': content

                        }
    info_logger.info(json.dumps(log_data))


    return content

In [22]:
test_query = [
    "AACSB Standards on sustainability", #aacsb standards
    "Our accounting department updated curriculum to include environmental impact in business risk evaluation, does this reflect the sustanability standard", #hybrid subquestion()
    "Standard 5 specifies a systematic process for assurance of learning. What do peer review teams usually expect in determining whether this standard is met?",
    "What are intellectual contributions",
    "Must faculty members publish in order to be qualified as scholarly academic, practice academic, or scholarly practitioner?", #FAQ
    "Who is the dean of my school",
    "What are the management courses",
    "Which leaning objectives are assessed in the undergradute business program ",
    "How is standard 8 defined?",
    "What are MACC program students expected to demonstrate"
]


In [None]:
#gen = generator(test_query_in, ret_result)

In [23]:
def advRagPipeline(query):
    try:
        query_pipeline_result = query_routing_pipeline(query)
        context_str = retriever(query_pipeline_result)
        output = generator(query,context_str)

    except Exception as e:
        raise e

    return output


In [24]:
test_out = []
#
#
# Stalls 30mins +

# for q in test_query:
#     output = advRagPipeline(q)
#     test_out.append(output)


def run(query):
    output = advRagPipeline(query)
    test_out.append(output)

run(test_query[0])
run(test_query[1])











[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (s:Standard {title: "AACSB Standards on sustainability"}) RETURN s[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (c:Curriculum)-[:INCLUDES_COURSE]->(co:Course {name: 'Environmental Impact in Business Risk Evaluation'}), (s:Standard {title: 'Sustainability Standard'}) RETURN c, co, s[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


In [25]:
run(test_query[2])
run(test_query[3])





[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (s:Standard {standardNum: 5})<-[:BASIS_OF]-(b:Basis) RETURN b.text[0m
Full Context:
[32;1m[1;3m[{'b.text': '5.1 assurance learning processes school identifies learning competencies business degree program well appropriate direct indirect measures systematically assessed demonstrate learning competencies achieved across degree programs . competencies derive consonant school mission strategies expected outcomes reported degree level opposed major level . competencies curriculum management processes reflect currency knowledge expectations stakeholders including limited organizations employing graduates alumni learners university community policymakers . competencies largely achieved . competencies achieved school provides evidence actions taken remediate deficiencies . direct indirect measures employed school expected include types measures across entire portfolio assessment degree programs . prop

In [26]:
run(test_query[4])
run(test_query[5])



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3m[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mThis question cannot be answered as the provided schema does not contain any information or relationship types related to a "dean" or a "school".[0m


UnboundLocalError: local variable 'cypher_result' referenced before assignment

In [27]:

run(test_query[6])
run(test_query[7])




[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3m[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3m[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


In [28]:
run(test_query[8])
run(test_query[9])



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (s:Standard {standardNum: 8})<-[:TERM_DEFINITIONS_FOR]-(d:Definitions) RETURN d.text[0m
Full Context:
[32;1m[1;3m[{'d.text': 'society context refers external stakeholders relevance business school given mission . examples include nonprofit private sector organizations business government community groups broader social economic business physical environments . external stakeholders broader environments may local regional national international scale . thought leadership evidenced business school recognized highly respected authority area areas expertise thus sought relevant stakeholders . aligned school mission stakeholders include learners business academics government nonprofits non governmental organizations broader society . predatory journals publishers defined entities prioritize self interest expense scholarship characterized false misleading information deviation best editorial publicat

UnboundLocalError: local variable 'cypher_result' referenced before assignment

In [29]:
for i, (q,o) in enumerate(zip(test_query,test_out)):
    print("======================================================")
    print("======================================================")
    print(f" {i}: {q} \n")
    print(f" {i}: {o} \n")
    print("======================================================")
    print("======================================================")



 0: AACSB Standards on sustainability 

 0: The AACSB (Association to Advance Collegiate Schools of Business) has established standards on sustainability that schools must adhere to in order to attain and maintain participating supporting status. These standards are periodically reviewed to ensure they reflect a focus on continuous improvement.

One of the criteria for attaining participating supporting status is the sufficiency of faculty members. The school must have faculty members who are actively participating and supporting the mission of the school. The criteria for faculty sufficiency should be documented and consistent with the mission of the school. The school may adapt guidance based on their specific situation in developing and implementing criteria that indicate they are meeting the spirit and intent of the standard.

In addition to faculty sufficiency, the criteria for attaining participating supporting status also address the depth and breadth of activities expected with

In [30]:
data = []

for i, (q, o) in enumerate(zip(test_query, test_out)):
    entry = {"query": q, "output": o, "idx": i}
    data.append(entry)

# Write data to JSON file
with open(f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}_advRAG_output.json", "w") as outfile:
    json.dump(data, outfile, indent=4)
