In [1]:
! pip install -q python-dotenv
! pip install -q neo4j
! pip install -q langchain
! pip install -q langchain-openai
! pip install -q tiktoken


[notice] A new release of pip available: 22.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip available: 22.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip available: 22.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip available: 22.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip available: 22.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import dotenv
import os

dotenv.load_dotenv()

os.environ["NEO4J_URI"] = os.getenv("URL")
os.environ["NEO4J_USERNAME"]= os.getenv("USERNAME")
os.environ["NEO4J_PASSWORD"] = os.getenv("PASSWORD2")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAIKEY")

In [3]:
from langchain_community.graphs import Neo4jGraph


In [4]:
graphDB = Neo4jGraph()

In [5]:
from langchain_community.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship,
    GraphDocument,
)
from langchain.schema import Document
from typing import List, Dict, Any, Optional
from langchain.pydantic_v1 import Field, BaseModel

class Property(BaseModel):
  """A single property consisting of key and value"""
  key: str = Field(..., description="key")
  value: str = Field(..., description="value")

class Node(BaseNode):
    properties: Optional[List[Property]] = Field(
        None, description="List of node properties")

class Relationship(BaseRelationship):
    properties: Optional[List[Property]] = Field(
        None, description="List of relationship properties"
    )

class KnowledgeGraph(BaseModel):
    """Generate a knowledge graph with entities and relationships."""
    nodes: List[Node] = Field(
        ..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(
        ..., description="List of relationships in the knowledge graph"
    )

In [6]:
def format_property_key(s: str) -> str:
    words = s.split()
    if not words:
        return s
    first_word = words[0].lower()
    capitalized_words = [word.capitalize() for word in words[1:]]
    return "".join([first_word] + capitalized_words)

def props_to_dict(props) -> dict:
    """Convert properties to a dictionary."""
    properties = {}
    if not props:
      return properties
    for p in props:
        properties[format_property_key(p.key)] = p.value
    return properties

def map_to_base_node(node: Node) -> BaseNode:
    """Map the KnowledgeGraph Node to the base Node."""
    properties = props_to_dict(node.properties) if node.properties else {}
    # Add name property for better Cypher statement generation
    properties["name"] = node.id.title()
    return BaseNode(
        id=node.id.title(), type=node.type.capitalize(), properties=properties
    )


def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    """Map the KnowledgeGraph Relationship to the base Relationship."""
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = props_to_dict(rel.properties) if rel.properties else {}
    return BaseRelationship(
        source=source, target=target, type=rel.type, properties=properties
    )

In [7]:
import os
from langchain.chains.openai_functions import (
    create_openai_fn_chain,
    create_structured_output_chain,
)
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)

def get_extraction_chain(
    allowed_nodes: Optional[List[str]] = None,
    allowed_rels: Optional[List[str]] = None
    ):
    prompt = ChatPromptTemplate.from_messages(
        [(
          "system",
          f"""# Knowledge Graph Instructions for GPT-3.5 Turbo
## 1. Overview
You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph. You are evaluating information from an academic institution, creating a knowledge graph of concepts related to AACSB accreditation.
You should be able to identify learning goals (eg. Written Communication, Oral Communication, Critical Thinking, Ethics, Globalization, Information Technology),
along with how and when they are assessed.
- **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.
- The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
## 2. Labeling Nodes
- **Consistency**: Ensure you use basic or elementary types for node labels.
  - For example, when you identify an entity representing a person, always label it as **"person"**. Avoid using more specific terms like "mathematician" or "scientist".
- **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.
{'- **Allowed Node Labels:**' + ", ".join(allowed_nodes) if allowed_nodes else ""}
{'- **Allowed Relationship Types**:' + ", ".join(allowed_rels) if allowed_rels else ""}
## 3. Handling Numerical Data and Dates
- Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
- **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
- **Property Format**: Properties must be in a key-value format.
- **Quotation Marks**: Never use escaped single or double quotes within property values.
- **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.
## 4. Coreference Resolution
- **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"),
always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.
Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.
## 5. Strict Compliance
Adhere to the rules strictly. Non-compliance will result in termination.
          """),
            ("human", "Use the given format to extract information from the following input: {input}"),
            ("human", "Tip: Make sure to answer in the correct format"),
        ])
    return create_structured_output_chain(KnowledgeGraph, llm, prompt, verbose=False)

In [None]:
# import os
# from langchain.chains.openai_functions import (
#     create_openai_fn_chain,
#     create_structured_output_chain,
# )
# from langchain_openai import ChatOpenAI
# from langchain.prompts import ChatPromptTemplate

# llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)

# standard_classification = {1:'strategic planning',
#                            2:'physical, virtual and financial resources',
#                            3: 'faculty and professional staff resources',
#                            4: 'curriculum',
#                            5: 'assurance of learning',
#                            6:'learner progression',
#                            7: 'teaching effectiveness and impact',
#                            8: 'impact of scholarship',
#                            9: 'engagement and societal impact',
#                            0: 'general institution information',
#                            }

# def get_extraction_chain(
#     allowed_nodes: Optional[List[str]] = None,
#     allowed_rels: Optional[List[str]] = None
#     ):
#     prompt = ChatPromptTemplate.from_messages(
#         [(
#           "system",
#           f"""# Knowledge Graph Instructions for GPT-3
# ## 1. Overview
# You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
# - **Nodes** represent entities and concepts. You are evaluating information from an academic institution, creating a knowledge graph of concepts related to AACSB accreditation.
# - The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
# ## 2. Labeling Nodes
# - **Consistency**: Ensure you use basic or elementary types for node labels.
#   - For example, when you identify an entity representing a person, always label it as **"person"**. Avoid using more specific terms like "mathematician" or "scientist".
# - **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.
# {'- **Allowed Node Labels:**' + ", ".join(allowed_nodes) if allowed_nodes else ""}
# {'- **Allowed Relationship Types**:' + ", ".join(allowed_rels) if allowed_rels else ""}
# ## 3. Handling Numerical Data and Dates
# - Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
# - **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
# - **Property Format**: Properties must be in a key-value format. Each concept node should have an attribute key called 'supportsStandard', with an integer value classification. This is the classification value map {standard_classification}

# - **Quotation Marks**: Never use escaped single or double quotes within property values.
# - **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.
# ## 4. Coreference Resolution
# - **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
# If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"),
# always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.
# Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.
# ## 5. Strict Compliance
# Adhere to the rules strictly. Non-compliance will result in termination.
#           """),
#             ("human", "Use the given format to extract information from the following input: {input}"),
#             ("human", "Tip: Make sure to answer in the correct format"),
#         ])
#     return create_structured_output_chain(KnowledgeGraph, llm, prompt, verbose=False)


In [8]:
def extract_and_store_graph(
    document: Document,
    nodes:Optional[List[str]] = None,
    rels:Optional[List[str]]=None) -> None:
    # Extract graph data using OpenAI functions
    extract_chain = get_extraction_chain(nodes, rels)
    data = extract_chain.invoke(document.page_content)['function']
    # Construct a graph document
    graph_document = GraphDocument(
      nodes = [map_to_base_node(node) for node in data.nodes],
      relationships = [map_to_base_relationship(rel) for rel in data.rels],
      source = document
    )
    # Store information into a graph
    graphDB.add_graph_documents([graph_document])

In [9]:
! pip install -q pypdf
#! pip install -q wikipedia


[notice] A new release of pip available: 22.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import TokenTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from datetime import datetime



import_date = datetime.now()
path =r'..\data\institution\sample\GCSU\appendix\gcsu_assurance_of_learning.pdf'

# Read the wikipedia article
raw_documents = PyPDFLoader(path)
pages = raw_documents.load_and_split()


# Define chunking strategy
#text_splitter = TokenTextSplitter(chunk_size=2048, chunk_overlap=24)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000, #changed from 2000
    chunk_overlap  = 200,
    length_function = len,
    is_separator_regex = False,
)

# # Only take the first the raw_documents
## document chunks
documents = text_splitter.split_documents(pages[5:10])



In [None]:
print(type(documents))
print(documents[0].metadata['page'])
#print(type(pages[0]))

In [11]:
doc_source_query="""
MERGE(d:Docsource {source: $docSource})
    ON CREATE SET
        d.importDate = $importDate,
        d.nodeType = 'DOCSOURCE',
        d.nodeCat = 'INSTITUTION'
RETURN d, elementID(d) as elementId
"""




# document_query ="""
# MERGE(d:Document {parentDocSourceId: $docSourceId})
#    ON CREATE SET
#         d.bertSummary = $modelSummary,
#         d.standardClassification = $modelClassification,
#         d.nodeType = 'DOCUMENT',
#         d.nodeCat='INSTITUTION',
#         d.pageIdxNum = $page,
#         d.source = $source
# RETURN d, elementID(d) as elementId
# """

# doc_chunk_query = """
# MERGE(c:Chunk {parentDocId: $parentDocId})
#     ON CREATE SET
#         c.text = $documentText,
#         c.embedding = NULL,
#         c.nodeType = 'DOCUMENTCHUNK',
#         c.nodeCat = 'INSTITUTION'
# RETURN c, elementID(c) as elementId
# """



document_query = """
CREATE (d:Document {
    parentDocSourceId: $docSourceId,
    bertSummary: $modelSummary,
    standardClassification: $modelClassification,
    nodeType: 'DOCUMENT',
    nodeCat: 'INSTITUTION',
    sourcePageIdxNum: $page,
    source: $source,
    inputIdx: $idx
})
RETURN d, elementId(d) as elementId, ID(d) as idNum
"""

doc_chunk_query = """
CREATE (c:Chunk {
    UUID: apoc.create.uuid(),
    parentDocId: $parentDocId,
    text: $documentText,
    embedding: null,
    nodeType: 'DOCUMENTCHUNK',
    nodeCat: 'INSTITUTION',
    sourcePageIdxNum: $page,
    source: $source,
    inputIdx: $idx
})
RETURN c, elementId(c) as elementId
"""


In [12]:
document_link_query = """
MATCH (from_same_section:Document)
WHERE from_same_section.parentDocSourceId = $docSourceId
WITH from_same_section
    ORDER BY from_same_section.inputIdx ASC
WITH collect(from_same_section) as doc_list
    call apoc.nodes.link(
        doc_list,
        "NEXT",
        {avoidDuplicates: true}
    )
    RETURN size(doc_list)
"""

link_doc_chunk_to_parent_doc =""" 
MATCH (c:Chunk), (d: Document)
WHERE c.parentDocId = $docParentId
    AND c.sourcePageIdxNum = d.sourcePageIdxNum
    AND c.source = d.source
    AND c.inputIdx = d.inputIdx
MERGE (c)-[newRelationship:PART_OF]->(d)
RETURN count(newRelationship) as num_links_created
"""

link_to_source_doc = """ 
MATCH(d:Document),(s:Docsource)
WHERE d.parentDocSourceId = $docSourceId
    AND d.source = s.source
MERGE (d)-[newRelationship:SOURCE_FROM]->(s)
RETURN count(newRelationship) AS num_links_created
"""

In [13]:
import time


def load_in_document(path,documents):
    """
    Loads documents into the graph database.

    Parameters:
        path (str): The path of the document source.
        documents (list): A list of document objects to load.

    Returns:
        None

    This function iterates over each document in the provided list and loads it into the graph database.
    It performs the following steps for each document:
    1. Queries the graph database to create a document source if it doesn't exist.
    2. Creates a document node for the current document.
    3. Creates chunk nodes for the text content of the document.
    4. Links the chunk nodes to the parent document node.
    5. Pauses execution for 1 second to avoid overwhelming the database.
    6. Once all documents are processed, links documents within the same source and links source documents to the document source.

    """

    try:
        doc_source_result = graphDB.query(doc_source_query, 
                    params={
                        "docSource": path,
                        "importDate": import_date
                    })


        for i, document in enumerate(documents):
            print(f'Loop Num: {i}')
            doc_result = graphDB.query(document_query,
                                    params={
                                        'docSourceId':doc_source_result[0]['elementId'],
                                        'modelSummary': 'placeholder model summary',
                                        'modelClassification': '00',
                                        'page':document.metadata['page'],
                                        'source':document.metadata['source'],
                                        'idx':i
                                    })
            
            #
            # ADD LLM FUNCTION
            #

            chunk_result = graphDB.query(doc_chunk_query, 
                                        params ={
                                            'parentDocId': doc_result[0]['elementId'],
                                            'documentText':document.page_content,
                                            'page':document.metadata['page'],
                                            'source':document.metadata['source'],
                                            'idx':i
                                        })
            chunk_to_doc_result = graphDB.query(link_doc_chunk_to_parent_doc,
                                            params ={
                                                'docParentId':doc_result[0]['elementId']
                                            })
            time.sleep(1)

        doc_link_result = graphDB.query(document_link_query,
                                        params ={
                                            'docSourceId':doc_source_result[0]['elementId']
                                        })

        link_docs_to_source_docs_result = graphDB.query(link_to_source_doc, 
                                                        params ={
                                                        'docSourceId':doc_source_result[0]['elementId']
                                                        })
    except Exception as e:
        raise e

In [14]:
#
# Test function
#
load_in_document(path,documents)

Loop Num: 0
Loop Num: 1
Loop Num: 2
Loop Num: 3
Loop Num: 4
Loop Num: 5
Loop Num: 6
Loop Num: 7
Loop Num: 8
Loop Num: 9


In [None]:
# doc_link_result = graphDB.query(document_link_query,
#                                 params ={
#                                     'docSourceId':doc_source_result[0]['elementId']
#                                 })

In [None]:
# link_docs_to_source_docs_result = graphDB.query(link_to_source_doc, 
#                                                 params ={
#                                                    'docSourceId':doc_source_result[0]['elementId']
#                                                 })

In [15]:
#
# query all chunks
#

query_all_chunks = """ 
MATCH (c:Chunk) WHERE c.embedding IS null
RETURN c
"""

In [16]:
result = graphDB.query(query_all_chunks)

In [17]:
import json
import pandas as pd
#TODO move to helper.py

def write_chunks_to_df(chunks):
    chunk_data = []
    for chunk_info in chunks:
        chunk = chunk_info['c']
        chunk_entry = {
            'text': chunk['text'],
            'nodeType': chunk['nodeType'],
            'UUID': chunk['UUID'],
        }
        chunk_data.append(chunk_entry)

    # with open(file_path, 'w') as json_file:
    #     json.dump(chunk_data, json)
    return pd.DataFrame(chunk_data)


In [18]:
out_path = r'..\data\content_chunks'
#write_chunks_to_json(result,out_path)

json_dataframe = write_chunks_to_df(result)

In [19]:
json_dataframe

Unnamed: 0,text,nodeType,UUID
0,The J. Whitney Bunting College of Business 201...,DOCUMENTCHUNK,e165d4cf-ccb2-4d45-91bf-085e3fa3e50b
1,4.3. Developing initiatives for student profes...,DOCUMENTCHUNK,258d3cf6-5990-43b3-969d-adc9ec262b05
2,The J. Whitney Bunting College of Business 201...,DOCUMENTCHUNK,ec725ffb-a27f-4866-b11f-9532bccc2413
3,taking for -credit internships. The department...,DOCUMENTCHUNK,b3235929-5054-4e02-a293-7bd7230c791f
4,the balance sheet. The results showed that on...,DOCUMENTCHUNK,20e8de4e-bd49-4532-aa6d-5310b4114dd9
5,"time, an interactive classroom tool will be in...",DOCUMENTCHUNK,8d6b206d-1389-4710-b202-1d14bad5449d
6,The J. Whitney Bunting College of Business 201...,DOCUMENTCHUNK,cad8f788-3b72-4715-a6e4-63ec68c1abbb
7,BBA Objective 2 .3: Demonstrate basic function...,DOCUMENTCHUNK,5545f05e-9990-4ffe-ba9e-34bc82cab2f1
8,core business knowledge so the LENB 3135 and M...,DOCUMENTCHUNK,6b6e13c3-63c7-4d04-a2ad-f4e326fb0b1b
9,year. \n The assessment of student knowledge ...,DOCUMENTCHUNK,33b6f7f0-dee5-4102-8904-660a940cff8b


In [20]:
#
# Documentation: https://platform.openai.com/docs/guides/embeddings/use-cases
#


from openai import OpenAI

client =OpenAI()

def get_embedding(text, model = "text-embedding-3-small"):

    text = text.replace("\n"," ")

    return client.embeddings.create(input=[text], model=model).data[0].embedding

In [21]:
json_dataframe['vector'] = json_dataframe['text'].apply(lambda x:get_embedding(x))

In [22]:
json_dataframe

Unnamed: 0,text,nodeType,UUID,vector
0,The J. Whitney Bunting College of Business 201...,DOCUMENTCHUNK,e165d4cf-ccb2-4d45-91bf-085e3fa3e50b,"[-0.002387269865721464, 0.030054964125156403, ..."
1,4.3. Developing initiatives for student profes...,DOCUMENTCHUNK,258d3cf6-5990-43b3-969d-adc9ec262b05,"[-0.022018861025571823, 0.014942693524062634, ..."
2,The J. Whitney Bunting College of Business 201...,DOCUMENTCHUNK,ec725ffb-a27f-4866-b11f-9532bccc2413,"[-0.021682489663362503, 0.03815522789955139, 0..."
3,taking for -credit internships. The department...,DOCUMENTCHUNK,b3235929-5054-4e02-a293-7bd7230c791f,"[-0.02500932849943638, 0.023677470162510872, 0..."
4,the balance sheet. The results showed that on...,DOCUMENTCHUNK,20e8de4e-bd49-4532-aa6d-5310b4114dd9,"[0.02478775382041931, 0.049934349954128265, 0...."
5,"time, an interactive classroom tool will be in...",DOCUMENTCHUNK,8d6b206d-1389-4710-b202-1d14bad5449d,"[-0.043007466942071915, 0.0022174373734742403,..."
6,The J. Whitney Bunting College of Business 201...,DOCUMENTCHUNK,cad8f788-3b72-4715-a6e4-63ec68c1abbb,"[0.018656976521015167, 0.058810919523239136, 0..."
7,BBA Objective 2 .3: Demonstrate basic function...,DOCUMENTCHUNK,5545f05e-9990-4ffe-ba9e-34bc82cab2f1,"[-0.02717341110110283, 0.022137731313705444, 0..."
8,core business knowledge so the LENB 3135 and M...,DOCUMENTCHUNK,6b6e13c3-63c7-4d04-a2ad-f4e326fb0b1b,"[-0.011547845788300037, 0.03375094756484032, 0..."
9,year. \n The assessment of student knowledge ...,DOCUMENTCHUNK,33b6f7f0-dee5-4102-8904-660a940cff8b,"[-0.020592421293258667, 0.0009046140476129949,..."


In [23]:
vector_to_chunk_query = """ 
MATCH (c:Chunk {UUID: $UUID})
SET c.embedding = $vector
"""

In [24]:
for i, row in json_dataframe.iterrows():

    graphDB.query(vector_to_chunk_query, 
                params ={
                    'UUID':row['UUID'],
                    'vector':row['vector']
                })

In [25]:
#
# Vector Index and Embeddings (may move to a different notebook)
# https://neo4j.com/docs/cypher-manual/current/indexes/semantic-indexes/vector-indexes/


#Create Vector Index

vector_index_query=""" 
CALL db.index.vector.createNodeIndex(
  'accreditation-index',
  'Chunk',
  'embedding',
   1536,
  'cosine'
)
"""

graphDB.query(vector_index_query)

[]

In [26]:
graphDB.query("SHOW INDEXES")

[{'id': 3,
  'name': 'accreditation-index',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'VECTOR',
  'entityType': 'NODE',
  'labelsOrTypes': ['Chunk'],
  'properties': ['embedding'],
  'indexProvider': 'vector-1.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCount': 0},
 {'id': 1,
  'name': 'index_343aff4e',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'NODE',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None,
  'lastRead': neo4j.time.DateTime(2024, 4, 15, 8, 8, 55, 947000000, tzinfo=<UTC>),
  'readCount': 35},
 {'id': 2,
  'name': 'index_f7700477',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'RELATIONSHIP',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCount': 0}]

In [30]:
#graphDB.query("SHOW PROCEDURES YIELD *")

semantic_index_query = """

CALL db.index.vector.queryNodes('accreditation-index', 2, $inputVector)
YIELD node AS responseNode, score

RETURN responseNode.text, score 
"""

In [31]:
query_text = "what are the marketing courses"
query_vector = get_embedding(query_text)

query_result = graphDB.query(semantic_index_query, 
                                params={
                                    "inputVector":query_vector
                                })

In [32]:
query_result

[{'responseNode.text': 'BBA Objective 2 .3: Demonstrate basic functional abilities across core business subjects  \nBecause this goal covers a large area of knowledge, two courses taken by all business majors w as \nassessed in addition to using the ETS exam as an overall assessment.   \n LENB 3135 was used to assess how well students could discuss the laws that relate to contracts, \nincluding the UCC. In 14- 15, the fall semester students met or exceed the target at an 80% rate while the \nspring semester seemed to be an anomaly as only 62% met or exceeded the target. The same assessment was used in 15- 16. For fall semester, 74% met or exceeded expectations, and in spring 16, \n72% met or exceeded expectations . \n \nMKTG 3161 was used to assess how well students identify key marketing concepts and apply them to \nreal-world business problems. In 13- 14, 68% met the target, but another 16% only missed by 1 question. \nThis assessment was  continued with continued emphasis on repetit

In [None]:
#
# Documentation References: 
# https://neo4j.com/docs/cypher-manual/current/indexes/semantic-indexes/vector-indexes/#:~:text=As%20of%20Neo4j%205.13%2C%20you,IEEE%20754%20single%20precision%20values
# https://neo4j.com/labs/genai-ecosystem/apoc-genai/
#


# chunk_embedding_query = """
#     MATCH (c:Chunk) WHERE c.embedding IS null
#     WITH c, apoc.ml.openai.embedding(
#       c.text, 
#       "OpenAI", 
#       {
#         token: $openAiApiKey 
#         }) AS vector
#     CALL db.create.setVectorProperty(c, "embedding", vector)
#     YIELD node
#     """

# chunk_embedding_query = """
#     MATCH (c:Chunk) WHERE c.embedding IS null
#     WITH c, apoc.ml.openai.embedding([c.text], $openAiApiKey, {}) AS result
#     CALL {
#         WITH c, result
#         CALL db.create.setVectorProperty(c, "embedding", result.embedding[0])
#         YIELD node
#         RETURN node
#     }
#     RETURN c
#     """


In [None]:
# graphDB.query(chunk_embedding_query, 
#               params={
#                   'openAiApiKey': os.environ["OPENAI_API_KEY"]
#               })

In [None]:
# retrieve_chunks_for_embed_query = """ 
# MATCH(c:Chunk) WHERE c.embedding is null RETURN count(c) AS numNodes
# """

In [None]:
#graphDB.query(retrieve_chunks_for_embed_query)

In [None]:
from tqdm import tqdm

for i, d in tqdm(enumerate(documents), total=len(documents)):
    try:
        extract_and_store_graph(d)
    except Exception as e:
        print(f"An error occurred for document {i}: {e}")
        pass

In [None]:
graphDB.query("MATCH (n) DETACH DELETE n")


In [None]:
# Specify which node labels should be extracted by the LLM
allowed_nodes = ["Person", "Company", "Location", "Event", "Movie", "Service", "Award"]

for i, d in tqdm(enumerate(documents), total=len(documents)):
    extract_and_store_graph(d, allowed_nodes)

In [None]:
# Query the knowledge graph in a RAG application
from langchain.chains import GraphCypherQAChain


graphDB.refresh_schema()

cypher_chain = GraphCypherQAChain.from_llm(
    graph=graphDB,
    cypher_llm=ChatOpenAI(temperature=0, model="gpt-4"),
    qa_llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo"),
    validate_cypher=True, # Validate relationship directions
    verbose=True
)

In [None]:
cypher_chain.invoke({"query": "How many learning objectives are assessed"})


In [None]:
cypher_chain.invoke({"query": "Which student learning goals were identified"}) # does not know


In [None]:
cypher_chain.invoke({"query": "What are the descriptions of the Learning goal"}) ## does not know , see still Learning goal example
