In [1]:
! pip install -q python-dotenv
! pip install -q neo4j
! pip install -q langchain
! pip install -q langchain-openai
! pip install -q tiktoken
! pip install -q torch
! pip install -q transformers[torch]


[notice] A new release of pip available: 22.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip available: 22.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip available: 22.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip available: 22.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip available: 22.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip available: 22.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip available: 22.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import dotenv
import os

dotenv.load_dotenv()

os.environ["NEO4J_URI"] = os.getenv("URL")
os.environ["NEO4J_USERNAME"]= os.getenv("USERNAME")
os.environ["NEO4J_PASSWORD"] = os.getenv("PASSWORD2")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAIKEY")

In [3]:
from langchain_community.graphs import Neo4jGraph


In [4]:
graphDB = Neo4jGraph()

In [5]:
from langchain_community.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship,
    GraphDocument,
)
from langchain.schema import Document
from typing import List, Dict, Any, Optional
from langchain.pydantic_v1 import Field, BaseModel

class Property(BaseModel):
  """A single property consisting of key and value"""
  key: str = Field(..., description="key")
  value: str = Field(..., description="value")

class Node(BaseNode):
    properties: Optional[List[Property]] = Field(
        None, description="List of node properties")

class Relationship(BaseRelationship):
    properties: Optional[List[Property]] = Field(
        None, description="List of relationship properties"
    )

class KnowledgeGraph(BaseModel):
    """Generate a knowledge graph with entities and relationships."""
    nodes: List[Node] = Field(
        ..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(
        ..., description="List of relationships in the knowledge graph"
    )

In [6]:
def format_property_key(s: str) -> str:
    words = s.split()
    if not words:
        return s
    first_word = words[0].lower()
    capitalized_words = [word.capitalize() for word in words[1:]]
    return "".join([first_word] + capitalized_words)

def props_to_dict(props) -> dict:
    """Convert properties to a dictionary."""
    properties = {}
    if not props:
      return properties
    for p in props:
        properties[format_property_key(p.key)] = p.value
    return properties

def map_to_base_node(node: Node) -> BaseNode:
    """Map the KnowledgeGraph Node to the base Node."""
    properties = props_to_dict(node.properties) if node.properties else {}
    # Add name property for better Cypher statement generation
    properties["name"] = node.id.title()
    return BaseNode(
        id=node.id.title(), type=node.type.capitalize(), properties=properties
    )


def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    """Map the KnowledgeGraph Relationship to the base Relationship."""
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = props_to_dict(rel.properties) if rel.properties else {}
    return BaseRelationship(
        source=source, target=target, type=rel.type, properties=properties
    )

In [7]:
import os
from langchain.chains.openai_functions import (
    create_openai_fn_chain,
    create_structured_output_chain,
)
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)

def get_extraction_chain(
    parentDocUUID: str,
    allowed_nodes: Optional[List[str]] = None,
    allowed_rels: Optional[List[str]] = None
    ):
    prompt = ChatPromptTemplate.from_messages(
        [(
          "system",
          f"""# Knowledge Graph Instructions for GPT-3.5 Turbo
## 1. Overview
You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph. You are evaluating information from an academic institution, creating a knowledge graph of concepts related to AACSB accreditation.
You should be able to identify learning goals (eg. Written Communication, Oral Communication, Critical Thinking, Ethics, Globalization, Information Technology),
along with how and when they are assessed.
- **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.
- The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
## 2. Labeling Nodes
- **Consistency**: Ensure you use basic or elementary types for node labels.
  - For example, when you identify an entity representing a person, always label it as **"person"**. Avoid using more specific terms like "mathematician" or "scientist".
- **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.
{'- **Allowed Node Labels:**' + ", ".join(allowed_nodes) if allowed_nodes else ""}
{'- **Allowed Relationship Types**:' + ", ".join(allowed_rels) if allowed_rels else ""}
## 3. Handling Numerical Data and Dates
- Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
- **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
- ** Provided REQUIRED Property**: Each node must have a the property key "parentDocUUID", this is the value {parentDocUUID}. DO NOT CREATE a VALUE only use the provided value.
- **Property Format**: Properties must be in a key-value format.
- **Quotation Marks**: Never use escaped single or double quotes within property values.
- **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.
## 4. Coreference Resolution
- **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"),
always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.
Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.
## 5. Strict Compliance
Adhere to the rules strictly. Non-compliance will result in termination.
          """),
            ("human", "Use the given format to extract information from the following input: {input}"),
            ("human", "Tip: Make sure to answer in the correct format"),
        ])
    return create_structured_output_chain(KnowledgeGraph, llm, prompt, verbose=False)

In [8]:
def extract_and_store_graph(
    document: Document,
    parentDocUUID: str,
    nodes:Optional[List[str]] = None,
    rels:Optional[List[str]]=None) -> None:
    # Extract graph data using OpenAI functions
    extract_chain = get_extraction_chain(parentDocUUID, nodes, rels)
    data = extract_chain.invoke(document.page_content)['function']
    # Construct a graph document
    graph_document = GraphDocument(
      nodes = [map_to_base_node(node) for node in data.nodes],
      relationships = [map_to_base_relationship(rel) for rel in data.rels],
      source = document
    )
    # Store information into a graph
    graphDB.add_graph_documents([graph_document])

In [9]:
! pip install -q pypdf



[notice] A new release of pip available: 22.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
from langchain_community.document_loaders import PyPDFLoader
#from langchain.text_splitter import TokenTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from datetime import datetime



import_date = datetime.now()
path =r'..\data\institution\sample\GCSU\appendix\gcsu_assurance_of_learning.pdf'

# Read the wikipedia article
raw_documents = PyPDFLoader(path)
pages = raw_documents.load_and_split()


# Define chunking strategy
#text_splitter = TokenTextSplitter(chunk_size=2048, chunk_overlap=24)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000, #changed from 2000
    chunk_overlap  = 200,
    length_function = len,
    is_separator_regex = False,
)

# # Only take the first the raw_documents
## document chunks
documents = text_splitter.split_documents(pages[5:10])



In [None]:
#print(type(documents))
#print(documents[0].metadata['page'])
#print(type(pages[0]))

In [11]:
doc_source_query="""
MERGE(d:Docsource {source: $docSource})
    ON CREATE SET
        d.importDate = $importDate,
        d.nodeType = 'DOCSOURCE',
        d.nodeCat = 'INSTITUTION'
RETURN d, elementID(d) as elementId
"""




# document_query ="""
# MERGE(d:Document {parentDocSourceId: $docSourceId})
#    ON CREATE SET
#         d.bertSummary = $modelSummary,
#         d.standardClassification = $modelClassification,
#         d.nodeType = 'DOCUMENT',
#         d.nodeCat='INSTITUTION',
#         d.pageIdxNum = $page,
#         d.source = $source
# RETURN d, elementID(d) as elementId
# """

# doc_chunk_query = """
# MERGE(c:Chunk {parentDocId: $parentDocId})
#     ON CREATE SET
#         c.text = $documentText,
#         c.embedding = NULL,
#         c.nodeType = 'DOCUMENTCHUNK',
#         c.nodeCat = 'INSTITUTION'
# RETURN c, elementID(c) as elementId
# """



document_query = """
CREATE (d:Document {
    parentDocSourceId: $docSourceId,
    docUUID: apoc.create.uuid(),
    modSummary: $modelSummary,
    standardClassification: $modelClassification,
    nodeType: 'DOCUMENT',
    nodeCat: 'INSTITUTION',
    sourcePageIdxNum: $page,
    source: $source,
    inputIdx: $idx
})
RETURN d, d.docUUID as elementId, ID(d) as idNum
"""

doc_chunk_query = """
CREATE (c:Chunk {
    UUID: apoc.create.uuid(),
    parentDocId: $parentDocId,
    text: $documentText,
    embedding: null,
    nodeType: 'DOCUMENTCHUNK',
    nodeCat: 'INSTITUTION',
    sourcePageIdxNum: $page,
    source: $source,
    inputIdx: $idx
})
RETURN c, elementId(c) as elementId
"""


In [12]:
document_link_query = """
MATCH (from_same_section:Document)
WHERE from_same_section.parentDocSourceId = $docSourceId
WITH from_same_section
    ORDER BY from_same_section.inputIdx ASC
WITH collect(from_same_section) as doc_list
    call apoc.nodes.link(
        doc_list,
        "NEXT",
        {avoidDuplicates: true}
    )
    RETURN size(doc_list)
"""

link_doc_chunk_to_parent_doc =""" 
MATCH (c:Chunk), (d: Document)
WHERE c.parentDocId = $docParentId
    AND c.sourcePageIdxNum = d.sourcePageIdxNum
    AND c.source = d.source
    AND c.inputIdx = d.inputIdx
MERGE (c)-[newRelationship:PART_OF]->(d)
RETURN count(newRelationship) as num_links_created
"""

link_to_source_doc = """ 
MATCH(d:Document),(s:Docsource)
WHERE d.parentDocSourceId = $docSourceId
    AND d.source = s.source
MERGE (d)-[newRelationship:SOURCE_FROM]->(s)
RETURN count(newRelationship) AS num_links_created
"""

link_kg_nodes_to_source_doc = """ 
MATCH (n), (d:Document) 
WHERE d.docUUID = n.parentdocuuid 
MERGE (n)-[newRelationship: CONTENT_FROM_DOC]->(d) 
RETURN count(newRelationship)
"""

In [13]:
import time
import sys
sys.path.append('../utils')  
from input_processing import classify_text

  from .autonotebook import tqdm as notebook_tqdm


In [14]:



def load_in_document(path,documents, allowed_nodes = None):
    """
    Loads documents into the graph database.

    Parameters:
        path (str): The path of the document source.
        documents (list): A list of document objects to load.

    Returns:
        None

    This function iterates over each document in the provided list and loads it into the graph database.
    It performs the following steps for each document:
    1. Queries the graph database to create a document source if it doesn't exist.
    2. Creates summary of document node and classifies document as it relates to AACSB standard
    3. Creates a document node for the current document.
    4. Creates knowledge graph from LLM extracted info
    5. Creates chunk nodes for the text content of the document.
    6. Links the chunk nodes to the parent document node.
    7. Pauses execution for 1 second to avoid overwhelming the database.
    8. Once all documents are processed, links documents within the same source and links source documents to the document source.
    9. Links all knowledge graph nodes to parent document node.
    """

    try:
        doc_source_result = graphDB.query(doc_source_query, 
                    params={
                        "docSource": path,
                        "importDate": import_date
                    })


         

        for i, document in enumerate(documents):
            print(f'Loop Num: {i}')
            classification, summary = classify_text(document.page_content)
            doc_result = graphDB.query(document_query,
                                    params={
                                        'docSourceId':doc_source_result[0]['elementId'],
                                        'modelSummary': summary,
                                        'modelClassification': classification,
                                        'page':document.metadata['page'],
                                        'source':document.metadata['source'],
                                        'idx':i
                                    })
           
            
            extract_and_store_graph(document, doc_result[0]['d']['docUUID'], allowed_nodes)

            chunk_result = graphDB.query(doc_chunk_query, 
                                        params ={
                                            'parentDocId': doc_result[0]['elementId'],
                                            'documentText':document.page_content,
                                            'page':document.metadata['page'],
                                            'source':document.metadata['source'],
                                            'idx':i
                                        })
            

            chunk_to_doc_result = graphDB.query(link_doc_chunk_to_parent_doc,
                                            params ={
                                                'docParentId':doc_result[0]['elementId']
                                            })
            time.sleep(1)

        doc_link_result = graphDB.query(document_link_query,
                                        params ={
                                            'docSourceId':doc_source_result[0]['elementId']
                                        })

        link_docs_to_source_docs_result = graphDB.query(link_to_source_doc, 
                                                        params ={
                                                        'docSourceId':doc_source_result[0]['elementId']
                                                        })
        link_kg_to_source_docs_result = graphDB.query(link_kg_nodes_to_source_doc)
    except Exception as e:
        raise e

In [15]:
#
# Test function
#
allowed_nodes = ["Department", "Curriculum", "Learning Objective", "Program", "Course", "Service", "Assessment", "Teaching","Business", "School of Business", "College", "University", "Staff",
            "Undergradauate", "Graduate", "Committee", "Mission", "Budget", "Performance Outcomes", "Students", "Faculty", "News", "Strategic Development", "Research", "Activity", "Goals"]

load_in_document(path,documents,allowed_nodes)

Loop Num: 0


Input ids are automatically padded from 459 to 1024 to be a multiple of `config.attention_window`: 1024
  warn_deprecated(


Loop Num: 1


Your max_length is set to 256, but your input_length is only 190. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=95)
Input ids are automatically padded from 190 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 449 to 1024 to be a multiple of `config.attention_window`: 1024


Loop Num: 2
Loop Num: 3


Input ids are automatically padded from 410 to 1024 to be a multiple of `config.attention_window`: 1024
Your max_length is set to 256, but your input_length is only 66. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=33)
Input ids are automatically padded from 66 to 1024 to be a multiple of `config.attention_window`: 1024


Loop Num: 4
Loop Num: 5


Your max_length is set to 256, but your input_length is only 16. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=8)
Input ids are automatically padded from 16 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 497 to 1024 to be a multiple of `config.attention_window`: 1024


Loop Num: 6
Loop Num: 7


Your max_length is set to 256, but your input_length is only 46. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)
Input ids are automatically padded from 46 to 1024 to be a multiple of `config.attention_window`: 1024


Loop Num: 8


Your max_length is set to 256, but your input_length is only 152. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=76)
Input ids are automatically padded from 152 to 1024 to be a multiple of `config.attention_window`: 1024


Loop Num: 9


In [None]:
# doc_link_result = graphDB.query(document_link_query,
#                                 params ={
#                                     'docSourceId':doc_source_result[0]['elementId']
#                                 })

In [None]:
# link_docs_to_source_docs_result = graphDB.query(link_to_source_doc, 
#                                                 params ={
#                                                    'docSourceId':doc_source_result[0]['elementId']
#                                                 })

In [19]:
link_docs_to_standards_query = """
MATCH(d: Document), (s: Standard)
WHERE d.standardClassification = s.standardNum
MERGE (d)-[newRelationship:ASSOCIATED_WITH_STANDARD]->(s)
RETURN count(newRelationship) AS num_links_created
"""

link_docsource_to_inst_root_query = """
MATCH (d: Docsource), (r: Root)
WHERE r.nodeCat = 'INSTITUTION' AND d.nodeCat = 'INSTITUTION' AND r.nodeType = 'ROOT'
MERGE (d)-[newRelationship: PRIMARY_DOCSOURCE_FOR ]->(r)
RETURN count(newRelationship) as num_links_created

"""

In [21]:
graphDB.query(link_docs_to_standards_query)

[{'num_links_created': 5}]

In [20]:
graphDB.query(link_docsource_to_inst_root_query)

[{'num_links_created': 1}]

In [22]:
#
# INSTITUTION DATA LOAD COMPLETE --> Goto next workbook retriverQA.ipynb
#

In [None]:
# import json
# import pandas as pd
# #[DONE] move to helper.py

# def write_chunks_to_df(chunks):
#     chunk_data = []
#     for chunk_info in chunks:
#         chunk = chunk_info['c']
#         chunk_entry = {
#             'text': chunk['text'],
#             'nodeType': chunk['nodeType'],
#             'UUID': chunk['UUID'],
#         }
#         chunk_data.append(chunk_entry)

#     # with open(file_path, 'w') as json_file:
#     #     json.dump(chunk_data, json)
#     return pd.DataFrame(chunk_data)
