### Imports

In [1]:
import nest_asyncio
import os
from dotenv import load_dotenv
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings
from llama_parse import LlamaParse
from copy import deepcopy
from llama_index.core.schema import TextNode, Document
from llama_index.core import VectorStoreIndex
from llama_index.graph_stores.neo4j import Neo4jPGStore
from llama_index.core.indices.property_graph import (
    ImplicitPathExtractor,
    SimpleLLMPathExtractor,
    SchemaLLMPathExtractor
)
from llama_index.core import PropertyGraphIndex
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.indices.property_graph import VectorContextRetriever
from llama_index.core import VectorStoreIndex
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import BaseRetriever
from llama_index.core.schema import NodeWithScore
from typing import List
from llama_index.core.agent import FunctionCallingAgentWorker
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import RetrieverQueryEngine
from typing import Literal

In [2]:
load_dotenv()
nest_asyncio.apply()

In [3]:
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")

### LLM & Embedding Model Setup

In [4]:
llm = OpenAI(model="gpt-4o")
embed_model = OpenAIEmbedding(model="text-embedding-3-small")

Settings.llm = llm
Settings.embed_model = embed_model

### Parsing instruction

In [5]:
parsing_instruction = """
1. **Text**: Identify and extract all text content, including explanations, descriptions, and definitions.
2. **Image**: Identify and extract all images, ensuring to capture diagrams, illustrations, and visual representations.
3. **Graph**: Identify and extract all graphs, including line graphs, bar graphs, and any other graphical data representations.
4. **Table**: Identify and extract all tables, capturing data and structured information presented in tabular form.
5. **Practice Problems**: Identify and extract all practice problems, including exercises and questions for students to solve.
6. **Examples**: Identify and extract all worked examples that demonstrate how to solve problems or apply concepts.

#### Tagging Requirements:

For each extracted element, tag the following:

1. **Chapter**: The overarching division of the textbook in which the content is located.
2. **Concept**: The specific subsection or topic within a chapter that the content pertains to.
3. **Learning Outcome Name**: The most fundamental unit of learning, which is a combination of the concept, the cognitive ability (based on Bloom's taxonomy), and the context. For example:
    - **Learning Outcome**: "Calculating resistance using VI graph"
        - **Cognitive Ability**: Analyse
        - **Concept**: Calculating resistance
        - **Context**: VI graph

4. **Cognitive Ability**: The level of cognitive ability required, based on Bloom's taxonomy (e.g., Remember, Understand, Apply, Analyze, Evaluate, Create).

#### Special Instructions for Formulas and Equations:

- For any formulas or equations encountered, ensure they are provided in LaTeX format, enclosed in `$$`. For example:
    - Equation: $$ E = mc^2 $$
    - Formula: $$ \text{Area of a circle} = \pi r^2 $$

#### Additional Notes:

- **Multiple Concepts and Learning Outcomes**: Each page can have multiple concepts, and each concept can have multiple learning outcomes.
- **Contextual Awareness**: Ensure the context of each concept and learning outcome is accurately captured, reflecting the specific example, problem, or explanation provided.

#### Example of Tagged Content:

**Chapter**: Electricity  
**Concept**: Ohm's Law  
**Learning Outcome Name**: Calculating resistance using VI graph  
- **Cognitive Ability**: Analyze  
- **Context**: VI graph

**Text**: "Ohm's Law states that the current through a conductor between two points is directly proportional to the voltage across the two points."

**Image**: [Image of a circuit diagram]

**Graph**: [Graph showing Voltage vs. Current]

**Table**: 
| Voltage (V) | Current (I) |
|-------------|-------------|
| 1           | 0.2         |
| 2           | 0.4         |
| 3           | 0.6         |

**Practice Problems**:
1. "Calculate the resistance if the voltage is 5V and the current is 1A."

**Examples**:
- **Example 1**: "If the voltage across a resistor is 10V and the current is 2A, the resistance can be calculated using Ohm's Law as follows: $$ R = \frac{V}{I} = \frac{10V}{2A} = 5 \Omega $$"
"""

  parsing_instruction = """


### Adjust the target pages here... (set start_page and end_page to None for all pages)

In [6]:
start_page = 14
end_page = 22

target_pages = ""
if start_page is not None and end_page is not None:
    target_pages = ",".join([str(i) for i in range(start_page, end_page+1)])
else:
    target_pages = None

target_pages

'14,15,16,17,18,19,20,21,22'

In [7]:
docs = LlamaParse(
  result_type="markdown", 
  target_pages=target_pages, 
  parsing_instruction=parsing_instruction,
  use_vendor_multimodal_model=True,
  vendor_multimodal_model_name="openai-gpt4o",
  vendor_multimodal_api_key=OPENAI_API_KEY,
).load_data("book.pdf")

Started parsing the file under job_id a992e34b-4f69-40a6-a705-9c32f7118429


In [8]:
current_page = start_page or 0
for doc in docs:
    doc.metadata = {"page": current_page, "course_id": "1"}
    current_page += 1

In [9]:
text = ""
for doc in docs:
    text += doc.text
text

'# Real Numbers\n\n## 1.1 Introduction\n\nIn Class IX, you began your exploration of the world of real numbers and encountered irrational numbers. We continue our discussion on real numbers in this chapter. We begin with two very important properties of positive integers in Sections 1.2 and 1.3, namely the Euclid’s division algorithm and the Fundamental Theorem of Arithmetic.\n\nEuclid’s division algorithm, as the name suggests, has to do with divisibility of integers. Stated simply, it says any positive integer \\( a \\) can be divided by another positive integer \\( b \\) in such a way that it leaves a remainder \\( r \\) that is smaller than \\( b \\). Many of you probably recognise this as the usual long division process. Although this result is quite easy to state and understand, it has many applications related to the divisibility properties of integers. We touch upon a few of them, and use it mainly to compute the HCF of two positive integers.\n\nThe Fundamental Theorem of Arith

In [10]:
def get_sub_docs(docs):
    """Split docs into pages, by separator."""
    sub_docs = []
    for doc in docs:
        doc_chunks = doc.text.split("\n---\n")
        for doc_chunk in doc_chunks:
            sub_doc = Document(
                text=doc_chunk,
                metadata=deepcopy(doc.metadata),
            )
            sub_docs.append(sub_doc)

    return sub_docs

In [11]:
sub_docs = get_sub_docs(docs)

In [13]:
sub_docs[0].metadata

{'page': 14, 'course_id': '1'}

In [14]:
graph_store = Neo4jPGStore(
    username="neo4j",
    password="password",
    url="bolt://localhost:7687",
    database="rag",
)
vec_store = None



### Run this only for the first time when the graph is not created

In [15]:
# Define entities and relations
entities = Literal[ "CONCEPT", "LEARNING_OUTCOME", "QUESTION"]
relations = Literal[ "HAS_OUTCOME", "PREREQ_FOR", "SUPPORTS"]

# Define which entities can have which relations
validation_schema = {
    "CONCEPT": ["HAS_OUTCOME"],
    "LEARNING_OUTCOME": ["PREREQ_FOR", "SUPPORTS"],
    "QUESTION": ["SUPPORTS"]
}

# Define specific relation rules
validation_schema = [
    ("CONCEPT", "HAS_OUTCOME", "LEARNING_OUTCOME"),
    ("LEARNING_OUTCOME", "PREREQ_FOR", "LEARNING_OUTCOME"),
    ("LEARNING_OUCOME", "SUPPORTS", "QUESTION")
]

kg_extractor = SchemaLLMPathExtractor(
    llm=llm,
    max_triplets_per_chunk=10,
    strict=False,
    possible_entities=entities, 
    possible_relations=relations, 
    possible_relation_props=[
        "extra_description"
    ], 
    possible_entity_props=[
        "cognitive_ability", "text_snippet", "description"
    ], 
    num_workers=4,
    kg_validation_schema=validation_schema
)



index = PropertyGraphIndex.from_documents(
    sub_docs,
    embed_model=OpenAIEmbedding(model_name="text-embedding-3-small"),
    kg_extractors=[kg_extractor], 
    property_graph_store=graph_store,
    show_progress=True,
)

  from .autonotebook import tqdm as notebook_tqdm
Parsing nodes: 100%|██████████| 10/10 [00:00<00:00, 2133.96it/s]
Extracting paths from text with schema: 100%|██████████| 10/10 [00:33<00:00,  3.32s/it]
Generating embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.05s/it]
Generating embeddings: 100%|██████████| 2/2 [00:01<00:00,  1.00it/s]


### Run this if the data is already converted into graph

In [16]:
index = PropertyGraphIndex.from_existing(
    graph_store,
    embed_model=OpenAIEmbedding(model_name="text-embedding-3-small"),
    kg_extractors=[
        ImplicitPathExtractor(),
        SimpleLLMPathExtractor(
            llm=OpenAI(model="gpt-3.5-turbo", temperature=0.3),
            num_workers=4,
            max_paths_per_chunk=10,
        ),
    ],
    show_progress=True,
)

### Retriever

In [17]:
kg_retriever = VectorContextRetriever(
    index.property_graph_store,
    embed_model=OpenAIEmbedding(model_name="text-embedding-3-small"),
    similarity_top_k=2,
    path_depth=1,
    include_text=True,
)

### Query

In [18]:
query = """
Fundamental Theorem of Arithmetic
"""

### Print Nodes Response

In [20]:
nodes = kg_retriever.retrieve(query)
print(len(nodes))
for idx, node in enumerate(nodes):
    print(f">> IDX: {idx}, {node.get_content()}")
    print("metadata", node.metadata)

5
>> IDX: 0, Here are some facts extracted from the provided text:

The Fundamental Theorem of Arithmetic ({'page': 22, 'name': 'The Fundamental Theorem of Arithmetic', 'course_id': '1', 'triplet_source_id': 'cdd15857-c8e5-49ec-a2d1-c356245020d4'}) -> HAS_OUTCOME ({'page': 22, 'course_id': '1', 'triplet_source_id': 'cdd15857-c8e5-49ec-a2d1-c356245020d4'}) -> The factorisation of composite numbers is unique apart from the order of prime factors ({'page': 22, 'name': 'The factorisation of composite numbers is unique apart from the order of prime factors', 'course_id': '1', 'triplet_source_id': 'cdd15857-c8e5-49ec-a2d1-c356245020d4'})
The Fundamental Theorem of Arithmetic ({'page': 22, 'name': 'The Fundamental Theorem of Arithmetic', 'course_id': '1', 'triplet_source_id': 'cdd15857-c8e5-49ec-a2d1-c356245020d4'}) -> HAS_OUTCOME ({'page': 22, 'course_id': '1', 'triplet_source_id': 'cdd15857-c8e5-49ec-a2d1-c356245020d4'}) -> Every composite number can be expressed as a product of primes ({'p

### Naive retriever for comparision

In [None]:
base_index = VectorStoreIndex.from_documents(sub_docs, embed_model=embed_model)
base_retriever = base_index.as_retriever(similarity_top_k=2)
base_query_engine = RetrieverQueryEngine(base_retriever)

### Print naive retriever response

In [None]:
response = base_query_engine.query(query)
print(str(response))

### Custom Retriever which uses both Graph and Naive approach (will return unique nodes)

In [None]:
class CustomRetriever(BaseRetriever):
    """Custom retriever that performs both KG vector search and direct vector search."""

    def __init__(self, kg_retriever, vector_retriever):
        self._kg_retriever = kg_retriever
        self._vector_retriever = vector_retriever

    def _retrieve(self, query_bundle) -> List[NodeWithScore]:
        """Retrieve nodes given query."""
        kg_nodes = self._kg_retriever.retrieve(query_bundle)
        vector_nodes = self._vector_retriever.retrieve(query_bundle)

        unique_nodes = {n.node_id: n for n in kg_nodes}
        unique_nodes.update({n.node_id: n for n in vector_nodes})
        return list(unique_nodes.values())

In [None]:
custom_retriever = CustomRetriever(kg_retriever, base_retriever)

In [None]:
nodes = custom_retriever.retrieve(query)

### Using this *Retriever* as *Tool* for the **Agent**

In [None]:
kg_query_engine = RetrieverQueryEngine(custom_retriever)
kg_query_tool = QueryEngineTool(
    query_engine=kg_query_engine,
    metadata=ToolMetadata(
        name="query_tool",
        description="Use this tool to get contextual data from the book. Provided information about the concepts from Maths NCERT 10th grade. ",
    ),
)

agent_worker = FunctionCallingAgentWorker.from_tools(
    [kg_query_tool],
    llm=llm,
    verbose=True,
    allow_parallel_tool_calls=False,
)
agent = agent_worker.as_agent()

### Agent Query

In [None]:
agent_query = """
Explain Theorem 1.2 with examples
"""

In [None]:
response = agent.chat(agent_query)
print(str(response))