### Imports

In [1]:
import nest_asyncio
import os
from dotenv import load_dotenv
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings
from llama_parse import LlamaParse
from copy import deepcopy
from llama_index.core.schema import TextNode, Document
from llama_index.core import VectorStoreIndex
from llama_index.graph_stores.neo4j import Neo4jPGStore
from llama_index.core.indices.property_graph import (
    ImplicitPathExtractor,
    SimpleLLMPathExtractor,
)
from llama_index.core import PropertyGraphIndex
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.indices.property_graph import VectorContextRetriever
from llama_index.core import VectorStoreIndex
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import BaseRetriever
from llama_index.core.schema import NodeWithScore
from typing import List
from llama_index.core.agent import FunctionCallingAgentWorker
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import RetrieverQueryEngine

In [2]:
load_dotenv()
nest_asyncio.apply()

In [3]:
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")

### LLM & Embedding Model Setup

In [4]:
llm = OpenAI(model="gpt-4o")
embed_model = OpenAIEmbedding(model="text-embedding-3-small")

Settings.llm = llm
Settings.embed_model = embed_model

### Parsing instruction

In [5]:
parsing_instruction = """
1. **Text**: Identify and extract all text content, including explanations, descriptions, and definitions.
2. **Image**: Identify and extract all images, ensuring to capture diagrams, illustrations, and visual representations.
3. **Graph**: Identify and extract all graphs, including line graphs, bar graphs, and any other graphical data representations.
4. **Table**: Identify and extract all tables, capturing data and structured information presented in tabular form.
5. **Practice Problems**: Identify and extract all practice problems, including exercises and questions for students to solve.
6. **Examples**: Identify and extract all worked examples that demonstrate how to solve problems or apply concepts.

#### Tagging Requirements:

For each extracted element, tag the following:

1. **Chapter**: The overarching division of the textbook in which the content is located.
2. **Concept**: The specific subsection or topic within a chapter that the content pertains to.
3. **Learning Outcome Name**: The most fundamental unit of learning, which is a combination of the concept, the cognitive ability (based on Bloom's taxonomy), and the context. For example:
    - **Learning Outcome**: "Calculating resistance using VI graph"
        - **Cognitive Ability**: Analyse
        - **Concept**: Calculating resistance
        - **Context**: VI graph

4. **Cognitive Ability**: The level of cognitive ability required, based on Bloom's taxonomy (e.g., Remember, Understand, Apply, Analyze, Evaluate, Create).

#### Special Instructions for Formulas and Equations:

- For any formulas or equations encountered, ensure they are provided in LaTeX format, enclosed in `$$`. For example:
    - Equation: $$ E = mc^2 $$
    - Formula: $$ \text{Area of a circle} = \pi r^2 $$

#### Additional Notes:

- **Multiple Concepts and Learning Outcomes**: Each page can have multiple concepts, and each concept can have multiple learning outcomes.
- **Contextual Awareness**: Ensure the context of each concept and learning outcome is accurately captured, reflecting the specific example, problem, or explanation provided.

#### Example of Tagged Content:

**Chapter**: Electricity  
**Concept**: Ohm's Law  
**Learning Outcome Name**: Calculating resistance using VI graph  
- **Cognitive Ability**: Analyze  
- **Context**: VI graph

**Text**: "Ohm's Law states that the current through a conductor between two points is directly proportional to the voltage across the two points."

**Image**: [Image of a circuit diagram]

**Graph**: [Graph showing Voltage vs. Current]

**Table**: 
| Voltage (V) | Current (I) |
|-------------|-------------|
| 1           | 0.2         |
| 2           | 0.4         |
| 3           | 0.6         |

**Practice Problems**:
1. "Calculate the resistance if the voltage is 5V and the current is 1A."

**Examples**:
- **Example 1**: "If the voltage across a resistor is 10V and the current is 2A, the resistance can be calculated using Ohm's Law as follows: $$ R = \frac{V}{I} = \frac{10V}{2A} = 5 \Omega $$"
"""

  parsing_instruction = """


### Adjust the target pages here... (set to None if want to parse all the pages)

In [6]:
target_pages="15,16,17,18,19,20,21"

In [7]:
docs = LlamaParse(
  result_type="markdown", 
  target_pages=target_pages, 
  parsing_instruction=parsing_instruction,
  use_vendor_multimodal_model=True,
  vendor_multimodal_model_name="openai-gpt4o",
  vendor_multimodal_api_key=OPENAI_API_KEY,
).load_data("book.pdf")

Started parsing the file under job_id 70d31837-9fb7-4ef9-b1ec-df3aac31caa4


In [8]:
def get_sub_docs(docs):
    """Split docs into pages, by separator."""
    sub_docs = []
    for doc in docs:
        doc_chunks = doc.text.split("\n---\n")
        for doc_chunk in doc_chunks:
            sub_doc = Document(
                text=doc_chunk,
                metadata=deepcopy(doc.metadata),
            )
            sub_docs.append(sub_doc)

    return sub_docs

In [9]:
sub_docs = get_sub_docs(docs)

In [10]:
graph_store = Neo4jPGStore(
    username=os.getenv("NEO4J_USERNAME"),
    password=os.getenv("NEO4J_PASSWORD"),
    url=os.getenv("NEO4J_URL"),
    database=os.getenv("NEO4J_DATABASE"),
)
vec_store = None



### Run this only for the first time when the graph is not created

In [11]:
index = PropertyGraphIndex.from_documents(
    sub_docs,
    embed_model=OpenAIEmbedding(model_name="text-embedding-3-small"),
    kg_extractors=[
        ImplicitPathExtractor(),
        SimpleLLMPathExtractor(
            llm=OpenAI(model="gpt-3.5-turbo", temperature=0.3),
            num_workers=4,
            max_paths_per_chunk=10,
        ),
    ],
    property_graph_store=graph_store,
    show_progress=True,
)

  from .autonotebook import tqdm as notebook_tqdm
Parsing nodes: 100%|██████████| 7/7 [00:00<00:00, 1629.76it/s]
Extracting implicit paths: 100%|██████████| 7/7 [00:00<00:00, 67963.26it/s]
Extracting paths from text: 100%|██████████| 7/7 [00:05<00:00,  1.33it/s]
Generating embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
Generating embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.81s/it]


### Run this if the data is already converted into graph

In [None]:
index = PropertyGraphIndex.from_existing(
    graph_store,
    embed_model=OpenAIEmbedding(model_name="text-embedding-3-small"),
    kg_extractors=[
        ImplicitPathExtractor(),
        SimpleLLMPathExtractor(
            llm=OpenAI(model="gpt-3.5-turbo", temperature=0.3),
            num_workers=4,
            max_paths_per_chunk=10,
        ),
    ],
    show_progress=True,
)

### Retriever

In [12]:
kg_retriever = VectorContextRetriever(
    index.property_graph_store,
    embed_model=OpenAIEmbedding(model_name="text-embedding-3-small"),
    similarity_top_k=2,
    path_depth=1,
    include_text=True,
)

### Query

In [13]:
query = """
Fundamental Theorem of Arithmetic
"""

### Print Nodes Response

In [14]:
nodes = kg_retriever.retrieve(query)
print(len(nodes))
for idx, node in enumerate(nodes):
    print(f">> IDX: {idx}, {node.get_content()}")

1
>> IDX: 0, Here are some facts extracted from the provided text:

Fundamental theorem of arithmetic ({'name': 'Fundamental theorem of arithmetic', 'triplet_source_id': '80a61887-b887-4251-ab71-980e4c30ca70'}) -> Is -> Part ({'name': 'Part', 'triplet_source_id': 'ea76798a-8466-4c34-9dd2-ecd5de2fe6ea'})
Fundamental theorem of arithmetic ({'name': 'Fundamental theorem of arithmetic', 'triplet_source_id': '80a61887-b887-4251-ab71-980e4c30ca70'}) -> Is -> Uniqueness part ({'name': 'Uniqueness part', 'triplet_source_id': 'ea76798a-8466-4c34-9dd2-ecd5de2fe6ea'})
Fundamental theorem of arithmetic ({'name': 'Fundamental theorem of arithmetic', 'triplet_source_id': '80a61887-b887-4251-ab71-980e4c30ca70'}) -> Is based on -> Proof ({'name': 'Proof', 'triplet_source_id': 'ea76798a-8466-4c34-9dd2-ecd5de2fe6ea'})
Fundamental theorem of arithmetic ({'name': 'Fundamental theorem of arithmetic', 'triplet_source_id': '80a61887-b887-4251-ab71-980e4c30ca70'}) -> Follows -> That p is one of the prime fact

### Naive retriever for comparision

In [15]:
base_index = VectorStoreIndex.from_documents(sub_docs, embed_model=embed_model)
base_retriever = base_index.as_retriever(similarity_top_k=2)
base_query_engine = RetrieverQueryEngine(base_retriever)

### Print naive retriever response

In [16]:
response = base_query_engine.query(query)
print(str(response))

The Fundamental Theorem of Arithmetic states that every composite number can be expressed as a product of prime numbers, and this factorization is unique, apart from the order in which the prime factors occur. This means that for any given composite number, there is only one way to write it as a product of primes, disregarding the sequence in which the primes are listed.


### Custom Retriever which uses both Graph and Naive approach (will return unique nodes)

In [17]:
class CustomRetriever(BaseRetriever):
    """Custom retriever that performs both KG vector search and direct vector search."""

    def __init__(self, kg_retriever, vector_retriever):
        self._kg_retriever = kg_retriever
        self._vector_retriever = vector_retriever

    def _retrieve(self, query_bundle) -> List[NodeWithScore]:
        """Retrieve nodes given query."""
        kg_nodes = self._kg_retriever.retrieve(query_bundle)
        vector_nodes = self._vector_retriever.retrieve(query_bundle)

        unique_nodes = {n.node_id: n for n in kg_nodes}
        unique_nodes.update({n.node_id: n for n in vector_nodes})
        return list(unique_nodes.values())

In [18]:
custom_retriever = CustomRetriever(kg_retriever, base_retriever)

In [19]:
nodes = custom_retriever.retrieve(query)

### Using this *Retriever* as *Tool* for the **Agent**

In [20]:
kg_query_engine = RetrieverQueryEngine(custom_retriever)
kg_query_tool = QueryEngineTool(
    query_engine=kg_query_engine,
    metadata=ToolMetadata(
        name="query_tool",
        description="Use this tool to get contextual data from the book. Provided information about the concepts from Maths NCERT 10th grade. ",
    ),
)

agent_worker = FunctionCallingAgentWorker.from_tools(
    [kg_query_tool],
    llm=llm,
    verbose=True,
    allow_parallel_tool_calls=False,
)
agent = agent_worker.as_agent()

### Agent Query

In [29]:
agent_query = """
Explain Theorem 1.2 with examples
"""

In [30]:
response = agent.chat(agent_query)
print(str(response))

Added user message to memory: 
Explain Theorem 1.2 with examples

=== Calling Function ===
Calling function: query_tool with args: {"input": "Explain Theorem 1.2 with examples from Maths NCERT 10th grade textbook"}
=== Function Output ===
Theorem 1.2 states: "Let \( p \) be a prime number. If \( p \) divides \( a^2 \), then \( p \) divides \( a \), where \( a \) is a positive integer."

To understand this theorem, let's consider an example. Suppose \( p = 3 \) and \( a = 6 \). According to the theorem, if 3 divides \( 6^2 \), then 3 must also divide 6.

First, calculate \( 6^2 \):
\[ 6^2 = 36 \]

Since 3 divides 36 (because \( 36 \div 3 = 12 \)), according to the theorem, 3 must also divide 6. Indeed, 3 divides 6 (because \( 6 \div 3 = 2 \)).

Another example can be with \( p = 5 \) and \( a = 10 \). If 5 divides \( 10^2 \), then 5 must also divide 10.

First, calculate \( 10^2 \):
\[ 10^2 = 100 \]

Since 5 divides 100 (because \( 100 \div 5 = 20 \)), according to the theorem, 5 must a