# 1. Preparation

## 1.1 Prepare for LLM

In [None]:
# %pip install llama-index-llms-azure-openai
# %pip install llama-index-graph-stores-nebula
# %pip install llama-index-llms-openai
# %pip install llama-index-embeddings-azure-openai

In [9]:
# For OpenAI

import os

# os.environ["OPENAI_API_KEY"] = "INSERT YOUR KEY"

import logging
import sys

logging.basicConfig(
    stream=sys.stdout, level=logging.INFO
)  # logging.DEBUG for more verbose output

from llama_index.core import (
    KnowledgeGraphIndex,
    VectorStoreIndex,
    ServiceContext,
    SimpleDirectoryReader,
    StorageContext,
    PromptTemplate,
    QueryBundle
)
from llama_index.core.schema import NodeWithScore
from llama_index.core.retrievers import BaseRetriever, VectorIndexRetriever, KGTableRetriever

from llama_index.graph_stores.nebula import NebulaGraphStore

from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from IPython.display import Markdown, display

from finllmqa.api.core import LLM_API_URL

from llama_index.core import Settings

llm = OpenAI(model="gpt-3.5-turbo", api_base=LLM_API_URL, api_key='null')
embed_model = OpenAIEmbedding(api_base=LLM_API_URL, api_key='null')

Settings.llm = llm
Settings.embed_model = embed_model

In [None]:
# For Azure OpenAI

import os
import json
import openai
from langchain.embeddings import OpenAIEmbeddings
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.langchain import LangchainEmbedding
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    KnowledgeGraphIndex
)

from llama_index.core.storage.storage_context import StorageContext
from llama_index.graph_stores.nebula import NebulaGraphStore

import logging
import sys

from IPython.display import Markdown, display

from finllmqa.api.core import LLM_API_URL

logging.basicConfig(
    stream=sys.stdout, level=logging.INFO
)  # logging.DEBUG for more verbose output
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

openai.api_type = "azure"
openai.api_base = LLM_API_URL
openai.api_version = "2024-03-01"
# os.environ["OPENAI_API_KEY"] = "youcannottellanyone"
# openai.api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = 'null'

llm = AzureOpenAI(
    engine="<foo-bar-deployment>",
    temperature=0,
    openai_api_version=openai.api_version,
    model_kwargs={
        "api_key": openai.api_key,
        "api_base": openai.api_base,
        "api_type": openai.api_type,
        "api_version": openai.api_version,
    },
)

# You need to deploy your own embedding model as well as your own chat completion model
embedding_llm = LangchainEmbedding(
    OpenAIEmbeddings(
        model="text-embedding-ada-002",
        deployment="<foo-bar-deployment>",
        openai_api_key=openai.api_key,
        openai_api_base=openai.api_base,
        openai_api_type=openai.api_type,
        openai_api_version=openai.api_version,
    ),
    embed_batch_size=1,
)

# service_context = ServiceContext.from_defaults(
#     llm=llm,
#     embed_model=embedding_llm,
# )

# set_global_service_context(service_context)

In [None]:
from llama_index.core import Settings

Settings.llm = llm
Settings.embed_model = embedding_llm

## 1.2. Prepare for NebulaGraph as Graph Store


In [None]:
# %pip install nebula3-python ipython-ngql

In [16]:
os.environ['NEBULA_USER'] = "root"
os.environ['NEBULA_PASSWORD'] = "nebula" # default password
os.environ['NEBULA_ADDRESS'] = "127.0.0.1:9669" 

## 2. Build the Knowledge Graph and Persist

In my work, the Knowledge Graph was created with LLM.

We simply do so leveragint the `KnowledgeGraphIndex` from LlamaIndex, when creating it, Triplets will be extracted with LLM and evantually persisted into `NebulaGraphStore`.

### 2.1 Load Data

In [10]:
from llama_index.core import SimpleDirectoryReader

# change path to where you save the teaching resources
document_path = 'books/'
file_name_ls = ['微观经济学.pdf']
file_name_ls = [document_path + file_name for file_name in file_name_ls]

reader = SimpleDirectoryReader(input_files=file_name_ls)
documents = reader.load_data()

Exception ignored in: <function NebulaGraphStore.__del__ at 0x00000278ECF41AB0>
Traceback (most recent call last):
  File "e:\anaconda\app\envs\chatglm3\lib\site-packages\llama_index\graph_stores\nebula\base.py", line 242, in __del__
    self._session_pool.close()
AttributeError: 'NoneType' object has no attribute 'close'
Exception ignored in: <function NebulaGraphStore.__del__ at 0x00000278ECF41AB0>
Traceback (most recent call last):
  File "e:\anaconda\app\envs\chatglm3\lib\site-packages\llama_index\graph_stores\nebula\base.py", line 242, in __del__
    self._session_pool.close()
AttributeError: 'NoneType' object has no attribute 'close'


### 2.2 Split Documents

In [2]:
from llama_index.core.node_parser import SentenceSplitter


chunk_size_ls = [256, 512, 1024]
chunk_overlap_pct_ls = [1/8, 1/4]
split_document_dc = {}
for chunk_size in chunk_size_ls:
    for chunk_overlap_pct in chunk_overlap_pct_ls:
        chunk_overlap = int(chunk_size * chunk_overlap_pct)
        nodes_group = f'size_{chunk_size}_overlap_{chunk_overlap}'
        splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        split_document = splitter.get_nodes_from_documents(documents=documents)
        split_document_dc[nodes_group] = split_document
        print(f'chunk_size: {chunk_size}; chunk_overlap: {chunk_overlap} len_chunks: {len(split_document)}')

chunk_size: 256; chunk_overlap: 32 len_chunks: 2941
chunk_size: 256; chunk_overlap: 64 len_chunks: 3277
chunk_size: 512; chunk_overlap: 64 len_chunks: 1508
chunk_size: 512; chunk_overlap: 128 len_chunks: 1588
chunk_size: 1024; chunk_overlap: 128 len_chunks: 871
chunk_size: 1024; chunk_overlap: 256 len_chunks: 874


### 2.3 Extract Triplets and Save to NebulaGraph

In [None]:
kg_extract_template = """
    下面提供了一些文本。根据文本，提取最多 {max_knowledge_triplets} 个三元组的知识，形式为(实体,关系,实体)，具体可以是(主语,谓语,宾语)或者其他类型，注意避开停用词。
    请忽略page_label和file_path
    ---------------------
    示例：
    文本：小红是小明的母亲.
    三元组：
    (小红,是母亲,小明)
    文本:瑞幸是2017年在厦门创立的咖啡店。
    三元组：
    (瑞幸,是,咖啡店)
    (瑞幸,创立于,厦门)
    (瑞幸,创立于,2017)
    文本:在长期中，物价总水平会调整到使货币需求等于货币供给的水平。
    三元组：
    (物价总水平,长期调整使等于,货币需求等于货币供给的水平)
    ---------------------
    文本：{text}
    三元组："""
kg_extract_template = PromptTemplate(kg_extract_template)

This cell will take some time, it'll extract entities and relationships and store them into NebulaGraph

In [None]:
import time
kg_index_ls = []

for nodes_group, nodes in split_document_dc.items():
    start = time.time()
    print(f'\n\nstart extract {nodes_group} nodes...\n\n')
    space_name = f"book_微观经济学_{nodes_group}"
    edge_types, rel_prop_names = ["关系"], ["关系"] # default, could be omit if create from an empty kg
    tags = ["实体"] # default, could be omit if create from an empty kg

    graph_store = NebulaGraphStore(
        space_name=space_name,
        edge_types=edge_types,
        rel_prop_names=rel_prop_names,
        tags=tags,
    )
    storage_context = StorageContext.from_defaults(graph_store=graph_store)
    kg_index = KnowledgeGraphIndex(
        nodes=nodes,
        storage_context=storage_context,
        max_triplets_per_chunk=10,
        space_name=space_name,
        edge_types=edge_types,
        rel_prop_names=rel_prop_names,
        tags=tags,
        include_embeddings=True,
        kg_triple_extract_template=kg_extract_template
    )
    end = time.time()
    print(f'{nodes_group} takes {(end-start)//60} min')
    kg_index_ls.append(kg_index)

    # store index
    kg_index.storage_context.persist(persist_dir=f'../storage/storage_graph/{nodes_group}')

## 3 Create VectorStoreIndex for RAG and Persist

To compare with/work together with VectorDB based RAG, let's also create a `VectorStoreIndex`.

During the creation, same data source will be split into chunks and embedding of them will be created, during the RAG query time, the top-k related embeddings will be vector-searched with the embedding of the question.

In [None]:
vector_index_ls = []

for nodes_group, nodes in split_document_dc.items():
    print(f'\n\nstart extract {nodes_group} nodes...\n\n')
    vector_index = VectorStoreIndex(nodes=nodes)
    vector_index_ls.append(vector_index)

    # store index
    vector_index.storage_context.persist(persist_dir=f'../storage/storage_vector/{nodes_group}')

## 4.Multiprocessing

In [None]:
%%writefile create_diff_chunk_index.py
import os
import logging
import sys

os.environ['NEBULA_USER'] = "root"
os.environ['NEBULA_PASSWORD'] = "nebula" # default password
os.environ['NEBULA_ADDRESS'] = "192.168.30.158:9669" 

logging.basicConfig(
    stream=sys.stdout, level=logging.INFO
)  # logging.DEBUG for more verbose output

from llama_index.core import (
    KnowledgeGraphIndex,
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
    PromptTemplate)
from llama_index.graph_stores.nebula import NebulaGraphStore
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from finllmqa.api.core import LLM_API_URL
from llama_index.core import Settings

llm = OpenAI(model="gpt-3.5-turbo", api_base=LLM_API_URL, api_key='null')
embed_model = OpenAIEmbedding(api_base=LLM_API_URL, api_key='null')

Settings.llm = llm
Settings.embed_model = embed_model

# change path to where you save the teaching resources
document_path = 'books/'
file_name_ls = ['微观经济学.pdf']
file_name_ls = [document_path + file_name for file_name in file_name_ls]

reader = SimpleDirectoryReader(input_files=file_name_ls)
documents = reader.load_data()

from llama_index.core.node_parser import SentenceSplitter


chunk_size_ls = [256, 512, 1024]
chunk_overlap_pct_ls = [1/8, 1/4]
split_document_dc = {}
for chunk_size in chunk_size_ls:
    for chunk_overlap_pct in chunk_overlap_pct_ls:
        chunk_overlap = int(chunk_size * chunk_overlap_pct)
        nodes_group = f'size_{chunk_size}_overlap_{chunk_overlap}'
        if os.path.exists(f'../storage/storage_graph/{nodes_group}'):
            continue
        splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        split_document = splitter.get_nodes_from_documents(documents=documents)
        split_document_dc[nodes_group] = split_document
        print(f'chunk_size: {chunk_size}; chunk_overlap: {chunk_overlap} len_chunks: {len(split_document)}')
print(split_document_dc.keys())
from threading import Thread

def create_and_store_kg_index(nodes_group, nodes):
    kg_extract_template = """
    下面提供了一些文本。根据文本，提取最多 {max_knowledge_triplets} 个三元组的知识，形式为(实体,关系,实体)，具体可以是(主语,谓语,宾语)或者其他类型，注意避开停用词。
    请忽略page_label和file_path
    ---------------------
    示例：
    文本：小红是小明的母亲.
    三元组：
    (小红,是母亲,小明)
    文本:瑞幸是2017年在厦门创立的咖啡店。
    三元组：
    (瑞幸,是,咖啡店)
    (瑞幸,创立于,厦门)
    (瑞幸,创立于,2017)
    文本:在长期中，物价总水平会调整到使货币需求等于货币供给的水平。
    三元组：
    (物价总水平,长期调整使等于,货币需求等于货币供给的水平)
    ---------------------
    文本：{text}
    三元组："""
    kg_extract_template = PromptTemplate(kg_extract_template)

    print(f'\n\nstart extract {nodes_group} nodes...\n\n')
    space_name = f"books_content_{nodes_group}"
    edge_types, rel_prop_names = ["relationship"], ["relationship"] # default, could be omit if create from an empty kg
    tags = ["entity"] # default, could be omit if create from an empty kg

    graph_store = NebulaGraphStore(
        space_name=space_name,
        edge_types=edge_types,
        rel_prop_names=rel_prop_names,
        tags=tags,
    )
    storage_context = StorageContext.from_defaults(graph_store=graph_store)
    kg_index = KnowledgeGraphIndex(
        nodes=nodes,
        storage_context=storage_context,
        max_triplets_per_chunk=10,
        space_name=space_name,
        edge_types=edge_types,
        rel_prop_names=rel_prop_names,
        tags=tags,
        include_embeddings=True,
        kg_triple_extract_template=kg_extract_template
    )

    vector_index = VectorStoreIndex(nodes=nodes)
    
    # store index
    kg_index.storage_context.persist(persist_dir=f'../storage/storage_graph/{nodes_group}')
    vector_index.storage_context.persist(persist_dir=f'../storage/storage_vector/{nodes_group}')

for nodes_group, nodes in split_document_dc.items():
    thread = Thread(target=create_and_store_kg_index, args=(nodes_group, nodes))
    thread.start()