# 1. Preparation

## 1.1 Prepare for LLM

In [None]:
# %pip install llama-index-llms-azure-openai
# %pip install llama-index-graph-stores-nebula
# %pip install llama-index-llms-openai
# %pip install llama-index-embeddings-azure-openai

In [None]:
# For OpenAI

import os

# os.environ["OPENAI_API_KEY"] = "INSERT YOUR KEY"

import logging
import sys

logging.basicConfig(
    stream=sys.stdout, level=logging.INFO
)  # logging.DEBUG for more verbose output

from llama_index.core import (
    KnowledgeGraphIndex,
    VectorStoreIndex,
    ServiceContext,
    SimpleDirectoryReader,
    StorageContext,
    PromptTemplate,
    QueryBundle
)
from llama_index.core.schema import NodeWithScore
from llama_index.core.retrievers import BaseRetriever, VectorIndexRetriever, KGTableRetriever

from llama_index.graph_stores.nebula import NebulaGraphStore

from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from IPython.display import Markdown, display

from finllmqa.api.core import LLM_API_URL

from llama_index.core import Settings

llm = OpenAI(model="gpt-3.5-turbo", api_base=LLM_API_URL, api_key='null')
embed_model = OpenAIEmbedding(api_base=LLM_API_URL, api_key='null')

Settings.llm = llm
Settings.embed_model = embed_model

In [None]:
# For Azure OpenAI

import os
import json
import openai
from langchain.embeddings import OpenAIEmbeddings
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.langchain import LangchainEmbedding
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    KnowledgeGraphIndex
)

from llama_index.core.storage.storage_context import StorageContext
from llama_index.graph_stores.nebula import NebulaGraphStore

import logging
import sys

from IPython.display import Markdown, display

from finllmqa.api.core import LLM_API_URL

logging.basicConfig(
    stream=sys.stdout, level=logging.INFO
)  # logging.DEBUG for more verbose output
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

openai.api_type = "azure"
openai.api_base = LLM_API_URL
openai.api_version = "2024-03-01"
# os.environ["OPENAI_API_KEY"] = "youcannottellanyone"
# openai.api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = 'null'

llm = AzureOpenAI(
    engine="<foo-bar-deployment>",
    temperature=0,
    openai_api_version=openai.api_version,
    model_kwargs={
        "api_key": openai.api_key,
        "api_base": openai.api_base,
        "api_type": openai.api_type,
        "api_version": openai.api_version,
    },
)

# You need to deploy your own embedding model as well as your own chat completion model
embedding_llm = LangchainEmbedding(
    OpenAIEmbeddings(
        model="text-embedding-ada-002",
        deployment="<foo-bar-deployment>",
        openai_api_key=openai.api_key,
        openai_api_base=openai.api_base,
        openai_api_type=openai.api_type,
        openai_api_version=openai.api_version,
    ),
    embed_batch_size=1,
)

# service_context = ServiceContext.from_defaults(
#     llm=llm,
#     embed_model=embedding_llm,
# )

# set_global_service_context(service_context)

In [None]:
from llama_index.core import Settings

Settings.llm = llm
Settings.embed_model = embedding_llm

## 1.2. Prepare for NebulaGraph as Graph Store


❗Access NebulaGraph Console to **create space** and **graph schema**

```sql
CREATE SPACE guardians(vid_type=FIXED_STRING(256), partition_num=1, replica_factor=1);
:sleep 10;
USE guardians;
CREATE TAG entity(name string);
CREATE EDGE relationship(relationship string);
:sleep 10;
CREATE TAG INDEX entity_index ON entity(name(256));
```

In [None]:
# %pip install nebula3-python ipython-ngql

In [None]:
os.environ['NEBULA_USER'] = "root"
os.environ['NEBULA_PASSWORD'] = "nebula" # default password
os.environ['NEBULA_ADDRESS'] = "192.168.30.158:9669" 

## 2. Build the Knowledge Graph and Persist

In my work, the Knowledge Graph was created with LLM.

We simply do so leveragint the `KnowledgeGraphIndex` from LlamaIndex, when creating it, Triplets will be extracted with LLM and evantually persisted into `NebulaGraphStore`.

### 2.1 Load Data

In [None]:
from llama_index.core import SimpleDirectoryReader

# change path to where you save the teaching resources
document_path = 'books/'
file_name_ls = ['微观经济学.pdf']
file_name_ls = [document_path + file_name for file_name in file_name_ls]

reader = SimpleDirectoryReader(input_files=file_name_ls)
documents = reader.load_data()

### 2.2 Split Documents

In [None]:
from llama_index.core.node_parser import SentenceSplitter


chunk_size_ls = [256, 512, 1024]
chunk_overlap_pct_ls = [1/8, 1/4]
split_document_dc = {}
for chunk_size in chunk_size_ls:
    for chunk_overlap_pct in chunk_overlap_pct_ls:
        chunk_overlap = int(chunk_size * chunk_overlap_pct)
        nodes_group = f'size_{chunk_size}_overlap_{chunk_overlap}'
        splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        split_document = splitter.get_nodes_from_documents(documents=documents)
        split_document_dc[nodes_group] = split_document
        print(f'chunk_size: {chunk_size}; chunk_overlap: {chunk_overlap} len_chunks: {len(split_document)}')

### 2.3 Extract Triplets and Save to NebulaGraph

In [None]:
kg_extract_template = """
    下面提供了一些文本。根据文本，提取最多 {max_knowledge_triplets} 个三元组的知识，形式为(实体,关系,实体)，具体可以是(主语,谓语,宾语)或者其他类型，注意避开停用词。
    请忽略page_label和file_path
    ---------------------
    示例：
    文本：小红是小明的母亲.
    三元组：
    (小红,是母亲,小明)
    文本:瑞幸是2017年在厦门创立的咖啡店。
    三元组：
    (瑞幸,是,咖啡店)
    (瑞幸,创立于,厦门)
    (瑞幸,创立于,2017)
    文本:在长期中，物价总水平会调整到使货币需求等于货币供给的水平。
    三元组：
    (物价总水平,长期调整使等于,货币需求等于货币供给的水平)
    ---------------------
    文本：{text}
    三元组："""
kg_extract_template = PromptTemplate(kg_extract_template)

This cell will take some time, it'll extract entities and relationships and store them into NebulaGraph

In [None]:
import time
kg_index_ls = []

for nodes_group, nodes in split_document_dc.items():
    start = time.time()
    print(f'\n\nstart extract {nodes_group} nodes...\n\n')
    space_name = f"books_content_{nodes_group}"
    edge_types, rel_prop_names = ["relationship"], ["relationship"] # default, could be omit if create from an empty kg
    tags = ["entity"] # default, could be omit if create from an empty kg

    graph_store = NebulaGraphStore(
        space_name=space_name,
        edge_types=edge_types,
        rel_prop_names=rel_prop_names,
        tags=tags,
    )
    storage_context = StorageContext.from_defaults(graph_store=graph_store)
    kg_index = KnowledgeGraphIndex(
        nodes=nodes,
        storage_context=storage_context,
        max_triplets_per_chunk=10,
        space_name=space_name,
        edge_types=edge_types,
        rel_prop_names=rel_prop_names,
        tags=tags,
        include_embeddings=True,
        kg_triple_extract_template=kg_extract_template
    )
    end = time.time()
    print(f'{nodes_group} takes {(end-start)//60} min')
    kg_index_ls.append(kg_index)

    # store index
    kg_index.storage_context.persist(persist_dir=f'../storage/storage_graph/{nodes_group}')

#### Multiprocessing

In [None]:
%%writefile create_kg_index.py
import os
import logging
import sys

os.environ['NEBULA_USER'] = "root"
os.environ['NEBULA_PASSWORD'] = "nebula" # default password
os.environ['NEBULA_ADDRESS'] = "192.168.30.158:9669" 

logging.basicConfig(
    stream=sys.stdout, level=logging.INFO
)  # logging.DEBUG for more verbose output

from llama_index.core import (
    KnowledgeGraphIndex,
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
    PromptTemplate)
from llama_index.graph_stores.nebula import NebulaGraphStore
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from finllmqa.api.core import LLM_API_URL
from llama_index.core import Settings

llm = OpenAI(model="gpt-3.5-turbo", api_base=LLM_API_URL, api_key='null')
embed_model = OpenAIEmbedding(api_base=LLM_API_URL, api_key='null')

Settings.llm = llm
Settings.embed_model = embed_model

# change path to where you save the teaching resources
document_path = 'books/'
file_name_ls = ['微观经济学.pdf']
file_name_ls = [document_path + file_name for file_name in file_name_ls]

reader = SimpleDirectoryReader(input_files=file_name_ls)
documents = reader.load_data()

from llama_index.core.node_parser import SentenceSplitter


chunk_size_ls = [256, 512, 1024]
chunk_overlap_pct_ls = [1/8, 1/4]
split_document_dc = {}
for chunk_size in chunk_size_ls:
    for chunk_overlap_pct in chunk_overlap_pct_ls:
        chunk_overlap = int(chunk_size * chunk_overlap_pct)
        nodes_group = f'size_{chunk_size}_overlap_{chunk_overlap}'
        if os.path.exists(f'../storage/storage_graph/{nodes_group}'):
            continue
        splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        split_document = splitter.get_nodes_from_documents(documents=documents)
        split_document_dc[nodes_group] = split_document
        print(f'chunk_size: {chunk_size}; chunk_overlap: {chunk_overlap} len_chunks: {len(split_document)}')
print(split_document_dc.keys())
from threading import Thread

def create_and_store_kg_index(nodes_group, nodes):
    kg_extract_template = """
    下面提供了一些文本。根据文本，提取最多 {max_knowledge_triplets} 个三元组的知识，形式为(实体,关系,实体)，具体可以是(主语,谓语,宾语)或者其他类型，注意避开停用词。
    请忽略page_label和file_path
    ---------------------
    示例：
    文本：小红是小明的母亲.
    三元组：
    (小红,是母亲,小明)
    文本:瑞幸是2017年在厦门创立的咖啡店。
    三元组：
    (瑞幸,是,咖啡店)
    (瑞幸,创立于,厦门)
    (瑞幸,创立于,2017)
    文本:在长期中，物价总水平会调整到使货币需求等于货币供给的水平。
    三元组：
    (物价总水平,长期调整使等于,货币需求等于货币供给的水平)
    ---------------------
    文本：{text}
    三元组："""
    kg_extract_template = PromptTemplate(kg_extract_template)

    print(f'\n\nstart extract {nodes_group} nodes...\n\n')
    space_name = f"books_content_{nodes_group}"
    edge_types, rel_prop_names = ["relationship"], ["relationship"] # default, could be omit if create from an empty kg
    tags = ["entity"] # default, could be omit if create from an empty kg

    graph_store = NebulaGraphStore(
        space_name=space_name,
        edge_types=edge_types,
        rel_prop_names=rel_prop_names,
        tags=tags,
    )
    storage_context = StorageContext.from_defaults(graph_store=graph_store)
    kg_index = KnowledgeGraphIndex(
        nodes=nodes,
        storage_context=storage_context,
        max_triplets_per_chunk=10,
        space_name=space_name,
        edge_types=edge_types,
        rel_prop_names=rel_prop_names,
        tags=tags,
        include_embeddings=True,
        kg_triple_extract_template=kg_extract_template
    )

    # store index
    kg_index.storage_context.persist(persist_dir=f'../storage/storage_graph/{nodes_group}')

for nodes_group, nodes in split_document_dc.items():
    thread = Thread(target=create_and_store_kg_index, args=(nodes_group, nodes))
    thread.start()

In [None]:
%run create_kg_index.py

## 3 Create VectorStoreIndex for RAG and Persist

To compare with/work together with VectorDB based RAG, let's also create a `VectorStoreIndex`.

During the creation, same data source will be split into chunks and embedding of them will be created, during the RAG query time, the top-k related embeddings will be vector-searched with the embedding of the question.

In [None]:
vector_index_ls = []

for nodes_group, nodes in split_document_dc.items():
    print(f'\n\nstart extract {nodes_group} nodes...\n\n')
    vector_index = VectorStoreIndex(nodes=nodes)
    vector_index_ls.append(vector_index)

    # store index
    vector_index.storage_context.persist(persist_dir=f'../storage/storage_vector/{nodes_group}')

### 6.5 Overall Comparision

Let's compare the results of them.

First check the information that were coverred by different approaches:

In [None]:
display(
    Markdown(
        llm(f"""
Compare the QA results on "Tell me about Peter Quill.", list the knowledge facts between them, to help evalute them. Output in markdown table.

Result text2GraphQuery: {response_nl2kg}
---
Result Graph: {response_graph_rag}
---
Result Vector: {response_vector_rag}
---
Result Graph+Vector: {response_graph_vector_rag}
---

"""
           )
    )
)

**Conclusion**

- The pure **KG**(both text2GraphQuery and Graph RAG) comes with **concise** results, and much **lower cost**(for cost comparision see our previous result [here](https://gpt-index.readthedocs.io/en/latest/examples/index_structs/knowledge_graph/KnowledgeGraphIndex_vs_VectorStoreIndex_vs_CustomIndex_combined.html#comparison-of-results) )
- The **Graph+Vector** RAG could be more **comprehensive** in case the question envolves knowledge that's fine-grained **spread** across more chunks than top-K searching.


| QueryEngine | Knowledge Graph query engine                                 | Graph RAG query engine                                       | Vector RAG query engine                                      | Graph Vector RAG query engine                                |
| ----------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
| Mechanism   | 1. **Text-to-GraphQuery** based on KG<br />2. Query KG with the result<br />3. Answer synthesis based on query result | 1. Get related entities of the question<br />2. Get n-depth **SubGraphs** of related entities from KG<br />3. Answer synthesis based on related SubGraphs | 1. Create embedding of question<br />2. Semantic search **top-k related doc chunks**<br />3. Answer synthesis based on related doc chunks | 1. Do retrieval as Vector and Graph RAG <br />2. Answer synthesis based on **both related chunks and SubGraphs** |
| Performance | Concise                                                      | Concise                                                      | Fruitful                                                     | Fruitful, could be more comprehensive                        |
| Cost        | Low                                                          | Low                                                          | High                                                         | High                                                         |


**Conclusion**

For those tasks:

- Potentially cares more relationed knowledge
- Schema of the KG is sophisticated to be hard for text2cypher to express the task
- KG quality isn't good enough
- Multiple "starting entities" are involved

Graph RAG could be a better approach to start with.

## 7. Financial Evaluation on four types of engines

### 7.1 FinEval on query engines base on nodes of different chunk sizes and chunk overlaps

In [None]:
chunk_size_ls = [256, 512, 1024]
chunk_overlap_pct_ls = [1/8, 1/4]
nodes_group_ls = []
for chunk_size in chunk_size_ls:
    for chunk_overlap_pct in chunk_overlap_pct_ls:
        chunk_overlap = int(chunk_size * chunk_overlap_pct)
        nodes_group = f'size_{chunk_size}_overlap_{chunk_overlap}'
        nodes_group_ls.append(nodes_group)
query_engine_dc = get_all_query_engine_from_cache_index(kg_index_folder_path='../storage/storage_graph',
                                                        vector_index_folder_path='../storage/storage_vector',
                                                        nodes_group=nodes_group_ls)

In [None]:
import os
import pandas as pd
import json
import time

choices = ["A", "B", "C", "D"]
eval_path = 'financial_eval'

def fineval(args, evaluator, take):
    assert os.path.exists(eval_path + "subject_mapping.json"), "subject_mapping.json not found!"
    with open(eval_path+ "subject_mapping.json") as f:
        subject_mapping = json.load(f)
    filenames = os.listdir(eval_path + "data/val")
    subject_list = [val_file.replace("_val.csv", "") for val_file in filenames]
    accuracy, summary = {}, {}

    run_date = time.strftime('%Y-%m-%d_%H-%M-%S', time.localtime(time.time()))
    output_dir = args.output_dir
    save_result_dir = os.path.join(output_dir, f"take{take}")
    if not os.path.exists(save_result_dir):
        os.makedirs(save_result_dir, exist_ok=True)

    print(f'############# nodes group: {args.nodes_group} ###############')

    all_answers = {}
    for index, subject_name in enumerate(subject_list):
        print(
            f"{index / len(subject_list)} Inference starts at {run_date} on {args.model_name} with subject of {subject_name}!")
        val_file_path = os.path.join('data/val', f'{subject_name}_val.csv')
        dev_file_path = os.path.join('data/dev', f'{subject_name}_dev.csv')
        test_file_path = os.path.join('data/test', f'{subject_name}_test.csv')

        val_df = pd.read_csv(val_file_path) if args.do_test is False else pd.read_csv(test_file_path)
        dev_df = pd.read_csv(dev_file_path) if args.few_shot else None

        correct_ratio, answers = evaluator.eval_subject(subject_name, val_df, dev_df,
                                                        save_result_dir=save_result_dir if args.do_save_csv else None,
                                                        few_shot=args.few_shot,
                                                        cot=args.cot,
                                                        )
        print(f"Subject: {subject_name}")
        print(f"Acc: {correct_ratio}")
        accuracy[subject_name] = correct_ratio
        summary[subject_name] = {"score": correct_ratio,
                                 "num": len(val_df),
                                 "correct": correct_ratio * len(val_df) / 100}
        all_answers[subject_name] = answers

    json.dump(all_answers, open(save_result_dir + '/submission.json', 'w'), ensure_ascii=False, indent=4)
    print("Accuracy:")
    for k, v in accuracy.items():
        print(k, ": ", v)

    total_num = 0
    total_correct = 0
    summary['grouped'] = {
        "Accounting": {"correct": 0.0, "num": 0},
        "Finance": {"correct": 0.0, "num": 0},
        "Economy": {"correct": 0.0, "num": 0},
        "Certificate": {"correct": 0.0, "num": 0}
    }
    for subj, info in subject_mapping.items():
        group = info[2]
        summary['grouped'][group]["num"] += summary[subj]['num']
        summary['grouped'][group]["correct"] += summary[subj]['correct']
    for group, info in summary['grouped'].items():
        info['score'] = info["correct"] / info["num"]
        total_num += info["num"]
        total_correct += info["correct"]
    summary['All'] = {"score": total_correct / total_num, "num": total_num, "correct": total_correct}

    print('-' * 80)
    print("Accuracy_subject:")
    for k, v in accuracy.items():
        print(k, ": ", v)
    print('-' * 80)
    print("Accuracy_grouped:")
    for k, v in summary['grouped'].items():
        print(k, ": ", v['score'])

    print("Avg: ")
    print(summary['All']['score'])

    json.dump(summary, open(save_result_dir + '/summary.json', 'w'), ensure_ascii=False, indent=2)
    return summary

In [None]:
cot = False
few_shot = False
ntrain = 5
n_times = 1
do_save_csv = False
output_dir = eval_path + 'output'
model_name = 'chatglm'
do_test = False
args = dict(
    cot=cot,
    few_shot = few_shot,
    ntrain = ntrain,
    n_times = n_times,
    do_save_csv = do_save_csv,
    output_dir = output_dir,
    model_name = model_name,
    do_test = do_test
)

tree_summary_template = \
    "从不同来源获取的参考信息如下:\n" \
    "---------------------\n" \
    "{context_str}\n" \
    "---------------------\n" \
    "题目:{query_str}" 

graph_query_synthesis_prompt = \
    """Task:Generate nGQL statement to query a Nebula graph database.
    Instructions:
    Use only the provided relationship types and properties in the schema.
    Do not use any other relationship types or properties that are not provided.
    Schema:
    {schema}
    Note: Do not include any explanations or apologies in your responses.
    Do not respond to any questions that might ask anything else than for you to construct a nGQL statement.
    Do not include any text except the generated nGQL statement.
    Examples: Here are a few examples of generated nGQL statements for particular questions:
    # Tell me about Peter Quill?
    MATCH (m:entity {name:"Peter Quill"})<-[:relationship]-()
    RETURN count(*) AS numberOfActors

    The question is:
    {query_str}"""

graph_response_answer_prompt = \
    """
    原问题被转化成了查询语句，查询语句和查询结果将作为参考信息，如下:

    查询语句: {kg_query_str}
    查询结果: {kg_response_str}
    题目: {query_str}
    """

In [None]:
evaluator = QueryEngineEvaluator(
    choices=choices,
    k=args.ntrain,
    model_name=args.model_name
)
for i in range(args.n_times):
    fineval(args, evaluator=evaluator, take=i)