# 1. Preparation

## 1.1 Prepare for LLM

In [None]:
# %pip install llama-index-llms-azure-openai
# %pip install llama-index-graph-stores-nebula
# %pip install llama-index-llms-openai
# %pip install llama-index-embeddings-azure-openai

In [None]:
# For OpenAI
import os
import logging
import sys
from typing import List

# logging.basicConfig(
#     stream=sys.stdout, level=logging.INFO
# )  # logging.DEBUG for more verbose output

from llama_index.core import (
    KnowledgeGraphIndex,
    VectorStoreIndex,
    ServiceContext,
    SimpleDirectoryReader,
    StorageContext,
    PromptTemplate,
    load_index_from_storage
)
from llama_index.core.query_engine import KnowledgeGraphQueryEngine

from llama_index.graph_stores.nebula import NebulaGraphStore

from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from IPython.display import Markdown, display

from finllmqa.api.core import LLM_API_URL

from llama_index.core import Settings

llm = OpenAI(model="gpt-3.5-turbo", api_base='http://gemini2.sufe.edu.cn:27282/v1', api_key='null')
embed_model = OpenAIEmbedding(api_base='http://gemini2.sufe.edu.cn:27282/v1', api_key='null')

Settings.llm = llm
Settings.embed_model = embed_model

## 1.2. Prepare for NebulaGraph as Graph Store


In [None]:
# %pip install nebula3-python ipython-ngql

In [None]:
os.environ['NEBULA_USER'] = "root"
os.environ['NEBULA_PASSWORD'] = "nebula" # default password
os.environ['NEBULA_ADDRESS'] = "127.0.0.1:9669" 

## 2. Load from disk Llama Indexes

In [None]:
from llama_index.core import load_index_from_storage

assert os.path.exists(os.path.join(os.path.abspath(os.path.join('..')), 'storage/storage_graph')), 'Do not have graph storage_context in disk'
assert os.path.exists(os.path.join(os.path.abspath(os.path.join('..')), 'storage/storage_vector')), 'Do not have vector storage_context in disk'

entries = os.listdir()
folders = [entry for entry in entries if os.path.isdir(os.path.join(entry))]

kg_index_ls = []
vector_index_ls = []
for nodes_group in folders:
    space_name = f"books_content_{nodes_group}"
    edge_types, rel_prop_names = ["relationship"], ["relationship"] # default, could be omit if create from an empty kg
    tags = ["entity"] # default, could be omit if create from an empty kg

    graph_store = NebulaGraphStore(
        space_name=space_name,
        edge_types=edge_types,
        rel_prop_names=rel_prop_names,
        tags=tags,
    )
    storage_context = StorageContext.from_defaults(persist_dir=f'../storage/storage_graph/{nodes_group}', graph_store=graph_store)
    kg_index = load_index_from_storage(
        storage_context=storage_context,
        space_name=space_name,
        edge_types=edge_types,
        rel_prop_names=rel_prop_names,
        tags=tags,
        include_embeddings=True,
    )
    kg_index_ls.append(kg_index)

    storage_context_vector = StorageContext.from_defaults(persist_dir=f'../storage_vector/{nodes_group}')
    vector_index = load_index_from_storage(
    #     service_context=service_context,
        storage_context=storage_context_vector
    )
    vector_index_ls.append(vector_index)

## 3. Prepare for different query approaches

We will do 3 types of query approaches with LLM, KG, VectorDB:


### 3.1 Graph RAG query engine

Graph RAG takes SubGraphs related to entities of the task/question as Context.


In [None]:
kg_rag_qg_ls = []
for kg_index in kg_index_ls:
    kg_rag_query_engine = kg_index.as_query_engine(
        include_text=False,
        retriever_mode="hybrid",
        response_mode="tree_summarize",
    )
    kg_rag_qg_ls.append(kg_rag_query_engine)

### 3.2 Vector RAG query engine

Vector RAG is the common approach to find topK semantic related doc chunks as context to synthesize the answer.

In [None]:
vector_rag_qg_ls = []
for vetor_index in vector_index_ls:
    vector_rag_query_engine = vector_index.as_query_engine()
    vector_rag_qg_ls.append(vector_rag_query_engine)

### 3.3 Graph+Vector RAG query engine

This is a combined Graph+Vector Based RAG, where we will retrieve both VectorDB and KG SubGraphs as the context, for synthesis of the answer.

In Llama Index, we set include_text = True in KGTableRetriever to get the combination of kg and vector

In [None]:
from llama_index.core import get_response_synthesizer
from llama_index.core.query_engine import RetrieverQueryEngine

kg_vec_rag_qg_ls = []
nodes_group = 'size_512_overlap_64'
space_name = f"book_微观经济学_{nodes_group}"
edge_types, rel_prop_names = ["关系"], ["关系"] # default, could be omit if create from an empty kg
tags = ["实体"] # default, could be omit if create from an empty kg

graph_store = NebulaGraphStore(
    space_name=space_name,
    edge_types=edge_types,
    rel_prop_names=rel_prop_names,
    tags=tags,
)
storage_context_kg = StorageContext.from_defaults(persist_dir='../storage/storage_graph' + f'/{nodes_group}', graph_store=graph_store)
z


kg_index = load_index_from_storage(
    storage_context=storage_context,
    space_name=space_name,
    edge_types=edge_types,
    rel_prop_names=rel_prop_names,
    tags=tags,
    include_embeddings=True,
)
kg_vector_rag_query_engine = kg_index.as_query_engine(
        include_text=True,
        response_mode="tree_summarize"
        )

### 3.5 General load index from disk and get query engine function

In [None]:
from llama_index.core import QueryBundle
from llama_index.core.schema import NodeWithScore
from llama_index.core.retrievers import BaseRetriever, VectorIndexRetriever, KGTableRetriever
from llama_index.core import get_response_synthesizer
from llama_index.core.query_engine import RetrieverQueryEngine
def get_all_query_engine_from_cache_index(kg_index_folder_path, vector_index_folder_path, nodes_group: str|List[str]):
    if isinstance(nodes_group, str):
        nodes_group_ls = [nodes_group]
    else:
        nodes_group_ls = nodes_group
    query_engine_dc = {
        # 'nl2kg': {},
        'kg_rag': {},
        'vec_rag': {},
        'kg_vec_rag': {}
    }
    for nodes_group in nodes_group_ls:
        space_name = f"book_{nodes_group}"
        edge_types, rel_prop_names = ["关系"], ["关系"] # default, could be omit if create from an empty kg
        tags = ["实体"] # default, could be omit if create from an empty kg

        graph_store = NebulaGraphStore(
            space_name=space_name,
            edge_types=edge_types,
            rel_prop_names=rel_prop_names,
            tags=tags,
        )
        storage_context_kg = StorageContext.from_defaults(persist_dir=kg_index_folder_path + f'/{nodes_group}', graph_store=graph_store)
        kg_index = load_index_from_storage(
            storage_context=storage_context_kg,
            space_name=space_name,
            edge_types=edge_types,
            rel_prop_names=rel_prop_names,
            tags=tags,
            include_embeddings=True,
        )

        storage_context_vector = StorageContext.from_defaults(persist_dir=vector_index_folder_path + f'/{nodes_group}')
        vector_index = load_index_from_storage(
            storage_context=storage_context_vector
        )

        # # text2cypher query engine
        # nl2kg_query_engine = KnowledgeGraphQueryEngine(
        #     storage_context=storage_context_kg,
        #     verbose=True
        # )
        # query_engine_dc['nl2kg'].append(nl2kg_query_engine)
        
        # kg_rag query engine
        kg_rag_query_engine = kg_index.as_query_engine(
            include_text=False,
            response_mode="tree_summarize"
        )
        query_engine_dc['kg_rag'][nodes_group] = kg_rag_query_engine
        # vec_rag query engine
        vec_rag_query_engine = vector_index.as_query_engine(response_mode="tree_summarize")
        query_engine_dc['vec_rag'][nodes_group] =  vec_rag_query_engine
        # kg_vec_rag query engine
        kg_vector_rag_query_engine = kg_index.as_query_engine(
            include_text=True,
            response_mode="tree_summarize"
        )
        query_engine_dc['kg_vec_rag'][nodes_group] = kg_vector_rag_query_engine
    return query_engine_dc


## 4. Base Query with all the Engines

### 4.1 Text-to-GraphQuery

In [None]:
response_nl2kg = nl2kg_query_engine.query("什么是经济学十大原理.")


display(Markdown(f"<b>{response_nl2kg}</b>"))

# Cypher:

print("Cypher Query:")

graph_query = nl2kg_query_engine.generate_query(
    "什么是经济学十大原理",
)
graph_query = graph_query.replace("WHERE", "\n  WHERE").replace("RETURN", "\nRETURN")

display(
    Markdown(
        f"""
```cypher
{graph_query}
```
"""
    )
)

### 4.2 Graph RAG

In [None]:
kg_rag_query_engine = query_engine_dc['kg_rag']['size_512_overlap_64']

In [None]:
response_graph_rag = kg_rag_query_engine.query("生产要素分为哪几种")

display(Markdown(f"<b>{response_graph_rag}</b>"))

In [None]:
print(response_graph_rag.source_nodes[0].text)

### 4.3 Vector RAG

In [None]:
vector_rag_query_engine = query_engine_dc['vec_rag']['size_256_overlap_16']

In [None]:
response_vector_rag = vector_rag_query_engine.query("生产要素分为哪几种")

display(Markdown(f"<b>{response_vector_rag}</b>"))

### 4.4 Graph + Vector RAG

In [None]:
graph_vector_rag_query_engine = query_engine_dc['kg_vec_rag']['size_512_overlap_64']

In [None]:
response_graph_vector_rag = graph_vector_rag_query_engine.query("厂商的要素需求曲线向右下方倾斜的原因在于？")

display(Markdown(f"<b>{response_graph_vector_rag}</b>"))

In [None]:
print(response_graph_vector_rag.source_nodes[0].text)

## 5. Financial Evaluation

In [None]:
import os
import pandas as pd
import json
import time

choices = ["A", "B", "C", "D"]
eval_path = ''

def fineval(args, evaluator, take, subject: str | List[str] = None):
    assert 'nodes_group' in args.keys(), 'you must assign nodes_group in args!'
    nodes_group = args['nodes_group']
    if subject is not None:
        if isinstance(subject, str):
            subject_list = [subject]
        else:
            subject_list = subject
    assert os.path.exists(eval_path + "subject_mapping.json"), "subject_mapping.json not found!"
    with open(eval_path+ "subject_mapping.json") as f:
        subject_mapping = json.load(f)
    if subject_list is not None:
        subject_mapping_tmp = subject_mapping.copy()
        for subject in subject_mapping_tmp.keys():
            if subject not in subject_list:
                del subject_mapping[subject]
    filenames = os.listdir(eval_path + "data/val")
    subject_list = [val_file.replace("_val.csv", "") for val_file in filenames if val_file.replace("_val.csv", "") in subject_list]
    accuracy, summary = {}, {}

    run_date = time.strftime('%Y-%m-%d_%H-%M-%S', time.localtime(time.time()))
    output_dir = args['output_dir']
    save_result_dir = os.path.join(output_dir, take)
    if not os.path.exists(save_result_dir):
        os.makedirs(save_result_dir, exist_ok=True)

    print(f"############# nodes group: {nodes_group} ###############")

    all_answers = {}
    for index, subject_name in enumerate(subject_list):
        print(
            f"{index / len(subject_list)} Inference starts at {run_date} on {args['model_name']} with subject of {subject_name}!")
        val_file_path = os.path.join('data/val', f'{subject_name}_val.csv')
        dev_file_path = os.path.join('data/dev', f'{subject_name}_dev.csv')
        test_file_path = os.path.join('data/test', f'{subject_name}_test.csv')

        val_df = pd.read_csv(val_file_path) if args['do_test'] is False else pd.read_csv(test_file_path)
        dev_df = pd.read_csv(dev_file_path) if args['few_shot'] else None

        correct_ratio, answers = evaluator.eval_subject(subject_name, val_df, dev_df,
                                                        save_result_dir=save_result_dir if args['do_save_csv'] else None,
                                                        few_shot=args['few_shot'],
                                                        cot=args['cot'],
                                                        )
        print(f"Subject: {subject_name}")
        print(f"Acc: {correct_ratio}")
        accuracy[subject_name] = correct_ratio
        summary[subject_name] = {"score": correct_ratio,
                                 "num": len(val_df),
                                 "correct": correct_ratio * len(val_df) / 100}
        all_answers[subject_name] = answers

    # json.dump(all_answers, open(save_result_dir + f'/{nodes_group}_submission.json', 'w'), ensure_ascii=False, indent=4)
    print("Accuracy:")
    for k, v in accuracy.items():
        print(k, ": ", v)

    total_num = 0
    total_correct = 0
    summary['grouped'] = {
        "Accounting": {"correct": 0.0, "num": 0},
        "Finance": {"correct": 0.0, "num": 0},
        "Economy": {"correct": 0.0, "num": 0},
        "Certificate": {"correct": 0.0, "num": 0}
    }
    for subj, info in subject_mapping.items():
        group = info[2]
        summary['grouped'][group]["num"] += summary[subj]['num']
        summary['grouped'][group]["correct"] += summary[subj]['correct']
    for group, info in summary['grouped'].items():
        info['score'] = info["correct"] / info["num"] if info["num"] != 0 else 0
        total_num += info["num"]
        total_correct += info["correct"]
    summary['All'] = {"score": total_correct / total_num, "num": total_num, "correct": total_correct}

    print('-' * 80)
    print("Accuracy_subject:")
    for k, v in accuracy.items():
        print(k, ": ", v)
    print('-' * 80)
    print("Accuracy_grouped:")
    for k, v in summary['grouped'].items():
        print(k, ": ", v['score'])

    print("Avg: ")
    print(summary['All']['score'])

    json.dump(summary, open(save_result_dir + f'/{nodes_group}_summary.json', 'w'), ensure_ascii=False, indent=2)
    return summary

In [None]:
cot = False
few_shot = False
ntrain = 5
n_times = 1
do_save_csv = False
output_dir = eval_path + 'output'
model_name = 'chatglm'
do_test = False
args = dict(
    cot=cot,
    few_shot = few_shot,
    ntrain = ntrain,
    n_times = n_times,
    do_save_csv = do_save_csv,
    output_dir = output_dir,
    model_name = model_name,
    do_test = do_test
)

In [None]:
# when we evaluate which group of parameter is better, we take 'microeconomics' for instance  
subject = 'microeconomics'

# record all eval summary
all_summary_records = []


#### ChatGLM3-6B baseline

In [None]:
benchmark_summary_dc = {}

In [None]:
from openai_evaluator import OpenAI_Evaluator
nodes_group = 'benchmark'
for cot in [False, True]:
    for few_shot in [False, True]:
        args['cot'] = cot
        args['few_shot'] = few_shot
        args['nodes_group'] = nodes_group
        evaluator = OpenAI_Evaluator(choices=choices, k=args['ntrain'], model_name=args['model_name'])
        # for file path detemined by whether we use cot or few shot in prompt
        prompt_type = ('' if cot else 'no_') + 'cot_' + \
            (('and_' if cot else 'but_') if few_shot else ('but_' if cot else 'and_') + 'no_') + 'few_shot' 
        take = f'{nodes_group}/' + prompt_type
        benchmark_summary = fineval(args=args, evaluator=evaluator, take=take, 
                                    subject=subject)
        benchmark_summary_dc[prompt_type] = benchmark_summary

In [None]:
nodes_group = 'benchmark'
if not benchmark_summary_dc:
    for cot in [False, True]:
        for few_shot in [False, True]:
            # for file path detemined by whether we use cot or few shot in prompt
            prompt_type = ('' if cot else 'no_') + 'cot_' + \
            (('and_' if cot else 'but_') if few_shot else ('but_' if cot else 'and_') + 'no_') + 'few_shot' 
            take = f'{nodes_group}/' + prompt_type
            with open(os.path.join(args['output_dir'],take) + f'/{nodes_group}_summary.json') as f:
                benchmark_summary = json.load(f)
            benchmark_summary_dc[prompt_type] = benchmark_summary

benchmark_summary_record = {'method': nodes_group, 'nodes_group': nodes_group, 'retrieve_mode': 'not retrieve'}
for cot in [False, True]:
    for few_shot in [False, True]:
        prompt_type = ('' if cot else 'no_') + 'cot_' + \
            (('and_' if cot else 'but_') if few_shot else ('but_' if cot else 'and_') + 'no_') + 'few_shot' 
        benchmark_summary_record[prompt_type] = benchmark_summary_dc[prompt_type][subject]['score']
all_summary_records.append(benchmark_summary_record)

### 5.1 Compare different query engines base on different retrieve mode and nodes of different chunk sizes and chunk overlaps

#### Query Engine

In [None]:
chunk_size_ls = [256, 512, 1024]
chunk_overlap_pct_ls = [1/16, 1/8]
nodes_group_ls = []
for chunk_size in chunk_size_ls:
    for chunk_overlap_pct in chunk_overlap_pct_ls:
        chunk_overlap = int(chunk_size * chunk_overlap_pct)
        nodes_group = f'size_{chunk_size}_overlap_{chunk_overlap}'
        nodes_group_ls.append(nodes_group)
query_engine_dc = get_all_query_engine_from_cache_index(kg_index_folder_path='../storage/storage_graph',
                                                        vector_index_folder_path='../storage/storage_vector',
                                                        nodes_group=nodes_group_ls)

In [None]:
tree_summary_template = \
    "从不同来源获取的参考信息如下:\n" \
    "---------------------\n" \
    "{context_str}\n" \
    "---------------------\n" \
    "题目:{query_str}" 

prompt_dict = dict(
    summary_template = [
        {
            'role': 'user',
            'content': tree_summary_template
        }])

In [None]:
query_engine_summary_dc = {}

In [None]:
from query_engine_evaluator import QueryEngineEvaluator
for query_engine_type in query_engine_dc.keys():
    if query_engine_type not in query_engine_summary_dc.keys():
        query_engine_summary_dc[query_engine_type] = {}
    for cot in [False, True]:
        for few_shot in [False, True]:
            prompt_type = ('' if cot else 'no_') + 'cot_' + \
                    (('and_' if cot else 'but_') if few_shot else ('but_' if cot else 'and_') + 'no_') + 'few_shot' 
            if prompt_type not in query_engine_summary_dc[query_engine_type].keys():
                query_engine_summary_dc[query_engine_type][prompt_type] = {}
            for retrieve_mode in ['retrieve_only_question', 'retrieve_with_choices']:
                if retrieve_mode not in query_engine_summary_dc[query_engine_type][prompt_type].keys():
                    query_engine_summary_dc[query_engine_type][prompt_type][retrieve_mode] = {}
                for nodes_group in nodes_group_ls:
                    if nodes_group not in query_engine_summary_dc[query_engine_type][prompt_type][retrieve_mode].keys():
                        query_engine_summary_dc[query_engine_type][prompt_type][retrieve_mode][nodes_group] = {}
                    args['nodes_group'] = nodes_group
                    args['cot'] = cot
                    args['few_shot'] = few_shot
                    query_engine = query_engine_dc[query_engine_type][nodes_group]
                    retrieve_choice = retrieve_mode == 'retrieve_with_choices'
                    evaluator = QueryEngineEvaluator(query_engine=query_engine, prompt_dict=prompt_dict, retrieve_choice=retrieve_choice,
                                                    choices=choices, k=args['ntrain'], model_name=args['model_name'])
                    
                    take = f'{query_engine_type}/{prompt_type}/{retrieve_mode}'
                    summary = fineval(args=args, evaluator=evaluator, take=take, subject=subject)
                    query_engine_summary_dc[query_engine_type][prompt_type][retrieve_mode][nodes_group] = summary

In [None]:
if not query_engine_summary_dc:
    for query_engine_type in query_engine_dc.keys():
        if query_engine_type not in query_engine_summary_dc.keys():
            query_engine_summary_dc[query_engine_type] = {}
        for cot in [False, True]:
            for few_shot in [False, True]:
                prompt_type = ('' if cot else 'no_') + 'cot_' + \
                        (('and_' if cot else 'but_') if few_shot else ('but_' if cot else 'and_') + 'no_') + 'few_shot' 
                if prompt_type not in query_engine_summary_dc[query_engine_type].keys():
                    query_engine_summary_dc[query_engine_type][prompt_type] = {}
                for retrieve_mode in ['retrieve_only_question', 'retrieve_with_choices']:
                    if retrieve_mode not in query_engine_summary_dc[query_engine_type][prompt_type].keys():
                        query_engine_summary_dc[query_engine_type][prompt_type][retrieve_mode] = {}
                    for nodes_group in nodes_group_ls:
                        if nodes_group not in query_engine_summary_dc[query_engine_type][prompt_type][retrieve_mode].keys():
                            query_engine_summary_dc[query_engine_type][prompt_type][retrieve_mode][nodes_group] = {}
                        take = f"{query_engine_type}/{prompt_type}/{retrieve_mode}"
                        with open(f"{args['output_dir']}/{take}/{nodes_group}_summary.json") as f:
                            summary = json.load(f)
                        query_engine_summary_dc[query_engine_type][prompt_type][retrieve_mode][nodes_group] = summary

prompt_type_ls = []
for query_engine_type in query_engine_dc.keys():
    for retrieve_mode in ['retrieve_only_question', 'retrieve_with_choices']:
        for nodes_group in nodes_group_ls:
            query_engine_summary_record = {'method': query_engine_type, 'nodes_group': nodes_group, 'retrieve_mode': retrieve_mode}
            for cot in [False, True]:
                for few_shot in [False, True]:
                    prompt_type = ('' if cot else 'no_') + 'cot_' + \
                            (('and_' if cot else 'but_') if few_shot else ('but_' if cot else 'and_') + 'no_') + 'few_shot' 
                    prompt_type_ls.append(prompt_type)
                    query_engine_summary_record[prompt_type] = \
                        query_engine_summary_dc[query_engine_type][prompt_type][retrieve_mode][nodes_group][subject]['score']
            all_summary_records.append(query_engine_summary_record) 

In [None]:
import numpy as np
df_all_summary = pd.DataFrame(all_summary_records)
# display(Markdown(f"""
#     ** Financial Evaluation on different query engine with different parameters and baseline ChatGLM3-6B: **
#     {df_all_summary}
# """))
df_all_summary['Avg'] = np.mean(df_all_summary[prompt_type_ls], axis=1)
df_all_summary['Max'] = np.max(df_all_summary[prompt_type_ls], axis=1)

**Compare different chunk size and chunk overlap**

In [None]:
df_all_summary[df_all_summary['retrieve_mode'].isin(['not retrieve', 'retrieve_with_choices'])].drop(['retrieve_mode', 'nodes_group'], axis=1).groupby(['method']).mean().round(2).reset_index(drop=False)

### 5.2 Compare retriever parameters

**Graph RAG**

In [None]:
nodes_group = 'benchmark'
benchmark_summary_dc = {}
if not benchmark_summary_dc:
    for cot in [False, True]:
        for few_shot in [False, True]:
            # for file path detemined by whether we use cot or few shot in prompt
            prompt_type = ('' if cot else 'no_') + 'cot_' + \
            (('and_' if cot else 'but_') if few_shot else ('but_' if cot else 'and_') + 'no_') + 'few_shot' 
            take = f'{nodes_group}/' + prompt_type
            with open(os.path.join(args['output_dir'],take) + f'/{nodes_group}_summary.json') as f:
                benchmark_summary = json.load(f)
            benchmark_summary_dc[prompt_type] = benchmark_summary

benchmark_summary_record = {'method': nodes_group, 'subgraph_size': 'no_subgraph'}
for cot in [False, True]:
    for few_shot in [False, True]:
        prompt_type = ('' if cot else 'no_') + 'cot_' + \
            (('and_' if cot else 'but_') if few_shot else ('but_' if cot else 'and_') + 'no_') + 'few_shot' 
        benchmark_summary_record[prompt_type] = benchmark_summary_dc[prompt_type][subject]['score']
all_summary_records.append(benchmark_summary_record)

In [None]:
nodes_group = '微观经济学'
space_name = f"book_{nodes_group}_size_512_overlap_64"
edge_types, rel_prop_names = ["关系"], ["关系"] # default, could be omit if create from an empty kg
tags = ["实体"] # default, could be omit if create from an empty kg

graph_store = NebulaGraphStore(
    space_name=space_name,
    edge_types=edge_types,
    rel_prop_names=rel_prop_names,
    tags=tags,
)
storage_context_kg = StorageContext.from_defaults(persist_dir=f'../storage/storage_graph/{nodes_group}', graph_store=graph_store)
kg_index = load_index_from_storage(
    storage_context=storage_context_kg,
    space_name=space_name,
    edge_types=edge_types,
    rel_prop_names=rel_prop_names,
    tags=tags,
    include_embeddings=True,
    )


In [None]:
tree_summary_template = \
    "从不同来源获取的参考信息如下:\n" \
    "---------------------\n" \
    "{context_str}\n" \
    "---------------------\n" \
    "题目:{query_str}" 

prompt_dict = dict(
    summary_template = [
        {
            'role': 'user',
            'content': tree_summary_template
        }])
query_engine_summary_dc = {}

In [None]:
from query_engine_evaluator import QueryEngineEvaluator
query_engine_dc = {}
for depth, breadth in zip([1, 2, 3, 4], [60, 30, 20, 15]):
    subgraph_size = f'depth_{depth}_breadth_{breadth}'
    # kg_rag query engine
    kg_rag_query_engine = kg_index.as_query_engine(
        include_text=False,
        graph_store_query_depth=depth,
        max_knowledge_sequence=breadth,
        response_mode="tree_summarize"
    )
    query_engine_dc[subgraph_size] = kg_rag_query_engine
    
for depth, breadth in zip([1, 2, 3, 4], [60, 30, 20, 15]):
    subgraph_size = f'depth_{depth}_breadth_{breadth}'
    if subgraph_size not in query_engine_summary_dc.keys():
        query_engine_summary_dc[subgraph_size] = {}
    for cot in [False, True]:
        for few_shot in [False, True]:
            prompt_type = ('' if cot else 'no_') + 'cot_' + \
                    (('and_' if cot else 'but_') if few_shot else ('but_' if cot else 'and_') + 'no_') + 'few_shot' 
            if prompt_type not in query_engine_summary_dc[subgraph_size].keys():
                query_engine_summary_dc[subgraph_size][prompt_type] = {}
            args['nodes_group'] = subgraph_size
            args['cot'] = cot
            args['few_shot'] = few_shot
            query_engine = query_engine_dc[subgraph_size]
            evaluator = QueryEngineEvaluator(query_engine=query_engine, prompt_dict=prompt_dict, retrieve_choice='True',
                                            choices=choices, k=args['ntrain'], model_name=args['model_name'])
            
            take = f'kg_rag/{prompt_type}/subgraph_size'
            summary = fineval(args=args, evaluator=evaluator, take=take, subject=subject)
            query_engine_summary_dc[subgraph_size][prompt_type] = summary

## Vector RAG 

In [None]:
nodes_group = '微观经济学'
storage_context_vec = StorageContext.from_defaults(persist_dir=f'../storage/storage_vector/{nodes_group}', graph_store=graph_store)
vector_index = load_index_from_storage(
    storage_context=storage_context_vec
    )

In [None]:
tree_summary_template = \
    "从不同来源获取的参考信息如下:\n" \
    "---------------------\n" \
    "{context_str}\n" \
    "---------------------\n" \
    "题目:{query_str}" 

prompt_dict = dict(
    summary_template = [
        {
            'role': 'user',
            'content': tree_summary_template
        }])
query_engine_summary_dc = {}

In [None]:
from query_engine_evaluator import QueryEngineEvaluator
query_engine_dc = {}
for similarity_top_k in [3,4,5,6]:
    top_k = f'top_{similarity_top_k}'
    vec_rag_query_engine = vector_index.as_query_engine(
        similarity_top_k = similarity_top_k,
        response_mode="tree_summarize"
    )
    query_engine_dc[top_k] = vec_rag_query_engine
    
for similarity_top_k in [3,4,5,6]:
    top_k = f'top_{similarity_top_k}'
    if top_k not in query_engine_summary_dc.keys():
        query_engine_summary_dc[top_k] = {}
    for cot in [False, True]:
        for few_shot in [False, True]:
            prompt_type = ('' if cot else 'no_') + 'cot_' + \
                    (('and_' if cot else 'but_') if few_shot else ('but_' if cot else 'and_') + 'no_') + 'few_shot' 
            if prompt_type not in query_engine_summary_dc[top_k].keys():
                query_engine_summary_dc[top_k][prompt_type] = {}
            args['nodes_group'] = subgraph_size
            args['cot'] = cot
            args['few_shot'] = few_shot
            query_engine = query_engine_dc[top_k]
            evaluator = QueryEngineEvaluator(query_engine=query_engine, prompt_dict=prompt_dict, retrieve_choice='True',
                                            choices=choices, k=args['ntrain'], model_name=args['model_name'])
            
            take = f'vec_rag/{prompt_type}/top_k'
            summary = fineval(args=args, evaluator=evaluator, take=take, subject=subject)
            query_engine_summary_dc[top_k][prompt_type] = summary

In [None]:
all_summary_records = []

In [None]:
nodes_group = 'benchmark'
benchmark_summary_dc = {}
if not benchmark_summary_dc:
    for cot in [False, True]:
        for few_shot in [False, True]:
            # for file path detemined by whether we use cot or few shot in prompt
            prompt_type = ('' if cot else 'no_') + 'cot_' + \
            (('and_' if cot else 'but_') if few_shot else ('but_' if cot else 'and_') + 'no_') + 'few_shot' 
            take = f'{nodes_group}/' + prompt_type
            with open(os.path.join(args['output_dir'],take) + f'/{nodes_group}_summary.json') as f:
                benchmark_summary = json.load(f)
            benchmark_summary_dc[prompt_type] = benchmark_summary

benchmark_summary_record = {'method': nodes_group, 'top_k': '0'}
for cot in [False, True]:
    for few_shot in [False, True]:
        prompt_type = ('' if cot else 'no_') + 'cot_' + \
            (('and_' if cot else 'but_') if few_shot else ('but_' if cot else 'and_') + 'no_') + 'few_shot' 
        benchmark_summary_record[prompt_type] = benchmark_summary_dc[prompt_type][subject]['score']
all_summary_records.append(benchmark_summary_record)

In [None]:
nodes_group = 'vector_rag'
prompt_type_ls = []
for similarity_top_k in [3,4,5,6]:
    top_k = f'top_{similarity_top_k}'
    vec_record = {'method': nodes_group}
    for cot in [False, True]:
        for few_shot in [False, True]:
            prompt_type = ('' if cot else 'no_') + 'cot_' + \
                (('and_' if cot else 'but_') if few_shot else ('but_' if cot else 'and_') + 'no_') + 'few_shot' 
            prompt_type_ls.append(prompt_type)
            vec_record['top_k'] = top_k
            vec_record[prompt_type] = query_engine_summary_dc[top_k][prompt_type][subject]['score']
    all_summary_records.append(vec_record)

In [None]:
import numpy as np
df_all_summary = pd.DataFrame(all_summary_records)
df_all_summary['Avg'] = np.mean(df_all_summary[prompt_type_ls], axis=1)
df_all_summary['Max'] = np.max(df_all_summary[prompt_type_ls], axis=1)
df_all_summary

## 5.3 Systematic Compare

In [None]:
book_subject_mapping = {
    '金融学': 'Finance',
    '投资学': 'Finance',
    '货币金融学': 'Finance',
    '公司理财': 'Finance',
    'CPA战略': 'Finance',
    'CPA会计': 'accounting',
    'CPA审计': 'auditing',
    'CPA税法': 'Accounting',
    'CPA财务成本管理': 'Accounting',
    'CPA经济法': 'Economy',
    '宏观经济学': 'Economy',
    '微观经济学': 'Economy',
    '计量经济学': 'Economy',
}
with open('book_subject_mapping.json', 'w') as f:
    f.write(json.dumps(book_subject_mapping, ensure_ascii=False, indent=2))

In [None]:
with open('subject_mapping.json', 'r') as f:
    subject_mapping = json.load(f)
subjects = subject_mapping.keys()
subjects = [subject for subject in subjects if subject not in ['banking_practitioner_qualification_certificate', 'fund_qualification_certificate', 'futures_practitioner_qualification_certificate',
 'securities_practitioner_qualification_certificate','statistics', 'financial_engineering', 'investments', 'monetary_finance']]

In [None]:
all_summary_records = []
benchmark_summary_dc = {}
query_engine_summary_dc = {}

In [None]:
from openai_evaluator import OpenAI_Evaluator
nodes_group = 'benchmark'
for cot in [False, True]:
    for few_shot in [False, True]:
        args['cot'] = cot
        args['few_shot'] = few_shot
        args['nodes_group'] = nodes_group
        evaluator = OpenAI_Evaluator(choices=choices, k=args['ntrain'], model_name=args['model_name'])
        # for file path detemined by whether we use cot or few shot in prompt
        prompt_type = ('' if cot else 'no_') + 'cot_' + \
            (('and_' if cot else 'but_') if few_shot else ('but_' if cot else 'and_') + 'no_') + 'few_shot' 
        take = f'{nodes_group}/' + prompt_type
        benchmark_summary = fineval(args=args, evaluator=evaluator, take=take, 
                                    subject=subjects)
        benchmark_summary_dc[prompt_type] = benchmark_summary

In [None]:
if not benchmark_summary_dc:
    for cot in [False, True]:
        for few_shot in [False, True]:
            # for file path detemined by whether we use cot or few shot in prompt
            prompt_type = ('' if cot else 'no_') + 'cot_' + \
            (('and_' if cot else 'but_') if few_shot else ('but_' if cot else 'and_') + 'no_') + 'few_shot' 
            take = f'{nodes_group}/' + prompt_type
            with open(os.path.join(args['output_dir'],take) + f'/{nodes_group}_summary.json') as f:
                benchmark_summary = json.load(f)
            benchmark_summary_dc[prompt_type] = benchmark_summary

benchmark_summary_record = {'method': nodes_group}
subject_group_ls = ['Finance', 'Accounting', 'Economy', 'Certificate']
for subject_group in subject_group_ls
    for cot in [False, True]:
        for few_shot in [False, True]:
            prompt_type = ('' if cot else 'no_') + 'cot_' + \
                (('and_' if cot else 'but_') if few_shot else ('but_' if cot else 'and_') + 'no_') + 'few_shot' 
            benchmark_summary_record[prompt_type] = benchmark_summary_dc[prompt_type]['grouped'][subject_group]['score']
all_summary_records.append(benchmark_summary_record)

In [None]:
tree_summary_template = \
    "从不同来源获取的参考信息如下:\n" \
    "---------------------\n" \
    "{context_str}\n" \
    "---------------------\n" \
    "题目:{query_str}" 

prompt_dict = dict(
    summary_template = [
        {
            'role': 'user',
            'content': tree_summary_template
        }])

In [None]:
query_engine_dc = get_all_query_engine_from_cache_index(kg_index_folder_path='../storage/storage_graph',
                                                        vector_index_folder_path='../storage/storage_vector',
                                                        nodes_group='all')

In [None]:
from query_engine_evaluator import QueryEngineEvaluator
nodes_group = 'all'
retrieve_mode = 'retrieve_with_choices'
for query_engine_type in query_engine_dc.keys():
    if query_engine_type not in query_engine_summary_dc.keys():
        query_engine_summary_dc[query_engine_type] = {}
    for cot in [False, True]:
        for few_shot in [False, True]:
            prompt_type = ('' if cot else 'no_') + 'cot_' + \
                    (('and_' if cot else 'but_') if few_shot else ('but_' if cot else 'and_') + 'no_') + 'few_shot' 
            if nodes_group not in query_engine_summary_dc[query_engine_type].keys():
                query_engine_summary_dc[query_engine_type][nodes_group] = {}
            if prompt_type not in query_engine_summary_dc[query_engine_type][nodes_group].keys():
                query_engine_summary_dc[query_engine_type][nodes_group][prompt_type] = {}
            args['nodes_group'] = nodes_group
            args['cot'] = cot
            args['few_shot'] = few_shot
            query_engine = query_engine_dc[query_engine_type][nodes_group]
            retrieve_choice = retrieve_mode == 'retrieve_with_choices'
            evaluator = QueryEngineEvaluator(query_engine=query_engine, prompt_dict=prompt_dict, retrieve_choice=retrieve_choice,
                                            choices=choices, k=args['ntrain'], model_name=args['model_name'])
            
            take = f'{query_engine_type}/{prompt_type}/{nodes_group}'
            summary = fineval(args=args, evaluator=evaluator, take=take, subject=subjects)
            query_engine_summary_dc[query_engine_type][nodes_group][prompt_type] = summary

In [None]:
if not query_engine_summary_dc:
    for query_engine_type in query_engine_dc.keys():
        if query_engine_type not in query_engine_summary_dc.keys():
            query_engine_summary_dc[query_engine_type] = {}
        for cot in [False, True]:
            for few_shot in [False, True]:
                prompt_type = ('' if cot else 'no_') + 'cot_' + \
                        (('and_' if cot else 'but_') if few_shot else ('but_' if cot else 'and_') + 'no_') + 'few_shot' 
                if nodes_group not in query_engine_summary_dc[query_engine_type].keys():
                    query_engine_summary_dc[query_engine_type][nodes_group] = {}
                if prompt_type not in query_engine_summary_dc[query_engine_type][nodes_group].keys():
                    query_engine_summary_dc[query_engine_type][nodes_group][prompt_type] = {}
                take = f"{query_engine_type}/{prompt_type}/{nodes_group}"
                with open(f"{args['output_dir']}/{take}/{nodes_group}_summary.json") as f:
                    summary = json.load(f)
                query_engine_summary_dc[query_engine_type][nodes_group][prompt_type] = summary

prompt_type_ls = []
subject_group_ls = ['Finance', 'Accounting', 'Economy', 'Certificate']
for query_engine_type in query_engine_dc.keys():
    query_engine_summary_record = {'method': query_engine_type}
    for subject_group in subject_group_ls:
        query_engine_summary_record['subject_group'] = subject_group
        for cot in [False, True]:
            for few_shot in [False, True]:
                prompt_type = ('' if cot else 'no_') + 'cot_' + \
                        (('and_' if cot else 'but_') if few_shot else ('but_' if cot else 'and_') + 'no_') + 'few_shot' 
                prompt_type_ls.append(prompt_type)
                query_engine_summary_record[prompt_type] = \
                    query_engine_summary_dc[query_engine_type][nodes_group][prompt_type]['grouped'][subject_group]['score']
    all_summary_records.append(query_engine_summary_record) 