# Paper Savior 2

Auto-explorative research with RAG critics cluster

In [1]:
# %pip install lionagi llama-index unstructured pypdf wikipedia google-search 'unstructured[pdf]'

In [2]:
# if you would like to ignore logging

# import logging
# logging.getLogger().setLevel(logging.ERROR)

In [3]:
topic = "Large Language Model Applications in Blockchain"
question = "Research on building a system of trust integrating Large Language Model with blockchain"
num_papers = 20

# 1. Setup

#### a. ArXiv

In [4]:
# download paper and build index
from llama_index import download_loader

ArxivReader = download_loader("ArxivReader")
loader = ArxivReader()

documents, abstracts = loader.load_papers_and_abstracts(search_query=topic, max_results=num_papers)

In [6]:
from llama_index import  VectorStoreIndex, ServiceContext
from llama_index.text_splitter import SentenceSplitter
from llama_index.llms import OpenAI

text_splitter = SentenceSplitter(chunk_size=512, chunk_overlap=10)
llm = OpenAI(model='gpt-4-turbo-preview', temperature=0.1)

service_context = ServiceContext.from_defaults(
    text_splitter=text_splitter, llm=llm)

arxiv_index = VectorStoreIndex.from_documents(
    documents, service_context=service_context)

arxiv_engine = arxiv_index.as_query_engine(
    similarity_top_k=3, response_mode= "tree_summarize")

arxiv_index.storage_context.persist('.storage/arxiv/')

build from storage

In [10]:
# # you can build from storage like this, just need to find the index_id in the index_store file
# from llama_index import StorageContext, load_index_from_storage
# from llama_index.llms import OpenAI

# llm = OpenAI(model='gpt-4-turbo-preview', temperature=0.1)
# service_context = service_context.from_defaults(llm=llm)

# storage_context = StorageContext.from_defaults(persist_dir='.storage/arvix1/')
# index_id = '789e33ff-f14c-4d36-a7eb-827896b2bd3b'

# arxiv_index = load_index_from_storage(
#     storage_context=storage_context, index_id=index_id, service_context=service_context)

# arxiv_engine = arxiv_index.as_query_engine(similarity_top_k=3, response_mode= "tree_summarize")

#### b. Textbooks

In [14]:
book1 = "d2l-en.pdf"
book2 = "Blockchain basic.pdf"

In [16]:
from llama_index import download_loader, Document
UnstructuredReader = download_loader("UnstructuredReader")
loader = UnstructuredReader()

documents1 = loader.load_data(book1)
documents2 = loader.load_data(book2)

documents1 = [Document(text="".join([x.text for x in documents1]))]
documents2 = [Document(text="".join([x.text for x in documents2]))]

[nltk_data] Downloading package punkt to /Users/lion/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/lion/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


##### Customized Triplets extration function using transformers

Build a triplet extraction funciton using `Transformers`

- [Transformers Doc](https://huggingface.co/docs/transformers/index)
- [Build KG with Wikipedia filtering](https://docs.llamaindex.ai/en/stable/examples/index_structs/knowledge_graph/knowledge_graph2.html)
- [Make Meaningful KG from Open Source REBEL model](https://medium.com/@haiyangli_38602/make-meaningful-knowledge-graph-from-opensource-rebel-model-6f9729a55527)

In [17]:
from transformers import pipeline

triplet_extractor = pipeline(
    "text2text-generation",
    model="Babelscape/rebel-large",
    tokenizer="Babelscape/rebel-large",
    # comment this line to run on CPU
    device="mps:0",
)

def extract_triplets(input_text):
    text = triplet_extractor.tokenizer.batch_decode(
        [
            triplet_extractor(
                input_text, return_tensors=True, return_text=False
            )[0]["generated_token_ids"]
        ]
    )[0]

    triplets = []
    relation, subject, relation, object_ = "", "", "", ""
    text = text.strip()
    current = "x"
    for token in (
        text.replace("<s>", "")
        .replace("<pad>", "")
        .replace("</s>", "")
        .split()
    ):
        if token == "<triplet>":
            current = "t"
            if relation != "":
                triplets.append(
                    (subject.strip(), relation.strip(), object_.strip())
                )
                relation = ""
            subject = ""
        elif token == "<subj>":
            current = "s"
            if relation != "":
                triplets.append(
                    (subject.strip(), relation.strip(), object_.strip())
                )
            object_ = ""
        elif token == "<obj>":
            current = "o"
            relation = ""
        else:
            if current == "t":
                subject += " " + token
            elif current == "s":
                object_ += " " + token
            elif current == "o":
                relation += " " + token

    if subject != "" and relation != "" and object_ != "":
        triplets.append((subject.strip(), relation.strip(), object_.strip()))

    return triplets

import wikipedia

class WikiFilter:
    def __init__(self):
        self.cache = {}

    def filter(self, candidate_entity):
        # check the cache to avoid network calls
        if candidate_entity in self.cache:
            return self.cache[candidate_entity]["title"]

        # pull the page from wikipedia -- if it exists
        try:
            page = wikipedia.page(candidate_entity, auto_suggest=False)
            entity_data = {
                "title": page.title,
                "url": page.url,
                "summary": page.summary,
            }

            # cache the page title and original entity
            self.cache[candidate_entity] = entity_data
            self.cache[page.title] = entity_data

            return entity_data["title"]
        except:
            return None

wiki_filter = WikiFilter()

def extract_triplets_wiki(text):
    relations = extract_triplets(text)

    filtered_relations = []
    for relation in relations:
        (subj, rel, obj) = relation
        filtered_subj = wiki_filter.filter(subj)
        filtered_obj = wiki_filter.filter(obj)

        # skip if at least one entity not linked to wiki
        if filtered_subj is None and filtered_obj is None:
            continue

        filtered_relations.append(
            (
                filtered_subj or subj,
                rel,
                filtered_obj or obj,
            )
        )

    return filtered_relations

##### Build KG

this will take quite a while, I suggest you to minimize the notebook for a couple hours, or change the source to be shorter in length

In [18]:
from llama_index import ServiceContext, KnowledgeGraphIndex
from llama_index.graph_stores import SimpleGraphStore
from llama_index.storage.storage_context import StorageContext
from llama_index.llms import OpenAI

llm = OpenAI(temperature=0.1, model="gpt-4-turbo-preview")
service_context = ServiceContext.from_defaults(llm=llm, chunk_size=256)

graph_store = SimpleGraphStore()
storage_context = StorageContext.from_defaults(graph_store=graph_store)

In [None]:
d2l_index = KnowledgeGraphIndex.from_documents(
    documents1,
    max_triplets_per_chunk=3,
    kg_triplet_extract_fn=extract_triplets_wiki,
    storage_context=storage_context,
    service_context=service_context,
    include_embeddings=True,
)

d2l_engine = arxiv_index.as_query_engine(similarity_top_k=3, response_mode= "tree_summarize")
d2l_index.storage_context.persist('.storage/d2l/')

In [19]:
bc_index = KnowledgeGraphIndex.from_documents(
    documents2,
    max_triplets_per_chunk=3,
    kg_triplet_extract_fn=extract_triplets_wiki,
    storage_context=storage_context,
    service_context=service_context,
    include_embeddings=True,
)

d2l_engine = arxiv_index.as_query_engine(similarity_top_k=3, response_mode= "tree_summarize")
bc_index.storage_context.persist('.storage/bc/')



  lis = BeautifulSoup(html).find_all('li')


##### From Storage

In [None]:
# from llama_index import load_index_from_storage
# from llama_index.storage.storage_context import StorageContext
# from llama_index.graph_stores import SimpleGraphStore

# storage_context = StorageContext.from_defaults(
#     graph_store=SimpleGraphStore(), persist_dir= '.storage/d2l/')

# # <change to your own index id, can find it in index store>
# index_id = 'cac9b853-6b30-4fc3-9a18-8592901df717' 

# d2l_index = load_index_from_storage(storage_context=storage_context, index_id=index_id)
# d2l_engine = d2l_index.as_query_engine(similarity_top_k=3, response_mode= "tree_summarize")

In [1]:
from llama_index import load_index_from_storage
from llama_index.storage.storage_context import StorageContext
from llama_index.graph_stores import SimpleGraphStore

storage_context = StorageContext.from_defaults(
    graph_store=SimpleGraphStore(), persist_dir= '.storage/bc/')

# <change to your own index id, can find it in index store>
index_id = 'bcd41f43-7af0-4001-a3c0-c1a753cdeaac' 

bc_index = load_index_from_storage(storage_context=storage_context, index_id=index_id,)
bc_engine = bc_index.as_query_engine(similarity_top_k=3, response_mode= "tree_summarize")

#### c. google and wikipedia

In [2]:
import os

google_key_scheme = 'GOOGLE_API_KEY'
google_engine_scheme = 'GOOGLE_CSE_ID'

In [3]:
def create_google_engine(
    google_api_key=os.getenv(google_key_scheme), 
    google_engine=os.getenv(google_engine_scheme), 
    verbose=False
):
    try:
        from llama_index.agent import OpenAIAgent
        from llama_index.tools.tool_spec.load_and_search.base import LoadAndSearchToolSpec
        from llama_hub.tools.google_search.base import GoogleSearchToolSpec

        api_key = google_api_key
        search_engine = google_engine
        google_spec = GoogleSearchToolSpec(key=api_key, engine=search_engine)

        # Wrap the google search tool as it returns large payloads
        tools = LoadAndSearchToolSpec.from_defaults(
            google_spec.to_tool_list()[0],
        ).to_tool_list()

        # Create the Agent with our tools
        agent = OpenAIAgent.from_tools(tools, verbose=verbose)
        return agent
    
    except Exception as e:
        raise ImportError(f"Error in importing OpenAIAgent from llama_index: {e}")

In [4]:
def wiki_query(query: str, lang: str = 'en'):
    import wikipedia
    from llama_index import Document, VectorStoreIndex

    wikipedia.set_lang(lang)

    res = wikipedia.search(query, results=1)
    if len(res) == 0:
        return "No search results."
    try:
        wikipedia_page = wikipedia.page(res[0], auto_suggest=False)
    except wikipedia.PageError:
        return f"Unable to load page {res[0]}."
    content = wikipedia_page.content

    documents = [Document(text=content)]
    index = VectorStoreIndex.from_documents(documents)
    query_engine = index.as_query_engine()
    response = query_engine.query(query)
    return response.response

# 2. Tools

In [None]:
responses_arxiv = []
responses_d2l = []
responses_bc = []

def query_arxiv(query: str):
    """
    Query a vector index built with papers from arxiv. It takes 
    natural language query, and give natural language response. 

    Args:
        query (str): The natural language query to get an answer from the index

    Returns:
        str: The query response from index
    """
    response = arxiv_engine.query(query)
    responses_arxiv.append(response)
    
    return str(response.response)

def query_d2l(query: str):
    """
    Query a index built from machine learning textbooks. It takes 
    natural language query, and give natural language response. 

    Args:
        query (str): The natural language query to get an answer from the index

    Returns:
        str: The query response from index
    """
    response = d2l_engine.query(query)
    responses_d2l.append(response)
    return str(response.response)
        
def query_bc(query: str):
    """
    Query a index built from blockchain textbooks. It takes 
    natural language query, and give natural language response. 

    Args:
        query (str): The natural language query to get an answer from the index

    Returns:
        str: The query response from index
    """
    response = bc_engine.query(query)
    responses_bc.append(response)
    return str(response.response)

In [14]:
responses_google = []
responses_wiki = []

# ask gpt to write you google format docstring
def query_google(query: str):
    """
    Search Google and retrieve a natural language answer to a given query.

    Args:
        query (str): The search query to find an answer for.

    Returns:
        str: A natural language answer obtained from Google search results.

    Raises:
        Exception: If there is an issue with making the request or parsing the response.
    """
    google_agent = create_google_engine()
    response = google_agent.chat(query)
    responses_google.append(response)
    return str(response)

def query_wiki(query: str):
    """
    Search Wikipedia and retrieve a natural language answer to a given query.

    Args:
        query (str): The search query to find an answer for.

    Returns:
        str: A natural language answer obtained from Google search results.

    Raises:
        Exception: If there is an issue with making the request or parsing the response.
    """
    response = wiki_query(query)
    responses_wiki.append(response)
    return str(response)

Make them into LionAGI tool objects

In [16]:
import lionagi as li

funcs = [query_google, query_wiki, query_arxiv, query_d2l, query_bc]
tools = li.lcall(funcs, li.func_to_tool)

# 3. Prompts

#### a. PROMPTS - Researcher

In [None]:
# Prompt 1: Abstract Summary
json_format1 = {
    "summary": "Brief summary of paper abstract.",
    "core points": "Key points from the paper.",
    "relevance to research question": 
        "Explanation of paper's relevance to a specific research question."
}

# Prompt 2: Reflections and Evaluation
json_format2 = {
    **json_format1,
    "reflections": "Reflections on the feedback received and improvements made."
}

# Prompt 3: Brainstorming for Research Question
json_format3 = {
    **json_format2,
    "Paper": "Name of the paper.",
    "Authors": "List of authors.",
    "key points for further investigation": "Points to explore further.",
    "reasoning": "Reasoning behind the selection of these points."
}

# Prompt 4: Final Deliverable Presentation
json_format4 = {
    **json_format3,
    "next steps": "Proposed next steps based on the research."
}

In [17]:
# Researcher System Configuration
researcher_system = {
    "persona": "World-class researcher",
    "requirements": "Clear, precise answers with a confident tone.",
    "responsibilities": "Researching a specific topic and question.",
    "notice": "Collaboration with critics for alignment with project goals."
}

# Instruction Set for Researcher

researcher_instruct1 = {
    "task step": "1",
    "task name": "Read Paper Abstracts",
    "task objective": "Initial understanding of papers.",
    "deliverable": {
        "delivery required": "yes", 
        "format": json_format1
    }
}

researcher_instruct2 = {
    "task step": "2",
    "task name": "Reflect on Feedback",
    "task objective": "Improve understanding and relevance.",
    "deliverable": {
        "delivery required": "yes", 
        "format": json_format2
    }
}

researcher_instruct3 = {
    "task step": "3",
    "task name": "Brainstorm for Research Question",
    "task objective": "Ideas for research question assistance.",
    "deliverable": {
        "delivery required": "yes", 
        "format": json_format3
    }
}

researcher_instruct4 = {
    "task step": "4",
    "task name": "Final Deliverable Presentation",
    "task objective": "Present final research output.",
    "deliverable": {
        "delivery required": "yes", 
        "format": json_format4
    }
}


#### b. PROMPTS - critic cluster

In [18]:
def get_critic_system(name_):
    tool_manual = {
        "notice": f"""
            Use the QA bot for each task. It queries an index of {name_} 
            and allows up to 3 queries per task.
        """
    }
    system = {
        "name": f"Critic_{name_}",
        "resource": f"Resource: {name_}",
        "persona": "World-class researcher",
        "responsibilities": """
            Check the quality of plans and code. Give feedback.
        """,
        "requirements": f"""
            As Critic_{name_}, verify the accuracy of plans and claims.
        """,
        "tools": tool_manual
    }
    return system

def get_critic_stage1(name_):
    return {
        "task step": "1",
        "task name": "Verify Claim",
        "description": f"""
            Research on a specific topic using the {name_} index. Verify claims and report findings.
        """,
        "deliverable": """
            Provide a detailed report with your findings. Include references and your name.
        """
    }

def get_critic_stage2(step_num):
    return {
        "task step": str(step_num),
        "task name": "Peer-Review for Improvement",
        "description": """
            Review and argue against findings from steps 2, 3, and 4. Use facts and sources.
        """,
        "deliverable": """
            Provide detailed reviews for each critic. Include references and your name.
        """
    }

def get_critic_stage3(step_num):
    return {
        "task step": str(step_num),
        "task name": "Finalize and Review",
        "description": """
            Write your final peer review. Consider all feedback and research relevance.
        """,
        "deliverable": """
            Present a comprehensive report of 1200+ words. Include your findings and name.
        """
    }

def get_critic_config(name_, tools):
    return {
        "name": f"critic_{name_}",
        "system": get_critic_system(name_),
        'step1': get_critic_stage1(name_),
        'step2': get_critic_stage2(2),
        'step3': get_critic_stage2(3),
        'step4': get_critic_stage2(4),
        'step5': get_critic_stage3(5), 
        "tools": tools
    }

# [query_google, query_wiki, query_arxiv, query_d2l, query_bc]
critic_configs = {
    "critic_google": get_critic_config("google", tools[0]),
    "critic_wiki": get_critic_config("wiki", tools[1]),
    "critic_arxiv": get_critic_config("arxiv", tools[2]),
    "critic_d2l": get_critic_config("d2l", tools[3]),
    "critic_bc": get_critic_config("bc", tools[4])
}

# 4. Drafting Workflow

#### Set up session and branches

In [19]:
# each critic will be in a branch of the session
# and they each have access to one corresponding query engine

tool_names = ['google', 'wiki', 'arxiv', 'd2l', 'bc']
critic_names = [f"critic_{t_}" for t_ in tool_names]

In [20]:
# create a researcher session
# researcher will be in the default branch called 'main'
researcher = li.Session(system=researcher_system)       # system message for the researcher
                                                        
# create a new branch for each critic
for idx, name_ in enumerate(critic_names):
    researcher.new_branch(
        branch_name=li.nget(critic_configs, [name_, "name"]),       # name of the branch
        tools=li.nget(critic_configs, [name_, "tools"]),            # tools to use for the branch
        system=li.nget(critic_configs, [name_, "system"])           # branch system message for the critic
    )

#### Compose Critic cluster workflow

In [21]:
import pandas as pd

async def critic_workflow(context):
    researcher.new_branch('critic_chat')

    # this inner function represents one step in the critic's workflow
    async def _inner(step_num=None, _context=None):        
        f = lambda branch_, step_: [f"{branch_}", f"step{step_}"]
        
        # this _func function is to be called by a single critic
        async def _func(name_):
            config_ = {
                'instruction': li.nget(critic_configs, f(name_, step_num)), 
                'num': 5, 'to_': name_, 'temperature': 0.3, 'tools': True,
            }
            if _context is not None:
                config_.update({'context': _context})
            try:
                out = await researcher.auto_followup(**config_)
                return out
            except:
                return 'somehow failed'

        # then we run the _func asynchronously for all critics
        await li.alcall(critic_names, _func)

    # once after each critic finished their step
    # we need to find a way for them to know each other's work, so they conduct peer review and discussion
    def update_group_chat():
        
        # get all responses from critic (this does not include action request nor action response)
        # only the finished output for each step after various querying
        df_ = lambda name_: researcher.branches[name_].filter_messages_by(sender='assistant').copy()
        
        # we filter out empty responses and concat the valid ones into a dataframe
        lst = li.to_list(
            [
                df_(name_) if (df_(name_) is not None and len(df_(name_)) >0)
                else None for name_ in critic_names
            ], 
                flatten=True, dropna=True)
        
        dfs = pd.concat(lst)
        
        # then we update the each critic as well as the group chat on the results
        researcher.branches['critic_chat'].extend(dfs)
        for name_ in critic_names:
            researcher.branches[name_].extend(dfs)


    # for the first step, we need to provide context for critics to work on
    # which is the responses from the researcher
    await _inner(1, _context=context)
    update_group_chat()
    
    # you just need to put the same context once, each branch will keep track all messages
    # in a pandas dataframe format, which allows easy manipulation and filtering
    for i in range(1,5):
        await _inner(i+1)
        update_group_chat()
    
    # now the workflow is done, we are ready to output the final result
    msgs = researcher.branches['critic_chat'].messages.copy()
    
    # and since we send out the whole group chat history, we can delete the group chat branch
    # so it doesn't interfere with future workflows 
    # (branches within a session cannot have duplicated names)
    researcher.delete_branch('critic_chat')
    
    return msgs

# 5. Run

depending on your prompts, this might take a long while as well

#### Researcher Instruction 1: read abstract

In [22]:
# we provide the researcher with the an abstract of a paper as a starting point
r_context1 = str(abstracts[1])

r1 = await researcher.chat(
    instruction=researcher_instruct1, 
    context=r_context1, 
    temperature=0.3
)

In [23]:
# then we provide both the abstract and researcher's response
# as context for the critic cluster 
c_context1 = {
    "paper_summary": r_context1, 
    "researcher_work": str(r1)
}

# run critic workflow, the outputs are all messages from the critic cluster group chat
reports1 = await critic_workflow(c_context1)



Branch critic_chat is deleted.


#### Researcher Instruction 2: reflect

In [24]:
# we get the last 10 valid messages from the group chat as feedback to the researcher
r_context2 = str(
    [i.content for _, i in reports1.iloc[-10:].iterrows()])

r2 = await researcher.chat(
    instruction=researcher_instruct2, 
    context=r_context2,
    temperature=0.5)

c_context2 = {"researcher_work": str(r2)}
reports2 = await critic_workflow(c_context2)



Branch critic_chat is deleted.


#### Researcher Instruction 3: brainstrom

In [25]:
r_context3 = str(
    [i.content for _, i in reports1.iloc[-10:].iterrows()])

r3 = await researcher.chat(
    instruction=researcher_instruct3, 
    context=r_context3)

c_context3 = {"researcher_work": str(r3)}
reports3 = await critic_workflow(c_context3)



Branch critic_chat is deleted.


#### Researcher Instruction 4: output

In [26]:
r_context4 = str(
    [i.content for _, i in reports1.iloc[-10:].iterrows()])

r4 = await researcher.chat(
    instruction=researcher_instruct4, 
    context=r_context4)

r4_2 = await researcher.chat(
    instruction = """
        integrate conversation, reflect on your work and present 
        final output to user, be as long as possible
    """
)

# 6. Results

#### a. Check the overall stats

In [27]:
ttl_messages = [
    i.messages.copy().dropna(how='all') for _, i in researcher.branches.items() 
    if len(i.messages) > 1
]
ttl_df = pd.concat(ttl_messages).drop_duplicates(subset=['node_id', 'content'])

In [32]:
ttl_df.to_csv('all_critics_cluster.csv')

In [33]:
# there are 396 messages unique in total across all branches
# covering all critics, researcher, system as well as user instructions
ttl_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 396 entries, 0 to 106
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   node_id    396 non-null    object        
 1   role       396 non-null    object        
 2   sender     396 non-null    object        
 3   timestamp  396 non-null    datetime64[ns]
 4   content    396 non-null    object        
dtypes: datetime64[ns](1), object(4)
memory usage: 18.6+ KB


In [34]:
# number of reponses from all assistants

len(ttl_df[ttl_df.sender == 'assistant'])

76

In [36]:
# number of queries answered

len(ttl_df[ttl_df.sender == 'action_response'])

94

#### check each tool uses

In [37]:
# you can trace back the source for each single successful query

print(f"total number of arxiv queries: {len(responses_arxiv)}")
print(f"total number of d2l queries: {len(responses_d2l)}")
print(f"total number of bc queries: {len(responses_bc)}")
print(f"total number of google queries: {len(responses_google)}")
print(f"total number of wiki queries: {len(responses_wiki)}")

total number of arxiv queries: 25
total number of d2l queries: 31
total number of bc queries: 9
total number of google queries: 18
total number of wiki queries: 11


# 5. Improve outputs

if the results quality is less than expected, you can chat with the researcher to further improve

In [None]:
improvements = '''
    please be more specific in terms of technical details, use your conversation context to supplement. for example, introduce which technologies, which references should be used where and include some reasoning. you can do it
'''

r5 = await researcher.chat(improvements)

In [44]:
print(r5)

I appreciate your encouragement and the opportunity to delve into a more technically detailed research proposal. Utilizing the context of our previous conversations, let’s articulate a comprehensive plan that integrates Large Language Models (LLMs) with blockchain technology to foster a system of trust.

**Title:**
"Design and Implementation of a Trust-Enhancing Framework via LLM-Blockchain Integration"

**Abstract:**
This research investigates the technical feasibility and implementation strategies for integrating LLMs with blockchain networks to construct a robust system of trust. By harnessing the linguistic and cognitive capabilities of LLMs and the immutable record-keeping features of blockchain, we aim to develop a platform that ensures data integrity, facilitates transparent transactions, and mitigates the risks of malicious activities across blockchain systems.

**Introduction:**
Blockchain technology has laid the foundation for creating secure and decentralized digital ledgers