In [1]:
from langchain.chains import SimpleSequentialChain

In [2]:
import os
import eunomia
import pickle




In [14]:
paper_id = '1'
docs_processor = eunomia.LoadDoc('../Data/SI_PDF/', paper_id=paper_id)
sliced_pages = docs_processor.process(['references ', 'acknowledgement', 'acknowledgments', 'references\n'],
                                             chunk_size=1000, chunk_overlap=10, chunking_type='fixed-size')

In [26]:
from langchain.chains import RetrievalQA

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS

k = 5
# Embedding_model = 'text-davinci-003' # text-embedding-ada-002
Embedding_model = 'text-embedding-ada-002' # text-embedding-ada-002


faiss_index_path = f"../Data/chosen_papers/faiss/{Embedding_model}/faiss_index_{Embedding_model}_{paper_id}.pkl"
just_load = False
if os.path.isfile(faiss_index_path) and just_load:
    print('loadded Embeddings')
    with open(faiss_index_path, "rb") as f:
        faiss_index = pickle.load(f)
else:
    with open(faiss_index_path, "wb") as f:  # text-embedding-ada-002
        faiss_index = FAISS.from_documents(sliced_pages, OpenAIEmbeddings(model=Embedding_model))    # 'text-davinci-003
        pickle.dump(faiss_index, f)

# faiss_index = LanceDB.from_documents(sliced_pages, OpenAIEmbeddings(model=Embedding_model))

retriever  = faiss_index.as_retriever(search_type="mmr", search_kwargs={"k":k, 'fetch_k': 50}) # Maximum Marginal Relevance Retrieval


In [89]:
from langchain.llms import OpenAI
from langchain.agents import Tool, tool
from langchain.chat_models  import ChatOpenAI
from langchain.chains import RetrievalQA
llm = OpenAI(temperature=0, model_name='gpt-3.5-turbo-16k') # gpt-3.5-turbo-16k
qa = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=False,)

In [90]:
Rule = f"""
There are only 3 options for water stability: \
1. Stable \
2. Unstable \
3. Not provided\
For 1. Stable, \
No change after exposure, soaking or boiling in water or an aqueous solution.\
Water adsorption isotherm should have a steep uptake.\
For 2. Unstable, \
The MOF will decompose after exposure \
to humid environment or partially dissolves in water. \
For 3. Not provided, \
If you don't know or cannot justfy 1 or 2
"""

# Steep uptake behavior is from this paper: https://pubs.acs.org/doi/epdf/10.1021/ja306812r

In [91]:
prompt3a = f"""
    You are an expert chemist. This doc describes the water stability properties of a few materials including \
    Metal-Organic Frameworks(MOF) or coordination polymers, Coordination Networks or ZIF compounds. Please find \
    the full name for materials, and their \
    stability in water. \
    Please use the following rules to identify\
    the water stability of each MOF. \
    Your second output for each mof should be a probability score ranging between [0, 1].\
    This probability score shows how certain you are in your answer.\
    Your third output should be exact sentences without any changes from the document that justifies your decision.\
    This should be "Not provided" if you cannot find water stability.\
    Rule: {Rule}\
    If there are multiple MOFs, please list them all and put "MOF name" before them. \
    You should only respond in JSON format as described below. 
    Make sure it is parsable using Python json.load.
    Make sure "MOF name"s do not have duplicates
    If information is not provided, state it as not provided.\
    """

In [30]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
qa.run(prompt3a)

'{\n  "MOF-11": {\n    "stability": "Unstable",\n    "probability": 1,\n    "justification": "Bio-MOF-11 dissolved rapidly in water, as evidenced by PXRD (Fig. 10); SEM images could not be obtained."\n  },\n  "MOF-12": {\n    "stability": "Unstable",\n    "probability": 1,\n    "justification": "Bio-MOF-12 dissolved partially in water. The intensity of their signature diﬀraction lines decreased substantially (Fig. 10); SEM images of the soaked samples revealed significant crystallite fragmentation and pitting of the crystal surfaces (Fig. S21 †)."\n  },\n  "MOF-13": {\n    "stability": "Unstable",\n    "probability": 1,\n    "justification": "Bio-MOF-13 dissolved partially in water. The intensity of their signature diﬀraction lines decreased substantially (Fig. 10); SEM images of the soaked samples revealed significant crystallite fragmentation and pitting of the crystal surfaces (Fig. S21 †)."\n  },\n  "MOF-14": {\n    "stability": "Stable",\n    "probability": 1,\n    "justification"

In [20]:
@tool
def read_context(input):
    '''
    use this as the primary source of context information when you are asked the input.
    Always search for the answers using this tool first, don't make up answers yourself
    '''
    llm = OpenAI(temperature=0, model_name='gpt-3.5-turbo-16k') # gpt-3.5-turbo-16k
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm, chain_type="stuff", retriever=retriever)
    return qa_chain({"query": input})

@tool
def eval_justification(model="gpt-3.5-turbo"):
    '''
    This function checks if the justification talks about the water stability. 
    '''
    import openai
    prompt = f"""Does this sentence describe water stability?"""
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0, 
    )
    return response.choices[0].message["content"]

In [52]:
from langchain.agents.agent_toolkits import create_retriever_tool

read_context_tool = create_retriever_tool(
    retriever, 
    "Read_paper",
    "Use this as the primary source of context information when you are asked the question about MOFs. Always search for the answers using this tool first, don't make up answers yourself"
)

tools = [
    # Tool(
    #     name = "Read_paper",
    #     func=qa.run,
    #     description=f"Useful for when you need to answer questions about MOFs."
    # )]
    read_context_tool]
    #eval_justification]


In [53]:
from langchain.agents.agent_toolkits import create_conversational_retrieval_agent
agent_executor = create_conversational_retrieval_agent(llm, tools, verbose=True)

In [55]:
from langchain.callbacks import get_openai_callback
with get_openai_callback() as cb:
    agent_executor(prompt3a)
    print(cb)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `Read_paper` with `MOFs`


[0m[36;1m[1;3m[Document(page_content='shown in Fig. 9. Previous calculations have demonstrated howCO\n2can act as both a Lewis acid and a Lewis base when inter-\nacting with CO 2-soluble polymers,53so it is not surprising that\nsimilar interactions could be important in MOFs. Our calcula-tions show that CO\n2is arranged so that it makes three Lewis-\nacid –Lewis-base interactions with slightly acidic protons on the\nframework, having O –H bond distances of 2.72, 2.75, and 2.85\n˚A. The binding energy of this con \ue103guration is 36 kJ mol/C01,\nwhich is in good agreement with the experimentally calculatedisosteric heats for bio-MOF-12 .\nWater stability studies\nHaving shown that increasing the length of the aliphatic group\ncan positively impact CO\n2:N2selectivity, we next explored\nFig. 8 Adsorption isotherms at 298 K of CO 2inbio-MOF-14 from grand\ncanonical Monte Carlo Simulation

In [96]:
tools = [
    Tool(
        name = "MOF_stability",
        func=qa.run,
        description=f"Use the original prompt for this tool to get context about water stability of MOFs."
    )]

In [97]:
# qa.run(prompt3a)

In [98]:
from llama_index import GPTListIndex, GPTIndexMemory
from langchain.agents import initialize_agent
from langchain.agents import AgentType
# index = GPTListIndex([])
# memory = GPTIndexMemory(index=GPTListIndex([]), memory_key="chat_history", query_kwargs={"response_mode": "compact"})
agent_chain = initialize_agent(tools, llm, agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION,
                               verbose=True, memory=None, handle_parsing_errors=True)

In [101]:
with get_openai_callback() as cb:
    agent_chain.run(prompt3a)
    print(cb)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to use the MOF_stability tool to find the water stability properties of the materials mentioned in the document. I should provide the document text as input to the tool.

Action:
```
{
  "action": "MOF_stability",
  "action_input": "This doc describes the water stability properties of a few materials including Metal-Organic Frameworks(MOF) or coordination polymers, Coordination Networks or ZIF compounds. Please find the full name for materials, and their stability in water. Please use the following rules to identify the water stability of each MOF. Your second output for each mof should be a probability score ranging between [0, 1]. This probability score shows how certain you are in your answer. Your third output should be exact sentences without any changes from the document that justifies your decision. This should be 'Not provided' if you cannot find water stability. Rule: There are only 3 options for wate