In [1]:
import os
import eunomia
import pickle


In [2]:
paper_id = '1'
docs_processor = eunomia.LoadDoc('../Data/SI_PDF/', paper_id=paper_id)
sliced_pages = docs_processor.process(['references ', 'acknowledgement', 'acknowledgments', 'references\n'],
                                             chunk_size=1000, chunk_overlap=10, chunking_type='fixed-size')

In [3]:
from langchain.chains import RetrievalQA

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS

k = 5
# Embedding_model = 'text-davinci-003' # text-embedding-ada-002
Embedding_model = 'text-embedding-ada-002' # text-embedding-ada-002


faiss_index_path = f"../Data/chosen_papers/faiss/{Embedding_model}/faiss_index_{Embedding_model}_{paper_id}.pkl"
just_load = True
if os.path.isfile(faiss_index_path) and just_load:
    print('loadded Embeddings')
    with open(faiss_index_path, "rb") as f:
        faiss_index = pickle.load(f)
else:
    with open(faiss_index_path, "wb") as f:  # text-embedding-ada-002
        faiss_index = FAISS.from_documents(sliced_pages, OpenAIEmbeddings(model=Embedding_model))    # 'text-davinci-003
        pickle.dump(faiss_index, f)

# faiss_index = LanceDB.from_documents(sliced_pages, OpenAIEmbeddings(model=Embedding_model))

retriever  = faiss_index.as_retriever(search_type="mmr", search_kwargs={"k":k, 'fetch_k': 50}) # Maximum Marginal Relevance Retrieval


loadded Embeddings


In [4]:
from langchain.llms import OpenAI
from langchain.agents import Tool, tool
from langchain.chat_models  import ChatOpenAI
from langchain.chains import RetrievalQA
llm = OpenAI(temperature=0, model_name='gpt-3.5-turbo-16k') # gpt-3.5-turbo-16k
qa = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=False,)



In [5]:
Rule = f"""
There are only 3 options for water stability: \
1. Stable \
2. Unstable \
3. Not provided\
For 1. Stable, \
No change after exposure, soaking or boiling in water or an aqueous solution.\
Water adsorption isotherm should have a steep uptake.\
For 2. Unstable, \
The MOF will decompose after exposure \
to humid environment or partially dissolves in water. \
For 3. Not provided, \
If you don't know or cannot justfy 1 or 2
"""

# Steep uptake behavior is from this paper: https://pubs.acs.org/doi/epdf/10.1021/ja306812r

In [6]:
prompt3a = f"""
    You are an expert chemist. The doc tool describes the water stability properties of a few materials including \
    Metal-Organic Frameworks(MOF) or coordination polymers, Coordination Networks or ZIF compounds. Please find \
    the full name for materials, and their \
    stability in water. \
    Your final answer should contain the following:\
    Your first output is the water stability of each MOF. \
    Your second output for each mof should be a probability score ranging between [0, 1].\
    This probability score shows how certain you are in your answer.\
    Your third output should be exact sentences without any changes from the document that justifies your decision.\
    This should be "Not provided" if you cannot find water stability.\
    Rule: {Rule}\
    If there are multiple MOFs, please list them all and put "MOF name" before them. \
    You should only respond in JSON format as described below. 
    Make sure it is parsable using Python json.load.
    Make sure "MOF name"s do not have duplicates
    If information is not provided, state it as not provided.\
    """

In [7]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
qa.run(prompt3a)

'{\n  "MOF-11": {\n    "water_stability": "Unstable",\n    "probability_score": 0.9,\n    "justification": "Bio-MOF-11 dissolved rapidly in water, as evidenced by PXRD (Fig. 10); SEM images could not be obtained."\n  },\n  "MOF-12": {\n    "water_stability": "Unstable",\n    "probability_score": 0.8,\n    "justification": "Bio-MOF-12 dissolved partially in water. The intensity of their signature diﬀraction lines decreased substantially (Fig. 10); SEM images of the soaked samples revealed significant crystallite fragmentation and pitting of the crystal surfaces (Fig. S21 †)."\n  },\n  "MOF-13": {\n    "water_stability": "Unstable",\n    "probability_score": 0.7,\n    "justification": "Bio-MOF-13 dissolved partially in water. The intensity of their signature diﬀraction lines decreased substantially (Fig. 10); SEM images of the soaked samples revealed significant crystallite fragmentation and pitting of the crystal surfaces (Fig. S21 †). However, it was clear from comparing PXRD patterns 

In [8]:
from langchain.llms import OpenAI

llm = OpenAI(temperature=0, model_name='gpt-3.5-turbo-16k') # gpt-3.5-turbo-16k
qa = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=False,)

In [9]:
from langchain.agents import Tool, tool

@tool
def eval_justification(justification):
    '''
    Always use this tool at the last step. This function checks if the justification talks about the water stability perdiction makes sense. 
    '''
    import openai
    model="gpt-3.5-turbo"
    prompt = f"""
            Does the below sentence talk about water stability on all found MOFS?\
            If not try to find a better justification for the MOF that does not.\
            
            "{justification}"\

            To do this, you should be looking at information on steep uptakes, change or desolving in water and water stability in the sentence.\
            Do not make up answers.\
            """
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0, 
    )
    return response.choices[0].message["content"]


# tools = [doc_retrival]


In [10]:

prompt3a = f"""
    You are an expert chemist. The document describes the water stability properties of a few materials including \
    Metal-Organic Frameworks(MOF) or coordination polymers, Coordination Networks or ZIF compounds. Please find \
    the exact full name for materials, and their \
    stability in water. \
    Your final answer should contain the following:\
    Your first output is the water stability of each MOF. \
    Your second output for each mof should be a probability score ranging between [0, 1].\
    This probability score shows how certain you are in your answer.\
    Your third output should be exact sentences without any changes from the document that justifies your decision.\
    Do not make up a justification that does not exist in the doc.\
    This should be "Not provided" if you cannot find water stability.\
    Rule: {Rule}\
    If there are multiple MOFs, please list them all and put "MOF name" before them. \
    You should only respond in JSON format as described below. 
    Make sure it is parsable using Python json.load.
    Make sure "MOF name"s do not have duplicates
    If information is not provided, state it as not provided.\
    """

In [11]:
llm = OpenAI(temperature=0, model_name='gpt-3.5-turbo-16k') # gpt-3.5-turbo-16k
qa_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever)
# qa_chain.run(prompt3a)

In [12]:
@tool
def read_doc(input):
    '''
    Input the users original prompt and get context on water stability of metal organic frameworks.
    Always search for the answers using this tool first, don't make up answers yourself.
    '''
    llm = OpenAI(temperature=0, model_name='gpt-3.5-turbo-16k') # gpt-3.5-turbo-16k
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm, chain_type="stuff", retriever=retriever)
    return qa_chain({"query": prompt3a})['result']

tools = [
    # Tool(
    #     name = "read_doc",
    #     func=qa.run,
    #     description=f"Always use this tool for context. Use the input prompt for this tool to get context about water stability of MOFs."
    # )
    read_doc, eval_justification]


In [13]:
from langchain.memory import ConversationBufferMemory
from langchain.chat_models  import ChatOpenAI
from langchain.agents import initialize_agent
from langchain.agents import AgentType
from langchain.callbacks import get_openai_callback

agent_llm = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo-16k')

memory = ConversationBufferMemory(memory_key="chat_history")

agent_chain = initialize_agent(tools, agent_llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
                               verbose=True, memory=memory, handle_parsing_errors=True,
                              prefix= None, suffix=None, output_parser=None)

In [17]:
input_prompt = 'You are an expert chemist. Read paper 1.pdf, find all the mentioned MOFs and their water stability with a justification of your answers.'

with get_openai_callback() as cb:
    result = agent_chain.run(input=input_prompt)
    print(cb)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI should start by reading the paper to gather information on the mentioned MOFs and their water stability.
Action: read_doc("paper 1.pdf")[0m
Observation: Invalid Format: Missing 'Action Input:' after 'Action:'
Thought:[32;1m[1;3mI need to provide the input for the read_doc action.
Action Input: "paper 1.pdf"[0m
Observation: Invalid Format: Missing 'Action:' after 'Thought:
Thought:[32;1m[1;3mI need to provide the action for the read_doc action.
Action: read_doc
Action Input: "paper 1.pdf"[0m
Observation: [36;1m[1;3m{
  "MOF-11": {
    "water_stability": "Unstable",
    "probability_score": 1,
    "justification": "Bio-MOF-11 dissolved rapidly in water, as evidenced by PXRD (Fig. 10); SEM images could not be obtained."
  },
  "MOF-12": {
    "water_stability": "Unstable",
    "probability_score": 1,
    "justification": "Bio-MOF-12 dissolved partially in water. The intensity of their signature diﬀraction lines decrea

In [18]:
eunomia.parse_to_dict(result, paper_id)

{'MOF-11': {'Predicted Stability': 'Unstable',
  'Score': 0.0,
  'Justification': 'Rapid dissolution in water',
  'Paper id': '1'},
 'MOF-12': {'Predicted Stability': 'Unstable',
  'Score': 0.0,
  'Justification': 'Partial dissolution, significant crystallite fragmentation and pitting',
  'Paper id': '1'},
 'MOF-13': {'Predicted Stability': 'Unstable',
  'Score': 0.0,
  'Justification': 'Partial dissolution, significant crystallite fragmentation and pitting, but more stable than MOF-12',
  'Paper id': '1'},
 'MOF-14': {'Predicted Stability': 'Stable',
  'Score': 0.0,
  'Justification': 'No loss of crystallinity, minimal pitting on crystal surface, water does not affect porosity',
  'Paper id': '1'}}