In [1]:
import openai
import numpy as np
import os
import pickle
#os.environ["OPENAI_API_KEY"] = "ADD YOUR KEY"

In [2]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import PyPDFLoader

In [3]:
from typing import List, Optional
from langchain.schema import Document
from copy import deepcopy

def cut_text(text: str, keywords: List[str]) -> str:
    lower_text = text.lower()
    indices = [lower_text.find(keyword) for keyword in keywords if lower_text.find(keyword) != -1]
    min_index = min(indices)
    return text[:min_index].strip()  # remove any trailing spaces

def find_in_document(document: Document, search_strings: List[str]) -> bool:
    return any(search_string.lower() in document.page_content.lower() for search_string in search_strings)

def filter_documents(documents: List[Document], search_strings: List[str]) -> List[Document]:
    filtered_documents = deepcopy(documents)  # Create a deep copy of documents
    for i, doc in enumerate(filtered_documents):
        if find_in_document(doc, search_strings):
            filtered_documents[i].page_content = cut_text(filtered_documents[i].page_content, keywords=search_strings)
            filtered_documents = filtered_documents[:i+1]
            break
    return filtered_documents

# Method : RetrievalQA

In [4]:
paper_id = '2'
mofpaper_path = 'Data/SI_PDF/' + paper_id + '.pdf'
loader = PyPDFLoader(mofpaper_path)
pages = loader.load_and_split()

In [51]:
sliced_pages = filter_documents(pages, ['references ', 'acknowledgement', 'acknowledgments'])
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
sliced_pages_chunks = text_splitter.split_documents(sliced_pages)

In [52]:
print(len(pages), len(sliced_pages))

15 15


In [53]:
from langchain.chains import RetrievalQA
faiss_index_path = f"Data/faiss/faiss_index_0{paper_id}.pkl"
just_load = False
if os.path.isfile(faiss_index_path) and just_load:
    print('loadded Embeddings')
    with open(faiss_index_path, "rb") as f:
        faiss_index = pickle.load(f)
else:
    with open(faiss_index_path, "wb") as f:
        faiss_index = FAISS.from_documents(sliced_pages, OpenAIEmbeddings(model='text-embedding-ada-002'))    # 'text-davinci-003
        pickle.dump(faiss_index, f)
retriever  = faiss_index.as_retriever(search_type="mmr", search_kwargs={"k":5}) # Maximum Marginal Relevance Retrieval


In [54]:
from langchain.llms import OpenAI
from langchain.chat_models  import ChatOpenAI
llm = OpenAI(temperature=0, model_name='gpt-4') # gpt-3.5-turbo-16k
qa = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=False,)

In [55]:
query = 'What is DOI, paper title, published journal?'

from langchain.callbacks import get_openai_callback
with get_openai_callback() as cb:
    result = qa({"query": query})
    print(result)
    print(cb)

{'query': 'What is DOI, paper title, published journal?', 'result': 'The document does not provide information on the DOI, paper title, or the journal in which it was published.'}
Tokens Used: 4589
	Prompt Tokens: 4567
	Completion Tokens: 22
Successful Requests: 1
Total Cost (USD): $0.13832999999999998


In [56]:
Rule = f"""

There are only 2 options for water stability: \
1. Stable \
2. Unstable \

For 1. Stable, \
The MOF should be stable to keep structure integrity \
after exposure to aqueous solution. \

For 2. Unstable, \
The MOF will decompose after exposure \
to humid environment. \

"""

In [57]:
prompt3a = f"""
    This doc describes the water stability properties of afew \
    Metal-Organic Framework(MOF) compound. Please find \
    the full name for all MOFs, and the \
    water stability corresponding to each MOF. \
    Please use the following rules to identify\
    the water stability of each MOF. \

    Rule: ```{Rule}``` \

    If there are multiple MOFs, please list them all and put "MOF name" before them. \

    Your answer should be in json format. 

    If information is not provided, state it as not provided.\

    """

from langchain.callbacks import get_openai_callback
with get_openai_callback() as cb:
    result = qa({"query": prompt3a})
    print(result)
    print(cb)

{'query': '\n    This doc describes the water stability properties of afew     Metal-Organic Framework(MOF) compound. Please find     the full name for all MOFs, and the     water stability corresponding to each MOF.     Please use the following rules to identify    the water stability of each MOF. \n    Rule: ```\n\nThere are only 2 options for water stability: 1. Stable 2. Unstable \nFor 1. Stable, The MOF should be stable to keep structure integrity after exposure to aqueous solution. \nFor 2. Unstable, The MOF will decompose after exposure to humid environment. \n``` \n    If there are multiple MOFs, please list them all and put "MOF name" before them. \n    Your answer should be in json format. \n\n    If information is not provided, state it as not provided.\n    ', 'result': '{\n"MOF-69C": "Unstable",\n"MOF-5": "Unstable",\n"MOF-508b": "Stable",\n"HKUST-1": "Stable",\n"MOF-74": "Stable",\n"ZIF-8": "Stable",\n"Zn-MOF-74": "Not Provided",\n"MOF-508b": "Not Provided",\n"Zn-BDC-DABC

In [58]:
def parse_to_dict(result):
    from pydantic import BaseModel, Field, validator
    from langchain.output_parsers import OutputFixingParser
    class MOF(BaseModel):
        name: List[str] = Field(description="name of a MOF")
        stability: List[str] = Field(description="choose one: stable or unstable")

    from langchain.output_parsers import PydanticOutputParser
    parser = PydanticOutputParser(pydantic_object=MOF)

    llm_parser = OutputFixingParser.from_llm(parser=parser, llm=ChatOpenAI(temperature=0))
    parsed_result = llm_parser.parse(result['result'])
    dict_response = dict(zip(parsed_result.name, parsed_result.stability))
    new_dict = {}
    for i, (key, value) in enumerate(dict_response.items(), start=1):
        new_dict[f"MOF {i}"] = {'MOF name': key, 'Predicted Stability': value}
    return new_dict


In [59]:
parse_to_dict(result)

{'MOF 1': {'MOF name': 'MOF-69C', 'Predicted Stability': 'Unstable'},
 'MOF 2': {'MOF name': 'MOF-5', 'Predicted Stability': 'Unstable'},
 'MOF 3': {'MOF name': 'MOF-508b', 'Predicted Stability': 'Not Provided'},
 'MOF 4': {'MOF name': 'HKUST-1', 'Predicted Stability': 'Not Provided'},
 'MOF 5': {'MOF name': 'MOF-74', 'Predicted Stability': 'Stable'},
 'MOF 6': {'MOF name': 'ZIF-8', 'Predicted Stability': 'Stable'},
 'MOF 7': {'MOF name': 'Zn-MOF-74', 'Predicted Stability': 'Not Provided'},
 'MOF 8': {'MOF name': 'Zn-BDC-DABCO', 'Predicted Stability': 'Not Provided'},
 'MOF 9': {'MOF name': 'Cr-MIL-101', 'Predicted Stability': 'Stable'},
 'MOF 10': {'MOF name': 'Al-MIL-110', 'Predicted Stability': 'Not Provided'}}