In [48]:
import os
import eunomia
import pickle


In [61]:
paper_id = '1'
docs_processor = eunomia.LoadDoc('../Data/SI_PDF/', paper_id=paper_id)
sliced_pages = docs_processor.process(['references ', 'acknowledgement', 'acknowledgments', 'references\n'],
                                             chunk_size=200, chunk_overlap=10, chunking_type='fixed-size')

In [62]:
# from langchain.document_loaders import PyPDFLoader
# loader = PyPDFLoader(f'../Data/SI_PDF/{paper_id}.pdf')
# docs_processor.filter_documents([loader.load_and_split()[-2]], ['references ', 'acknowledgement', 'acknowledgments'])

In [63]:
print(len(docs_processor.pages), len(sliced_pages))

15 12


In [64]:
from langchain.chains import RetrievalQA

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS

k = 5
# Embedding_model = 'text-davinci-003' # text-embedding-ada-002
Embedding_model = 'text-embedding-ada-002' # text-embedding-ada-002


faiss_index_path = f"../Data/SI_PDF/faiss/{Embedding_model}/faiss_index_{Embedding_model}_{paper_id}.pkl"
just_load = False
if os.path.isfile(faiss_index_path) and just_load:
    print('loadded Embeddings')
    with open(faiss_index_path, "rb") as f:
        faiss_index = pickle.load(f)
else:
    with open(faiss_index_path, "wb") as f:  # text-embedding-ada-002
        faiss_index = FAISS.from_documents(sliced_pages, OpenAIEmbeddings(model=Embedding_model))    # 'text-davinci-003
        pickle.dump(faiss_index, f)

# faiss_index = LanceDB.from_documents(sliced_pages, OpenAIEmbeddings(model=Embedding_model))

retriever  = faiss_index.as_retriever(search_type="mmr", search_kwargs={"k":k}) # Maximum Marginal Relevance Retrieval


In [65]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import EmbeddingsFilter

embeddings_filter = EmbeddingsFilter(embeddings=OpenAIEmbeddings(model=Embedding_model), similarity_threshold=0.4)
compression_retriever = ContextualCompressionRetriever(base_compressor=embeddings_filter, base_retriever=retriever)

In [66]:
from langchain.llms import OpenAI
from langchain.chat_models  import ChatOpenAI
from langchain.chains import RetrievalQAWithSourcesChain

llm = OpenAI(temperature=0, model_name='gpt-3.5-turbo-16k') # gpt-3.5-turbo-16k
qa = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True,)

In [67]:
query = 'What is DOI, paper title, published journal?'

from langchain.callbacks import get_openai_callback
with get_openai_callback() as cb:
    result = qa({"question": query})
    print(result)
#     print(result['result'])
#     print(cb)

{'question': 'What is DOI, paper title, published journal?', 'answer': 'DOI: 10.1039/c3sc22207a\nPaper title: Systematic modulation and enhancement of CO2:N2 selectivity and water stability in an isoreticular series of bio-MOF-11 analogues\nPublished journal: Chemical Science\n', 'sources': '../Data/SI_PDF/1.pdf', 'source_documents': [Document(page_content="14. We modulate the porosity and hydrophobicity within thisaNational Energy Technology Laboratory, Regional University Alliance (NETL-RUA),\nUSA\nbDepartment of Chemistry, University of Pittsburgh, 219 Parkman Ave., Pittsburgh,\nPennsylvania 15260, USA. E-mail: nrosi@pitt.edu\ncDepartment of Chemical and Petroleum Engineering, University of Pittsburgh, 3700\nO'Hara Street, Pittsburgh, Pennsylvania 15261, USA\ndUnited States Department of Energy, National Energy Technology Laboratory, 626\nCochrans Mill Road, Pittsburgh, Pennsylvania 15236, USA\n†Electronic supplementary information (ESI) available: Additional isotherms,\nsingle crys

In [68]:
Rule = f"""
There are only 2 options for water stability: \
1. Stable \
2. Unstable \
For 1. Stable, \
No change after exposure, soaking or boiling in water or an aqueous solution.\
For 2. Unstable, \
The MOF will decompose after exposure \
to humid environment or dissolves in water. \
"""

In [71]:
prompt3a = f"""
    You are an expert chemist. This doc describes the water stability properties of a few \
    Metal-Organic Frameworks(MOF) or ZIF compounds. Please find \
    the full name for all MOFs or ZIFs, and their \
    stability in water. \
    Please use the following rules to identify\
    the water stability of each MOF. \
    Your second output for each mof should be a probability score ranging between [0, 1].\
    This probability score shows how certain you are in your answer.\
    Rule: {Rule}\
    If there are multiple MOFs, please list them all and put "MOF name" before them. \
    You should only respond in JSON format as described below. 
    Make sure it is parsable using Python json.load.
    Make sure "MOF name"s do not have duplicates
    If information is not provided, state it as not provided.\
    """

from langchain.callbacks import get_openai_callback
with get_openai_callback() as cb:
    result = qa({"question": prompt3a})
    print(result)
    print(cb)

{'question': '\n    You are an expert chemist. This doc describes the water stability properties of a few     Metal-Organic Frameworks(MOF) or ZIF compounds. Please find     the full name for all MOFs or ZIFs, and their     stability in water.     Please use the following rules to identify    the water stability of each MOF.     Your second output for each mof should be a probability score ranging between [0, 1].    This probability score shows how certain you are in your answer.    Rule: \nThere are only 2 options for water stability: 1. Stable 2. Unstable For 1. Stable, No change after exposure, soaking or boiling in water or an aqueous solution.For 2. Unstable, The MOF will decompose after exposure to humid environment or dissolves in water.     If there are multiple MOFs, please list them all and put "MOF name" before them.     You should only respond in JSON format as described below. \n    Make sure it is parsable using Python json.load.\n    Make sure "MOF name"s do not have dup

In [72]:
result['answer']

'{\n  "MOFs": [\n    {\n      "MOF name": "bio-MOF-11",\n      "Water stability": "Unstable",\n      "Probability score": 1\n    },\n    {\n      "MOF name": "bio-MOF-12",\n      "Water stability": "Unstable",\n      "Probability score": 1\n    },\n    {\n      "MOF name": "bio-MOF-13",\n      "Water stability": "Unstable",\n      "Probability score": 1\n    },\n    {\n      "MOF name": "bio-MOF-14",\n      "Water stability": "Stable",\n      "Probability score": 1\n    }\n  ]\n}\n'

In [57]:
from typing import List
from pydantic import BaseModel, Field, validator, ValidationError
from langchain.output_parsers import OutputFixingParser, PydanticOutputParser
# def parse_to_dict(result, paper_id):

#     class MOF(BaseModel):
#         name: str = Field(description="name of a MOF")
#         stability: str = Field(description="choose only one: stable or unstable or not provided")

#         def name_not_provided(cls, field):
#             if field == "Not Provided":
# #                 raise ValueError("Dummy-MOF!")
#                 field = 'Dummy-MOF'
#             return field
    
#     class MOFList(BaseModel):
#         MOFs: List[MOF]

#     from langchain.output_parsers import PydanticOutputParser
#     parser = PydanticOutputParser(pydantic_object=MOF)
#     llm_parser = OutputFixingParser.from_llm(parser=parser, llm=ChatOpenAI(temperature=0))  
    
# #     print(result['result'])
#     parsed_result = llm_parser.parse(result['result'].strip("'").replace("\\", ""))
#     dict_response = dict(zip(parsed_result.name, parsed_result.stability))
#     new_dict = {}
#     for i, (key, value) in enumerate(dict_response.items(), start=1):
#         new_dict[key] = {'Predicted Stability': value, 'Paper id': paper_id}
#     return new_dict

def parse_to_dict(result, paper_id):
    from typing import List
    from langchain.output_parsers import PydanticOutputParser

    class MOF(BaseModel):
        name: str = Field(description="name of a MOF")
        stability: str = Field(description="choose only one: stable or unstable or not provided")

    class MOFList(BaseModel):
        MOFs: List[MOF]

    parser = PydanticOutputParser(pydantic_object=MOFList)
    llm_parser = OutputFixingParser.from_llm(parser=parser, llm=ChatOpenAI(temperature=0))

    try:
        parsed_result = llm_parser.parse(result['result'])
    except OutputParserException as e:
        raise OutputParserException(f"Failed to parse MOF: {e}", llm_output=result['result'])

    dict_response = {mof.name: mof.stability for mof in parsed_result.MOFs}
    new_dict = {key: {'Predicted Stability': value, 'Paper id': paper_id} for key, value in dict_response.items()}
    return new_dict

In [58]:
predictions_dict = parse_to_dict(result, paper_id)
predictions_dict

NameError: name 'OutputParserException' is not defined

In [13]:
import os
import pickle
from langchain.callbacks import get_openai_callback
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.chat_models  import ChatOpenAI


def retriev_QA(paper_id, LLM_model, Embedding_model, k, temperature):
    mofpaper_path = '../Data/SI_PDF/' + paper_id + '.pdf'
    print(mofpaper_path)
    with get_openai_callback() as cb:
        docs_processor = eunomia.LoadDoc('../Data/SI_PDF/', paper_id=str(paper_id))
        sliced_pages = docs_processor.process(['references ', 'acknowledgement', 'acknowledgments'],
                                                     chunk_size=500, chunk_overlap=10)
        print(len(docs_processor.pages), len(sliced_pages))
        faiss_index_path = f"../Data/SI_PDF/faiss/{Embedding_model}/faiss_index_{Embedding_model}_{paper_id}.pkl"
        if os.path.isfile(faiss_index_path):
            with open(faiss_index_path, "rb") as f:
                faiss_index = pickle.load(f)
        else:
            with open(faiss_index_path, "wb") as f:
                faiss_index = FAISS.from_documents(sliced_pages, OpenAIEmbeddings(model=Embedding_model))   #text-davinci-003
                pickle.dump(faiss_index, f)
        retriever  = faiss_index.as_retriever(search_type="mmr", search_kwargs={"k":k}) # Maximum Marginal Relevance Retrieval        
#         retriever  = faiss_index.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": .8}) # Maximum Marginal Relevance Retrieval        
        qa = RetrievalQA.from_chain_type(
            llm=ChatOpenAI(model=LLM_model, temperature=temperature), chain_type="stuff", retriever=retriever, return_source_documents=False)
        answer = qa({"query": prompt3a})
        predictions_dict = parse_to_dict(answer, paper_id)
        return predictions_dict, cb.total_cost

In [14]:
from tqdm import tqdm
import numpy as np
import json
predictions = []
costs = []


LLM_model = 'gpt-4'
Embedding_model = 'text-davinci-003'#'text-davinci-003' text-embedding-ada-002
k = 8
temp = 0

for i in tqdm(range(1,112)):
    print(f'Paper {i}')
    p, c = retriev_QA(str(i), LLM_model, Embedding_model, k, temperature=temp)
    costs.append(c)
    predictions.append(p)
    print(f'\n Total cost: ${np.sum(costs)}\n') 
    results_index_path = f"../Data/SI_PDF/faiss/results/{LLM_model}/faiss-{Embedding_model}/k_{k}/t_{temp}/results_paper_{i}.json"
    with open(results_index_path, 'w') as f:
        json.dump(p, f)
#     break

  0%|                                                                                                                                                        | 0/111 [00:00<?, ?it/s]

Paper 1
../Data/SI_PDF/1.pdf
15 12


  1%|█▎                                                                                                                                              | 1/111 [00:15<28:16, 15.42s/it]


 Total cost: $0.20141849999999997

Paper 2
../Data/SI_PDF/2.pdf
15 15


  2%|██▌                                                                                                                                             | 2/111 [00:44<42:29, 23.39s/it]


 Total cost: $0.423879

Paper 3
../Data/SI_PDF/3.pdf
8 8


  3%|███▉                                                                                                                                            | 3/111 [00:51<29:03, 16.14s/it]


 Total cost: $0.63124

Paper 4
../Data/SI_PDF/4.pdf
11 11


Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised RateLimitError: Rate limit reached for 10KTPM-200RPM in organization org-ch7xZkxfNhNOfiyd4gkEf754 on tokens per min. Limit: 10000 / min. Please try again in 6ms. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 2.0 seconds as it raised RateLimitError: Rate limit reached for 10KTPM-200RPM in organization org-ch7xZkxfNhNOfiyd4gkEf754 on tokens per min. Limit: 10000 / min. Please try again in 6ms. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for 10KTPM-200RPM in organization org-ch7xZkxfNhNOfiyd4gkEf754 on tokens p


 Total cost: $0.8479140000000001

Paper 5
../Data/SI_PDF/5.pdf
31 25


  5%|██████▍                                                                                                                                         | 5/111 [01:58<42:38, 24.14s/it]


 Total cost: $1.0158900000000002

Paper 6
../Data/SI_PDF/6.pdf
10 9


Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised RateLimitError: Rate limit reached for 10KTPM-200RPM in organization org-ch7xZkxfNhNOfiyd4gkEf754 on tokens per min. Limit: 10000 / min. Please try again in 6ms. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 2.0 seconds as it raised RateLimitError: Rate limit reached for 10KTPM-200RPM in organization org-ch7xZkxfNhNOfiyd4gkEf754 on tokens per min. Limit: 10000 / min. Please try again in 6ms. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for 10KTPM-200RPM in organization org-ch7xZkxfNhNOfiyd4gkEf754 on tokens p


 Total cost: $1.2088495000000001

Paper 7
../Data/SI_PDF/7.pdf
6 4


  6%|█████████                                                                                                                                       | 7/111 [02:52<41:30, 23.95s/it]


 Total cost: $1.3204495

Paper 8
../Data/SI_PDF/8.pdf
15 13


Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised RateLimitError: Rate limit reached for 10KTPM-200RPM in organization org-ch7xZkxfNhNOfiyd4gkEf754 on tokens per min. Limit: 10000 / min. Please try again in 6ms. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 2.0 seconds as it raised RateLimitError: Rate limit reached for 10KTPM-200RPM in organization org-ch7xZkxfNhNOfiyd4gkEf754 on tokens per min. Limit: 10000 / min. Please try again in 6ms. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for 10KTPM-200RPM in organization org-ch7xZkxfNhNOfiyd4gkEf754 on tokens p


 Total cost: $1.5449835

Paper 9
../Data/SI_PDF/9.pdf
8 5


Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised RateLimitError: Rate limit reached for 10KTPM-200RPM in organization org-ch7xZkxfNhNOfiyd4gkEf754 on tokens per min. Limit: 10000 / min. Please try again in 6ms. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 2.0 seconds as it raised RateLimitError: Rate limit reached for 10KTPM-200RPM in organization org-ch7xZkxfNhNOfiyd4gkEf754 on tokens per min. Limit: 10000 / min. Please try again in 6ms. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for 10KTPM-200RPM in organization org-ch7xZkxfNhNOfiyd4gkEf754 on tokens p


 Total cost: $1.6807325

Paper 10
../Data/SI_PDF/10.pdf
7 6


  9%|████████████▉                                                                                                                                  | 10/111 [04:24<45:39, 27.13s/it]


 Total cost: $1.818457

Paper 11
../Data/SI_PDF/11.pdf
6 6


Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised RateLimitError: Rate limit reached for 10KTPM-200RPM in organization org-ch7xZkxfNhNOfiyd4gkEf754 on tokens per min. Limit: 10000 / min. Please try again in 6ms. Contact us through our help center at help.openai.com if you continue to have issues..
 10%|██████████████▏                                                                                                                                | 11/111 [04:33<36:09, 21.69s/it]


 Total cost: $1.9789355

Paper 12
../Data/SI_PDF/12.pdf
120 16


 11%|███████████████▍                                                                                                                               | 12/111 [05:06<41:28, 25.14s/it]


 Total cost: $2.0811165

Paper 13
../Data/SI_PDF/13.pdf
16 13


Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised RateLimitError: Rate limit reached for 10KTPM-200RPM in organization org-ch7xZkxfNhNOfiyd4gkEf754 on tokens per min. Limit: 10000 / min. Please try again in 6ms. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 2.0 seconds as it raised RateLimitError: Rate limit reached for 10KTPM-200RPM in organization org-ch7xZkxfNhNOfiyd4gkEf754 on tokens per min. Limit: 10000 / min. Please try again in 6ms. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for 10KTPM-200RPM in organization org-ch7xZkxfNhNOfiyd4gkEf754 on tokens p


 Total cost: $2.271349

Paper 14
../Data/SI_PDF/14.pdf
8 8


Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised RateLimitError: Rate limit reached for 10KTPM-200RPM in organization org-ch7xZkxfNhNOfiyd4gkEf754 on tokens per min. Limit: 10000 / min. Please try again in 6ms. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 2.0 seconds as it raised RateLimitError: Rate limit reached for 10KTPM-200RPM in organization org-ch7xZkxfNhNOfiyd4gkEf754 on tokens per min. Limit: 10000 / min. Please try again in 6ms. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for 10KTPM-200RPM in organization org-ch7xZkxfNhNOfiyd4gkEf754 on tokens p

InvalidRequestError: This model's maximum context length is 8192 tokens. However, your messages resulted in 8336 tokens. Please reduce the length of the messages.

## Two-step LLM

In [45]:
chain_1_prompt = f"""
    This doc describes the water stability properties of a few \
    Metal-Organic Framework(MOF) compound. Please find \
    the full name for all MOFs.
    Please use the following rules:

    Rules: \

    If there are multiple MOFs, please list them all and put "MOF name" before them. \

    Your output sould be a Python list.
    
    Make sure "MOF name"s do not have duplicates.

    """




def two_step_retriev_QA_(paper_id, LLM_model, Embedding_model, k, temperature):
    mofpaper_path = '../Data/SI_PDF/' + paper_id + '.pdf'
    print(mofpaper_path)
    with get_openai_callback() as cb:
        docs_processor = eunomia.LoadDoc('../Data/SI_PDF/', paper_id=str(paper_id))
        sliced_pages = docs_processor.process(['references ', 'acknowledgement', 'acknowledgments',
                                               'acknowledgments\n', 'references\n '],
                                                     chunk_size=None, chunk_overlap=20)
        print(len(docs_processor.pages), len(sliced_pages))
        faiss_index_path = f"../Data/SI_PDF/faiss/{Embedding_model}/faiss_index_{Embedding_model}_{paper_id}.pkl"
        if os.path.isfile(faiss_index_path):
            with open(faiss_index_path, "rb") as f:
                faiss_index = pickle.load(f)
        else:
            with open(faiss_index_path, "wb") as f:
                faiss_index = FAISS.from_documents(sliced_pages, OpenAIEmbeddings(model=Embedding_model))   #text-davinci-003
                pickle.dump(faiss_index, f)
        retriever  = faiss_index.as_retriever(search_type="mmr", search_kwargs={"k":k}) # Maximum Marginal Relevance Retrieval        
#         retriever  = faiss_index.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": .8}) # Maximum Marginal Relevance Retrieval        
        qa = RetrievalQA.from_chain_type(
            llm=ChatOpenAI(model=LLM_model, temperature=temperature), chain_type="stuff",
            retriever=retriever, return_source_documents=False)
        answer_1 = qa({"query": chain_1_prompt})
        mof_list = answer_1['result']
        chain_2_prompt = f"""
        You are an expert chemist. This doc describes the water stability properties of a few \
        Metal-Organic Frameworks(MOF) listed as {mof_list}. Find \
        the water stability corresponding to each MOF using the rules below:\

        Rules: \
        There are only 2 options for water stability: \
        1. Stable \
        2. Unstable \
        For 1. Stable, \
        No change after exposure, soaking or boiling in water or an aqueous solution.\
        For 2. Unstable, \
        The MOF will decompose after exposure \
        to humid environment or dissolves in water. \
        
        If there are multiple MOFs, please list them all and put "MOF name" before them. \
        You should only respond in JSON format as described below. \
        Make sure it is parsable using Python json.load.\
        If information is not provided, state it as not provided.\
        """
        answer_2 = qa(chain_2_prompt)
        predictions_dict = parse_to_dict(answer_2, paper_id)
        return predictions_dict, cb.total_cost

In [47]:
from tqdm import tqdm
import numpy as np
import json
predictions = []
costs = []

LLM_model = 'gpt-3.5-turbo-16k'
Embedding_model = 'text-davinci-003'#'text-davinci-003' text-embedding-ada-002
k = 12
temp = 0
# 112
for i in tqdm(range(21,112)):
    print(f'Paper {i}')
    p, c = two_step_retriev_QA_(str(i), LLM_model, Embedding_model, k, temperature=temp)
    costs.append(c)
    predictions.append(p)
    print(f'\n Total cost: ${np.sum(costs)}\n') 
    results_index_path = f"../Data/SI_PDF/faiss/results/{LLM_model}/faiss-{Embedding_model}/k_{k}/t_{temp}/two-step-no-chunk/results_paper_{i}.json"
    with open(results_index_path, 'w') as f:
        json.dump(p, f)
#     break

  0%|                                                                                                                                                    | 0/91 [00:00<?, ?it/s]

Paper 21
../Data/SI_PDF/21.pdf
28 23


Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 2.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 8.0 seconds as it raised APIError: Bad gateway. {"error":{"code":502,"message":"Bad gateway.","param":null,"type":"cf_bad_gateway"}} 502 {'error': 

APIError: Bad gateway. {"error":{"code":502,"message":"Bad gateway.","param":null,"type":"cf_bad_gateway"}} 502 {'error': {'code': 502, 'message': 'Bad gateway.', 'param': None, 'type': 'cf_bad_gateway'}} {'Date': 'Thu, 27 Jul 2023 14:09:41 GMT', 'Content-Type': 'application/json', 'Content-Length': '84', 'Connection': 'keep-alive', 'X-Frame-Options': 'SAMEORIGIN', 'Referrer-Policy': 'same-origin', 'Cache-Control': 'private, max-age=0, no-store, no-cache, must-revalidate, post-check=0, pre-check=0', 'Expires': 'Thu, 01 Jan 1970 00:00:01 GMT', 'Server': 'cloudflare', 'CF-RAY': '7ed56eb05801387e-YYZ', 'alt-svc': 'h3=":443"; ma=86400'}

In [None]:
from typing import List
from pydantic import BaseModel, Field, validator
from langchain.output_parsers import OutputFixingParser, PydanticOutputParser

class MOF(BaseModel):
    name: List[str] = Field(description="name of a MOF")
    stability: List[str] = Field(description="choose only one: stable or unstable or not provided")
    @validator("name", pre=True)
    def name_not_provided(cls, field):
        if field == "Not Provided":
#                 raise ValueError("Dummy-MOF!")
            field = 'Dummy-MOF'
        return field

from langchain.output_parsers import PydanticOutputParser
parser = PydanticOutputParser(pydantic_object=MOF)
#     try:
llm_parser = OutputFixingParser.from_llm(parser=parser, llm=ChatOpenAI(temperature=0))

test_dict = {
  "MOFs": [
    {
      "MOF name": "DMOF",
      "Water stability": "Stable"
    },
    {
      "MOF name": "MOF-508",
      "Water stability": "Stable"
    },
    {
      "MOF name": "DMOF-TM",
      "Water stability": "Not provided"
    },
    {
      "MOF name": "MOF-508-TM",
      "Water stability": "Not provided"
    }
  ]
}

In [None]:
parsed_result = parse_to_dict(test_dict,65)