## Loading test data obtained from matscholar

In [14]:
import json
from tqdm import tqdm
import numpy as np
import copy

def read_jsonl(filename):
    """
    Read a jsonlines file into a list of dictionaries.

    Args:
        filename (str): Path to input file.

    Returns:
        ([dict]): List of dictionaries.
    """
    with open(filename, "r") as f:
        lines = f.readlines()
        samples = []
        for l in lines:
            samples.append(json.loads(l))
        return samples

In [2]:
general_annotations_file = '../data/NERRE/mof_results/run_1.jsonl'
run = read_jsonl(general_annotations_file)
run[1].keys()

dict_keys(['prompt', 'completion', 'gpt3_completion'])

In [3]:
text_input = run[3]['prompt']
text_input

'Bimetallic MOFs (H3O)x[Cu(MF6)(pyrazine)2]·(4 − x)H2O (M = V4+, x = 0; M = Ga3+, x = 1): co-existence of ordered and disordered quantum spins in the V4+ system\nThe title compounds are bimetallic MOFs containing [Cu(pyz)2]2+ square lattices linked by MF6n− octahedra. In each, only the Cu2+ spins exhibit long-range magnetic order below 3.5 K (M = V4+) and 2.6 K (M = Ga3+). The V4+ spins remain disordered down to 0.5 K.\n\n###\n\n'

In [4]:
import eunomia
docs_processor = eunomia.LoadDoc(text_input=text_input)
sliced_pages = docs_processor.process(chunk_size=2000, chunk_overlap=5, chunking_type='fixed-size')

In [5]:
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
Embedding_model = 'text-embedding-ada-002'
faiss_index = FAISS.from_documents(sliced_pages, OpenAIEmbeddings(model=Embedding_model))

In [94]:
ENTS_FROZEN = ['name_of_mof', 'mof_formula', 'mof_description', 'guest_species', 'applications']
ROOT = ("name_of_mof",)
LINK_DELIMITER = "|||"


ENTS_FROZEN_NOROOT = [e for e in ENTS_FROZEN if e not in ROOT]
ENTS_LINKS_FROZEN = [f"{root}{LINK_DELIMITER}{e}" for e in ENTS_FROZEN_NOROOT for root in ROOT]

def evlaute_model_instance(prompt ,test_json, gold_json, model='Eunomia'):

    exact_matches = 0
    unparsable = 0
    total = 0
    
    ent_scores_test = {e: [] for e in ENTS_FROZEN}
    ent_scores_gold = {e: [] for e in ENTS_FROZEN}
    
    subdict = {"test_correct_triplets": 0, "test_retrieved_triplets": 0, "gold_retrieved_triplets": 0}
    links_scores = {el: copy.deepcopy(subdict) for el in ENTS_LINKS_FROZEN}

    # if test_json:
    #     test_accounting = ent_json_to_word_basis_sets(test_json)
    # else:
    #     test_accounting = ent_json_to_word_basis_sets({}, return_empty=True)
    # with open(path, 'r') as f:
    #     result = json.load(f)
    # test_json = result[model]
    # gold_json = result['ground-truth']
    test_accounting = ent_json_to_word_basis_sets(test_json)
    # gold_json = json.loads(gold_json.replace("\n\nEND\n\n", "").strip())
    gold_accounting = ent_json_to_word_basis_sets(gold_json)
    for etype in ENTS_FROZEN:
        ent_accounting_copy = copy.deepcopy(test_accounting[etype])
        
        n_prompt_words = len([w for w in prompt.split(" ") if w])
        n_unlabelled_words = copy.deepcopy(n_prompt_words)
        for ew in gold_accounting[etype]:
    
            # Account for true positives
            if ew in test_accounting[etype]:
                ent_scores_test[etype].append(1)
                ent_scores_gold[etype].append(1)
                ent_accounting_copy.remove(ew)
                n_unlabelled_words -= 1
            # account for false negatives
            else:
                ent_scores_test[etype].append(0)
                ent_scores_gold[etype].append(1)
                n_unlabelled_words -= 1
    
        # Among the remaining test accounting words, only false positives
        # should remain in the set
        for ew in ent_accounting_copy:
            ent_scores_test[etype].append(1)
            ent_scores_gold[etype].append(0)
            n_unlabelled_words -= 1
    
        # the only labels remaining are true negatives
        ent_scores_test[etype] += [0] * n_unlabelled_words
        ent_scores_gold[etype] += [0] * n_unlabelled_words
    
    for elinktype in ENTS_LINKS_FROZEN:
        gold_triples = gold_accounting[elinktype]
        test_triples = test_accounting[elinktype]
    
        n_correct_triples = len([e for e in test_triples if e in gold_triples])
        links_scores[elinktype]["test_correct_triplets"] += n_correct_triples
        links_scores[elinktype]["test_retrieved_triplets"] += len(test_triples)
        links_scores[elinktype]["gold_retrieved_triplets"] += len(gold_triples)

    classification_results = {"ents": {}, "links": {}}
    from sklearn.metrics import f1_score, recall_score, precision_score

    from sklearn.metrics import f1_score, recall_score, precision_score

    for etype in ENTS_FROZEN:
        gold_arr = ent_scores_gold[etype]
        test_arr = ent_scores_test[etype]
            # Check if both arrays are zeros. If so, set recall to 1.
        if sum(test_arr) == 0 and sum(gold_arr) == 0:
            print(gold_arr , etype)
            print(test_arr , etype)
            recall = 1.0
            precision = 1.0
            f1 = 1.0
        else:
            recall = np.round(recall_score(gold_arr, test_arr),3)
            percision = np.round(precision_score(gold_arr, test_arr),3)
            f1 = np.round(f1_score(gold_arr, test_arr),3)
        subdict = {"recall": 0, "precision": 0, "f1": 0}
        subdict["recall"] = recall
        subdict["precision"] = percision
        subdict["f1"] = f1
        classification_results["ents"][etype] = subdict
    return classification_results

def check_equivalence_of_entries(gold_entry, test_entry):
    ## Entries are a list of dictionaries
    ## We first order each list, then each dictionary, then compare strings


    ### order list by formula key
    gold_entry = sorted(gold_entry, key=lambda x: x.get('formula', ''))
    test_entry = sorted(test_entry, key=lambda x: x.get('formula', ''))

    ### order each dictionary by keys
    gold_entry = [dict(sorted(d.items())) for d in gold_entry]
    test_entry = [dict(sorted(d.items())) for d in test_entry]

    ### compare strings
    return str(gold_entry) == str(test_entry)

def ent_json_to_word_basis_sets(ent_json, return_empty=False):
    """
    Where ent_json is multiple entries in a list

    Return all entities and links in a set-based word basis
    """
    # Must account for these in a weird way because the entries are not ordered :(
    to_account = {e: set() for e in ENTS_FROZEN + ENTS_LINKS_FROZEN}

    if return_empty:
        return to_account

    for entry in ent_json:
        root_accounting = {root: set() for root in ROOT}
        for etype in ENTS_FROZEN:
            ent_strs = entry[etype]
            if isinstance(ent_strs, str):
                for w in ent_str_to_words(ent_strs):
                    to_account[etype].add(w)
                if etype in ROOT and ent_strs:
                    # Formulae/roots must be counted as single words
                    root_accounting[etype].add(ent_strs)
                    # root_accounting[etype] = root_accounting[etype].union(set(ent_str_to_words(ent_strs)))
            elif isinstance(ent_strs, list):
                for ent_str in ent_strs:
                    for w in ent_str_to_words(ent_str):
                        to_account[etype].add(w)
            else:
                raise ValueError(f"Ent strings was a weird type: {type(ent_strs)}, {ent_strs}")

        # Add links
        for root, accounting in root_accounting.items():
            if accounting:
                for e in ENTS_FROZEN_NOROOT:
                    ent_strs = entry[e]
                    words = []
                    if isinstance(ent_strs, str):
                        words = ent_str_to_words(ent_strs)
                    elif isinstance(ent_strs, list):
                        for ent_str in ent_strs:
                            words += ent_str_to_words(ent_str)
                    else:
                        raise ValueError(f"Ent strings was a weird type: {type(ent_strs)}, {ent_strs}")

                    if words:
                        for f in accounting:
                            for w in words:
                                # avoid self-links
                                if f != w:
                                    to_account[f"{root}{LINK_DELIMITER}{e}"].add(f"{f}{LINK_DELIMITER}{w}")
    return to_account

def ent_str_to_words(ent):
    stripped =  [e.strip() for e in ent.split(" ")]
    return [e for e in stripped if e]

## General Schema

In [3]:
from langchain.agents import Tool, tool


# this is the general case
prompt = """You are an expert chemist. Read context and answer the following questions:
name of material? chemical formula? applications? material description? guest species?


There may be more than a single material in the context. If so, answer all the questions for each one you find.

Your findings should be structured like this:

[{"acronym": other names mentioned for a material/formula that actually exists in the context provided.

"applications" : list of applications or high-level use cases or major property classes for the material. 
Applications may represent different levels of device-level implementation.

"name": name of the material.

"formula": chemical formula of the material.

"structure_or_phase": list of entities that directly imply the crystalline structure or symmetry of the material. 
Crystal systems such as "cubic" or "tetragonal", structure names such as "rutile" or "NASICON", and space groups 
such as "Fd3m" or "space group No. 93" are all extracted in this field.

"description": list of details about a material’s processing history, defects, modifications, or the sample’s morphology.}]

If the context don't mention each item, leave them empty. Do not make up answers and use the exact words from the context. 
Format the output into a python list of JSONs.

"""

@tool
def read_context(input):
    '''
    Input the users original prompt and get context to answer to questions.
    Always search for the answers using this tool first, don't make up answers yourself.
    '''
    from langchain.llms import OpenAI
    k = 5
    min_k = 4  # Minimum limit for k
    llm = OpenAI(temperature=0, model_name='gpt-4')  
    result = eunomia.RetrievalQABypassTokenLimit(prompt, faiss_index, k=k, min_k=min_k, llm=llm,
                         search_type="mmr", fetch_k=50, chain_type="stuff", memory=None)
    return result

    

import json
from typing import List

from langchain.chat_models import ChatOpenAI
from langchain.output_parsers import OutputFixingParser, PydanticOutputParser
from pydantic import BaseModel, Field

# @tool
def format_output(answer):
    """
    Format the agent's answer into a python list of JSONs
    """

    class Material(BaseModel):
        """Pydantic data model for a Material."""
        acronym: str = Field(description="acronym of a Material")
        applications: list = Field(description="applications of a Material")
        name: str = Field(description="name of a Material")
        formula: str = Field(description="formula of a Material")
        structure_or_phase: list = Field(description="structure_or_phase of a Material")
        description: list = Field(description="description of a Material")

    class MaterialList(BaseModel):
        """Pydantic data model for a list of Materials."""

        Materials: List[Material]

    # Initialize a Pydantic output parser for the Material list model
    parser = PydanticOutputParser(pydantic_object=MaterialList)

    # Enhance the parser with capabilities to handle specific LLM outputs
    llm_parser = OutputFixingParser.from_llm(
        parser=parser, llm=ChatOpenAI(temperature=0, model="gpt-4")
    )

    # Check if result is a valid JSON string
    try:
        json.loads(answer, strict=False)
    except json.JSONDecodeError:
        # If not, convert it to a valid JSON string
        answer = json.dumps({"dummy_key": answer})
    # Try parsing the answer using the enhanced parser
    try:
        parsed_result = llm_parser.parse(answer)
    except ValueError as e:
        raise ValueError(f"Failed to parse Material: {e}")

    # Construct a dictionary with materials' details using comprehension
    parsed_response = [
        {   "acronym": material.acronym,
            "applications": material.applications,
            "name": material.name,
            "formula": material.formula,
            "structure_or_phase": material.structure_or_phase,
            "description": material.description
        }
        for material in parsed_result.Materials
    ]

    return parsed_response



from eunomia.agents import Eunomia
agent = Eunomia(tools=[read_context],
                model='gpt-4', get_cost=True, temp=0)



## Metal–organic framework (MOF) schema

In [133]:
mof_annotations_file = '../data/NERRE/mof_results/run_1.jsonl'
run = read_jsonl(mof_annotations_file)
m = 10
text_input = run[m]['prompt']
ground_truth = run[m]['completion']
prompt = run[m]["prompt"].replace("\n\n###\n\n", "").strip()
text_input

'Nanoporous Ag2O photocatalysts based on copper terephthalate metal–organic frameworks\nWe report the nanoporous Ag2O based on the copper terephthalate metal–organic frameworks (Ag2O/MOF) photocatalyst. Ag2O/MOF nanostructure was formed via oxygen treatment of Ag/MOF nanoparticles. The resulting Ag2O/MOF photocatalysts were characterised by various techniques. Results showed that the synthesised Ag2O/MOF nanocomposite exhibited dramatic separation of photoinduced electron/hole and excellent photodegradation activity under visible light irradiation. The degradation rate of acid blue 92 using Ag2O/MOF nanocomposite is found to be higher than that using pure MOF and Ag/MOF. It was divulged that the photodegradation rate is increased by oxygen treatment of Ag in Ag/MOF structure. The possible mechanism for the enhanced photocatalytic properties of the Ag2O/MOF nanocomposite was also discussed. Similar to the mechanism proposed in the photodegradation by the other semiconductors, we propose

In [156]:
import eunomia
docs_processor = eunomia.LoadDoc(text_input=text_input)
sliced_pages = docs_processor.process(chunk_size=600, chunk_overlap=50, chunking_type='NLTK')
sliced_pages

[Document(page_content='Nanoporous Ag2O photocatalysts based on copper terephthalate metal–organic frameworks\nWe report the nanoporous Ag2O based on the copper terephthalate metal–organic frameworks (Ag2O/MOF) photocatalyst.\n\nAg2O/MOF nanostructure was formed via oxygen treatment of Ag/MOF nanoparticles.\n\nThe resulting Ag2O/MOF photocatalysts were characterised by various techniques.\n\nResults showed that the synthesised Ag2O/MOF nanocomposite exhibited dramatic separation of photoinduced electron/hole and excellent photodegradation activity under visible light irradiation.', metadata={'source': 'local'}),
 Document(page_content='The degradation rate of acid blue 92 using Ag2O/MOF nanocomposite is found to be higher than that using pure MOF and Ag/MOF.\n\nIt was divulged that the photodegradation rate is increased by oxygen treatment of Ag in Ag/MOF structure.\n\nThe possible mechanism for the enhanced photocatalytic properties of the Ag2O/MOF nanocomposite was also discussed.', 

In [157]:
# from langchain.schema import Document
# pages = Document(page_content=text_input, metadata={"source": "local"})
# pages

In [158]:
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
Embedding_model = 'text-embedding-ada-002'
faiss_index = FAISS.from_documents(sliced_pages, OpenAIEmbeddings(model=Embedding_model))

In [159]:
# [{"name_of_mof": name of the MOF

# "mof_formula": chemical formula of the MOF.

# "mof_description": list of details about a MOF’s processing history, defects, modifications, or the sample’s morphology.

# "guest_species": list of chemical species that have been incorporated, stored, or absorbed in the MOF.

# "applications" : list of applications or high-level use cases or major property classes for the MOF.
# Applications may represent different levels of device-level implementation. Make the answer short like: "gas-separation", "heterogeneous
# catalyst", "Diels-Alder reactions"}]

In [160]:
from langchain.agents import Tool, tool

# this is the MOF case
prompt_mof = """You are an expert chemist. Read context, find all the MOFs and answer the following questions for each one:
name of MOF? MOF chemical formula? MOF description? guest species? applications?

Use this rules: 
MOF names cannot be general like: "copper terephthalate metal–organic frameworks".
Valid chemical formula should be similar to: "Zn3(BTC)2", "Zn4O(1,4-benzenedicarboxylate)3" or "[Ln(2,5-pzdc)(ox)0.5(H2O)2] 4.5H2O (1-Ln; Ln = Nd, Sm, Eu, Gd, Tb, Dy)".
MOF description is a list of details about a MOF’s processing history, defects, modifications, or the sample’s morphology.
Guest species is a list of chemical molecules (like water, hydrogen, CO2, CH4, N2, H2, VOCs, etc.) that have been incorporated, stored, or absorbed in the MOF.
Applications is a list summary of applications or high-level use cases or major property classes for the MOF.
Applications may represent different levels of device-level implementation. Answer shortly short like: "gas-separation", "heterogeneous
catalyst", "Diels-Alder reactions"

There may be more than a single MOF in the context. If so, answer all the questions for each one you find.

Your findings should be structured like this:

[{"name_of_mof": ...

"mof_formula": ...

"mof_description": [...]

"guest_species": [...]

"applications" : [...]}]


If the context don't mention each item, leave them empty and use the following format:
"name_of_mof": ""
"mof_formula": ""
"mof_description": [""]
"guest_species": [""]
"applications" : [""]

Do not make up answers and use the exact words from the context. 
Format the output into a python list of JSONs.

"""

@tool
def read_context(input):
    '''
    Input the users original prompt and get context to answer to questions.
    Always search for the answers using this tool first, don't make up answers yourself.
    '''
    from langchain.llms import OpenAI
    k = 10
    min_k = 4  # Minimum limit for k
    llm = OpenAI(temperature=0, model_name='gpt-4')  
    result = eunomia.RetrievalQABypassTokenLimit(prompt_mof, faiss_index, k=k, min_k=min_k, llm=llm,
                         search_type="mmr", fetch_k=50, chain_type="stuff", memory=None)
    return result

    

import json
from typing import List

from langchain.chat_models import ChatOpenAI
from langchain.output_parsers import OutputFixingParser, PydanticOutputParser
from pydantic import BaseModel, Field

# @tool
def format_output(answer):
    """
    Format the agent's answer into a python list of JSONs
    """

    class MOF(BaseModel):
        """Pydantic data model for a MOF."""
        name: str = Field(description="name of a MOF")
        formula: str = Field(description="formula of a MOF")
        description: list = Field(description="description of a MOF")
        guest_species: list = Field(description="guest_species of a MOF")
        applications: list = Field(description="applications of a MOF")

    class MOFList(BaseModel):
        """Pydantic data model for a list of MOFs."""

        MOFs: List[MOF]

    # Initialize a Pydantic output parser for the MOF list model
    parser = PydanticOutputParser(pydantic_object=MOFList)

    # Enhance the parser with capabilities to handle specific LLM outputs
    llm_parser = OutputFixingParser.from_llm(
        parser=parser, llm=ChatOpenAI(temperature=0, model="gpt-4")
    )

    # Check if result is a valid JSON string
    try:
        json.loads(answer, strict=False)
    except json.JSONDecodeError:
        # If not, convert it to a valid JSON string
        answer = json.dumps({"dummy_key": answer})
    # Try parsing the answer using the enhanced parser
    try:
        parsed_result = llm_parser.parse(answer)
    except ValueError as e:
        raise ValueError(f"Failed to parse MOF: {e}")

    # Construct a dictionary with MOFs' details using comprehension
    parsed_response = [
        {   
            "name_of_mof": MOF.name,
            "mof_formula": MOF.formula,
            "mof_description": MOF.description,
            "guest_species": MOF.guest_species,
            "applications": MOF.applications
        }
        for MOF in parsed_result.MOFs
    ]

    return parsed_response



from eunomia.agents import Eunomia
agent = Eunomia(tools=[read_context],
                model='gpt-4', get_cost=True, temp=0)



In [None]:
results = agent.run(prompt=prompt_mof)
json.loads(results)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to read the context to find the information about MOFs.
Action: read_context
Action Input: You are an expert chemist. Read context, find all the MOFs and answer the following questions for each one:
name of MOF? MOF chemical formula? MOF description? guest species? applications?

Use this rules: 
MOF names cannot be general like: "copper terephthalate metal–organic frameworks".
Valid chemical formula should be similar to: "Zn3(BTC)2", "Zn4O(1,4-benzenedicarboxylate)3" or "[Ln(2,5-pzdc)(ox)0.5(H2O)2] 4.5H2O (1-Ln; Ln = Nd, Sm, Eu, Gd, Tb, Dy)".
MOF description is a list of details about a MOF’s processing history, defects, modifications, or the sample’s morphology.
Guest species is a list of chemical molecules (like water, hydrogen, CO2, CH4, N2, H2, VOCs, etc.) that have been incorporated, stored, or absorbed in the MOF.
Applications is a list summary of applications or high-level use cases or major property classes fo

In [131]:
results

'[{"name_of_mof": "", "mof_formula": "", "mof_description": [""], "guest_species": [""], "applications" : [""]}]'

In [132]:
json.loads(results)

[{'name_of_mof': '',
  'mof_formula': '',
  'mof_description': [''],
  'guest_species': [''],
  'applications': ['']}]

In [116]:
parsed_results = format_output(results)
parsed_results

[{'name_of_mof': 'Ag2O/MOF',
  'mof_formula': 'Ag2O',
  'mof_description': ['Nanoporous Ag2O photocatalysts based on copper terephthalate metal–organic frameworks',
   'Ag2O/MOF nanostructure was formed via oxygen treatment of Ag/MOF nanoparticles'],
  'guest_species': [''],
  'applications': ['degradation of organic pollutants for water purification']},
 {'name_of_mof': 'Ag/MOF',
  'mof_formula': 'Ag',
  'mof_description': ['Ag/MOF nanoparticles'],
  'guest_species': [''],
  'applications': ['']}]

In [33]:
prompt

'Topology-dependent emissive properties of zirconium-based porphyrin MOFs\nHighly ordered chromophoric linkers positioned within the metal–organic frameworks (MOFs) have the potential to mimic natural light-harvesting complexes. Herein we report topological control over the photophysical properties of MOFs via modular interchromophoric electronic coupling to manifest different steady-state singlet emission spectra and their corresponding fluorescence lifetimes.'

In [129]:
ground_truth

' [{"name_of_mof": "", "mof_formula": "", "mof_description": [""], "guest_species": [""], "applications": ["chromatography"]}]\n\nEND\n\n'

In [85]:
evlaute_model_instance(resultprompt, parsed_results, ground_truth)

NameError: name 'resultprompt' is not defined

In [121]:
run_ID =  1
i = 10
file = f'../data/NERRE/mof_results/{run_ID}/run_{run_ID}_{i}.json'
with open(file, 'r') as f:
    result = json.load(f)

In [122]:
result['prompt']

'Nanoporous Ag2O photocatalysts based on copper terephthalate metal–organic frameworks\nWe report the nanoporous Ag2O based on the copper terephthalate metal–organic frameworks (Ag2O/MOF) photocatalyst. Ag2O/MOF nanostructure was formed via oxygen treatment of Ag/MOF nanoparticles. The resulting Ag2O/MOF photocatalysts were characterised by various techniques. Results showed that the synthesised Ag2O/MOF nanocomposite exhibited dramatic separation of photoinduced electron/hole and excellent photodegradation activity under visible light irradiation. The degradation rate of acid blue 92 using Ag2O/MOF nanocomposite is found to be higher than that using pure MOF and Ag/MOF. It was divulged that the photodegradation rate is increased by oxygen treatment of Ag in Ag/MOF structure. The possible mechanism for the enhanced photocatalytic properties of the Ag2O/MOF nanocomposite was also discussed. Similar to the mechanism proposed in the photodegradation by the other semiconductors, we propose

In [76]:
test_accounting = ent_json_to_word_basis_sets(parsed_results)
gold_accounting = ent_json_to_word_basis_sets(result['ground-truth'])
print(test_accounting)

{'name_of_mof': {'Ag/MOF', 'Ag2O/MOF'}, 'mof_formula': {'Ag2O', 'Ag'}, 'mof_description': {'formed', 'oxygen', 'treatment', 'on', 'of', 'terephthalate', 'photocatalysts', 'Nanoporous', 'frameworks', 'was', 'nanoparticles', 'Ag2O/MOF', 'Ag2O', 'Ag/MOF', 'metal–organic', 'based', 'via', 'nanostructure', 'copper'}, 'guest_species': set(), 'applications': {'for', 'purification', 'degradation', 'water', 'pollutants', 'of', 'organic'}, 'name_of_mof|||mof_formula': {'Ag/MOF|||Ag', 'Ag2O/MOF|||Ag2O'}, 'name_of_mof|||mof_description': {'Ag2O/MOF|||Nanoporous', 'Ag2O/MOF|||formed', 'Ag2O/MOF|||Ag2O', 'Ag2O/MOF|||based', 'Ag2O/MOF|||Ag/MOF', 'Ag2O/MOF|||on', 'Ag2O/MOF|||was', 'Ag2O/MOF|||of', 'Ag2O/MOF|||frameworks', 'Ag/MOF|||nanoparticles', 'Ag2O/MOF|||nanoparticles', 'Ag2O/MOF|||copper', 'Ag2O/MOF|||nanostructure', 'Ag2O/MOF|||terephthalate', 'Ag2O/MOF|||oxygen', 'Ag2O/MOF|||via', 'Ag2O/MOF|||treatment', 'Ag2O/MOF|||photocatalysts', 'Ag2O/MOF|||metal–organic'}, 'name_of_mof|||guest_species': s

In [78]:
print(gold_accounting)

{'name_of_mof': {'Ag/MOF', 'Ag2O/MOF'}, 'mof_formula': set(), 'mof_description': {'frameworks', 'terephthalate', 'copper', 'metal–organic'}, 'guest_species': set(), 'applications': {'photodegradation', 'photocatalytic', 'photocatalysts'}, 'name_of_mof|||mof_formula': set(), 'name_of_mof|||mof_description': {'Ag2O/MOF|||copper', 'Ag2O/MOF|||frameworks', 'Ag2O/MOF|||terephthalate', 'Ag2O/MOF|||metal–organic'}, 'name_of_mof|||guest_species': set(), 'name_of_mof|||applications': {'Ag/MOF|||photocatalysts', 'Ag/MOF|||photodegradation', 'Ag2O/MOF|||photodegradation', 'Ag2O/MOF|||photocatalysts', 'Ag2O/MOF|||photocatalytic'}}


In [89]:
result['ground-truth']

[{'name_of_mof': 'Ag2O/MOF',
  'mof_formula': '',
  'mof_description': ['copper terephthalate metal–organic frameworks'],
  'guest_species': [''],
  'applications': ['photocatalysts', 'photodegradation', 'photocatalytic ']},
 {'name_of_mof': 'Ag/MOF',
  'mof_formula': '',
  'mof_description': [''],
  'guest_species': [''],
  'applications': ['photocatalysts', 'photodegradation']}]

In [88]:
parsed_results

[{'name_of_mof': 'Ag2O/MOF',
  'mof_formula': 'Ag2O',
  'mof_description': ['Nanoporous Ag2O photocatalysts based on copper terephthalate metal–organic frameworks',
   'Ag2O/MOF nanostructure was formed via oxygen treatment of Ag/MOF nanoparticles'],
  'guest_species': [''],
  'applications': ['degradation of organic pollutants for water purification']},
 {'name_of_mof': 'Ag/MOF',
  'mof_formula': 'Ag',
  'mof_description': ['Ag/MOF nanoparticles'],
  'guest_species': [''],
  'applications': ['']}]

In [120]:
evlaute_model_instance(result['prompt'], parsed_results, result['ground-truth'])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] guest_species
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

  _warn_prf(average, modifier, msg_start, len(result))


{'ents': {'name_of_mof': {'recall': 1.0, 'precision': 1.0, 'f1': 1.0},
  'mof_formula': {'recall': 0.0, 'precision': 0.0, 'f1': 0.0},
  'mof_description': {'recall': 1.0, 'precision': 0.211, 'f1': 0.348},
  'guest_species': {'recall': 1.0, 'precision': 0.211, 'f1': 1.0},
  'applications': {'recall': 0.0, 'precision': 0.0, 'f1': 0.0}},
 'links': {}}

In [119]:
calculate_metrics(result['ground-truth'], parsed_results)

{'ents': {'name_of_mof': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0},
  'mof_formula': {'precision': 0.0, 'recall': 0, 'f1': 0},
  'mof_description': {'precision': 0.0, 'recall': 0.0, 'f1': 0},
  'guest_species': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0},
  'applications': {'precision': 0.0, 'recall': 0.0, 'f1': 0}}}

In [100]:
def calculate_metrics(ground_truth, predicted):
    # Initialize results
    results = {"ents": {}}
    
    # Define keys to compare
    keys = ['name_of_mof', 'mof_formula', 'mof_description', 'guest_species', 'applications']

    for key in keys:
        TP, FP, FN = 0, 0, 0
        
        for gt, pred in zip(ground_truth, predicted):
            # Counting True Positives
            TP += sum([item in pred[key] for item in gt[key]])
            # Counting False Positives
            FP += sum([item not in gt[key] for item in pred[key]])
            # Counting False Negatives
            FN += sum([item not in pred[key] for item in gt[key]])

        # Calculate precision, recall, and F1 score
        precision = TP / (TP + FP) if (TP + FP) != 0 else 0
        recall = TP / (TP + FN) if (TP + FN) != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
        
        results["ents"][key] = {
            'precision': round(precision, 3),
            'recall': round(recall, 3),
            'f1': round(f1_score, 3)
        }

    return results

In [107]:
ground_truth

' [{"name_of_mof": "", "mof_formula": "", "mof_description": ["zirconium-based porphyrin MOFs"], "guest_species": [""], "applications": ["photophysical properties", "singlet emission", "fluorescence"]}]\n\nEND\n\n'

In [54]:
sample["completion"].replace("\n\nEND\n\n", "").strip()

'[{"name_of_mof": "NH2-MIL-125", "mof_formula": "", "mof_description": ["amine-functionalized MOFs"], "guest_species": [""], "applications": ["photocatalysis", "photocatalysts"]}, {"name_of_mof": "CdS/NH2-MIL-125@TiO2", "mof_formula": "", "mof_description": ["amine-functionalized MOFs"], "guest_species": [""], "applications": ["photocatalysis", "photocatalysts"]}]'

In [55]:
ground_truth_json = json.loads( run[0]["completion"].replace("\n\nEND\n\n", "").strip())


dict = {
    'prompt': run[0]['prompt'], 'Eunomia': parsed_results, 'NERRE': run[0]['gpt3_completion'], 'ground-truth': ground_truth_json
}

with open('../data/NERRE/mof_results/0/run_0_0.json', 'w') as f:
    json.dump(dict,f, indent = 4)

dict

{'prompt': 'A mild one-step method for enhancing optical absorption of amine-functionalized metal-organic frameworks\nMetal-organic frameworks (MOFs) are of significant interest for photocatalysis using visible light. Here we discuss HOMO/LUMO gap modification of amine-functionalized MOFs through a mild one-step method. DFT calculation reveals the formation of covalent bond between TiO2 and the amine from NH2-MIL-125, narrowing the HOMO/LUMO gap of NH2-MIL-125 by raising its HOMO level. After CdS quantum dots (QDs) deposited on this MOF@TiO2 core-shell structure, this composite catalyst can act as an efficient visible-light-driven catalyst for NO removal. The integral coating of amorphous TiO2 onto MOF octahedrons constructs a mesoporous protection shell upon MOF frameworks, providing a superior accommodation for embedding CdS QDs. Integrating MOF with TiO2 also reduces undesirable electron-hole recombination by facilitating charge transfer to amorphous TiO2. Possible mechanism of phot

In [34]:
json.load(run[0]['completion'])

AttributeError: 'str' object has no attribute 'read'

In [17]:
run[0]['prompt']

'A mild one-step method for enhancing optical absorption of amine-functionalized metal-organic frameworks\nMetal-organic frameworks (MOFs) are of significant interest for photocatalysis using visible light. Here we discuss HOMO/LUMO gap modification of amine-functionalized MOFs through a mild one-step method. DFT calculation reveals the formation of covalent bond between TiO2 and the amine from NH2-MIL-125, narrowing the HOMO/LUMO gap of NH2-MIL-125 by raising its HOMO level. After CdS quantum dots (QDs) deposited on this MOF@TiO2 core-shell structure, this composite catalyst can act as an efficient visible-light-driven catalyst for NO removal. The integral coating of amorphous TiO2 onto MOF octahedrons constructs a mesoporous protection shell upon MOF frameworks, providing a superior accommodation for embedding CdS QDs. Integrating MOF with TiO2 also reduces undesirable electron-hole recombination by facilitating charge transfer to amorphous TiO2. Possible mechanism of photocatalytic 

In [18]:
run[0]['completion']

' [{"name_of_mof": "NH2-MIL-125", "mof_formula": "", "mof_description": ["amine-functionalized MOFs"], "guest_species": [""], "applications": ["photocatalysis", "photocatalysts"]}, {"name_of_mof": "CdS/NH2-MIL-125@TiO2", "mof_formula": "", "mof_description": ["amine-functionalized MOFs"], "guest_species": [""], "applications": ["photocatalysis", "photocatalysts"]}]\n\nEND\n\n'

In [19]:
run[0]['gpt3_completion']

'[{"name_of_mof": "NH2-MIL-125", "mof_formula": "", "mof_description": ["amine-functionalized metal-organic frameworks (MOFs)"], "guest_species": [""], "applications": ["photocatalysis", "NO removal", "catalyst", "photocatalytic oxidation of NO"]}, {"name_of_mof": "CdS/NH2-MIL-125@TiO2", "mof_formula": "", "mof_description": ["amine-functionalized metal-organic frameworks (MOFs)"], "guest_species": [""], "applications": ["photocatalysis", "NO removal", "catalyst", "photocatalytic oxidation of NO"]}]'

In [10]:
run[3]['completion']

' [{"name_of_mof": "", "mof_formula": "(H3O)x[Cu(MF6)(pyrazine)2]·(4 − x)H2O (M = V4+, x = 0; M = Ga3+, x = 1)", "mof_description": [""], "guest_species": [""], "applications": ["magnetic"]}, {"name_of_mof": "", "mof_formula": "[Cu(pyz)2]2+", "mof_description": [""], "guest_species": [""], "applications": ["magnetic"]}]\n\nEND\n\n'

In [36]:
import jellyfish
import copy
import tqdm


general_annotations_file = '../data/NERRE/mof_results/run_0.jsonl'
run = read_jsonl(general_annotations_file)

ENTS_FROZEN = ['name_of_mof', 'mof_formula', 'mof_description', 'guest_species', 'applications']
ent_scores_test = {e: [] for e in ENTS_FROZEN}
ent_scores_gold = {e: [] for e in ENTS_FROZEN}

exact_matches = 0
unparsable = 0
total = 0
jaro_winkler_similarities = []

LINK_DELIMITER = "|||"
ROOT = ("name_of_mof",)

ENTS_FROZEN_NOROOT = [e for e in ENTS_FROZEN if e not in ROOT]
ENTS_LINKS_FROZEN = [f"{root}{LINK_DELIMITER}{e}" for e in ENTS_FROZEN_NOROOT for root in ROOT]

subdict = {"test_correct_triplets": 0, "test_retrieved_triplets": 0, "gold_retrieved_triplets": 0}
links_scores = {el: copy.deepcopy(subdict) for el in ENTS_LINKS_FROZEN}

In [37]:
run[0]['completion']

' [{"name_of_mof": "NH2-MIL-125", "mof_formula": "", "mof_description": ["amine-functionalized MOFs"], "guest_species": [""], "applications": ["photocatalysis", "photocatalysts"]}, {"name_of_mof": "CdS/NH2-MIL-125@TiO2", "mof_formula": "", "mof_description": ["amine-functionalized MOFs"], "guest_species": [""], "applications": ["photocatalysis", "photocatalysts"]}]\n\nEND\n\n'

In [38]:
def ent_json_to_word_basis_sets(ent_json, return_empty=False):
    """
    Where ent_json is multiple entries in a list

    Return all entities and links in a set-based word basis
    """
    # Must account for these in a weird way because the entries are not ordered :(
    to_account = {e: set() for e in ENTS_FROZEN + ENTS_LINKS_FROZEN}

    if return_empty:
        return to_account

    for entry in ent_json:
        root_accounting = {root: set() for root in ROOT}
        for etype in ENTS_FROZEN:
            ent_strs = entry[etype]
            if isinstance(ent_strs, str):
                for w in ent_str_to_words(ent_strs):
                    to_account[etype].add(w)
                if etype in ROOT and ent_strs:
                    # Formulae/roots must be counted as single words
                    root_accounting[etype].add(ent_strs)
                    # root_accounting[etype] = root_accounting[etype].union(set(ent_str_to_words(ent_strs)))
            elif isinstance(ent_strs, list):
                for ent_str in ent_strs:
                    for w in ent_str_to_words(ent_str):
                        to_account[etype].add(w)
            else:
                raise ValueError(f"Ent strings was a weird type: {type(ent_strs)}, {ent_strs}")

        # Add links
        for root, accounting in root_accounting.items():
            if accounting:
                for e in ENTS_FROZEN_NOROOT:
                    ent_strs = entry[e]
                    words = []
                    if isinstance(ent_strs, str):
                        words = ent_str_to_words(ent_strs)
                    elif isinstance(ent_strs, list):
                        for ent_str in ent_strs:
                            words += ent_str_to_words(ent_str)
                    else:
                        raise ValueError(f"Ent strings was a weird type: {type(ent_strs)}, {ent_strs}")

                    if words:
                        for f in accounting:
                            for w in words:
                                # avoid self-links
                                if f != w:
                                    to_account[f"{root}{LINK_DELIMITER}{e}"].add(f"{f}{LINK_DELIMITER}{w}")
    return to_account

def check_equivalence_of_entries(gold_entry, test_entry):
    ## Entries are a list of dictionaries
    ## We first order each list, then each dictionary, then compare strings


    ### order list by formula key
    gold_entry = sorted(gold_entry, key=lambda x: x.get('formula', ''))
    test_entry = sorted(test_entry, key=lambda x: x.get('formula', ''))

    ### order each dictionary by keys
    gold_entry = [dict(sorted(d.items())) for d in gold_entry]
    test_entry = [dict(sorted(d.items())) for d in test_entry]

    ### compare strings
    return str(gold_entry) == str(test_entry)



def ent_str_to_words(ent):
    stripped =  [e.strip() for e in ent.split(" ")]
    return [e for e in stripped if e]


In [39]:
for sample in tqdm.tqdm(run):
    gold_string = sample["completion"].replace("\n\nEND\n\n", "").strip()
    test_string = sample["gpt3_completion"].replace("\n\nEND\n\n", "").replace('\\', '').strip()
    # print(f"Gold string is {gold_string}")
    # print(f"Test string is {test_string}")
    gold_json = json.loads(gold_string)
    prompt = sample["prompt"].replace("\n\n###\n\n", "").strip()
    n_prompt_words = len([w for w in prompt.split(" ") if w])
    total += 1
    test_json = {}
    try:
        test_json = sample["gpt3_completion"]
        if isinstance(test_json, str):
            try:
                test_json = json.loads(test_json)
            except json.decoder.JSONDecodeError as jse:
                test_json = []
        for d in test_json:
            for key in ENTS_FROZEN:
                if key not in d:
                    if key in ["formula", "name", "acronym", "mof_formula", "name_of_mof"]:
                        d[key] = ""
                    else:
                        d[key] = [""]

            # remove extra keys as they are "parsable" but invalid
            extra_keys = []
            for key in d:
                if key not in ENTS_FROZEN:
                    extra_keys.append(key)
            for key in extra_keys:
                d.pop(key)
    except json.decoder.JSONDecodeError as jse:
                unparsable += 1

    if check_equivalence_of_entries(gold_json, test_json):
                exact_matches += 1

    jws = jellyfish.jaro_winkler_similarity(gold_string, test_string, long_tolerance=True)
    jaro_winkler_similarities.append(jws)

    gold_accounting = ent_json_to_word_basis_sets(gold_json)

    if test_json:
        test_accounting = ent_json_to_word_basis_sets(test_json)
    else:
        test_accounting = ent_json_to_word_basis_sets({}, return_empty=True)
    for etype in ENTS_FROZEN:
        ent_accounting_copy = copy.deepcopy(test_accounting[etype])
        n_unlabelled_words = copy.deepcopy(n_prompt_words)
        for ew in gold_accounting[etype]:

            # Account for true positives
            if ew in test_accounting[etype]:
                ent_scores_test[etype].append(1)
                ent_scores_gold[etype].append(1)
                ent_accounting_copy.remove(ew)
                n_unlabelled_words -= 1
            # account for false negatives
            else:
                ent_scores_test[etype].append(0)
                ent_scores_gold[etype].append(1)
                n_unlabelled_words -= 1

        # Among the remaining test accounting words, only false positives
        # should remain in the set
        for ew in ent_accounting_copy:
            ent_scores_test[etype].append(1)
            ent_scores_gold[etype].append(0)
            n_unlabelled_words -= 1

        # the only labels remaining are true negatives
        ent_scores_test[etype] += [0] * n_unlabelled_words
        ent_scores_gold[etype] += [0] * n_unlabelled_words

    for elinktype in ENTS_LINKS_FROZEN:
        gold_triples = gold_accounting[elinktype]
        test_triples = test_accounting[elinktype]

        n_correct_triples = len([e for e in test_triples if e in gold_triples])
        links_scores[elinktype]["test_correct_triplets"] += n_correct_triples
        links_scores[elinktype]["test_retrieved_triplets"] += len(test_triples)
        links_scores[elinktype]["gold_retrieved_triplets"] += len(gold_triples)
    
    break

results = {"ents": {}, "links": {}}
from sklearn.metrics import f1_score, recall_score, precision_score
for etype in ENTS_FROZEN:
    gold_arr = ent_scores_gold[etype]
    test_arr = ent_scores_test[etype]

    subdict = {"recall": 0, "precision": 0, "f1": 0}
    subdict["recall"] = recall_score(gold_arr, test_arr)
    subdict["precision"] = precision_score(gold_arr, test_arr)
    subdict["f1"] = f1_score(gold_arr, test_arr)
    results["ents"][etype] = subdict

  0%|                                                                                              | 0/51 [00:00<?, ?it/s]


TypeError: 'dict' object is not callable

In [41]:
gold_json

[{'name_of_mof': 'NH2-MIL-125',
  'mof_formula': '',
  'mof_description': ['amine-functionalized MOFs'],
  'guest_species': [''],
  'applications': ['photocatalysis', 'photocatalysts']},
 {'name_of_mof': 'CdS/NH2-MIL-125@TiO2',
  'mof_formula': '',
  'mof_description': ['amine-functionalized MOFs'],
  'guest_species': [''],
  'applications': ['photocatalysis', 'photocatalysts']}]

In [21]:
links_scores

{'name_of_mof|||mof_formula': {'test_correct_triplets': 0,
  'test_retrieved_triplets': 0,
  'gold_retrieved_triplets': 0},
 'name_of_mof|||mof_description': {'test_correct_triplets': 0,
  'test_retrieved_triplets': 0,
  'gold_retrieved_triplets': 0},
 'name_of_mof|||guest_species': {'test_correct_triplets': 0,
  'test_retrieved_triplets': 0,
  'gold_retrieved_triplets': 0},
 'name_of_mof|||applications': {'test_correct_triplets': 0,
  'test_retrieved_triplets': 0,
  'gold_retrieved_triplets': 0}}

In [36]:
true_answer = run[3]['completion']
true_answer

' [{"acronym": "", "applications": [""], "name": "post-perovskite", "formula": "MgSiO3", "structure_or_phase": [""], "description": [""]}, {"acronym": "", "applications": [""], "name": "MgO", "formula": "", "structure_or_phase": ["CsCl"], "description": [""]}, {"acronym": "", "applications": [""], "name": "", "formula": "MgSi2O5", "structure_or_phase": ["P21/c"], "description": [""]}, {"acronym": "", "applications": [""], "name": "SiO2", "formula": "", "structure_or_phase": ["Fe2P"], "description": [""]}]\n\nEND\n\n'

In [43]:
NERRE_answer = run[2]['gpt3_completion']
NERRE_answer

' [{"acronym": "", "applications": ["photovoltaics", "optoelectronics"], "name": "", "formula": "CH3NH3PbI3", "structure_or_phase": ["perovskite"], "description": ["single-crystalline"]}]'

In [52]:
run[0]['completion']

' [{"acronym": "", "applications": ["perovskite-based solar cells", "mesoporous scaffold", "solar cells"], "name": "", "formula": "Al2O3", "structure_or_phase": [""], "description": ["nanostructured"]}]\n\nEND\n\n'

In [4]:
for sample in tqdm(run):
    gold_string = sample["completion"].replace("\n\nEND\n\n", "").strip()
    test_string = sample["gpt3_completion"].replace("\n\nEND\n\n", "").replace('\\', '').strip()
    break

  0%|                                                                                                                                                             | 0/62 [00:00<?, ?it/s]


In [5]:
gold_string

'[{"acronym": "", "applications": ["perovskite-based solar cells", "mesoporous scaffold", "solar cells"], "name": "", "formula": "Al2O3", "structure_or_phase": [""], "description": ["nanostructured"]}]'

In [32]:
tool_names = list(eunomia.EunomiaTools.all_tools_dict.keys())
vectorstore = "../tests/test_files/test_vector_store.pkl"
a =eunomia.EunomiaTools(
    tool_names=tool_names, vectorstore=vectorstore
    ).get_tools()
a.index

<function list.index(value, start=0, stop=9223372036854775807, /)>

In [8]:
data_inference[0]['prompt']

'Sensitivity enhancement in planar microwave active-resonator using metal organic framework for CO2 detection\nThis work presents a proof of concepts of CO2 gas monitoring by a microwave sensor operating in microwave regime and study its sensitivity enhancement using an adsorbent bed of commercially available Zeolite 13X, and two synthesized MOF-199 (MOF-199-M1 and MOF-199-M2). The sensing principle of the sensor is based on the change in the dielectric properties of the bed, in response to CO2 concentration change in the dry CO2/He mixture. The sensor’s response is quantified in terms of change in the resonant frequency with respect to baseline for each material. The sensor shows maximum sensitivity of 24kHz/% CO2 for MOF-199-M2 and minimum of 10kHz/% CO2 for Zeolite 13 X. The sensitivity of the microwave senor to CO2 concentration, with MOF-199-M2 as the bed is higher than MOF-199-M1, which can be related to the presence of different amount of unsaturated Cu2+ ions in their framework

In [9]:
data_inference[2]['completion']

' [{"name_of_mof": "MOF-1004", "mof_formula": "", "mof_description": [""], "guest_species": [""], "applications": [""]}, {"name_of_mof": "MOF-1005", "mof_formula": "", "mof_description": [""], "guest_species": [""], "applications": [""]}, {"name_of_mof": "MOF-177", "mof_formula": "", "mof_description": [""], "guest_species": [""], "applications": [""]}, {"name_of_mof": "UiO-67", "mof_formula": "", "mof_description": [""], "guest_species": [""], "applications": [""]}]\n\nEND\n\n'

In [11]:
tools = EunomiaTools(tool_names = ['get_cif_from_COD', 'rename_cif']).get_tools()


agent = Eunomia(tools=tools, model='gpt-4', get_cost=True)
agent.run(prompt="Download cif files for this doi: 10.1021/cg301691d from COD and rename them.")

NameError: name 'EunomiaTools' is not defined

In [12]:
paper_id = '69'
docs_processor = eunomia.LoadDoc('../Data/chosen_papers/', paper_id=paper_id)
sliced_pages = docs_processor.process(['references ', 'acknowledgement', 'acknowledgments', 'references\n'],
                                             chunk_size=1000, chunk_overlap=20, chunking_type='fixed-size')

NameError: name 'eunomia' is not defined

In [13]:
from langchain.schema import Document
from typing import List, Optional
from copy import deepcopy


class LoadDoc:
    """
    A class to handle the loading and processing of different Docs.

    Attributes:
    paper_id : str
        ID of the paper to be loaded.
    paper_path : str
        Path of the paper to be loaded.
    loader : PyPDFLoader
        Instance of PyPDFLoader to load the document.
    pages : list
        Pages of the loaded document.
    """

    def __init__(self, filename: str = None, text_input: str = None, **kwargs):
        """
        Parameters:
        filename : str
            Path to file.
        text_input : str
            Direct text input.
        **kwargs are passed to the CSVLoader class.
        """
        if filename is None and text_input is None:
            raise ValueError("Either 'filename' or 'text_input' must be provided.")
        elif filename and text_input:
            raise ValueError(
                "Only one of 'filename' or 'text_input' should be provided as input."
            )

        if filename:
            extension = filename.split(".")[-1].lower()
            self._check_extension(extension)
            self.doc_path = filename
            if self.type == "pdf":
                from langchain.document_loaders import PyPDFLoader

                self.loader = PyPDFLoader(filename)
            if self.type == "md":
                from langchain.document_loaders import UnstructuredMarkdownLoader

                self.loader = UnstructuredMarkdownLoader(filename)
            if self.type == "csv":
                from langchain.document_loaders.csv_loader import CSVLoader

                self.loader = CSVLoader(filename, **kwargs)
            if self.type == "txt":
                from langchain.document_loaders import TextLoader

                self.loader = TextLoader(filename)
            self.pages = self.loader.load_and_split()
            print(type(self.pages))
        else:
            self.pages = [Document(page_content=text_input, metadata={"source": "local"})]

    def _check_extension(self, extension: str):
        """
        Checks the provided file extension against the supported extensions.

        Parameters:
        extension : str
            File extension to check.

        Raises:
        Exception:
            If the file extension is not supported.
        NotImplementedError:
            If the file extension is 'xml', which is not yet implemented.
        """
        supported_extensions = {"pdf", "txt", "md", "csv"}
        if extension in supported_extensions:
            self.type = extension
        elif extension == "xml":
            raise NotImplementedError
        else:
            raise Exception(f"Eunomia supports {supported_extensions} doc files.")

    @staticmethod
    def cut_text(text: str, keywords: List[str]) -> str:
        """
        Cuts the given text up to the first found keyword.

        Parameters:
        text : str
            The text to be cut.
        keywords : List[str]
            List of keywords to find in the text.

        Returns:
        str
            The cut text.
        """
        lower_text = text.lower()
        indices = [
            lower_text.find(keyword)
            for keyword in keywords
            if lower_text.find(keyword) != -1
        ]
        min_index = min(indices)
        return text[:min_index].strip()  # remove any trailing spaces

    @staticmethod
    def find_in_document(document: Document, search_strings: List[str]) -> bool:
        """
        Searches for the given strings in the document content.

        Parameters:
        document : Document
            Document in which to search.
        search_strings : List[str]
            List of strings to search for.

        Returns:
        bool
            True if any of the search strings are found, False otherwise.
        """
        return any(
            search_string.lower() in document.page_content.lower()
            for search_string in search_strings
        )

    def filter_documents(
        self, documents: List[Document], search_strings: List[str]
    ) -> List[Document]:
        """
        Filters documents based on the presence of search strings.
        use this if you wish to remove "Acknowledgments or "References"
        in a long research article.

        Parameters:
        documents : List[Document]
            List of documents to filter.
        search_strings : List[str]
            List of strings to search for.

        Returns:
        List[Document]
            List of filtered documents.
        """
        filtered_documents = deepcopy(documents)  # Create a deep copy of documents
        for i, doc in enumerate(filtered_documents):
            if self.find_in_document(doc, search_strings):
                filtered_documents[i].page_content = self.cut_text(
                    filtered_documents[i].page_content,
                    keywords=search_strings,
                )
                filtered_documents = filtered_documents[: i + 1]
                break
        return filtered_documents

    def process(
        self,
        filter_words: List[str] = [],
        chunk_size: Optional[int] = None,
        chunk_overlap: Optional[int] = 0,
        chunking_type="fixed-size",
    ) -> List[Document]:
        """
        Process the document pages based on the search strings. Additionally, this function
        will split the document into chunks if a chunk size is provided.

        Parameters:
        filter_words : List[str]
            List of words to search and filter.
        chunk_size : Optional[int]
            The size of the chunks in which the document will be split. If this parameter
            is not provided, the document will not be split into chunks.
        chunk_overlap : Optional[int]
            The size of the overlap between chunks. If chunk_size is not provided, this
            parameter will not be used.

        Returns:
        List[Document]
            List of processed document chunks.
        """
        sliced_pages = self.filter_documents(self.pages, filter_words)
        text_splitter = None
        if chunk_size is not None:
            if chunking_type == "fixed-size":
                from langchain.text_splitter import (
                    CharacterTextSplitter,
                )

                text_splitter = CharacterTextSplitter(
                    chunk_size=chunk_size, chunk_overlap=chunk_overlap
                )
            if chunking_type == "latex":
                from langchain.text_splitter import LatexTextSplitter

                text_splitter = LatexTextSplitter(
                    chunk_size=chunk_size, chunk_overlap=chunk_overlap
                )
            if chunking_type == "NLTK":
                from langchain.text_splitter import NLTKTextSplitter

                text_splitter = NLTKTextSplitter(
                    chunk_size=chunk_size, chunk_overlap=chunk_overlap
                )
            if chunking_type == "spacy":
                from langchain.text_splitter import SpacyTextSplitter

                text_splitter = SpacyTextSplitter(
                    chunk_size=chunk_size, chunk_overlap=chunk_overlap
                )

            sliced_pages = text_splitter.split_documents(sliced_pages)

        return sliced_pages


In [26]:
import eunomia
extensions = ["csv", "txt", "md", "pdf"]
test_results = [['name:', 'description:'], ['thermospheric', 'models.'], ['Mehrad', 'Ansari.'], ['bioRxiv', 'preprint']]

for e in extensions:
    test_file_name = f"../tests/test_files/test_docs.{e}"
    docs_processor = eunomia.LoadDoc(file_name=test_file_name)
    doc_pages = docs_processor.process(
        filter_words=[
            "references ",
            "acknowledgement",
            "acknowledgments",
            "references\n",
        ],
        chunk_size=1000,
        chunk_overlap=20,
        chunking_type="fixed-size",
    )
    print(doc_pages[-1].page_content.split()[-2:])

<class 'list'>
['name:', 'description:']
<class 'list'>
['thermospheric', 'models.']
<class 'list'>
['Mehrad', 'Ansari.']
<class 'list'>
['bioRxiv', 'preprint']


['bioRxiv', 'preprint']


In [29]:
text_input = "Electrodynamics in superconductors explained by Proca equations\nA fully consistent model to study electrodynamics for superconductors in the stationary and non-stationary regimes has been developed based on Proca equations and a massive photon. In particular, this approach has been applied to study the electric field penetration depth in superconductors. The model shows a deviation from the charge contribution to an internal electric field compared to previous approaches"

with open("../tests/test_files/test_docs.txt", "rb") as f:
    text_input = f.read()

docs_processor = LoadDoc(text_input=text_input)
sliced_pages = docs_processor.process(filter_words=['references ', 'acknowledgement', 'acknowledgments', 'references\n'],
                                             chunk_size=1000, chunk_overlap=20, chunking_type='fixed-size')
sliced_pages[-1].page_content.split()[-2:]

['thermospheric', 'models.']

In [131]:
import eunomia
paper_file = '../Data/chosen_papers/1.pdf'
# filename = 'OutdoorClothingCatalog_1000.csv'
docs_processor = LoadDoc(filename=filename, encoding="utf8")
sliced_pages = docs_processor.process(filter_words=['references ', 'acknowledgement', 'acknowledgments', 'references\n'],
                                             chunk_size=1000, chunk_overlap=20, chunking_type='fixed-size')

Created a chunk of size 1005, which is longer than the specified 1000
Created a chunk of size 1004, which is longer than the specified 1000


<class 'list'>


In [126]:
from langchain.document_loaders import TextLoader
from langchain.document_loaders import UnstructuredMarkdownLoader

loader = TextLoader("sample_text.txt")
doc =  Document(page_content="text", metadata={"source": "local"})
a = loader.load_and_split()

In [119]:
type(doc)

langchain.schema.document.Document

In [135]:
type(a[0])

langchain.schema.document.Document

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings

Embedding_model = 'text-embedding-ada-002' # text-embedding-ada-002 text-davinci-003
faiss_index_path = f"../Data/chosen_papers/faiss/{Embedding_model}/faiss_index_{Embedding_model}_{paper_id}.pkl"
faiss_index = FAISS.from_documents(sliced_pages, OpenAIEmbeddings(model=Embedding_model))

In [4]:

with open('../data/NERRE/general_materials_information_annotations.jsonl', 'rb') as f:
    for line in f:
        general_annotations = json.loads(line)

general_annotations

{'prompt': 'Effect of ceria structural properties on the catalytic activity of Au–CeO2 catalysts for WGS reaction\nTwo gold based catalysts supported on ceria prepared by different methods (urea gelation coprecipitation, UGC, and coprecipitation, CP) have been synthesized and tested in the WGS reaction, showing quite different catalytic behaviors. Interestingly, the two catalysts have the same gold loading (3 wt% Au was inserted by deposition–precipitation) and the FTIR spectroscopy of the adsorbed CO revealed the same amount of gold exposed sites. With the aim to elucidate how the preparation method affects the properties of the support, a morphological, structural and textural characterization has been performed by HRTEM, XRD, BET and Raman analyses, as well as FTIR spectroscopy to probe both the Au and the support exposed sites. It was found that the UGC method gave rise to an enhancement of the defectivity of ceria and to an increase of the reactivity under reductive treatment. Fur

In [5]:
general_annotations.keys()

dict_keys(['prompt', 'completion'])