# MOF ChemUnity Property Extraction

This notebook demonstrates how the property extraction in MOF ChemUnity is used. You need to have the MOF name that you want to extract properties for which is obtained from the Matching workflow.

In [61]:
from src.MOF_ChemUnity.Agents.ExtractionAgent import ExtractionAgent
from src.MOF_ChemUnity.utils.DocProcessor import DocProcessor
from src.MOF_ChemUnity.Extraction_Prompts import VERIFICATION, RECHECK, EXTRACTION
from src.MOF_ChemUnity.Water_Stability_Prompts import WATER_STABILITY, RULES_WATER_STABILITY, VERF_RULES_WATER_STABILITY, WATER_STABILITY_RE

### Preparation of MOF Names from Matching CSV

we need to read the matching csv file and extract the file names from within that.

In [62]:
import pandas as pd
import glob
import os

In [63]:
with open(".apikeys", 'r') as f:
    os.environ["OPENAI_API_KEY"] = f.read()

from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o", temperature=0.1)
parser_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

In [64]:
mof_names_df = pd.read_csv("/mnt/c/Users/Amro/Desktop/DOI LISTS/Processing Batches - results/Batch 7/matching.csv", skiprows=102+101, header=None, names=["MOF Name", "CSD Ref Code", "Justification", "DOI"])
mof_names_df.head()

Unnamed: 0,MOF Name,CSD Ref Code,Justification,DOI
0,{[Cd2(tbip)(nic)2(H2O)4]·H2O}n<|>compound 3,PORZON,"The MOF {[Cd2(tbip)(nic)2(H2O)4]·H2O}n, referr...",10.1002/zaac.201400134
1,Cd(BIPA)(bpe)1.5<|>complex 1,LIDQIA,"The MOF Cd(BIPA)(bpe)1.5, referred to as compl...",10.1002/zaac.201100373
2,Cd(BIPA)(bpp)(H2O)<|>complex 2,not provided,"The MOF Cd(BIPA)(bpp)(H2O), referred to as com...",10.1002/zaac.201100373
3,[Mn2(L)(DMPU)3]n<|>compound 1,WUFSOI,"The MOF [Mn2(L)(DMPU)3]n, referred to as compo...",10.1002/zaac.201900299
4,"[Mn(L)0.5(4,4'-bipy)0.5(H2O)]n<|>compound 2",WUFSIC,"The MOF [Mn(L)0.5(4,4'-bipy)0.5(H2O)]n, referr...",10.1002/zaac.201900299


### Markdown files setup

In [65]:
from src.MOF_ChemUnity.Agents.BaseAgent import BaseAgent


input_folder = "/mnt/c/Users/Amro/Desktop/DOI LISTS/Processing Batches - vs/Batch 7/vs"


files = glob.glob(input_folder+"/*/")

### Running the Extraction Loop for General Property Extraction + CoV

In [66]:
result = {}
result["MOF Name"] = []
result["Ref Code"] = []
result["Property"] = []
result["Value"] = []
result["Units"] = []
result["Condition"] = []
result["Summary"] = []
result["Reference"] = []

In [67]:
filtered_result = {}
filtered_result["MOF Name"] = []
filtered_result["Ref Code"] = []
filtered_result["Property"] = []
filtered_result["Value"] = []
filtered_result["Units"] = []
filtered_result["Condition"] = []
filtered_result["Summary"] = []
filtered_result["Reference"] = []

In [68]:
ws_result = {}
ws_result["MOF Name"] = []
ws_result["Ref Code"] = []
ws_result["Property"] = []
ws_result["Value"] = []
ws_result["Units"] = []
ws_result["Condition"] = []
ws_result["Summary"] = []
ws_result["Reference"] = []

In [69]:
WS_READ = WATER_STABILITY.replace("{RULES}", RULES_WATER_STABILITY)
WS_CHECK = VERIFICATION.replace("{VERF_RULES}", VERF_RULES_WATER_STABILITY)
WS_RECHECK = RECHECK.replace("{RECHECK_INSTRUCTIONS}", WATER_STABILITY_RE.replace("{RULES}", RULES_WATER_STABILITY))

sp_dict = {"read_prompts": [WS_READ], "verification_prompts": [WS_CHECK], "recheck_prompts": [WS_RECHECK]}

In [70]:
agent = ExtractionAgent(llm=llm)

In [None]:
from openai import RateLimitError


try:
    for i in range(len(mof_names_df)):

        mof = mof_names_df.iloc[i]["MOF Name"]
        refcode = mof_names_df.iloc[i]["CSD Ref Code"]
        reference = mof_names_df.iloc[i]["DOI"]

        if len(refcode) > 8:
            continue
        if refcode.lower() == "not provided":
            continue
        
        response = agent.agent_response(mof, reference.replace("/","_")+".md",
                                        EXTRACTION, ["Water Stability"], sp_dict, CoV=True, fuzz_threshold=85,
                                        vector_store=input_folder+f"/{reference.replace('/','_',1)}")

        general_extraction = response[0]

        filtered = general_extraction[0]
        all_props = general_extraction[1]

        print(filtered)
        print(all_props)

        for j in filtered:
            filtered_result["MOF Name"].append(mof)
            filtered_result["Ref Code"].append(refcode)
            filtered_result["Reference"].append(reference)
            filtered_result["Property"].append(j.name)
            filtered_result["Units"].append(j.units)
            filtered_result["Value"].append(j.value)
            filtered_result["Condition"].append(j.condition)
            filtered_result["Summary"].append(j.summary)
        for j in all_props.properties:
            result["MOF Name"].append(mof)
            result["Ref Code"].append(refcode)
            result["Reference"].append(reference)
            result["Property"].append(j.name)
            result["Units"].append(j.units)
            result["Value"].append(j.value)
            result["Condition"].append(j.condition)
            result["Summary"].append(j.summary)
        
        specific_extraction = response[1]

        ws = specific_extraction[0]

        for j in ws:
            ws_result["MOF Name"].append(mof)
            ws_result["Ref Code"].append(refcode)
            ws_result["Reference"].append(reference)
            ws_result["Property"].append(j.name)
            ws_result["Units"].append(j.units)
            ws_result["Value"].append(j.value)
            ws_result["Condition"].append(j.condition)
            ws_result["Summary"].append(j.summary)
except Exception as e:
    all_props = pd.DataFrame(result)
    filtered = pd.DataFrame(filtered_result)
    ws = pd.DataFrame(ws_result)

    all_props.to_csv("/mnt/c/Users/Amro/Desktop/all-B7-P3.csv")
    filtered.to_csv("/mnt/c/Users/Amro/Desktop/fil-B7-P3.csv")
    ws.to_csv("/mnt/c/Users/Amro/Desktop/ws-B7-P3.csv")


all_props = pd.DataFrame(result)
filtered = pd.DataFrame(filtered_result)
ws = pd.DataFrame(ws_result)
    

Action: reading the document
finding all properties of name 1: ][Cd2(tbip)(nic)2(H2O)4]·H2O]n ---name 2: compound 3

Result: 
1.  -Property Name: Crystal System
    -Property Value: monoclinic
    -Value Units: N/A
    -Conditions: N/A
    -Summary: "Compound 3 crystallizes in the acentric space group Cc, with the Flack parameter[31] of –0.02(2) indicating enantiomeric purity within the crystal."

2.  -Property Name: Space Group
    -Property Value: Cc
    -Value Units: N/A
    -Conditions: N/A
    -Summary: "Compound 3 crystallizes in the acentric space group Cc, with the Flack parameter[31] of –0.02(2) indicating enantiomeric purity within the crystal."

3.  -Property Name: Flack Parameter
    -Property Value: -0.02(2)
    -Value Units: N/A
    -Conditions: N/A
    -Summary: "Compound 3 crystallizes in the acentric space group Cc, with the Flack parameter[31] of –0.02(2) indicating enantiomeric purity within the crystal."

4.  -Property Name: Formula Weight
    -Property Value: 779.3

In [None]:
all_props.to_csv("/mnt/c/Users/Amro/Desktop/all-B7-P3.csv")
filtered.to_csv("/mnt/c/Users/Amro/Desktop/fil-B7-P3.csv")
ws.to_csv("/mnt/c/Users/Amro/Desktop/ws-B7-P3.csv")