# Task 4 - Retrieve: Has PCN-46 ever been used for Carbon Capture?
In this notebook, we demonstrate the ability of knowledge graph-augmented LLMs to conduct an advanced literature search. By combining the pre-existing information in our knowledge graph, we have identified a promising MOF candidate for Carbon Capture (PCN-46) based on its similarity to HKUST-1, experimentally measured CO2 uptake, and text-mined expert recommendation. Now, we will use a cross-document agent to search for additional information outside of PCN-46's original synthesis paper

In [None]:
from MOF_ChemUnity.QueryAgent import QueryGenerationAgent

# Connect to graph
agent = QueryGenerationAgent()
data = agent.run_full_query("All experimental information for MOFs with name PCN-46 and HKUST-1")

✅ Connected to Neo4j.


Unnamed: 0,m.refcode,property_name,property_value,r.units,r.condition,r.reference,r.summary
0,LUYHAP,Density,0.618537,,,,
1,LUYHAP,Solvent-Accessible Volume,73.8,%,Calculated using the PLATON routine,10.1039/c002767g,"Calculated using the PLATON routine, PCN-46 ha..."
2,LUYHAP,Pore Volume,6.8,Å,Based on the Horvath–Kawazoe model,10.1039/c002767g,It has a uniform pore size around 6.8 A˚ based...
3,LUYHAP,Surface Area,2500,m2 g-1,Based on the N2 sorption isotherm,10.1039/c002767g,"Based on the N2 sorption isotherm, PCN-46 has ..."
4,LUYHAP,CO2 Uptake,21.0,mmol g-1,Saturation excess CO2 uptake at 30 bar,10.1039/c002767g,"As can be seen in Fig. 5, the saturation exces..."
5,LUYHAP,H2 uptake,71.6,mg g-1,Total uptake at 77 K,10.1039/c002767g,"Table 1 Ligand length, porosity and H2 uptake ..."
6,TISPAO,Density,1.01627,,,,
7,TISPAO,Surface Area,97,m²/g,After sulfur encapsulation,10.1021/cg401304x,"As shown in Figure S2, Supporting Information,..."
8,TISPAO,Color,Purple,,After synthesis,10.1021/cg401304x,"In fact, the HKUST-1/S mixture is still purple."
9,WULJUJ,Density,0.880841,g/cm³,Not specified,10.1021/ja904782h,"""HKUST-1 (2095) 201 177 23.4 0.21 19.3"""


## Cross Document Extraction for PCN-46

In [1]:
import requests
import openai
import numpy as np
import pandas as pd
import faiss
import time  
import os


In [4]:
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()
# Retrieve the Scopus API key
SCOPUS_API_KEY = os.getenv('SCOPUS_API_KEY')


In [8]:
import requests
import pandas as pd
import time

def search_scopus(mof_name, count=50, batch_size=100, wait_time=1, abstract_wait=1):
    """
    Searches Scopus for articles related to the given MOF name and synthesis,
    retrieves metadata (Title, DOI), and fetches full abstracts.

    Parameters:
        mof_name (str): The name of the MOF to search for.
        count (int): The total number of papers to retrieve (default=50).
        batch_size (int): Max papers per request (Scopus limit = 200, using 100 for better control).
        wait_time (float): Delay (seconds) between Scopus search requests to avoid rate limits.
        abstract_wait (float): Delay (seconds) between abstract retrieval requests.

    Returns:
        pd.DataFrame: A DataFrame containing retrieved papers (Title, Abstract, DOI).
    """
    search_url = "https://api.elsevier.com/content/search/scopus"
    abstract_url_base = "https://api.elsevier.com/content/abstract/doi/"
    headers = {"X-ELS-APIKey": SCOPUS_API_KEY, "Accept": "application/json"}

    papers = []
    retrieved = 0  # Track total retrieved papers

    while retrieved < count:
        remaining = count - retrieved
        current_batch_size = min(batch_size, remaining)  # Ensure we don’t exceed count
        
        params = {
            "query": f'TITLE-ABS-KEY("{mof_name}" AND Carbon Capture) AND DOCTYPE(ar)',
            "count": current_batch_size,
            "start": retrieved
        }

        time.sleep(wait_time)  # Prevent hitting rate limits
        response = requests.get(search_url, headers=headers, params=params)

        if response.status_code != 200:
            print(f"Error: Received status code {response.status_code}")
            break  # Stop if error

        data = response.json()

        if "search-results" in data and "entry" in data["search-results"]:
            for entry in data["search-results"]["entry"]:
                title = entry.get("dc:title", "")
                doi = entry.get("prism:doi", "N/A")
                abstract = entry.get("dc:description", "") or "No abstract available"

                # Retrieve full abstract if missing
                if abstract == "No abstract available" and doi != "N/A":
                    time.sleep(abstract_wait)  # Prevent rate limits
                    abstract_url = f"{abstract_url_base}{doi}"
                    abstract_response = requests.get(abstract_url, headers=headers)
                    
                    if abstract_response.status_code == 200:
                        abstract_data = abstract_response.json()
                        abstract = abstract_data.get("abstracts-retrieval-response", {}).get("coredata", {}).get("dc:description", "No abstract available")

                papers.append({"title": title, "abstract": abstract, "doi": doi})
                retrieved += 1

                if retrieved >= count:
                    break  # Stop when we reach `count` papers

            if len(data["search-results"]["entry"]) < current_batch_size:
                break  # Stop if no more papers left in Scopus

        else:
            print("No results found or incorrect response format")
            break

    # Convert to DataFrame
    papers_df = pd.DataFrame(papers, columns=["title", "abstract", "doi"])

    if len(papers_df) < count:
        print(f"Warning: Could not find {count} papers. Returning {len(papers_df)} instead.")

    return papers_df


In [9]:
# Function to get OpenAI embeddings
def get_embedding(text, model="text-embedding-ada-002"):
    if not text.strip():  # Handle empty abstracts
        return np.zeros(1536)  # Default embedding size for OpenAI models
    response = openai.embeddings.create(input=[text], model=model)
    return response.data[0].embedding

In [10]:
# Function to Perform vector search
def vector_search(query_text, index, abstracts, top_k=5):
    query_embedding = np.array(get_embedding(query_text)).reshape(1, -1)
    _, top_indices = index.search(query_embedding, top_k)
    return [(abstracts[i], i) for i in top_indices[0]]

In [11]:
# Search Scopus
MOF_NAME = "PCN-46"  # Change to the desired MOF
papers_df = search_scopus(MOF_NAME, count=500)
papers_df



Unnamed: 0,title,abstract,doi
0,,No abstract available,
