In [None]:
!pip install langchain langchain_community langchain_openai openai pubchempy rdkit-pypi gradio

In [None]:
import os
os.environ["OPENAI_API_KEY"] = ""

In [None]:
from langchain_openai.chat_models import ChatOpenAI
from pydantic import BaseModel, Field
from langchain_community.utils.openai_functions import convert_pydantic_to_openai_function
from typing import List

class ExtractedInfo(BaseModel):
    search_key: str = Field(
        ...,
        description="The key to search for the compound, it should be compound name, formula, or SMILES."
    )
    search_key_type: str = Field(
        ...,
        description="The type of key to search for the compound, is it a name, formula or SMILES ."
    )
    requested_attributes: List[str] = Field(
        ...,
        description=("The attribute being requested it should be one of the following: atom_stereo_count, atoms, bond_stereo_count, bonds, cactvs_fingerprint, canonical_smiles, charge, cid, complexity, conformer_id_3d, conformer_rmsd_3d, coordinate_type, covalent_unit_count, defined_atom_stereo_count, defined_bond_stereo_count, effective_rotor_count_3d, elements, exact_mass, feature_selfoverlap_3d, fingerprint, h_bond_acceptor_count, h_bond_donor_count, heavy_atom_count, inchi, inchikey, isomeric_smiles, isotope_atom_count, iupac_name, mmff94_energy_3d, mmff94_partial_charges_3d, molecular_formula, molecular_weight, monoisotopic_mass, multipoles_3d, pharmacophore_features_3d, record, rotatable_bond_count, shape_fingerprint_3d, shape_selfoverlap_3d, tpsa, undefined_atom_stereo_count, undefined_bond_stereo_count, volume_3d, xlogp, canonical_smiles, or compound.synonyms.")
    )

function_schema = convert_pydantic_to_openai_function(ExtractedInfo)

def extract_information_with_langchain(user_input: str):
    llm = ChatOpenAI(temperature=0, model_name="gpt-4")

    prompt = f"""You are a chemistry expert chatbot.
    Extract the search key and requested attributes from the following user input:
    '{user_input}'
    Format the result as JSON based on the schema provided."""

    response = llm.predict(prompt, functions=[function_schema])


    try:
        extracted_info = ExtractedInfo.model_validate_json(response)
        return extracted_info
    except Exception as e:
        return f"Error validating extracted information: {str(e)}"




  function_schema = convert_pydantic_to_openai_function(ExtractedInfo)


In [None]:
user_query = "What is the molecular weight and molecular formula of caffeine?"

result = extract_information_with_langchain(user_query)

if isinstance(result, ExtractedInfo):
    print("Extracted Information:")
    print(f"Search Key: {result.search_key}")
    print(f"Search Key type: {result.search_key_type}")
    print(f"Requested Attributes: {', '.join(result.requested_attributes)}")
else:
    print(result)

Extracted Information:
Search Key: caffeine
Search Key type: name
Requested Attributes: molecular_weight, molecular_formula


In [None]:
user_query = "What is this compound name CC(=O)OC1=CC=CC=C1C(=O)O and what is its molecular formula ?"

result = extract_information_with_langchain(user_query)

if isinstance(result, ExtractedInfo):
    print("Extracted Information:")
    print(f"Search Key: {result.search_key}")
    print(f"Search Key type: {result.search_key_type}")
    print(f"Requested Attributes: {', '.join(result.requested_attributes)}")
else:
    print(result)

Extracted Information:
Search Key: CC(=O)OC1=CC=CC=C1C(=O)O
Search Key type: SMILES
Requested Attributes: iupac_name, molecular_formula


In [None]:
user_query = "What is this compound name H2O and what is its SMILES formula ?"

result = extract_information_with_langchain(user_query)

if isinstance(result, ExtractedInfo):
    print("Extracted Information:")
    print(f"Search Key: {result.search_key}")
    print(f"Search Key type: {result.search_key_type}")
    print(f"Requested Attributes: {', '.join(result.requested_attributes)}")
else:
    print(result)

Extracted Information:
Search Key: H2O
Search Key type: formula
Requested Attributes: canonical_smiles, iupac_name


In [None]:
test_questions = {
    "What is the molecular formula of Aspirin?": {
        "search_key": "name",
        "search_value": "Aspirin",
        "information_requested": ["molecular_formula"]
    },
    "Provide the canonical SMILES for Ibuprofen.": {
        "search_key": "name",
        "search_value": "Ibuprofen",
        "information_requested": ["canonical_smiles"]
    },
    "What is the molecular weight of caffeine?": {
        "search_key": "name",
        "search_value": "caffeine",
        "information_requested": ["molecular_weight"]
    },
    "Give me the InChIKey for Paracetamol.": {
        "search_key": "name",
        "search_value": "Paracetamol",
        "information_requested": ["inchikey"]
    },
    "What is the exact mass of water?": {
        "search_key": "name",
        "search_value": "water",
        "information_requested": ["exact_mass"]
    },
    "How many rotatable bonds does Ethanol have?": {
        "search_key": "name",
        "search_value": "Ethanol",
        "information_requested": ["rotatable_bond_count"]
    },
    "List all synonyms for Methane.": {
        "search_key": "name",
        "search_value": "Methane",
        "information_requested": ["compound.synonyms"]
    },
    "Provide the number of heavy atoms in Benzene.": {
        "search_key": "name",
        "search_value": "Benzene",
        "information_requested": ["heavy_atom_count"]
    },
    "Generate the chemical structure drawing for Acetone.": {
        "search_key": "name",
        "search_value": "Acetone",
        "information_requested": ["canonical_smiles"],
        "action": "draw_structure"
    },
    "What is the atom stereo count for Threonine?": {
        "search_key": "name",
        "search_value": "Threonine",
        "information_requested": ["atom_stereo_count"]
    },
    "Provide the molecular formula of Glucose.": {
        "search_key": "name",
        "search_value": "Glucose",
        "information_requested": ["molecular_formula"]
    },
    "Give me the canonical SMILES for Sulfuric acid.": {
        "search_key": "name",
        "search_value": "Sulfuric acid",
        "information_requested": ["canonical_smiles"]
    },
    "What is the molecular formula of NaCl?": {
        "search_key": "formula",
        "search_value": "NaCl",
        "information_requested": ["molecular_formula"]
    },
    "Provide the InChIKey for CH4.": {
        "search_key": "formula",
        "search_value": "CH4",
        "information_requested": ["inchikey"]
    },
    "What is the molecular weight of H2SO4?": {
        "search_key": "formula",
        "search_value": "H2SO4",
        "information_requested": ["molecular_weight"]
    }
}


In [None]:
key_error = []
info_error = []
for q, info in test_questions.items():

    print(f"Question: {q}")
    result = extract_information_with_langchain(q)

    if result.search_key == info["search_value"] and result.requested_attributes ==  info["information_requested"]:
      continue
    if result.search_key != info["search_value"]:
      key_error.append(q)
    if result.requested_attributes !=  info["information_requested"]:
      info_error.append(q)


Question: What is the molecular formula of Aspirin?
Question: Provide the canonical SMILES for Ibuprofen.
Question: What is the molecular weight of caffeine?
Question: Give me the InChIKey for Paracetamol.
Question: What is the exact mass of water?
Question: How many rotatable bonds does Ethanol have?
Question: List all synonyms for Methane.
Question: Provide the number of heavy atoms in Benzene.
Question: Generate the chemical structure drawing for Acetone.
Question: What is the atom stereo count for Threonine?
Question: Provide the molecular formula of Glucose.
Question: Give me the canonical SMILES for Sulfuric acid.
Question: What is the molecular formula of NaCl?
Question: Provide the InChIKey for CH4.
Question: What is the molecular weight of H2SO4?


In [None]:
len(key_error)

0

In [None]:
len(info_error)

0