In [1]:
!pip install pubchempy

Collecting pubchempy
  Downloading PubChemPy-1.0.4.tar.gz (29 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pubchempy
  Building wheel for pubchempy (setup.py) ... [?25l[?25hdone
  Created wheel for pubchempy: filename=PubChemPy-1.0.4-py3-none-any.whl size=13818 sha256=dcc50748f8721bdeaf4794c9437699052c32dae879d412c64f2b6fed34877be8
  Stored in directory: /root/.cache/pip/wheels/8b/e3/6c/3385b2db08b0985a87f5b117f98d0cb61a3ae3ca3bcbbd8307
Successfully built pubchempy
Installing collected packages: pubchempy
Successfully installed pubchempy-1.0.4


In [2]:
!pip install langchain langchain_community langchain_openai openai rdkit-pypi gradio

Collecting langchain_community
  Downloading langchain_community-0.3.22-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain_openai
  Downloading langchain_openai-0.3.14-py3-none-any.whl.metadata (2.3 kB)
Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Collecting gradio
  Downloading gradio-5.26.0-py3-none-any.whl.metadata (16 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting tiktoken<1,>=0.7 (from langchain_openai)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting ai

In [71]:
from langchain_openai.chat_models import ChatOpenAI
from pydantic import BaseModel, Field
from langchain_community.utils.openai_functions import convert_pydantic_to_openai_function
from typing import List

import pubchempy as pcp
from rdkit import Chem
from rdkit.Chem import Draw
import time

import os
os.environ["OPENAI_API_KEY"] = ""

In [4]:
class ExtractedInfo(BaseModel):
    search_key: str = Field(
        ...,
        description="The key to search for the compound, it should be compound name, formula, or SMILES."
    )
    search_key_type: str = Field(
        ...,
        description="The type of key to search for the compound, is it a name, formula or SMILES ."
    )
    requested_attributes: List[str] = Field(
        ...,
        description=("The attribute being requested it should be one of the following: atom_stereo_count, atoms, bond_stereo_count, bonds, cactvs_fingerprint, canonical_smiles, charge, cid, complexity, conformer_id_3d, conformer_rmsd_3d, coordinate_type, covalent_unit_count, defined_atom_stereo_count, defined_bond_stereo_count, effective_rotor_count_3d, elements, exact_mass, feature_selfoverlap_3d, fingerprint, h_bond_acceptor_count, h_bond_donor_count, heavy_atom_count, inchi, inchikey, isomeric_smiles, isotope_atom_count, iupac_name, mmff94_energy_3d, mmff94_partial_charges_3d, molecular_formula, molecular_weight, monoisotopic_mass, multipoles_3d, pharmacophore_features_3d, record, rotatable_bond_count, shape_fingerprint_3d, shape_selfoverlap_3d, tpsa, undefined_atom_stereo_count, undefined_bond_stereo_count, volume_3d, xlogp, canonical_smiles, or synonyms.")
    )

function_schema = convert_pydantic_to_openai_function(ExtractedInfo)

def extract_information_with_langchain(user_input: str):
    llm = ChatOpenAI(temperature=0, model_name="gpt-4")

    prompt = f"""You are a chemistry expert chatbot.
    Extract the search key and requested attributes from the following user input:
    '{user_input}'
    Format the result as JSON based on the schema provided."""

    response = llm.predict(prompt, functions=[function_schema])


    try:
        extracted_info = ExtractedInfo.model_validate_json(response)
        return extracted_info
    except Exception as e:
        return f"Error validating extracted information: {str(e)}"




  function_schema = convert_pydantic_to_openai_function(ExtractedInfo)


In [5]:
def retrive_info(search_key, key_type, attributes):

  retries = 3
  for i in range(retries):
    try:
       compounds = pcp.get_compounds(search_key, key_type)
    except TimeoutError:
      if i < retries - 1:
        time.sleep(2)
        print(f"Retrying PubChem request (attempt {i + 2} of {retries})...")
      else:
        raise

  smiles = compounds[0].canonical_smiles
  compound = compounds[0].to_dict()
  results = []
  for attribute in attributes:
    if attribute == "inchi":
      results.append(f"inchi: {compound['inchi'].split('/')[1]}")
    elif attribute == "elements":
      results.append(f"elements:{' , '.join(compounds[0]['elements'])}")
    elif attribute == "iupac_name":
      names = [comp.iupac_name for comp in compounds if comp.iupac_name != None]
      names = list(set(names))
      results.append(f"names:{' , '.join(names)}")
    elif attribute == "synonyms":
      synonyms = compounds[0].synonyms
      synonyms = list(set(synonyms))
      results.append(f"synonyms:{' , '.join(synonyms)}")
    else:
      results.append(f"{attribute}:{compound[attribute]}")

  final_results = "\n ".join(results)
  molecule = Chem.MolFromSmiles(smiles)

  Draw.MolToFile(molecule, 'output_draw.png')
  return final_results

In [6]:
user_q = "What is the molecular formula of Aspirin?"

In [7]:
extracted_info = extract_information_with_langchain(user_q)

  response = llm.predict(prompt, functions=[function_schema])


In [8]:
extracted_info

ExtractedInfo(search_key='Aspirin', search_key_type='name', requested_attributes=['molecular_formula'])

In [9]:
extracted_info.search_key

'Aspirin'

In [10]:
retrived_info = retrive_info(extracted_info.search_key, extracted_info.search_key_type, extracted_info.requested_attributes)

In [11]:
retrived_info

'molecular_formula:C17H19N3O3S'

In [12]:
def generate_answer_with_langchain(user_input: str, retrived_info: str):
    llm = ChatOpenAI(temperature=0, model_name="gpt-4")

    prompt = f"""You are a chemistry expert chatbot, and you will get retrived information to answer the user questions
    generate an expert answer for the following user question, if the user asked for a drawing of generating a structure just tell them that 'it is shown below':
    '{user_input}'
    using tne following extracted information:
    '{retrived_info}'

   after you finish the answer, add to it that the user can check the structure drawing, because it will be shown
    """

    response = llm.predict(prompt)

    return response

In [13]:
generate_answer_with_langchain(user_q, retrived_info)

'The molecular formula of Aspirin is C9H8O4. Please note that the information you provided seems to be for a different compound. You can check the structure drawing of Aspirin as it is shown below.'

In [72]:
from langchain_openai.chat_models import ChatOpenAI
from pydantic import BaseModel, Field
from langchain_community.utils.openai_functions import convert_pydantic_to_openai_function
from typing import List

import pubchempy as pcp
from rdkit import Chem
from rdkit.Chem import Draw

import os
os.environ["OPENAI_API_KEY"] = "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"

In [73]:
class ExtractedInfo(BaseModel):
    search_key: str = Field(
        ...,
        description="The key to search for the compound, it should be compound name, formula, or SMILES."
    )
    search_key_type: str = Field(
        ...,
        description="The type of key to search for the compound, is it a name, formula or SMILES ."
    )
    requested_attributes: List[str] = Field(
        ...,
        description=("The attribute being requested it should be one of the following: atom_stereo_count, atoms, bond_stereo_count, bonds, cactvs_fingerprint, canonical_smiles, charge, cid, complexity, conformer_id_3d, conformer_rmsd_3d, coordinate_type, covalent_unit_count, defined_atom_stereo_count, defined_bond_stereo_count, effective_rotor_count_3d, elements, exact_mass, feature_selfoverlap_3d, fingerprint, h_bond_acceptor_count, h_bond_donor_count, heavy_atom_count, inchi, inchikey, isomeric_smiles, isotope_atom_count, iupac_name, mmff94_energy_3d, mmff94_partial_charges_3d, molecular_formula, molecular_weight, monoisotopic_mass, multipoles_3d, pharmacophore_features_3d, record, rotatable_bond_count, shape_fingerprint_3d, shape_selfoverlap_3d, tpsa, undefined_atom_stereo_count, undefined_bond_stereo_count, volume_3d, xlogp, canonical_smiles, or synonyms.")
    )

function_schema = convert_pydantic_to_openai_function(ExtractedInfo)

def extract_information_with_langchain(user_input: str):
    llm = ChatOpenAI(temperature=0, model_name="gpt-4")

    prompt = f"""You are a chemistry expert chatbot.
    Extract the search key and requested attributes from the following user input:
    '{user_input}'
    Format the result as JSON based on the schema provided."""

    response = llm.predict(prompt, functions=[function_schema])


    try:
        extracted_info = ExtractedInfo.model_validate_json(response)
        return extracted_info
    except Exception as e:
        return f"Error validating extracted information: {str(e)}"




def retrive_info(search_key, key_type, attributes):

  retries = 3
  for i in range(retries):
    try:
       compounds = pcp.get_compounds(search_key, key_type)
    except TimeoutError:
      if i < retries - 1:
        time.sleep(2)
        print(f"Retrying PubChem request (attempt {i + 2} of {retries})...")
      else:
        raise

  smiles = compounds[0].canonical_smiles
  compound = compounds[0].to_dict()
  results = []
  for attribute in attributes:
    if attribute == "inchi":
      results.append(f"inchi: {compound['inchi'].split('/')[1]}")
    elif attribute == "elements":
      results.append(f"elements:{' , '.join(compounds[0]['elements'])}")
    elif attribute == "iupac_name":
      names = [comp.iupac_name for comp in compounds if comp.iupac_name != None]
      names = list(set(names))
      results.append(f"names:{' , '.join(names)}")
    elif attribute == "synonyms":
      synonyms = compounds[0].synonyms
      synonyms = list(set(synonyms))
      results.append(f"synonyms:{' , '.join(synonyms)}")
    else:
      results.append(f"{attribute}:{compound[attribute]}")

  final_results = "\n ".join(results)
  molecule = Chem.MolFromSmiles(smiles)

  Draw.MolToFile(molecule, 'output_draw.png')
  return final_results

def generate_answer_with_langchain(user_input: str, retrived_info: str):
    llm = ChatOpenAI(temperature=0, model_name="gpt-4")

    prompt = f"""You are a chemistry expert chatbot, and you will get retrived information to answer the user questions
    generate an expert answer for the following user question, if the user asked for a drawing of generating a structure just tell them that 'it is shown below':
    '{user_input}'
    using tne following extracted information:
    '{retrived_info}'

   after you finish the answer, add to it that the user can check the structure drawing, because it will be shown
    """

    response = llm.predict(prompt)

    return response


In [64]:
def ChemLLM_pipline(user_q):
  extracted_info = extract_information_with_langchain(user_q)

  retrived_info = retrive_info(extracted_info.search_key, extracted_info.search_key_type, extracted_info.requested_attributes)

  response = generate_answer_with_langchain(user_q, retrived_info)

  return response


In [65]:
user_q = "What is the molecular formula of Aspirin?"
user_q

'What is the molecular formula of Aspirin?'

In [66]:
len(pcp.get_substances("Aspirin", namespace='name'))

149

In [67]:
ChemLLM_pipline(user_q)

'The molecular formula of Aspirin is C9H8O4. This means it is composed of 9 carbon atoms (C), 8 hydrogen atoms (H), and 4 oxygen atoms (O). For a better understanding, you can check the structure drawing of Aspirin, as it is shown below.'

In [68]:
quest = "what is the SMILES formula for Propanolol? "
ChemLLM_pipline(quest)

"The Simplified Molecular Input Line Entry System (SMILES) formula for Propranolol is 'CC(C)NCC(COC1=CC=CC2=CC=CC=C21)O'. This formula represents the structure of the molecule. You can check the structure drawing for a more visual representation, as it will be shown below."

In [70]:
quest = "what is citral SMILES formula?"
ChemLLM_pipline(quest)

'The SMILES formula for Citral is CC(=CCCC(=CC=O)C)C. This formula represents the structure of the Citral molecule. You can also check the structure drawing, as it is shown below.'

In [None]:
test_questions = {
    "What is the molecular formula of Aspirin?": {
        "search_key": "name",
        "search_value": "Aspirin",
        "information_requested": ["molecular_formula"]
    },
    "Provide the canonical SMILES for Ibuprofen.": {
        "search_key": "name",
        "search_value": "Ibuprofen",
        "information_requested": ["canonical_smiles"]
    },
    "What is the molecular weight of caffeine?": {
        "search_key": "name",
        "search_value": "caffeine",
        "information_requested": ["molecular_weight"]
    },
    "Give me the InChIKey for Paracetamol.": {
        "search_key": "name",
        "search_value": "Paracetamol",
        "information_requested": ["inchikey"]
    },
    "What is the exact mass of water?": {
        "search_key": "name",
        "search_value": "water",
        "information_requested": ["exact_mass"]
    },
    "How many rotatable bonds does Ethanol have?": {
        "search_key": "name",
        "search_value": "Ethanol",
        "information_requested": ["rotatable_bond_count"]
    },
    "List all synonyms for Methane.": {
        "search_key": "name",
        "search_value": "Methane",
        "information_requested": ["synonyms"]
    },
    "Provide the number of heavy atoms in Benzene.": {
        "search_key": "name",
        "search_value": "Benzene",
        "information_requested": ["heavy_atom_count"]
    },
    "Generate the chemical structure drawing for Acetone.": {
        "search_key": "name",
        "search_value": "Acetone",
        "information_requested": ["canonical_smiles"],
        "action": "draw_structure"
    },
    "What is the atom stereo count for Threonine?": {
        "search_key": "name",
        "search_value": "Threonine",
        "information_requested": ["atom_stereo_count"]
    },
    "Provide the molecular formula of Glucose.": {
        "search_key": "name",
        "search_value": "Glucose",
        "information_requested": ["molecular_formula"]
    },
    "Give me the canonical SMILES for Sulfuric acid.": {
        "search_key": "name",
        "search_value": "Sulfuric acid",
        "information_requested": ["canonical_smiles"]
    },
    "What is the molecular formula of NaCl?": {
        "search_key": "formula",
        "search_value": "NaCl",
        "information_requested": ["molecular_formula"]
    },
    "Provide the InChIKey for CH4.": {
        "search_key": "formula",
        "search_value": "CH4",
        "information_requested": ["inchikey"]
    },
    "What is the molecular weight of H2SO4?": {
        "search_key": "formula",
        "search_value": "H2SO4",
        "information_requested": ["molecular_weight"]
    }
}


In [None]:
for q in list(test_questions.keys()):
  response = ChemLLM_pipline(q)
  print(f"question:{q}")
  print(f"Answer:{response}")
  print("-" * 20)

question:What is the molecular formula of Aspirin?
Answer:The molecular formula of Aspirin is C9H8O4. This formula indicates that each molecule of Aspirin consists of 9 carbon atoms, 8 hydrogen atoms, and 4 oxygen atoms. For a more detailed understanding, you can check the structure drawing of Aspirin, as it will be shown below.
--------------------
question:Provide the canonical SMILES for Ibuprofen.
Answer:The canonical SMILES for Ibuprofen is 'CC(C)CC1=CC=C(C=C1)C(C)C(=O)O'. This notation represents the structure of the Ibuprofen molecule. You can also check the structure drawing, as it will be shown below.
--------------------
question:What is the molecular weight of caffeine?
Answer:The molecular weight of caffeine is 194.19 g/mol. This value is calculated based on the atomic weights of the individual atoms in the caffeine molecule. If you're interested in the structure of caffeine, it is shown below.
--------------------
question:Give me the InChIKey for Paracetamol.
Answer:The I

In [74]:
import gradio as gr
from PIL import Image

def ChemLLM_pipeline_with_image(user_q):
    answer = ChemLLM_pipline(user_q)
    try:
        image = Image.open("output_draw.png")
    except Exception as e:
        image = None
    return answer, image

interface = gr.Interface(
    fn=ChemLLM_pipeline_with_image,
    inputs=gr.Textbox(lines=2, placeholder="Type your chemistry question here...", label="Your Question"),
    outputs=[
        gr.Textbox(label="Expert Answer"),
        gr.Image(label="Generated Chemical Structure")
    ],
    title="Chemistry Expert Chatbot",
    description="Ask your chemistry questions and view the generated molecular structure alongside the answer!"
)



In [75]:
interface.launch()

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://41c52438e7433adb40.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


