In [None]:
%load_ext autoreload
%autoreload 2
%cd ..

In [None]:
from mol_gen_docking.data.prompt_generator import PromptGenerator
from mol_gen_docking.data.pydantic_dataset import read_jsonl, write_jsonl
from tqdm.auto import tqdm
from pathlib import Path

## Creating a system prompt

In [None]:
SYSTEM_PROMT = """You are a molecular modeling assistant. You answer to questions asked by a user in english.
You can first draft your thinking process or inner molologue before arriving to an answer.
You must provide your final answer enclosed in <answer> </answer> tags e.g <answer> answer here </answer>.
"""

SYSTEM_PROMT

# Molecular Property - Prompt Generation

## Creating a jinja template

In [None]:
from jinja2 import Template
MOLPROP_DATA_PATH = Path("data/polaris")

TO_SHORTNAME = {
    ### ASAP Discovery Antiviral Potency
    "asap-discovery/antiviral-potency-2025-unblinded/pIC50 (SARS-CoV-2 Mpro)": "SARS-CoV-2 main protease inhibition potency (pIC50)",
    "asap-discovery/antiviral-potency-2025-unblinded/pIC50 (MERS-CoV Mpro)": "MERS-CoV main protease inhibition potency (pIC50)",
    ### Polaris ADME
    "polaris/az-logd-74-v1/LOGD_74": "octan-1-ol/water (pH7.4) distribution coefficent",
    "polaris/az-ppb-clearance-v1/log_unbound_PPB": "plasma protein binding (log-percent unbound to plasma protein)",
    "novartis/novartis-cyp3a4-v1/log_kobs": "log-inactivation rate constant of CYP3A4",
    "polaris/drewry2017-pkis2-subset-v2/CLS_EGFR": "inhibation of EGFR kinase",
    "polaris/drewry2017-pkis2-subset-v2/CLS_KIT": "inhibation of KIT kinase",
    "polaris/drewry2017-pkis2-subset-v2/CLS_RET": "inhibation of RET kinase",
    "polaris/drewry2017-pkis2-subset-v2/CLS_LOK": "inhibation of LOK kinase",
    "polaris/drewry2017-pkis2-subset-v2/CLS_SLK": "inhibation of SLK kinase",
    ### Biogen ADME-FANG
    "biogen/adme-fang-solu-reg-v1": "log-solubility at pH6.8 (log \u03bcg/mL)",
    "biogen/adme-fang-rppb-reg-v1": "rat plasma protein binding (log-percent unbound to plasma protein)",
    "biogen/adme-fang-hppb-reg-v1": "human plasma protein binding (log-percent unbound to plasma protein)",
    "biogen/adme-fang-perm-reg-v1": "log-MDR1 MDCK efflux ratio",
    "biogen/adme-fang-hclint-reg-v1": "human log-liver microsomal stability (log mL/min/kg)",
    "biogen/adme-fang-rclint-reg-v1": "rat log-liver microsomal stability (log mL/min/kg)",
    ### TDCommons
    "tdcommons/pgp-broccatelli": "inhibition of the P-glycoprotein",
    "tdcommons/vdss-lombardo": "log-volume of distribution at steady state (log L/kg)",
    "tdcommons/bbb-martins": "ablity to penetrate the blood-brain barrier",
    "tdcommons/caco2-wang": "Caco-2 cell effective log-permeability (log cm/s)",
    "tdcommons/dili": "ability to induce liver injuries",
    "tdcommons/herg": "blocker of hERG",
    "tdcommons/ames": "mutagenicity",
    "tdcommons/half-life-obach": "log-half-life (log hr) in human body",
    "tdcommons/lipophilicity-astrazeneca": "lipophilicity (log-ratio)",
    "tdcommons/clearance-hepatocyte-az": "log-drug clearance, hepatocyte assay (log \u03bcL/min/10^6 cells)",
    "tdcommons/clearance-microsome-az": "log-drug clearance, microsome assay (log mL/min/g)",
    "tdcommons/ld50-zhu": "LD50 toxicity (log kg/mol)",
    "tdcommons/cyp2c9-substrate-carbonmangels": "being a substrate of CYP2C9",
    "tdcommons/cyp2d6-substrate-carbonmangels": "being a substrate of CYP2D6",
    "tdcommons/cyp3a4-substrate-carbonmangels": "being a substrate of CYP3A4",
    "tdcommons/solubility-aqsoldb": "log-solubility (log mol/L)",
}

In [None]:
jinja_template = """Given a molecular SMILES, you are asked to predict its following property: {{ short_prop }}.
Provide your predicted value in a box (i.e \\boxed{ {%- if objectives[0] == "classification" %}yes or no{%- elif objectives[0] == "regression" %}property value{% endif -%}}) in your final answer.

Molecule to analyze: {{ smiles[0] }}.
"""

pg = PromptGenerator(jinja_template, "data/molgendata")

In [None]:
from copy import copy

for split in ["train", "eval"]:
    prompt_dataset = []
    n_lines = 0
    for path in MOLPROP_DATA_PATH.rglob(f"*/{split}.jsonl"):
        with open(path) as f:
            n_lines += sum(1 for _ in f)

    pbar =tqdm(total=n_lines, desc=f"Generating {split} prompts")
    for path in MOLPROP_DATA_PATH.rglob(f"*/{split}.jsonl"):
        data = read_jsonl(path)
        short_prop = TO_SHORTNAME[
            str(path).replace(f"/{split}.jsonl", "").replace("data/polaris/", "")
        ]
        pbar.set_description(f"Generating prompts for {short_prop}")
        for item in data:
            metadata = copy(item.conversations[0].meta)
            assert item.conversations[0].messages[0].role == "system"
            assert item.conversations[0].messages[1].role == "user"
            del item.conversations[0].messages[0]

            metadata["short_prop"] = short_prop
            new_content = pg(metadata)

            item.conversations[0].messages[0].content = new_content

            item.identifier = item.identifier.replace("train", split)
            item.conversations[0].identifier = item.conversations[0].identifier.replace("train", split)

            prompt_dataset.append(item)
            pbar.update(1)
        print(
            prompt_dataset[-1].conversations[0].messages[0].content
        )
        print("===="*30)
        print("===="*30)
    pbar.close()

    write_path = Path(f"data/property_prediction/{split}_prompts_boxed.jsonl")
    write_path.parent.mkdir(parents=True, exist_ok=True)
    write_jsonl(write_path, prompt_dataset)

# Molecular Reaction - Prompt Generation

In [None]:
from mol_gen_docking.dataset.scripts.reaction_task.utils import full_jinja

pg = PromptGenerator(full_jinja, "data/molgendata")
path_dir =  Path("data/synthesis")
out_dir = Path("data/synthesis_tasks")
out_dir.mkdir(parents=True, exist_ok=True)
full_dataset = []

i = 0

for path in path_dir.rglob("train_prompts_*.jsonl"):
    try:
        data = read_jsonl(path)
    except Exception as e:
        print(f"Error reading {path}: {e}")
        continue
    obj_text = {}
    for item in tqdm(data, desc=path.name):
        metadata = item.conversations[0].meta
        new_content = pg(metadata)

        item.identifier = item.identifier + f"_{i}"
        item.conversations[0].identifier = item.conversations[0].identifier + f"_{i}"

        item.conversations[0].messages[0].content = new_content
        item.conversations[0].meta = metadata
        if metadata["objectives"][0] not in obj_text:
            obj_text[metadata["objectives"][0]] = new_content
    i+=1
    full_dataset += data

write_path = out_dir / "train_prompts_unclean_json.jsonl"
write_jsonl(write_path, full_dataset)

In [None]:
from mol_gen_docking.data.reactions.utils import PROMPT_TASKS

for k in PROMPT_TASKS[1:3]:
    v = obj_text.get(k, "No prompt generated for this objective")
    print("Objective:", k)
    print(v)
    print("====="*20)

In [None]:
pg = PromptGenerator(full_jinja, "data/molgendata")
in_dir =  Path("data/synthesis")
out_dir = Path("data/synthesis_tasks/eval_prompts")
out_dir.mkdir(parents=True, exist_ok=True)

for path in in_dir.rglob("*eval*.jsonl"):
    data = read_jsonl(path)
    obj_text = {}
    for item in tqdm(data):
        metadata = item.conversations[0].meta
        try:
            new_content = pg(metadata)
        except Exception as e:
            print(metadata)
            raise e

        item.conversations[0].messages[0].content = new_content
        item.conversations[0].meta = metadata
        if metadata["objectives"][0] not in obj_text:
            obj_text[metadata["objectives"][0]] = new_content

    write_path = out_dir / path.name
    write_jsonl(write_path, data)