In [None]:
%cd ..
%load_ext autoreload
%autoreload 2

In [None]:
import os
import pandas as pd
from tqdm import tqdm
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

tqdm.pandas()

# Prompts

In [None]:
from mol_gen_docking.data.pydantic_dataset import read_jsonl
from pathlib import Path

def get_df(data_d):
    df = pd.DataFrame(data_d)
    df= df.explode(["properties", "objectives", "target", "smiles"]).reset_index(drop=True)
    df["origin"] = df["properties"].apply(lambda x: x.split("/")[0])
    df["task"] = df["properties"].apply(lambda x: x.split("/")[1])
    return df

file_path = "data/property_prediction/eval_prompts_boxed.jsonl"

dataset = read_jsonl(Path(file_path))
df = get_df([line.conversations[0].meta for line in dataset])
df["dataset"] = df["properties"].apply(lambda x: x.split("/")[0])
df

In [None]:
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold

mols = [Chem.MolFromSmiles(smi) for smi in tqdm(df.smiles.unique())]
scaffolds_list = [MurckoScaffold.GetScaffoldForMol(mol) for mol in tqdm(mols)]


In [None]:

scaffolds = {}
for smi, scaff in zip(df.smiles.unique(), tqdm(scaffolds_list)):
    if scaff.GetNumAtoms() <= 6:
        scaffolds[smi] = "n/a"
    else:
        scaffolds[smi] = Chem.MolToSmiles(scaff)
df["scaffold"] = df.smiles.map(scaffolds)

In [None]:
size = df.groupby("task").size().sort_values(ascending=False)
df["task size"] =df.task.map(size.to_dict())
palette = {
    origin: sns.color_palette("colorblind", n_colors=10)[i] for i,origin in enumerate(df.origin.unique())
}


fig, axes = plt.subplots(1,2,figsize = (12,3), gridspec_kw = {
    'width_ratios':[df[df.objectives == "regression"].task.nunique()/df.task.nunique(), df[df.objectives == "classification"].task.nunique()/df.task.nunique()],
}, sharey=True)
fig.subplots_adjust(wspace=0.01)
sns.histplot(df[df.objectives == "regression"].sort_values("task size"), x="task", hue="origin",ax = axes[0], palette=palette)
axes[0].set_title("Regression")
sns.histplot(df[df.objectives == "classification"].sort_values("task size"), x="task", hue="origin",ax = axes[1], palette=palette, legend=False)
axes[1].set_title("Classification")
for ax in axes:
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90)

fig.savefig("mol_prop_task_size.pdf", bbox_inches="tight")

In [None]:
from rdkit.Chem import Draw
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
fig, ax = plt.subplots(figsize=(14, 3))

count = df[df.scaffold.apply(len)>3].groupby("scaffold").size().sort_values(ascending=False)
scaffs_to_keep = count.index.tolist()[:30]


df["n_scaff"] = df.scaffold.map(count.to_dict())
df_top = df[df.scaffold.isin(scaffs_to_keep)]
df_top = df_top.sort_values("n_scaff", ascending=False)
sns.histplot(df_top, x="scaffold", hue="origin",multiple="stack", log_scale=False, ax=ax)
ax.set_xlabel("")
ax.set_xlim(-0.5,29.5)
ax.set_xticks([])  # remove default labels

scaff_to_img = {}
for scaff in scaffs_to_keep:
    mol = Chem.MolFromSmiles(scaff)
    if mol:
        img = Draw.MolToImage(mol, size=(80, 80))
        scaff_to_img[scaff] = img

# Add images as tick labels
for i, scaff in enumerate(scaffs_to_keep):
    if scaff in scaff_to_img:
        imagebox = OffsetImage(scaff_to_img[scaff], zoom = 0.2)
        ab = AnnotationBbox(imagebox, (i, 0), frameon=True, box_alignment=(0.5, 1.4))
        ax.add_artist(ab)


fig.savefig("mol_prop_scaff.pdf", bbox_inches="tight")