In [None]:
%cd ..
%load_ext autoreload
%autoreload 2

In [None]:
import os
import json
import pandas as pd
from mol_gen_docking.data.pdb_uniprot.target_naming import fetch_uniprot_id_from_pdbid
from tqdm import tqdm
from multiprocessing import Pool
import plotly.express as px
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

FIG_PATH = "../-Philippe-MolGenDocking/Figures/reaction_data"
os.makedirs(FIG_PATH, exist_ok=True)
tqdm.pandas()

# Prompts

In [None]:
from mol_gen_docking.data.pydantic_dataset import read_jsonl
from pathlib import Path

def load(path:str):
    data = read_jsonl(Path(path))
    return [line.conversations[0].meta for line in data]

def get_n_reactions_steps(row: pd.Series):
    if not row["type of objective"] in ["full synthesis", "product prediction"]:
        return row["idx_chosen"] + 1
    else:
        return len(row["full_reaction"].split("\n"))

def get_df(data_d):
    df = pd.DataFrame(data_d)
    df= df.explode(["properties", "objectives"]).reset_index(drop=True)
    df["last molecule passes filter"] = df.pass_filters.apply(lambda x: x[-1])
    df["prop. of molecules passing filter"] = df.pass_filters.apply(lambda x: np.mean(x))

    df["type of objective"]  = df.objectives.apply(
        lambda x: "full synthesis" if "full_path" in x else "reactant prediction" if "reactant" in x else "product prediction" if "products" in x else "product prediction" if "product" in x else x
    )
    df["reaction_steps"] = pd.Categorical(
        df.apply(
           get_n_reactions_steps, axis=1
        ), ordered=True
    )
    return df

df = get_df(load("data/synthesis/train.jsonl"))
df

In [None]:
fig,ax = plt.subplots(figsize=(6,4))

sns.histplot(df, x="reaction_steps", hue="type of objective", multiple="stack", palette="deep",ax=ax)
ax.set_xlabel("Number of reaction steps")
ax.set_ylabel("Count")
fig.savefig(os.path.join(FIG_PATH,"reaction_steps_histogram.pdf"), bbox_inches='tight')

In [None]:
# Evolution of the tanimoto similarity of the products along the reaction steps

df_per_step = df[["products", "or_smarts", "pass_filters"]]
df_per_step["n_step"] =df_per_step.products.apply(lambda x: list(range(len(x))))
df_per_step = df_per_step.explode(["products", "or_smarts", "pass_filters", "n_step"])

In [None]:
map_smarts_to_id = {smarts: idx for idx, smarts in enumerate(df_per_step.groupby("or_smarts").size().sort_values(ascending=False).index.tolist())}
df_per_step["smarts_id"] = pd.Categorical(df_per_step["or_smarts"].map(map_smarts_to_id), ordered=True)

In [None]:
def smarts_to_image(smarts):
    rxn = AllChem.ReactionFromSmarts(smarts)
    img = AllChem.Draw.ReactionToImage(rxn, subImgSize=(300,300))
    return img

In [None]:
fig, ax = plt.subplots(figsize=(10,3))

sns.histplot(df_per_step, x="smarts_id", hue="n_step", multiple="stack", palette="coolwarm_r", ax=ax)
ax.grid(False)
ax.set_xlabel("Reaction SMARTS (sorted by frequency)")
ax.set_ylabel("Count")
ax.set_xticks([])  # Hide x-axis ticks for clarity

# Print the image of some smarts
for smarts_id, (image_xaxis, image_yaxis) in zip([0,22,80], [(0.17,0.6), (0.3,0.3), (0.65,0.2)]):
    smarts = list(map_smarts_to_id.keys())[smarts_id]
    img = smarts_to_image(smarts)
    # Display the image with an arrow to the first bin
    # Define position and size for the new axes
    image_width = 0.2
    image_height = 0.2

    # Add the new axes for the image
    ax_image = fig.add_axes([image_xaxis, image_yaxis, image_width, image_height])
    arr_mult = 1 if smarts_id!= 0 else 0.4
    # Display the image in the new axes
    ax_image.imshow(img)
    ax_image.axis('off') # Turn off axes for the image if not needed
    # Add an arrow between the image and the first bin
    ax.annotate('', xy=(smarts_id, arr_mult*df_per_step[df_per_step.smarts_id == smarts_id].shape[0]), xytext=(image_xaxis, image_yaxis),
                arrowprops=dict(arrowstyle="->", color='black', lw=1),
                textcoords='figure fraction')
fig.savefig(os.path.join(FIG_PATH,"smarts_histogram_with_images.pdf"), bbox_inches='tight')

In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs

def agg_tanimoto_sim(sub_df):
    """ Get the average tanimoto similarity between all products in the sub_df"""
    fps = [AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smi), 2, nBits=1024) for smi in sub_df.products]
    n = len(fps)
    if n <=1:
        return 1.0
    sims = np.zeros((n,n))
    for i in range(n):
        for j in range(i+1, n):
            sim = DataStructs.TanimotoSimilarity(fps[i], fps[j])
            sims[i,j] = sim
            sims[j,i] = sim
    return sims.max(axis=1)


values = []
steps = []
for n_steps in df_per_step.n_step.unique():
    sims = agg_tanimoto_sim(df_per_step[df_per_step.n_step==n_steps]).tolist()
    values = values + sims
    steps = steps + [n_steps]*len(sims)

df_simi = pd.DataFrame({
    "tanimoto_similarity": values,
    "n_step": steps
})


In [None]:
fig, ax = plt.subplots(figsize=(5,3))

sns.pointplot(df_simi, x="n_step", y="tanimoto_similarity", capsize=.2,ax=ax)

ax.set_xlabel("Reaction step")
ax.set_ylabel("Max Tanimoto similarity\nbetween products")
fig.savefig(os.path.join(FIG_PATH,"tanimoto_similarity_per_reaction_step.pdf"), bbox_inches='tight')