In [None]:
import copy

from IPython.core.pylabtools import figsize
%cd ..
%load_ext autoreload
%autoreload 2

In [None]:
import os
import json
import pandas as pd
from mol_gen_docking.data.pdb_uniprot.target_naming import fetch_uniprot_id_from_pdbid
from tqdm import tqdm
from multiprocessing import Pool
import plotly.express as px
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from rdkit.Chem import AllChem
import matplotlib.pyplot as plt
import seaborn as sns
from rdkit import Chem
from rdkit.Chem import AllChem

FIG_PATH = "../-Philippe-MolGenDocking/Figures/reaction_data"
os.makedirs(FIG_PATH, exist_ok=True)
tqdm.pandas()

# Prompts

In [None]:
from mol_gen_docking.data.pydantic_dataset import read_jsonl
from pathlib import Path

def load(path:str):
    data = read_jsonl(Path(path))
    return [line.conversations[0].meta for line in data]

def get_n_reactions_steps(row: pd.Series):
    if not row["type of objective"] in ["full synthesis", "product prediction"]:
        return row["idx_chosen"] + 1
    else:
        return len(row["full_reaction"].split("\n"))

def runtime_get_reactants(row: pd.Series):
    full_reaction = row["full_reaction"]
    reac_steps = full_reaction.split("\n")
    reactants = [
        reac.split(" -> ")[0].split(" + ") for reac in reac_steps
    ]
    return reactants

def runtime_get_products(row: pd.Series):
    full_reaction = row["full_reaction"]
    reac_steps = full_reaction.split("\n")
    products = [
        reac.split(" -> ")[1].split(" + ")[0] for reac in reac_steps
    ]
    return products

def smarts_to_image(smarts):
    rxn = AllChem.ReactionFromSmarts(smarts)
    rxn.Initialize()
    img = AllChem.Draw.ReactionToImage(rxn, subImgSize=(300,300))
    return img


def get_df(data_d):
    df = pd.DataFrame(data_d)
    df= df.explode(["properties", "objectives"]).reset_index(drop=True)
    df["last molecule passes filter"] = df.pass_filters.apply(lambda x: x[-1])
    df["prop. of molecules passing filter"] = df.pass_filters.apply(lambda x: np.mean(x))

    df["type of objective"]  = df.objectives.apply(
        lambda x: "full synthesis" if "full_path" in x else "reactant prediction" if "reactant" in x else "product prediction" if "products" in x else "product prediction" if "product" in x else x
    )
    df["reaction_steps"] = pd.Categorical(
        df.apply(
           get_n_reactions_steps, axis=1
        ), ordered=True
    )

    df["all_reactants"] = df.apply(runtime_get_reactants, axis = 1)
    df["all_products"] = df.apply(runtime_get_products, axis = 1)
    return df

df = get_df(load("data/synthesis/train_prompts.jsonl"))
df

In [None]:
from rdkit import DataStructs

def agg_tanimoto_sim(fps,smi):
    """ Get the average tanimoto similarity between all products in the sub_df"""
    fp_ref = AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(ref_smi), 2, nBits=2048)
    n = len(fps)
    sims = np.zeros((n,))
    for i in range(n):
        sims[i] = DataStructs.TanimotoSimilarity(fps[i], fp_ref)
    return sims

In [None]:
sub_df = df[~df.impossible]
fps = []
for smis in tqdm(sub_df["all_products"]):
    fps.append(AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smis[-1]), 2, nBits=2048))

In [None]:
all_analogs = pd.DataFrame(load(Path("test/data/analog_test.jsonl"))).explode(["properties", "objectives", "target"])

In [None]:
example_results : list[list[tuple[str, float]]] = []

ref_smis =  all_analogs["target"]
for i,ref_smi in enumerate(ref_smis):
    example_results.append([])
    sims = agg_tanimoto_sim(fps,ref_smi)
    idxs = np.argsort(sims)[::-1]
    print("#-"*5, ref_smi, "-#"*5)
    for idx in idxs[:500:100]:
        example_results[-1].append((sub_df.iloc[idx].full_reaction, sims[idx]))

In [None]:
with open("test/data/analog_test_compl_example.jsonl", "w") as f:
    json.dump(example_results, f)