In [None]:
%cd ..
%load_ext autoreload
%autoreload 2

In [None]:
import os
import json
import pandas as pd
from mol_gen_docking.data.pdb_uniprot.target_naming import fetch_uniprot_id_from_pdbid
from tqdm import tqdm
from multiprocessing import Pool
import plotly.express as px
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

tqdm.pandas()

# Prompts

In [None]:
from mol_gen_docking.data.pydantic_dataset import read_jsonl
from pathlib import Path

def load(path:str):
    data = read_jsonl(Path(path))
    return [line.conversations[0].meta for line in data]

def get_df(data_d):
    df = pd.DataFrame(data_d)
    df= df.explode(["properties", "objectives", "target"]).reset_index(drop=True)
    df["n_reactants"] = df["full_reaction"].apply(lambda x: len(x.split(" -> ")[0].split(" + ")))
    df["n_products"] = df["full_reaction"].apply(lambda x: len(x.split(" -> ")[1].split(" + ")))
    df["n_elem_reaction"] = df["n_reactants"] + df["n_products"]
    return df

df = get_df(load("data/uspto/train.jsonl"))

In [None]:
df

In [None]:

df["number of reactants"] = pd.Categorical(df["n_reactants"])
df["number of products"] = pd.Categorical(df["n_products"])
sns.histplot(df, hue="number of products", x="number of reactants", thresh=None,multiple="stack", palette="viridis")

plt.savefig("N_prod_reac.pdf", bbox_inches="tight")

In [None]:
from rdkit.Chem import Draw
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
from rdkit import Chem
N_SOLV = 15

count = df.groupby("solvent").size().sort_values(ascending=False)
to_keep = count.index.tolist()[:N_SOLV]

df_plot = df[df["solvent"].isin(to_keep)]
df_plot["solvent"] = pd.Categorical(df_plot["solvent"], to_keep, ordered=True)

fig, ax = plt.subplots(1,1,figsize=(7,3))
sns.histplot(df_plot, hue="objectives", x="solvent", thresh=None,multiple="stack", palette="husl", ax=ax)
ax.set_xlabel("")
ax.set_xlim(-0.5,N_SOLV - 0.5)
ax.set_xticks([])  # remove default labels


to_img = {}
for solv in to_keep:
    mol = Chem.MolFromSmiles(solv)
    if mol:
        img = Draw.MolToImage(mol, size=(80, 80))
        to_img[solv] = img

# Add images as tick labels
for i, solv in enumerate(to_keep):
    if solv in to_img:
        imagebox = OffsetImage(to_img[solv], zoom = 0.2)
        ab = AnnotationBbox(imagebox, (i, 0), frameon=True, box_alignment=(0.5, 1.4))
        ax.add_artist(ab)

fig.savefig("solvents.pdf", bbox_inches="tight")