In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip install rdkit==2024.9.5
!pip install torch_geometric==2.5.3
!pip install umap-learn

In [None]:
# Modality Vector Generator
import os
import sys
import torch
sys.path.append("/content/drive/MyDrive/MMHRP-GCL-Code")
from utils.rxn import *
from utils.molecule import *
from torch_geometric.loader import DataLoader
from models.GNN_Models import *
import time
from tqdm import tqdm
import datetime
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
import umap

In [None]:
colors = [[114/255, 188/255, 213/255], [170/255, 220/255, 224/255], [231/255, 98/255, 84/255]]

def get_embed_vec(model, x):

  # Graph Modality
  graph_emb1 = model.ReaProEncoder(x[0])
  graph_emb2 = model.CatSolEncoder(x[1])

  # Text Modality
  text_embed = model.RxnSmiEncoder(x[2])

  return graph_emb1, graph_emb2, text_embed

In [None]:
# BH
# 1. import data
data = pd.read_excel("/content/drive/MyDrive/MMHRP-GCL-Code/data/BH_HTE/BH_HTE_data.xlsx")
vocab_type = "BH"
vocab_path = "/content/drive/MyDrive/MMHRP-GCL-Code/utils/%s_vocab.txt" % vocab_type

# 2. build dataset & dataloader

rxn_RxnSmi = list()
max_len = -1
for batch in range(data.shape[0]):
    RxnSmi = get_Buchwald_RxnSmi(data.iloc[batch, :])
    max_len = max(max_len, len(RxnSmi))
    RxnSmi = " ".join(smi_tokenizer(RxnSmi))
    rxn_RxnSmi.append(RxnSmi)

rxn_dataset = list()
smi_inputsize = 128

for batch in tqdm(range(data.shape[0])):
    meta = list()
    # rea
    rea = data.loc[batch]["aryl_halide_smiles"]
    pro = data.loc[batch]["product_smiles"]
    meta.append(smis_to_graph([rea, pro]))
    # add
    base = data.loc[batch]["base_smiles"]
    ligand = data.loc[batch]["ligand_smiles"]
    additive = data.loc[batch]["additive_smiles"]
    meta.append(smis_to_graph([base, ligand, additive]))
    # RxnSmi
    RxnSmi_vec = RxnSmi_to_tensor(RxnSmi=rxn_RxnSmi[batch], maxlen_=max_len, victor_size=smi_inputsize,
                                  file=vocab_path)
    meta.append(RxnSmi_vec)

    # yield
    meta.append(data.loc[batch]["yield"] / 100)

    rxn_dataset.append(meta)

# import model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = torch.load("/content/drive/MyDrive/MMHRP-GCL-Code/exp/LVA/BH_model.pth", map_location=device).eval()

emb_vecs = []
info = DataLoader(rxn_dataset, batch_size=1)
for i in info:
    x = [j.to(device) for j in i[:-1]]
    graph_emb1, graph_emb2, text_emb = get_embed_vec(model, x)
    emb_vecs.append(np.array(graph_emb1.detach().cpu()))
    emb_vecs.append(np.array(graph_emb2.detach().cpu()))
    emb_vecs.append(np.array(text_emb.detach().cpu()))

emb_vecs = np.array(emb_vecs).reshape(-1, 128)
# DR
reducer = umap.UMAP(n_components=2)
embedding = reducer.fit_transform(emb_vecs)
plt.figure(figsize=(5, 5), dpi=200)
for i in tqdm(range(0, embedding.shape[0], 3)):
  a = plt.scatter(embedding[i, 0], embedding[i, 1], color=colors[0])
  b = plt.scatter(embedding[i+1, 0], embedding[i+1, 1],color=colors[1])
  c = plt.scatter(embedding[i+2, 0], embedding[i+2, 1],color=colors[2])
labels = [a, b, c]
plt.title('UMAP projection of the latent vectors in B-H dataset')
plt.legend(labels, ["Graph Modality\n(Reactans&Products)", "Graph Modality\n(Catalysts&Solvents)", "Text Modality"], loc="upper right", prop={'size': 6})
plt.xlabel("UMAP 1", fontsize=10)
plt.ylabel("UMAP 2", fontsize=10)
plt.xticks([])
plt.yticks([])
plt.savefig("/content/drive/MyDrive/MMHRP-GCL-Code/exp/LVA/BH_umap.png")

In [None]:
# Suzuki
# 1. import data
data = pd.read_excel("/content/drive/MyDrive/MMHRP-GCL-Code/data/Suzuki_HTE/Suzuki_HTE_data.xlsx")
vocab_type = "Suzuki"
vocab_path = "/content/drive/MyDrive/MMHRP-GCL-Code/utils/%s_vocab.txt" % vocab_type

# Generate Rxnsmi
rxn_RxnSmi = list()
max_len = -1
for batch in range(data.shape[0]):
    RxnSmi = get_Suzuki_RxnSmi(data.iloc[batch, :])
    max_len = max(max_len, len(RxnSmi))
    RxnSmi = " ".join(smi_tokenizer(RxnSmi))
    rxn_RxnSmi.append(RxnSmi)

rxn_dataset = list()
smi_inputsize = 128

for batch in tqdm(range(data.shape[0])):
    meta = list()
    # rea
    rea1 = data.loc[batch]["Reactant_1_Name"]
    rea2 = data.loc[batch]["Reactant_2_Name"]
    meta.append(smis_to_graph([rea1, rea2]))
    # add
    add = list()

    base = data.loc[batch]["Reagent_1_Short_Hand"]
    if not pd.isnull(base):
        add.append(base)
    ligand = data.loc[batch]["Ligand_Short_Hand"]
    if not pd.isnull(ligand):
        add.append(ligand)
    sol = data.loc[batch]["Solvent_1_Short_Hand"]
    if not pd.isnull(sol):
        add.append(sol)

    meta.append(smis_to_graph(add))

    # RxnSmi
    RxnSmi_vec = RxnSmi_to_tensor(RxnSmi=rxn_RxnSmi[batch], maxlen_=max_len, victor_size=smi_inputsize,
                                  file=vocab_path)
    meta.append(RxnSmi_vec)

    # yield
    meta.append(data.loc[batch]["Product_Yield_PCT_Area_UV"] / 100)

    rxn_dataset.append(meta)

# import model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = torch.load("/content/drive/MyDrive/MMHRP-GCL-Code/exp/LVA/Suzuki_model.pth", map_location=device).eval()

emb_vecs = []
info = DataLoader(rxn_dataset, batch_size=1)
for i in info:
    x = [j.to(device) for j in i[:-1]]
    graph_emb1, graph_emb2, text_emb = get_embed_vec(model, x)
    emb_vecs.append(np.array(graph_emb1.detach().cpu()))
    emb_vecs.append(np.array(graph_emb2.detach().cpu()))
    emb_vecs.append(np.array(text_emb.detach().cpu()))

emb_vecs = np.array(emb_vecs).reshape(-1, 128)
# DR
reducer = umap.UMAP(n_components=2)
embedding = reducer.fit_transform(emb_vecs)
plt.figure(figsize=(5, 5), dpi=200)
for i in tqdm(range(0, embedding.shape[0], 3)):
  a = plt.scatter(embedding[i, 0], embedding[i, 1], color=colors[0])
  b = plt.scatter(embedding[i+1, 0], embedding[i+1, 1],color=colors[1])
  c = plt.scatter(embedding[i+2, 0], embedding[i+2, 1],color=colors[2])
labels = [a, b, c]
plt.title('UMAP projection of the latent vectors in S-M dataset')
plt.legend(labels, ["Graph Modality\n(Reactans&Products)", "Graph Modality\n(Catalysts&Solvents)", "Text Modality"], loc="upper right", prop={'size': 6})
plt.xlabel("UMAP 1", fontsize=10)
plt.ylabel("UMAP 2", fontsize=10)
plt.xticks([])
plt.yticks([])
plt.savefig("/content/drive/MyDrive/MMHRP-GCL-Code/exp/LVA/SM_umap.png")

In [None]:
# AT
# 1. import data
data = pd.read_csv("/content/drive/MyDrive/MMHRP-GCL-Code/data/AT/Asymmetric_Thiol_Addition.csv")
vocab_type = "AT"
vocab_path = "/content/drive/MyDrive/MMHRP-GCL-Code/utils/%s_vocab.txt" % vocab_type

# Generate Rxnsmi
rxn_RxnSmi = list()
max_len = -1
for batch in range(data.shape[0]):
    RxnSmi = get_AT_RxnSmi(data.iloc[batch, :])
    max_len = max(max_len, len(RxnSmi))
    RxnSmi = " ".join(smi_tokenizer(RxnSmi))
    rxn_RxnSmi.append(RxnSmi)

rxn_dataset = list()
smi_inputsize = 128

for batch in tqdm(range(data.shape[0])):
    meta = list()
    # rea
    rea1 = data.loc[batch]["Imine"]
    rea2 = data.loc[batch]["Thiol"]
    prod = data.loc[batch]["product"]
    meta.append(smis_to_graph([rea1, rea2, prod]))
    # add
    add = list()

    cat = data.loc[batch]["Catalyst"]
    add.append(cat)

    meta.append(smis_to_graph(add))

    # RxnSmi
    RxnSmi_vec = RxnSmi_to_tensor(RxnSmi=rxn_RxnSmi[batch], maxlen_=max_len, victor_size=smi_inputsize,
                                  file=vocab_path)
    meta.append(RxnSmi_vec)

    # yield
    meta.append(data.loc[batch]["Output"])

    rxn_dataset.append(meta)

# import model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = torch.load("/content/drive/MyDrive/MMHRP-GCL-Code/exp/LVA/AT_model.pth", map_location=device).eval()

emb_vecs = []
info = DataLoader(rxn_dataset, batch_size=1)
for i in info:
    x = [j.to(device) for j in i[:-1]]
    graph_emb1, graph_emb2, text_emb = get_embed_vec(model, x)
    emb_vecs.append(np.array(graph_emb1.detach().cpu()))
    emb_vecs.append(np.array(graph_emb2.detach().cpu()))
    emb_vecs.append(np.array(text_emb.detach().cpu()))

emb_vecs = np.array(emb_vecs).reshape(-1, 128)
# DR
reducer = umap.UMAP(n_components=2)
embedding = reducer.fit_transform(emb_vecs)
plt.figure(figsize=(5, 5), dpi=200)
for i in tqdm(range(0, embedding.shape[0], 3)):
  a = plt.scatter(embedding[i, 0], embedding[i, 1], color=colors[0])
  b = plt.scatter(embedding[i+1, 0], embedding[i+1, 1],color=colors[1])
  c = plt.scatter(embedding[i+2, 0], embedding[i+2, 1],color=colors[2])
labels = [a, b, c]
plt.title('UMAP projection of the latent vectors in A-T dataset')
plt.legend(labels, ["Graph Modality\n(Reactans&Products)", "Graph Modality\n(Catalysts&Solvents)", "Text Modality"], loc="upper right", prop={'size': 6})
plt.xlabel("UMAP 1", fontsize=10)
plt.ylabel("UMAP 2", fontsize=10)
plt.xticks([])
plt.yticks([])
plt.savefig("/content/drive/MyDrive/MMHRP-GCL-Code/exp/LVA/AT_umap.png")

In [None]:
# SNAr
# 1. import data
data = pd.read_excel("/content/drive/MyDrive/MMHRP-GCL-Code/data/SNAR/SNAR_data.xlsx")
vocab_type = "SNAR"
vocab_path = "/content/drive/MyDrive/MMHRP-GCL-Code/utils/%s_vocab.txt" % vocab_type

# Generate Rxnsmi
rxn_RxnSmi = list()
max_len = -1
for batch in range(data.shape[0]):
    RxnSmi = get_SNAR_RxnSmi(data.iloc[batch, :])
    max_len = max(max_len, len(RxnSmi))
    RxnSmi = " ".join(smi_tokenizer(RxnSmi))
    rxn_RxnSmi.append(RxnSmi)

rxn_dataset = list()
smi_inputsize = 128

for batch in tqdm(range(data.shape[0])):
    meta = list()
    # rea
    rea1 = data.loc[batch]["Substrate SMILES"]
    rea2 = data.loc[batch]["Nucleophile SMILES"]
    prod = data.loc[batch]["Product SMILES"]
    meta.append(smis_to_graph([rea1, rea2, prod]))
    # sol
    sol = list()

    sol = data.loc[batch]["Solvent"].split(".")

    meta.append(smis_to_graph(sol))

    # RxnSmi
    RxnSmi_vec = RxnSmi_to_tensor(RxnSmi=rxn_RxnSmi[batch], maxlen_=max_len, victor_size=smi_inputsize,
                                  file=vocab_path)
    meta.append(RxnSmi_vec)

    # activation energy
    meta.append(data.loc[batch]["exp_activation_energy"])

    rxn_dataset.append(meta)

# import model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = torch.load("/content/drive/MyDrive/MMHRP-GCL-Code/exp/LVA/SNAR_model.pth", map_location=device).eval()

emb_vecs = []
info = DataLoader(rxn_dataset, batch_size=1)
for i in info:
    x = [j.to(device) for j in i[:-1]]
    graph_emb1, graph_emb2, text_emb = get_embed_vec(model, x)
    emb_vecs.append(np.array(graph_emb1.detach().cpu()))
    emb_vecs.append(np.array(graph_emb2.detach().cpu()))
    emb_vecs.append(np.array(text_emb.detach().cpu()))

emb_vecs = np.array(emb_vecs).reshape(-1, 128)
# DR
reducer = umap.UMAP(n_components=2)
embedding = reducer.fit_transform(emb_vecs)
plt.figure(figsize=(5, 5), dpi=200)
for i in tqdm(range(0, embedding.shape[0], 3)):
  a = plt.scatter(embedding[i, 0], embedding[i, 1], color=colors[0])
  b = plt.scatter(embedding[i+1, 0], embedding[i+1, 1],color=colors[1])
  c = plt.scatter(embedding[i+2, 0], embedding[i+2, 1],color=colors[2])
labels = [a, b, c]
plt.title('UMAP projection of the latent vectors in S$_N$Ar dataset')
plt.legend(labels, ["Graph Modality\n(Reactans&Products)", "Graph Modality\n(Catalysts&Solvents)", "Text Modality"], loc="upper right", prop={'size': 6})
plt.xlabel("UMAP 1", fontsize=10)
plt.ylabel("UMAP 2", fontsize=10)
plt.xticks([])
plt.yticks([])
plt.savefig("/content/drive/MyDrive/MMHRP-GCL-Code/exp/LVA/SNAR_umap.png")