In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install rdkit==2024.9.5
!pip install torch_geometric==2.5.3

In [None]:
import os
import sys
import torch
sys.path.append("/content/drive/MyDrive/MMHRP-GCL-Code")
from utils.rxn import *
from utils.molecule import *
from torch_geometric.loader import DataLoader
from models.MMHRP_GCL import *
import time
from tqdm import tqdm
import datetime
import warnings
warnings.simplefilter('ignore')

In [None]:
rs_list = [1,2,3,4,5]

for rs in rs_list:
    # 1. import data
    data = pd.read_excel("/content/drive/MyDrive/MMHRP-GCL-Code/data/Suzuki_HTE/Suzuki_HTE_data.xlsx")
    random_state = rs
    data = data.sample(random_state=random_state, frac=1).reset_index(drop=True)
    vocab_type = "Suzuki"
    vocab_path = "/content/drive/MyDrive/MMHRP-GCL-Code/utils/%s_vocab.txt" % vocab_type

    # Generate Rxnsmi
    rxn_RxnSmi = list()
    max_len = -1
    for batch in range(data.shape[0]):
        RxnSmi = get_Suzuki_RxnSmi(data.iloc[batch, :])
        max_len = max(max_len, len(RxnSmi))
        RxnSmi = " ".join(smi_tokenizer(RxnSmi))
        rxn_RxnSmi.append(RxnSmi)

    rxn_dataset = list()
    smi_inputsize = 128

    for batch in tqdm(range(data.shape[0])):
        meta = list()
        # rea
        rea1 = data.loc[batch]["Reactant_1_Name"]
        rea2 = data.loc[batch]["Reactant_2_Name"]
        meta.append(smis_to_graph([rea1, rea2]))
        # add
        add = list()

        base = data.loc[batch]["Reagent_1_Short_Hand"]
        if not pd.isnull(base):
            add.append(base)
        ligand = data.loc[batch]["Ligand_Short_Hand"]
        if not pd.isnull(ligand):
            add.append(ligand)
        sol = data.loc[batch]["Solvent_1_Short_Hand"]
        if not pd.isnull(sol):
            add.append(sol)

        meta.append(smis_to_graph(add))

        # RxnSmi
        RxnSmi_vec = RxnSmi_to_tensor(RxnSmi=rxn_RxnSmi[batch], maxlen_=max_len, victor_size=smi_inputsize,
                                      file=vocab_path)
        meta.append(RxnSmi_vec)

        # yield
        meta.append(data.loc[batch]["Product_Yield_PCT_Area_UV"] / 100)

        rxn_dataset.append(meta)

    # report
    dir_path = "/content/drive/MyDrive/MMHRP-GCL-Code/exp/Suzuki/Suzuki_MMHRP_rs=%s_%s" % (
    random_state, datetime.datetime.now())
    os.mkdir("%s" % dir_path)
    f = open("%s/Model_Training_Report.txt" % dir_path, mode="w")

    # split of train & test set
    ratio = 0.7
    batch_size = 72
    batch = len(rxn_dataset)
    train_set = rxn_dataset[0: int(ratio * batch)]
    test_set = rxn_dataset[int(ratio * batch) + 1:]
    # data_loader
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=True)

    test_RMSE = list()
    test_R2 = list()
    test_MAE = list()
    train_R2 = list()
    train_RMSE = list()
    train_MAE = list()
    pred = list()
    true = list()

    # 3. training of the model
    # params
    t = 700
    lr = 1e-3
    num_feature = 8
    smi_inputsize = 128

    # use gpu
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # model
    model = MMHRP_GCL(
      GraphEncoder={
        "NodeFeatNum":num_feature,
        "Channels":[32, 64],
        "Heads":4
      },
      TextEncoder={
        "SmiFeatNum":smi_inputsize,
        "Heads":4,
        "BigruChannels":[128, 128, 128],
        "BigruNumlayer":1
      },
      Decoder = {
        "Heads":4,
        "Channels":[1000, 500, 100]
      },
      device=device
    )

    opti = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
    model = model.to(device)
    criterion = nn.MSELoss()

    # writedown params
    f.write("params:\n")
    f.write("random_state=%s\n" % random_state)
    f.write("ratio=%s\n" % ratio)
    f.write("batch_size=%s\n" % batch_size)
    f.write("t=%s\n" % t)
    f.write("lr=%s\n" % lr)
    f.write("vocab_type=%s\n" % vocab_type)

    # Training

    # best performance
    best = [0, 0, 0, 0, 0, 0, [], [],
            []]  # train_R2, train_RMSE, train_MAE, test_R2, test_RMSE, test_MAE test_predict, test_true, model

    f.write("\nStart training\n")

    for epoch in tqdm(range(t)):
        if epoch == 200:
          opti = torch.optim.Adam(model.parameters(), lr=lr/10, weight_decay=1e-5)
        # Training
        global_loss = torch.tensor([0.])

        for data in train_loader:
            x = [i.to(device) for i in data[:-1]]
            y = torch.unsqueeze(data[-1], dim=1).to(device)
            loss = criterion(model.forward(x), y)
            opti.zero_grad()
            loss.backward()
            opti.step()
            global_loss += loss.item()

        # record of loss during training
        # performance in train set
        with torch.no_grad():
            pred = list()
            true = list()
            for data in train_loader:
                x = [i.to(device) for i in data[:-1]]
                tr = list(torch.unsqueeze(data[-1], dim=1).detach().numpy())
                pr = list(model.forward(x).cpu().detach().numpy())
                pred += pr
                true += tr
            train_RMSE.append(RMSE(np.array(pred), np.array(true)))
            train_R2.append(R2(np.array(pred), np.array(true)))
            train_MAE.append(MAE(np.array(true), np.array(pred)))

        # performance in test set
        with torch.no_grad():
            pred = list()
            true = list()
            for data in test_loader:
                x = [i.to(device) for i in data[:-1]]
                tr = list(torch.unsqueeze(data[-1], dim=1).detach().numpy())
                pr = list(model.forward(x).cpu().detach().numpy())
                pred += pr
                true += tr
            test_RMSE.append(RMSE(np.array(pred), np.array(true)))
            test_R2.append(R2(np.array(pred), np.array(true)))
            test_MAE.append(MAE(np.array(true), np.array(pred)))

            if epoch == 0 or test_R2[-1] > best[3]:
                best = [train_R2[-1], train_RMSE[-1], train_MAE[-1], test_R2[-1], test_RMSE[-1], test_MAE[-1], pred,
                        true, model]

        # write report
        f.write(
            "Epoch:%d loss: %f, R2:train set %.3f\ttest set %.3f\tRMSE:train set %.3f\ttest set %.3f\tMAE:train set %.3f\ttest set %.3f\n" % (
            epoch + 1, global_loss / batch_size, train_R2[-1], test_R2[-1], train_RMSE[-1], test_RMSE[-1],
            train_MAE[-1], test_MAE[-1]))

    # 4.Evaluation
    f.write("\n")
    # Performance in train set
    f.write("R2 of train set is:%.3f+-%f\tbest:%f\n" % (
    np.array(train_R2[-10:]).mean(), np.array(train_R2[-10:]).std(), best[0]))
    f.write("RMSE of train set is: %.3f+-%f\tbest:%f\n" % (
    np.array(train_RMSE[-10:]).mean(), np.array(train_RMSE[-10:]).std(), best[1]))
    f.write("MAE of train set is: %.3f+-%f\tbest:%f\n" % (
    np.array(train_MAE[-10:]).mean(), np.array(train_MAE[-10:]).std(), best[2]))

    # Performance in test set
    f.write("R2 of test set is:%.3f+-%.3f\tbest:%f\n" % (
    np.array(test_R2[-10:]).mean(), np.array(test_R2[-10:]).std(), best[3]))
    f.write("RMSE of test set is: %.3f+-%f\tbest:%f\n" % (
    np.array(test_RMSE[-10:]).mean(), np.array(test_RMSE[-10:]).std(), best[4]))
    f.write("MAE of test set is: %.3f+-%f\tbest:%f\n" % (
    np.array(test_MAE[-10:]).mean(), np.array(test_MAE[-10:]).std(), best[5]))

    f.close()

    # Performance in train set
    print("R2 of train set is:%.3f+-%f\tbest:%f\n" % (
    np.array(train_R2[-10:]).mean(), np.array(train_R2[-10:]).std(), best[0]))
    print("RMSE of train set is: %.3f+-%f\tbest:%f\n" % (
    np.array(train_RMSE[-10:]).mean(), np.array(train_RMSE[-10:]).std(), best[1]))
    print("MAE of train set is: %.3f+-%f\tbest:%f\n" % (
    np.array(train_MAE[-10:]).mean(), np.array(train_MAE[-10:]).std(), best[2]))

    # Performance in test set
    print("R2 of test set is:%.3f+-%.3f\tbest:%f\n" % (
    np.array(test_R2[-10:]).mean(), np.array(test_R2[-10:]).std(), best[3]))
    print("RMSE of test set is: %.3f+-%f\tbest:%f\n" % (
    np.array(test_RMSE[-10:]).mean(), np.array(test_RMSE[-10:]).std(), best[4]))
    print("MAE of test set is: %.3f+-%f\tbest:%f\n" % (
    np.array(test_MAE[-10:]).mean(), np.array(test_MAE[-10:]).std(), best[5]))

    # 5.Figure
    import matplotlib.pyplot as plt
    import seaborn as sns

    fig = plt.figure(dpi=500, figsize=(10, 5))

    # Training Fig
    plt.subplot(1, 2, 1)
    steps = np.linspace(1, t, t)
    plt.plot(steps, train_R2, color=[236 / 255, 164 / 255, 124 / 255])
    plt.plot(steps, test_R2, color=[117 / 255, 157 / 255, 219 / 255])
    # Beautify
    plt.legend(["train set", "test set"], loc="upper left", prop={'size': 8})
    plt.xlabel("Epoch", fontsize=10)
    plt.ylabel("R$^2$", fontsize=10)
    plt.title("The R$^2$ of train & test set during training", fontsize=13)

    # Test set performance
    plt.subplot(1, 2, 2)
    tr = np.array(best[7]).flatten() * 100
    pr = np.array(best[6]).flatten() * 100
    plt.scatter(pr, tr, alpha=0.7, marker=".")
    plt.xlabel("Predicted Yield", fontsize=10)
    plt.ylabel("Observed Yield", fontsize=10)
    x = np.linspace(0, 100, 100)
    y = np.linspace(0, 100, 100)
    plt.plot(x, y, linestyle="--", color="r")
    plt.title("Test set performance", fontsize=15)
    fig.suptitle("MMHRP for Suzuki dataset", fontsize=16)
    plt.tight_layout()
    plt.savefig("%s/Performance_Figure.png" % dir_path)
    plt.show()

    torch.save(best[-1], "%s/model.pth" % dir_path)