In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip install rdkit

In [5]:
import os
import sys
import torch
import time
import datetime
from tqdm import tqdm
sys.path.append("/content/drive/MyDrive/HeckLit")
from utils.rxn import *
from utils.molecule import *
from models.ANN import *
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error as MAE

In [None]:
rs_list = [1, 2, 3, 4, 5]
for rs in rs_list:
    # 1. import data
    data = pd.read_excel("/content/drive/MyDrive/HeckLit/data/Heck/Heck_fp.xlsx")
    random_state = rs
    data = data.sample(random_state=random_state, frac=1).reset_index(drop=True)

    # 2. build dataset & dataloader
    inter_dataset = list()
    intra_dataset = list()
    rxn_list = df_to_rxn_list(data)
    len_drfp = 0
    split_num = 100

    for batch in tqdm(range(data.shape[0])):
        rxn = rxn_list[batch]

        # features
        drfp = torch.tensor(read_drfp(data.loc[batch]["drfp"]), dtype=torch.float32)
        len_drfp = drfp.shape[0]
        # label
        y = rxn.rxn_yield / 100

        # Inter
        if len(rxn.reactants) == 2:
            inter_dataset.append([drfp, y])

        # Intra
        if len(rxn.reactants) == 1:
            intra_dataset.append([drfp, y])

    # report
    dir_path = "/content/drive/MyDrive/HeckLit/exp/Heck_split/Heck_drfp_split_%s" % datetime.datetime.now()
    os.mkdir("%s" % dir_path)
    f = open("%s/Model_Training_Report.txt" % dir_path, mode="w")

    # split of train & test set
    ratio = 0.8
    f.write("random_state=%s\n" % random_state)
    f.write("ratio=%s\n" % ratio)

    # inter
    inter_batch_size = 1024
    inter_batch = len(inter_dataset)
    inter_train_set = inter_dataset[0: int(ratio * inter_batch)]
    inter_test_set = inter_dataset[int(ratio * inter_batch) + 1:]

    # data_loader
    inter_train_loader = DataLoader(inter_train_set, batch_size=inter_batch_size, shuffle=True)
    inter_test_loader = DataLoader(inter_test_set, batch_size=inter_batch_size, shuffle=True)

    inter_test_RMSE = list()
    inter_test_R2 = list()
    inter_train_R2 = list()
    inter_train_RMSE = list()

    # intra
    intra_batch_size = 512
    intra_batch = len(intra_dataset)
    intra_train_set = intra_dataset[0: int(ratio * intra_batch)]
    intra_test_set = intra_dataset[int(ratio * intra_batch) + 1:]

    # data_loader
    intra_train_loader = DataLoader(intra_train_set, batch_size=intra_batch_size, shuffle=True)
    intra_test_loader = DataLoader(intra_test_set, batch_size=intra_batch_size, shuffle=True)

    intra_test_RMSE = list()
    intra_test_R2 = list()
    intra_train_R2 = list()
    intra_train_RMSE = list()

    # 3. training of the model

    # use gpu
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # inter
    # params
    inter_t = 1
    # 1500
    inter_lr = 1e-4

    # model
    inter_model = ANN(input_size=len_drfp).to(device)
    inter_opti = torch.optim.Adam(inter_model.parameters(), lr=inter_lr, weight_decay=1e-5)
    inter_criterion = nn.MSELoss()

    # writedown params
    f.write("\nInter params:\n")
    f.write("inter_batch_size=%s\n" % inter_batch_size)
    f.write("inter_t=%s\n" % inter_t)
    f.write("inter_lr=%s\n" % inter_lr)

    # intra
    # params
    intra_t = 1
    # 2000
    intra_lr = 1e-4

    # model
    intra_model =ANN(input_size=len_drfp).to(device)
    intra_opti = torch.optim.Adam(intra_model.parameters(), lr=intra_lr, weight_decay=1e-5)
    intra_criterion = nn.MSELoss()

    # writedown params
    f.write("\nIntra params:\n")
    f.write("intra_batch_size=%s\n" % intra_batch_size)
    f.write("intra_t=%s\n" % intra_t)
    f.write("intra_lr=%s\n" % intra_lr)

    # Training and Evaluation

    # Inter
    # Inter best performance
    inter_best = [0, 0, 0, 0, [], []]  # train_R2, train_RMSE, test_R2, test_RMSE, test_predict, test_true

    f.write("\nInter Start training\n")

    for epoch in tqdm(range(inter_t)):
        # Training
        global_loss = torch.tensor([0.])

        for data in inter_train_loader:
            x = data[:-1][0].cuda()
            y = torch.unsqueeze(data[-1], dim=1).cuda()
            loss = inter_criterion(inter_model.forward(x).float(), y.float())
            inter_opti.zero_grad()
            loss.backward()
            inter_opti.step()
            global_loss += loss.item()

        # record of loss during training
        # performance in train set
        with torch.no_grad():
            pred = list()
            true = list()
            for data in inter_train_loader:
                x = data[:-1][0].cuda()
                tr = list(torch.unsqueeze(data[-1], dim=1).detach().numpy())
                pr = list(inter_model.forward(x).cpu().detach().numpy())
                pred += pr
                true += tr
            inter_train_RMSE.append(RMSE(np.array(pred), np.array(true)))
            inter_train_R2.append(R2(np.array(pred), np.array(true)))

        # performance in test set
        with torch.no_grad():
            inter_pred = list()
            inter_true = list()
            for data in inter_test_loader:
                x = data[:-1][0].cuda()
                tr = list(torch.unsqueeze(data[-1], dim=1).detach().numpy())
                pr = list(inter_model.forward(x).cpu().detach().numpy())
                inter_pred += pr
                inter_true += tr
            inter_test_RMSE.append(RMSE(np.array(inter_pred), np.array(inter_true)))
            inter_test_R2.append(R2(np.array(inter_pred), np.array(inter_true)))

            if epoch == 0 or inter_test_R2[-1] > inter_best[2]:
                inter_best = [inter_train_R2[-1], inter_train_RMSE[-1], inter_test_R2[-1], inter_test_RMSE[-1],
                              inter_pred, inter_true]

    # Evaluation
    f.write("Intermolecular:\n")
    # Performance in train set
    f.write("R2 of train set is:%.3f+-%f\tbest:%f\n" % (
    np.array(inter_train_R2[-10:]).mean(), np.array(inter_train_R2[-10:]).std(), inter_best[0]))
    f.write("RMSE of train set is: %.3f+-%f\tbest:%f\n" % (
    np.array(inter_train_RMSE[-10:]).mean(), np.array(inter_train_RMSE[-10:]).std(), inter_best[1]))

    # Performance in test set
    f.write("R2 of test set is:%.3f+-%.3f\tbest:%f\n" % (
    np.array(inter_test_R2[-10:]).mean(), np.array(inter_test_R2[-10:]).std(), inter_best[2]))
    f.write("RMSE of test set is: %.3f+-%f\tbest:%f\n" % (
    np.array(inter_test_RMSE[-10:]).mean(), np.array(inter_test_RMSE[-10:]).std(), inter_best[3]))
    f.write("MAE of test set is: best:%f\n" % MAE(np.array(inter_best[5]), np.array(inter_best[4])))
    f.write("\n")

    fig = plt.figure(dpi=500, figsize=(24, 7))

    # Training Fig
    plt.subplot(1, 3, 1)
    steps = np.linspace(1, inter_t, inter_t)
    plt.plot(steps, inter_train_R2, color=[236 / 255, 164 / 255, 124 / 255])
    plt.plot(steps, inter_test_R2, color=[117 / 255, 157 / 255, 219 / 255])
    # Beautify
    plt.legend(["train set R$^2$", "test set R$^2$"], loc="upper left", prop={'size': 10})
    plt.xlabel("Epoch", fontsize=10)
    plt.ylabel("R$^2$ value", fontsize=10)
    plt.title("The R$^2$ value during training", fontsize=13)

    # Test set performance
    plt.subplot(1, 3, 2)
    tr = np.array(inter_best[5]).flatten()  * 100
    pr = np.array(inter_best[4]).flatten() * 100
    plt.scatter(pr, tr, alpha=0.7, marker=".")
    plt.xlabel("Predicted Yield", fontsize=10)
    plt.ylabel("Observed Yield", fontsize=10)
    x = np.linspace(0, 100, 100)
    y = np.linspace(0, 100, 100)
    plt.plot(x, y, linestyle="--", color="r")
    plt.title("Test set performance", fontsize=15)

    fig.suptitle("DRFP for Heck dataset", fontsize=16)
    plt.tight_layout()
    plt.savefig("%s/Performance_Figure.png" % dir_path)
    plt.show()

    # Training and Evaluation
    # Intra

    # Intra best performance
    intra_best = [0, 0, 0, 0, [], []]  # train_R2, train_RMSE, test_R2, test_RMSE, test_predict, test_true

    f.write("\nIntra Start training\n")

    for epoch in tqdm(range(intra_t)):
        # Training
        global_loss = torch.tensor([0.])

        for data in intra_train_loader:
            x = data[:-1][0].cuda()
            y = torch.unsqueeze(data[-1], dim=1).cuda()
            loss = intra_criterion(intra_model.forward(x).float(), y.float())
            intra_opti.zero_grad()
            loss.backward()
            intra_opti.step()
            global_loss += loss.item()

        # record of loss during training
        # performance in train set
        with torch.no_grad():
            pred = list()
            true = list()
            for data in intra_train_loader:
                x = data[:-1][0].cuda()
                tr = list(torch.unsqueeze(data[-1], dim=1).detach().numpy())
                pr = list(intra_model.forward(x).cpu().detach().numpy())
                pred += pr
                true += tr
            intra_train_RMSE.append(RMSE(np.array(pred), np.array(true)))
            intra_train_R2.append(R2(np.array(pred), np.array(true)))

        # performance in test set
        with torch.no_grad():
            intra_pred = list()
            intra_true = list()
            for data in intra_test_loader:
                x = data[:-1][0].cuda()
                tr = list(torch.unsqueeze(data[-1], dim=1).detach().numpy())
                pr = list(intra_model.forward(x).cpu().detach().numpy())
                intra_pred += pr
                intra_true += tr
            intra_test_RMSE.append(RMSE(np.array(intra_pred), np.array(intra_true)))
            intra_test_R2.append(R2(np.array(intra_pred), np.array(intra_true)))

            if epoch == 0 or intra_test_R2[-1] > intra_best[2]:
                intra_best = [intra_train_R2[-1], intra_train_RMSE[-1], intra_test_R2[-1], intra_test_RMSE[-1],
                              intra_pred, intra_true]

    f.write("\n")
    f.write("Intramolecular:\n")
    # Performance in train set
    f.write("R2 of train set is:%.3f+-%f\tbest:%f\n" % (
    np.array(intra_train_R2[-10:]).mean(), np.array(intra_train_R2[-10:]).std(), intra_best[0]))
    f.write("RMSE of train set is: %.3f+-%f\tbest:%f\n" % (
    np.array(intra_train_RMSE[-10:]).mean(), np.array(intra_train_RMSE[-10:]).std(), intra_best[1]))

    # Performance in test set
    f.write("R2 of test set is:%.3f+-%.3f\tbest:%f\n" % (
    np.array(intra_test_R2[-10:]).mean(), np.array(intra_test_R2[-10:]).std(), intra_best[2]))
    f.write("RMSE of test set is: %.3f+-%f\tbest:%f\n" % (
    np.array(intra_test_RMSE[-10:]).mean(), np.array(intra_test_RMSE[-10:]).std(), intra_best[3]))
    f.write("MAE of test set is: best:%f\n" % MAE(np.array(intra_best[5]), np.array(intra_best[4])))
    f.write("\n")

    # Intra
    fig = plt.figure(dpi=120, figsize=(20, 7))

    # Training Fig
    plt.subplot(1, 3, 1)
    steps = np.linspace(1, intra_t, intra_t)
    plt.plot(steps, intra_train_R2, color=[236 / 255, 164 / 255, 124 / 255])
    plt.plot(steps, intra_test_R2, color=[117 / 255, 157 / 255, 219 / 255])
    # Beautify
    plt.legend(["train set R$^2$", "test set R$^2$"], loc="upper left", prop={'size': 10})
    plt.xlabel("Epoch", fontsize=10)
    plt.ylabel("R$^2$ value", fontsize=10)
    plt.title("The R$^2$ value during training", fontsize=13)

    # Test set performance
    plt.subplot(1, 3, 2)
    intra_tr = np.array(intra_best[5]).flatten() * 100
    intra_pr = np.array(intra_best[4]).flatten() * 100
    plt.scatter(intra_pr, intra_tr, alpha=0.7, marker=".")
    plt.xlabel("Predicted Yield", fontsize=10)
    plt.ylabel("Observed Yield", fontsize=10)
    x = np.linspace(0, 100, 100)
    y = np.linspace(0, 100, 100)
    plt.plot(x, y, linestyle="--", color="r")
    plt.title("Test set performance", fontsize=15)

    fig.suptitle("DRFP for Heck Intramolecular dataset", fontsize=16)
    plt.tight_layout()
    plt.savefig("%s/Intramolecular Performance Figure.png" % dir_path)
    plt.show()

    # Performance in both in test set
    r2 = R2(np.array(intra_best[4] + inter_best[4]), np.array(intra_best[5] + inter_best[5]))
    rmse = RMSE(np.array(intra_best[4] + inter_best[4]), np.array(intra_best[5] + inter_best[5]))

    f.write("\nThe performance in both datasets\n")
    f.write("R2 for both testset is:%f\n" % r2)
    f.write("RMSE for both testset is: %f\n" % rmse)
    f.close()