In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip install rdkit

In [4]:
import os
import sys
import torch
from torch.utils.data import DataLoader
sys.path.append("/content/drive/MyDrive/HeckLit")
from utils.rxn import *
from utils.molecule import *
from utils.dataset_analysis import *
from models.ANN import *
import time
from tqdm import tqdm
import datetime

In [None]:
rs_list = [1,2,3,4,5]
for rs in rs_list:
    # 1. import data
    data = pd.read_excel("/content/drive/MyDrive/HeckLit/data/Heck/Heck_fp.xlsx")
    random_state = rs
    data = data.sample(random_state=random_state, frac=1).reset_index(drop=True)

    # 2. build dataset & dataloader
    rxn_dataset = list()
    rxn_list = df_to_rxn_list(data)
    len_drfp = 0

    for batch in tqdm(range(data.shape[0])):
        rxn = rxn_list[batch]

        # features
        drfp = torch.tensor(read_drfp(data.loc[batch]["drfp"]), dtype=torch.float32)
        len_drfp = drfp.shape[0]
        # label
        y = rxn.rxn_yield / 100

        rxn_dataset.append([drfp, y])

    # report
    dir_path = "/content/drive/MyDrive/HeckLit/exp/Heck/Heck_drfp_%s" % datetime.datetime.now()
    os.mkdir("%s" % dir_path)
    f = open("%s/Model_Training_Report.txt" % dir_path, mode="w")

    # split of train & test set
    ratio = 0.7
    batch_size = 968
    batch = len(rxn_dataset)
    train_set = rxn_dataset[0: int(ratio * batch)]
    test_set = rxn_dataset[int(ratio * batch) + 1:]

    # data_loader
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=True)

    test_RMSE = list()
    test_R2 = list()
    train_R2 = list()
    train_RMSE = list()
    pred = list()
    true = list()

    # 3. training of the model
    # params
    t = 1
    lr = 1e-4

    # model
    model = ANN(input_size=len_drfp)
    opti = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)

    # use gpu
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    criterion = nn.MSELoss()

    # writedown params
    f.write("params:\n")
    f.write("random_state=%s\n" % random_state)
    f.write("ratio=%s\n" % ratio)
    f.write("batch_size=%s\n" % batch_size)
    f.write("t=%s\n" % t)
    f.write("lr=%s\n" % lr)

    # Training
    # best performance
    best = [0, 0, 0, 0, [], []]  # train_R2, train_RMSE, test_R2, test_RMSE, test_predict, test_true

    f.write("\nStart training\n")

    for epoch in tqdm(range(t)):
        # Training
        global_loss = torch.tensor([0.])

        for data in train_loader:
            x = data[:-1][0].to(device)
            y = torch.unsqueeze(data[-1], dim=1).to(device)
            loss = criterion(model.forward(x).float(), y.float())
            opti.zero_grad()
            loss.backward()
            opti.step()
            global_loss += loss.item()

        # record of loss during training
        # performance in train set
        with torch.no_grad():
            pred = list()
            true = list()
            for data in train_loader:
                x = data[:-1][0].to(device)
                tr = torch.unsqueeze(data[-1], dim=1).to(device)
                pr = list(model.forward(x).cpu().detach().numpy())
                pred += pr
                true += list(tr.cpu().detach().numpy())
            train_RMSE.append(RMSE(np.array(pred), np.array(true)))
            train_R2.append(R2(np.array(pred), np.array(true)))

        # performance in test set
        with torch.no_grad():
            pred = list()
            true = list()
            for data in test_loader:
                x = data[:-1][0].to(device)
                tr = torch.unsqueeze(data[-1], dim=1).to(device)
                pr = list(model.forward(x).cpu().detach().numpy())
                pred += pr
                true += list(tr.cpu().detach().numpy())
            test_RMSE.append(RMSE(np.array(pred), np.array(true)))
            test_R2.append(R2(np.array(pred), np.array(true)))

            if epoch == 0 or test_R2[-1] >= best[2]:
                best = [train_R2[-1], train_RMSE[-1], test_R2[-1], test_RMSE[-1], pred, true]

        # write report
        f.write("Epoch:%d loss: %f, R2:train set %.3f\ttest set %.3f\n" % (
        epoch + 1, global_loss / batch_size, train_R2[-1], test_R2[-1]))

    # 4.Evaluation
    f.write("\n")
    # Performance in train set
    f.write("R2 of train set is:%.3f+-%f\tbest:%f\n" % (
    np.array(train_R2[-10:]).mean(), np.array(train_R2[-10:]).std(), best[0]))
    f.write("RMSE of train set is: %.3f+-%f\tbest:%f\n" % (
    np.array(train_RMSE[-10:]).mean(), np.array(train_RMSE[-10:]).std(), best[1]))

    # Performance in test set
    f.write("R2 of test set is:%.3f+-%.3f\tbest:%f\n" % (
    np.array(test_R2[-10:]).mean(), np.array(test_R2[-10:]).std(), best[2]))
    f.write("RMSE of test set is: %.3f+-%f\tbest:%f\n" % (
    np.array(test_RMSE[-10:]).mean(), np.array(test_RMSE[-10:]).std(), best[3]))

    f.close()

    # Performance in train set
    print("R2 of train set is:%.3f+-%f\tbest:%f\n" % (
    np.array(train_R2[-10:]).mean(), np.array(train_R2[-10:]).std(), best[0]))
    print("RMSE of train set is: %.3f+-%f\tbest:%f\n" % (
    np.array(train_RMSE[-10:]).mean(), np.array(train_RMSE[-10:]).std(), best[1]))

    # Performance in test set
    print("R2 of test set is:%.3f+-%.3f\tbest:%f\n" % (
    np.array(test_R2[-10:]).mean(), np.array(test_R2[-10:]).std(), best[2]))
    print("RMSE of test set is: %.3f+-%f\tbest:%f\n" % (
    np.array(test_RMSE[-10:]).mean(), np.array(test_RMSE[-10:]).std(), best[3]))

    # 5.Figure
    import matplotlib.pyplot as plt

    fig = plt.figure(dpi=500, figsize=(24, 7))

    # Training Fig
    plt.subplot(1, 3, 1)
    steps = np.linspace(1, t, t)
    plt.plot(steps, train_R2, color=[236 / 255, 164 / 255, 124 / 255])
    plt.plot(steps, test_R2, color=[117 / 255, 157 / 255, 219 / 255])
    # Beautify
    plt.legend(["train set R$^2$", "test set R$^2$"], loc="upper left", prop={'size': 10})
    plt.xlabel("Epoch", fontsize=10)
    plt.ylabel("R$^2$ value", fontsize=10)
    plt.title("The R$^2$ value during training", fontsize=13)

    # Test set performance
    plt.subplot(1, 3, 2)
    tr = np.array(best[5]).flatten() * 100
    pr = np.array(best[4]).flatten() * 100
    plt.scatter(pr, tr, alpha=0.7, marker=".")
    plt.xlabel("Predicted Yield", fontsize=10)
    plt.ylabel("Observed Yield", fontsize=10)
    x = np.linspace(0, 100, 100)
    y = np.linspace(0, 100, 100)
    plt.plot(x, y, linestyle="--", color="r")
    plt.title("Test set performance", fontsize=15)

    fig.suptitle("DRFP for Heck dataset", fontsize=16)
    plt.tight_layout()
    plt.savefig("%s/Performance_Figure.png" % dir_path)
    plt.show()