In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install rdkit
!pip install torch_geometric

In [None]:
import os
import sys
import torch
sys.path.append("/content/drive/MyDrive/MMHRP")
from utils.rxn import *
from utils.molecule import *
from torch_geometric.loader import DataLoader
from models.GNN_Models import *
import time
from tqdm import tqdm
import datetime
from sklearn.metrics import mean_absolute_error as MAE
import warnings
warnings.simplefilter('ignore')
import matplotlib.pyplot as plt

In [None]:
# GCN
rs_list = [1]

for rs in rs_list:
    # 1. import data
    data = pd.read_excel("/content/drive/MyDrive/MMHRP/data/SNAR/SNAR_data.xlsx")
    random_state = rs

    data = data.sample(random_state=random_state, frac=1).reset_index(drop=True)

    # 2. build dataset & dataloader
    rxn_dataset = list()

    for batch in tqdm(range(data.shape[0])):
        meta = list()
        # rea
        rea1 = data.loc[batch]["Substrate SMILES"]
        rea2 = data.loc[batch]["Nucleophile SMILES"]
        prod = data.loc[batch]["Product SMILES"]
        # sol
        sol = list()
        sol = data.loc[batch]["Solvent"].split(".")

        meta.append(smis_to_graph([rea1, rea2, prod] + sol))

        # activation energy
        meta.append(data.loc[batch]["exp_activation_energy"])

        rxn_dataset.append(meta)

    # report
    dir_path = "/content/drive/MyDrive/MMHRP/exp/SNAR/SNAR_GCN_rs=%s_%s" % (random_state, datetime.datetime.now())
    os.mkdir("%s" % dir_path)
    f = open("%s/Model_Training_Report.txt" % dir_path, mode="w")

    # split of train & test set
    ratio = 0.7
    batch_size = 64
    batch = len(rxn_dataset)
    train_set = rxn_dataset[0: int(ratio * batch)]
    test_set = rxn_dataset[int(ratio * batch) + 1:]
    # data_loader
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=True)

    test_RMSE = list()
    test_R2 = list()
    test_MAE = list()
    train_R2 = list()
    train_RMSE = list()
    train_MAE = list()
    pred = list()
    true = list()

    # 3. training of the model
    # params
    t = 1
    lr = 1e-3
    num_feature = 8

    # use gpu
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # model
    model = GCN(node_feature_num=num_feature,
                channels=[32, 64])
    model = model.to(device)
    opti = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)

    criterion = nn.MSELoss()

    # writedown params
    f.write("params:\n")
    f.write("random_state=%s\n" % random_state)
    f.write("ratio=%s\n" % ratio)
    f.write("batch_size=%s\n" % batch_size)
    f.write("t=%s\n" % t)
    f.write("lr=%s\n" % lr)

    # Training

    # best performance
    best = [0, 0, 0, 0, 0, 0, [], [],
            []]  # train_R2, train_RMSE, train_MAE, test_R2, test_RMSE, test_MAE test_predict, test_true, model

    f.write("\nStart training\n")

    for epoch in tqdm(range(t)):
        # Training
        global_loss = torch.tensor([0.])

        for data in train_loader:
            x = [i.to(device) for i in data[:-1]]
            y = torch.unsqueeze(data[-1], dim=1).to(device)
            loss = criterion(model.forward(x[0]), y)
            opti.zero_grad()
            loss.backward()
            opti.step()
            global_loss += loss.item()

        # record of loss during training
        # performance in train set
        with torch.no_grad():
            pred = list()
            true = list()
            for data in train_loader:
                x = [i.to(device) for i in data[:-1]]
                tr = list(torch.unsqueeze(data[-1], dim=1).detach().numpy())
                pr = list(model.forward(x[0]).cpu().detach().numpy())
                pred += pr
                true += tr
            train_RMSE.append(RMSE(np.array(pred), np.array(true)))
            train_R2.append(R2(np.array(pred), np.array(true)))
            train_MAE.append(MAE(np.array(true), np.array(pred)))

        # performance in test set
        with torch.no_grad():
            pred = list()
            true = list()
            for data in test_loader:
                x = [i.to(device) for i in data[:-1]]
                tr = list(torch.unsqueeze(data[-1], dim=1).detach().numpy())
                pr = list(model.forward(x[0]).cpu().detach().numpy())
                pred += pr
                true += tr
            test_RMSE.append(RMSE(np.array(pred), np.array(true)))
            test_R2.append(R2(np.array(pred), np.array(true)))
            test_MAE.append(MAE(np.array(true), np.array(pred)))

            if epoch == 0 or test_R2[-1] > best[3]:
                best = [train_R2[-1], train_RMSE[-1], train_MAE[-1], test_R2[-1], test_RMSE[-1], test_MAE[-1], pred,
                        true, model]

        # write report
        f.write(
            "Epoch:%d loss: %f, R2:train set %.3f\ttest set %.3f\tRMSE:train set %.3f\ttest set %.3f\tMAE:train set %.3f\ttest set %.3f\n" % (
            epoch + 1, global_loss / batch_size, train_R2[-1], test_R2[-1], train_RMSE[-1], test_RMSE[-1],
            train_MAE[-1], test_MAE[-1]))

    # 4.Evaluation
    f.write("\n")
    # Performance in train set
    f.write("R2 of train set is:%.3f+-%f\tbest:%f\n" % (
    np.array(train_R2[-10:]).mean(), np.array(train_R2[-10:]).std(), best[0]))
    f.write("RMSE of train set is: %.3f+-%f\tbest:%f\n" % (
    np.array(train_RMSE[-10:]).mean(), np.array(train_RMSE[-10:]).std(), best[1]))
    f.write("MAE of train set is: %.3f+-%f\tbest:%f\n" % (
    np.array(train_MAE[-10:]).mean(), np.array(train_MAE[-10:]).std(), best[2]))

    # Performance in test set
    f.write("R2 of test set is:%.3f+-%.3f\tbest:%f\n" % (
    np.array(test_R2[-10:]).mean(), np.array(test_R2[-10:]).std(), best[3]))
    f.write("RMSE of test set is: %.3f+-%f\tbest:%f\n" % (
    np.array(test_RMSE[-10:]).mean(), np.array(test_RMSE[-10:]).std(), best[4]))
    f.write("MAE of test set is: %.3f+-%f\tbest:%f\n" % (
    np.array(test_MAE[-10:]).mean(), np.array(test_MAE[-10:]).std(), best[5]))

    f.close()

    # Performance in train set
    print("R2 of train set is:%.3f+-%f\tbest:%f\n" % (
    np.array(train_R2[-10:]).mean(), np.array(train_R2[-10:]).std(), best[0]))
    print("RMSE of train set is: %.3f+-%f\tbest:%f\n" % (
    np.array(train_RMSE[-10:]).mean(), np.array(train_RMSE[-10:]).std(), best[1]))
    print("MAE of train set is: %.3f+-%f\tbest:%f\n" % (
    np.array(train_MAE[-10:]).mean(), np.array(train_MAE[-10:]).std(), best[2]))

    # Performance in test set
    print("R2 of test set is:%.3f+-%.3f\tbest:%f\n" % (
    np.array(test_R2[-10:]).mean(), np.array(test_R2[-10:]).std(), best[3]))
    print("RMSE of test set is: %.3f+-%f\tbest:%f\n" % (
    np.array(test_RMSE[-10:]).mean(), np.array(test_RMSE[-10:]).std(), best[4]))
    print("MAE of test set is: %.3f+-%f\tbest:%f\n" % (
    np.array(test_MAE[-10:]).mean(), np.array(test_MAE[-10:]).std(), best[5]))

    # 5.Figure
    import matplotlib.pyplot as plt
    import seaborn as sns

    fig = plt.figure(dpi=500, figsize=(10, 5))

    # Training Fig
    plt.subplot(1, 2, 1)
    steps = np.linspace(1, t, t)
    plt.plot(steps, train_R2, color=[236 / 255, 164 / 255, 124 / 255])
    plt.plot(steps, test_R2, color=[117 / 255, 157 / 255, 219 / 255])
    # Beautify
    plt.legend(["train set", "test set"], loc="upper left", prop={'size': 8})
    plt.xlabel("Epoch", fontsize=10)
    plt.ylabel("R$^2$", fontsize=10)
    plt.title("The R$^2$ of train & test set during training", fontsize=13)

    # Test set performance
    plt.subplot(1, 2, 2)
    tr = np.array(best[7]).flatten()
    pr = np.array(best[6]).flatten()
    plt.scatter(pr, tr, alpha=0.7, marker=".")
    plt.xlabel("Predicted E$_a$ (kcal mol$^-$$^1$)", fontsize=10)
    plt.ylabel("Observed E$_a$ (kcal mol$^-$$^1$)", fontsize=10)
    x = np.linspace(10, 45, 8)
    y = np.linspace(10, 45, 8)
    plt.plot(x, y, linestyle="--", color="r")
    plt.title("Test set performance", fontsize=15)
    fig.suptitle("GCN for SNAR dataset", fontsize=16)
    plt.tight_layout()
    plt.savefig("%s/Performance_Figure.png" % dir_path)
    plt.show()

    torch.save(best[-1], "%s/model.pth" % dir_path)

In [None]:
# GAT
rs_list = [1]

for rs in rs_list:
    # 1. import data
    data = pd.read_excel("/content/drive/MyDrive/MMHRP/data/SNAR/SNAR_data.xlsx")
    random_state = rs

    data = data.sample(random_state=random_state, frac=1).reset_index(drop=True)

    # 2. build dataset & dataloader
    rxn_dataset = list()

    for batch in tqdm(range(data.shape[0])):
        meta = list()
        # rea
        rea1 = data.loc[batch]["Substrate SMILES"]
        rea2 = data.loc[batch]["Nucleophile SMILES"]
        prod = data.loc[batch]["Product SMILES"]
        # sol
        sol = list()
        sol = data.loc[batch]["Solvent"].split(".")

        meta.append(smis_to_graph([rea1, rea2, prod] + sol))

        # activation energy
        meta.append(data.loc[batch]["exp_activation_energy"])

        rxn_dataset.append(meta)

    # report
    dir_path = "/content/drive/MyDrive/MMHRP/exp/SNAR/SNAR_GAT_rs=%s_%s" % (random_state, datetime.datetime.now())
    os.mkdir("%s" % dir_path)
    f = open("%s/Model_Training_Report.txt" % dir_path, mode="w")

    # split of train & test set
    ratio = 0.8
    batch_size = 32
    batch = len(rxn_dataset)
    train_set = rxn_dataset[0: int(ratio * batch)]
    test_set = rxn_dataset[int(ratio * batch) + 1:]
    # data_loader
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=True)

    test_RMSE = list()
    test_R2 = list()
    test_MAE = list()
    train_R2 = list()
    train_RMSE = list()
    train_MAE = list()
    pred = list()
    true = list()

    # 3. training of the model
    # params
    t = 1000
    lr = 1e-5
    num_feature = 8

    # use gpu
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # model
    model = GAT(node_feature_num=num_feature,
                channels=[32, 64],
                heads=4)
    model = model.to(device)
    opti = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)

    criterion = nn.MSELoss()

    # writedown params
    f.write("params:\n")
    f.write("random_state=%s\n" % random_state)
    f.write("ratio=%s\n" % ratio)
    f.write("batch_size=%s\n" % batch_size)
    f.write("t=%s\n" % t)
    f.write("lr=%s\n" % lr)

    # Training

    # best performance
    best = [0, 0, 0, 0, 0, 0, [], [],
            []]  # train_R2, train_RMSE, train_MAE, test_R2, test_RMSE, test_MAE test_predict, test_true, model

    f.write("\nStart training\n")

    for epoch in tqdm(range(t)):
        # Training
        global_loss = torch.tensor([0.])

        for data in train_loader:
            x = [i.to(device) for i in data[:-1]]
            y = torch.unsqueeze(data[-1], dim=1).to(device)
            loss = criterion(model.forward(x[0]), y)
            opti.zero_grad()
            loss.backward()
            opti.step()
            global_loss += loss.item()

        # record of loss during training
        # performance in train set
        with torch.no_grad():
            pred = list()
            true = list()
            for data in train_loader:
                x = [i.to(device) for i in data[:-1]]
                tr = list(torch.unsqueeze(data[-1], dim=1).detach().numpy())
                pr = list(model.forward(x[0]).cpu().detach().numpy())
                pred += pr
                true += tr
            train_RMSE.append(RMSE(np.array(pred), np.array(true)))
            train_R2.append(R2(np.array(pred), np.array(true)))
            train_MAE.append(MAE(np.array(true), np.array(pred)))

        # performance in test set
        with torch.no_grad():
            pred = list()
            true = list()
            for data in test_loader:
                x = [i.to(device) for i in data[:-1]]
                tr = list(torch.unsqueeze(data[-1], dim=1).detach().numpy())
                pr = list(model.forward(x[0]).cpu().detach().numpy())
                pred += pr
                true += tr
            test_RMSE.append(RMSE(np.array(pred), np.array(true)))
            test_R2.append(R2(np.array(pred), np.array(true)))
            test_MAE.append(MAE(np.array(true), np.array(pred)))

            if epoch == 0 or test_R2[-1] > best[3]:
                best = [train_R2[-1], train_RMSE[-1], train_MAE[-1], test_R2[-1], test_RMSE[-1], test_MAE[-1], pred,
                        true, model]

        # write report
        f.write(
            "Epoch:%d loss: %f, R2:train set %.3f\ttest set %.3f\tRMSE:train set %.3f\ttest set %.3f\tMAE:train set %.3f\ttest set %.3f\n" % (
            epoch + 1, global_loss / batch_size, train_R2[-1], test_R2[-1], train_RMSE[-1], test_RMSE[-1],
            train_MAE[-1], test_MAE[-1]))

    # 4.Evaluation
    f.write("\n")
    # Performance in train set
    f.write("R2 of train set is:%.3f+-%f\tbest:%f\n" % (
    np.array(train_R2[-10:]).mean(), np.array(train_R2[-10:]).std(), best[0]))
    f.write("RMSE of train set is: %.3f+-%f\tbest:%f\n" % (
    np.array(train_RMSE[-10:]).mean(), np.array(train_RMSE[-10:]).std(), best[1]))
    f.write("MAE of train set is: %.3f+-%f\tbest:%f\n" % (
    np.array(train_MAE[-10:]).mean(), np.array(train_MAE[-10:]).std(), best[2]))

    # Performance in test set
    f.write("R2 of test set is:%.3f+-%.3f\tbest:%f\n" % (
    np.array(test_R2[-10:]).mean(), np.array(test_R2[-10:]).std(), best[3]))
    f.write("RMSE of test set is: %.3f+-%f\tbest:%f\n" % (
    np.array(test_RMSE[-10:]).mean(), np.array(test_RMSE[-10:]).std(), best[4]))
    f.write("MAE of test set is: %.3f+-%f\tbest:%f\n" % (
    np.array(test_MAE[-10:]).mean(), np.array(test_MAE[-10:]).std(), best[5]))

    f.close()

    # Performance in train set
    print("R2 of train set is:%.3f+-%f\tbest:%f\n" % (
    np.array(train_R2[-10:]).mean(), np.array(train_R2[-10:]).std(), best[0]))
    print("RMSE of train set is: %.3f+-%f\tbest:%f\n" % (
    np.array(train_RMSE[-10:]).mean(), np.array(train_RMSE[-10:]).std(), best[1]))
    print("MAE of train set is: %.3f+-%f\tbest:%f\n" % (
    np.array(train_MAE[-10:]).mean(), np.array(train_MAE[-10:]).std(), best[2]))

    # Performance in test set
    print("R2 of test set is:%.3f+-%.3f\tbest:%f\n" % (
    np.array(test_R2[-10:]).mean(), np.array(test_R2[-10:]).std(), best[3]))
    print("RMSE of test set is: %.3f+-%f\tbest:%f\n" % (
    np.array(test_RMSE[-10:]).mean(), np.array(test_RMSE[-10:]).std(), best[4]))
    print("MAE of test set is: %.3f+-%f\tbest:%f\n" % (
    np.array(test_MAE[-10:]).mean(), np.array(test_MAE[-10:]).std(), best[5]))

    # 5.Figure
    import matplotlib.pyplot as plt
    import seaborn as sns

    fig = plt.figure(dpi=500, figsize=(10, 5))

    # Training Fig
    plt.subplot(1, 2, 1)
    steps = np.linspace(1, t, t)
    plt.plot(steps, train_R2, color=[236 / 255, 164 / 255, 124 / 255])
    plt.plot(steps, test_R2, color=[117 / 255, 157 / 255, 219 / 255])
    # Beautify
    plt.legend(["train set", "test set"], loc="upper left", prop={'size': 8})
    plt.xlabel("Epoch", fontsize=10)
    plt.ylabel("R$^2$", fontsize=10)
    plt.title("The R$^2$ of train & test set during training", fontsize=13)

    # Test set performance
    plt.subplot(1, 2, 2)
    tr = np.array(best[7]).flatten()
    pr = np.array(best[6]).flatten()
    plt.scatter(pr, tr, alpha=0.7, marker=".")
    plt.xlabel("Predicted E$_a$ (kcal mol$^-$$^1$)", fontsize=10)
    plt.ylabel("Observed E$_a$ (kcal mol$^-$$^1$)", fontsize=10)
    x = np.linspace(10, 45, 8)
    y = np.linspace(10, 45, 8)
    plt.plot(x, y, linestyle="--", color="r")
    plt.title("Test set performance", fontsize=15)
    fig.suptitle("GAT for SNAR dataset", fontsize=16)
    plt.tight_layout()
    plt.savefig("%s/Performance_Figure.png" % dir_path)
    plt.show()

    torch.save(best[-1], "%s/model.pth" % dir_path)

In [None]:
# rf
rs_list = [1, 2, 3, 4, 5]
for rs in rs_list:
  # 1. import data
  data = pd.read_excel("/content/drive/MyDrive/MMHRP/data/SNAR/SNAR_fp.xlsx")
  random_state = rs
  data = data.sample(random_state=random_state, frac=1).reset_index(drop=True)
  # 2. build dataset & dataloader
  rxnfp_set = list()
  drfp_set = list()
  yield_set = list()
  len_rxnfp = 0
  len_drfp = 0

  for batch in tqdm(range(data.shape[0])):
      # features
      rxnfp = read_rxnfp(data.loc[batch]["rxnfp"])
      len_rxnfp = rxnfp.shape[0]
      drfp = read_drfp(data.loc[batch]["drfp"])
      len_drfp = drfp.shape[0]
      # label
      y = float(data.loc[batch]["exp_activation_energy"])

      rxnfp_set.append(rxnfp)
      drfp_set.append(drfp)
      yield_set.append(y)

  # split of train & test set
  ratio = 0.8
  # rxnfp
  batch = len(rxnfp_set)
  rxnfp_trainset = [rxnfp_set[0: int(ratio * batch)], yield_set[0: int(ratio * batch)]]
  rxnfp_testset = [rxnfp_set[int(ratio * batch) + 1:], yield_set[int(ratio * batch) + 1:]]

  # drfp
  batch = len(drfp_set)
  drfp_trainset = [drfp_set[0: int(ratio * batch)], yield_set[0: int(ratio * batch)]]
  drfp_testset = [drfp_set[int(ratio * batch) + 1:], yield_set[int(ratio * batch) + 1:]]

  # report
  dir_path = "/content/drive/MyDrive/MMHRP/exp/SNAR/SNAR_RXNFP&DRFP_rs=%s_%s" % (random_state, datetime.datetime.now())
  os.mkdir("%s" % dir_path)
  f = open("%s/Model_Training_Report.txt" % dir_path, mode="w")

  # record
  f.write("params:\n")
  f.write("random_state=%s\n" % random_state)
  f.write("ratio=%s\n" % ratio)
  f.write("length of RXNFP=%s\n" % len_rxnfp)
  f.write("length of DRFP=%s\n" % len_drfp)
  f.write("\n")

  # Machine Learning methods
  # RandomForest
  from sklearn.ensemble import RandomForestRegressor

  # Train
  print("RandomForest Start Training")
  start = time.time()

  rxnfp_rf = RandomForestRegressor(n_estimators=150, random_state=0)
  drfp_rf = RandomForestRegressor(n_estimators=150, random_state=0)
  rxnfp_rf.fit(rxnfp_trainset[0], rxnfp_trainset[1])
  drfp_rf.fit(drfp_trainset[0], drfp_trainset[1])

  print("Finish Training")
  print("Training Time: %.2f s" % (time.time()-start))#截止时间

  # Eval
  # trainset
  # R2
  rxnfp_train_R2 = R2(rxnfp_rf.predict(rxnfp_trainset[0]), np.array(rxnfp_trainset[1]))
  drfp_train_R2 = R2(drfp_rf.predict(drfp_trainset[0]), np.array(drfp_trainset[1]))
  # RMSE
  rxnfp_train_RMSE = RMSE(rxnfp_rf.predict(rxnfp_trainset[0]), np.array(rxnfp_trainset[1]))
  drfp_train_RMSE = RMSE(drfp_rf.predict(drfp_trainset[0]), np.array(drfp_trainset[1]))
  # MAE
  rxnfp_train_MAE = MAE(y_pred=rxnfp_rf.predict(rxnfp_trainset[0]), y_true=np.array(rxnfp_trainset[1]))
  drfp_train_MAE = MAE(y_pred=drfp_rf.predict(drfp_trainset[0]), y_true=np.array(drfp_trainset[1]))

  # R2
  rxnfp_test_R2 = R2(rxnfp_rf.predict(rxnfp_testset[0]), np.array(rxnfp_testset[1]))
  drfp_test_R2 = R2(drfp_rf.predict(drfp_testset[0]), np.array(drfp_testset[1]))
  # RMSE
  rxnfp_test_RMSE = RMSE(rxnfp_rf.predict(rxnfp_testset[0]), np.array(rxnfp_testset[1]))
  drfp_test_RMSE = RMSE(drfp_rf.predict(drfp_testset[0]), np.array(drfp_testset[1]))
  # MAE
  rxnfp_test_MAE = MAE(y_pred=rxnfp_rf.predict(rxnfp_testset[0]), y_true=np.array(rxnfp_testset[1]))
  drfp_test_MAE = MAE(y_pred=drfp_rf.predict(drfp_testset[0]), y_true=np.array(drfp_testset[1]))

  # Record
  f.write("RandomForest:\n")
  f.write("params:\n")
  f.write("rxnfp RF:%s\n" % rxnfp_rf.n_estimators)
  f.write("drfp RF:%s\n" % drfp_rf.n_estimators)

  f.write("rxnfp:\n")
  f.write("rxnfp_train_R2=%s\n" % rxnfp_train_R2)
  f.write("rxnfp_train_RMSE=%s\n" % rxnfp_train_RMSE)
  f.write("rxnfp_train_MAE=%s\n" % rxnfp_train_MAE)
  f.write("rxnfp_test_R2=%s\n" % rxnfp_test_R2)
  f.write("rxnfp_test_RMSE=%s\n" % rxnfp_test_RMSE)
  f.write("rxnfp_test_MAE=%s\n" % rxnfp_test_MAE)
  f.write("drfp:\n")
  f.write("drfp_train_R2=%s\n" % drfp_train_R2)
  f.write("drfp_train_RMSE=%s\n" % drfp_train_RMSE)
  f.write("drfp_train_MAE=%s\n" % drfp_train_MAE)
  f.write("drfp_test_R2=%s\n" % drfp_test_R2)
  f.write("drfp_test_RMSE=%s\n" % drfp_test_RMSE)
  f.write("drfp_test_MAE=%s\n" % drfp_test_MAE)
  f.write("\n")

  # xgboost
  from xgboost import XGBRegressor

  # Train
  print("XGBoost Start Training")
  start = time.time()

  rxnfp_xgb = XGBRegressor(n_estimators=300)
  drfp_xgb = XGBRegressor(n_estimators=300)
  rxnfp_xgb.fit(rxnfp_trainset[0], rxnfp_trainset[1])
  drfp_xgb.fit(drfp_trainset[0], drfp_trainset[1])

  print("Finish Training")
  print("Training Time: %.2f s" % (time.time()-start))#截止时间

  # Eval
  # trainset
  # R2
  rxnfp_train_R2 = R2(rxnfp_xgb.predict(rxnfp_trainset[0]), np.array(rxnfp_trainset[1]))
  drfp_train_R2 = R2(drfp_xgb.predict(drfp_trainset[0]), np.array(drfp_trainset[1]))
  # RMSE
  rxnfp_train_RMSE = RMSE(rxnfp_xgb.predict(rxnfp_trainset[0]), np.array(rxnfp_trainset[1]))
  drfp_train_RMSE = RMSE(drfp_xgb.predict(drfp_trainset[0]), np.array(drfp_trainset[1]))
  # MAE
  rxnfp_train_MAE = MAE(y_pred=rxnfp_xgb.predict(rxnfp_trainset[0]), y_true=np.array(rxnfp_trainset[1]))
  drfp_train_MAE = MAE(y_pred=drfp_xgb.predict(drfp_trainset[0]), y_true=np.array(drfp_trainset[1]))

  # R2
  rxnfp_test_R2 = R2(rxnfp_xgb.predict(rxnfp_testset[0]), np.array(rxnfp_testset[1]))
  drfp_test_R2 = R2(drfp_xgb.predict(drfp_testset[0]), np.array(drfp_testset[1]))
  # RMSE
  rxnfp_test_RMSE = RMSE(rxnfp_xgb.predict(rxnfp_testset[0]), np.array(rxnfp_testset[1]))
  drfp_test_RMSE = RMSE(drfp_xgb.predict(drfp_testset[0]), np.array(drfp_testset[1]))
  # MAE
  rxnfp_test_MAE = MAE(y_pred=rxnfp_xgb.predict(rxnfp_testset[0]), y_true=np.array(rxnfp_testset[1]))
  drfp_test_MAE = MAE(y_pred=drfp_xgb.predict(drfp_testset[0]), y_true=np.array(drfp_testset[1]))

  # Record
  f.write("XGBoost:\n")
  f.write("params:\n")
  f.write("rxnfp XGB:%s\n" % rxnfp_xgb.n_estimators)
  f.write("drfp XGB:%s\n" % drfp_xgb.n_estimators)

  f.write("rxnfp:\n")
  f.write("rxnfp_train_R2=%s\n" % rxnfp_train_R2)
  f.write("rxnfp_train_RMSE=%s\n" % rxnfp_train_RMSE)
  f.write("rxnfp_train_MAE=%s\n" % rxnfp_train_MAE)
  f.write("rxnfp_test_R2=%s\n" % rxnfp_test_R2)
  f.write("rxnfp_test_RMSE=%s\n" % rxnfp_test_RMSE)
  f.write("rxnfp_test_MAE=%s\n" % rxnfp_test_MAE)
  f.write("drfp:\n")
  f.write("drfp_train_R2=%s\n" % drfp_train_R2)
  f.write("drfp_train_RMSE=%s\n" % drfp_train_RMSE)
  f.write("drfp_train_MAE=%s\n" % drfp_train_MAE)
  f.write("drfp_test_R2=%s\n" % drfp_test_R2)
  f.write("drfp_test_RMSE=%s\n" % drfp_test_RMSE)
  f.write("drfp_test_MAE=%s\n" % drfp_test_MAE)
  f.close()

In [None]:
# rf
rs_list = [1,2,3,4,5]
for rs in rs_list:
  # 1. import data
  data = pd.read_excel("/content/drive/MyDrive/MMHRP/data/SNAR/SNAR_DFT.xlsx")
  random_state = rs
  data = data.sample(random_state=random_state, frac=1).reset_index(drop=True)
  # 2. build dataset & dataloader
  dft_set = list()
  yield_set = list()

  for batch in tqdm(range(data.shape[0])):
      # features
      dft = data.iloc[batch, :-1]
      # label
      y = float(data.loc[batch]["exp_activation_energy"])

      dft_set.append(dft)
      yield_set.append(y)

  # split of train & test set
  ratio = 0.8
  batch = len(dft_set)
  trainset = [dft_set[0: int(ratio * batch)], yield_set[0: int(ratio * batch)]]
  testset = [dft_set[int(ratio * batch) + 1:], yield_set[int(ratio * batch) + 1:]]

  # report
  dir_path = "/content/drive/MyDrive/MMHRP/exp/SNAR/SNAR_DFT_rs=%s_%s" % (random_state, datetime.datetime.now())
  os.mkdir("%s" % dir_path)
  f = open("%s/Model_Training_Report.txt" % dir_path, mode="w")

  # record
  f.write("params:\n")
  f.write("random_state=%s\n" % random_state)
  f.write("ratio=%s\n" % ratio)
  f.write("\n")

  # Machine Learning methods
  # RandomForest
  from sklearn.ensemble import RandomForestRegressor

  # Train
  print("RandomForest Start Training")
  start = time.time()

  rf = RandomForestRegressor(n_estimators=150, random_state=0)
  rf.fit(trainset[0], trainset[1])

  print("Finish Training")
  print("Training Time: %.2f s" % (time.time()-start))#截止时间

  # Eval
  # trainset
  # R2
  train_R2 = R2(rf.predict(trainset[0]), np.array(trainset[1]))
  # RMSE
  train_RMSE = RMSE(rf.predict(trainset[0]), np.array(trainset[1]))
  # MAE
  train_MAE = MAE(y_pred=rf.predict(trainset[0]), y_true=np.array(trainset[1]))

  # R2
  test_R2 = R2(rf.predict(testset[0]), np.array(testset[1]))
  # RMSE
  test_RMSE = RMSE(rf.predict(testset[0]), np.array(testset[1]))
  # MAE
  test_MAE = MAE(y_pred=rf.predict(testset[0]), y_true=np.array(testset[1]))

  # Record
  f.write("RandomForest:\n")
  f.write("params:\n")
  f.write("RF:%s\n" % rf.n_estimators)

  f.write("train_R2=%s\n" % train_R2)
  f.write("train_RMSE=%s\n" % train_RMSE)
  f.write("train_MAE=%s\n" % train_MAE)
  f.write("test_R2=%s\n" % test_R2)
  f.write("test_RMSE=%s\n" % test_RMSE)
  f.write("test_MAE=%s\n" % test_MAE)
  f.write("\n")

  # xgboost
  from xgboost import XGBRegressor

  # Train
  print("XGBoost Start Training")
  start = time.time()

  xgb = XGBRegressor(n_estimators=300)
  xgb.fit(trainset[0], trainset[1])

  print("Finish Training")
  print("Training Time: %.2f s" % (time.time()-start))#截止时间

  # Eval
  # trainset
  # R2
  train_R2 = R2(xgb.predict(trainset[0]), np.array(trainset[1]))
  # RMSE
  train_RMSE = RMSE(xgb.predict(trainset[0]), np.array(trainset[1]))
  # MAE
  train_MAE = MAE(y_pred=xgb.predict(trainset[0]), y_true=np.array(trainset[1]))

  # R2
  test_R2 = R2(xgb.predict(testset[0]), np.array(testset[1]))
  # RMSE
  test_RMSE = RMSE(xgb.predict(testset[0]), np.array(testset[1]))
  # MAE
  test_MAE = MAE(y_pred=xgb.predict(testset[0]), y_true=np.array(testset[1]))

  # Record
  f.write("XGBoost:\n")
  f.write("params:\n")
  f.write("XGB:%s\n" % xgb.n_estimators)
  f.write("train_R2=%s\n" % train_R2)
  f.write("train_RMSE=%s\n" % train_RMSE)
  f.write("train_MAE=%s\n" % train_MAE)
  f.write("test_R2=%s\n" % test_R2)
  f.write("test_RMSE=%s\n" % test_RMSE)
  f.write("test_MAE=%s\n" % test_MAE)
  f.close()