In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip install rdkit

In [None]:
import os
import sys
import torch
import time
from tqdm import tqdm
import datetime
sys.path.append("/content/drive/MyDrive/HeckLit")
from utils.rxn import *
from utils.molecule import *
from models.ANN import *
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
subset_path = "/content/drive/MyDrive/HeckLit/data/Heck/Subsets"
rs_list = [1, 2, 3, 4, 5]
for file in os.listdir(subset_path):
  for rs in rs_list:
    # 1. import data
    data = pd.read_excel("%s/%s" % (subset_path, file))
    random_state = rs
    data = data.sample(random_state=random_state, frac=1).reset_index(drop=True)
    if data.shape[0] == 0:
      continue

    # 2. build dataset & dataloader
    drfp_set = list()
    yield_set = list()
    rxn_list = df_to_rxn_list(data)
    len_drfp = 0

    for batch in tqdm(range(data.shape[0])):
      rxn = rxn_list[batch]

      # features
      drfp = read_drfp(data.loc[batch]["drfp"])
      len_drfp = drfp.shape[0]
      # label
      y = np.array(rxn.rxn_yield / 100)

      drfp_set.append(drfp)
      yield_set.append(y)

    # split of train & test set
    ratio = 0.8

    # drfp
    batch = len(drfp_set)
    drfp_trainset = [drfp_set[0: int(ratio * batch)], yield_set[0: int(ratio * batch)]]
    drfp_testset = [drfp_set[int(ratio * batch) + 1:], yield_set[int(ratio * batch) + 1:]]

    # report
    dir_path = "/content/drive/MyDrive/HeckLit/exp/Heck_split/Heck_%s（rs=%s）_%s" % (file[:-5], rs, datetime.datetime.now())
    os.mkdir("%s" % dir_path)
    f = open("%s/Model_Training_Report.txt" % dir_path, mode="w")

    # record
    f.write("params:\n")
    f.write("random_state=%s\n" % random_state)
    f.write("ratio=%s\n" % ratio)
    f.write("\n")

    # 3.Machine Learning methods
    # RandomForest
    from sklearn.ensemble import RandomForestRegressor

    # Train
    print("RandomForest Start Training on %s（rs=%s）" % (file[:-5], rs))
    start = time.time()

    drfp_rf = RandomForestRegressor(n_estimators=500, random_state=0)
    drfp_rf.fit(drfp_trainset[0], drfp_trainset[1])

    print("Finish Training")
    print("Training Time: %.2f s" % (time.time() - start))  # 截止时间

    # Eval
    # trainset
    # R2
    drfp_train_R2 = R2(drfp_rf.predict(drfp_trainset[0]), np.array(drfp_trainset[1]))
    # RMSE
    drfp_train_RMSE = RMSE(drfp_rf.predict(drfp_trainset[0]), np.array(drfp_trainset[1]))

    # R2
    drfp_test_R2 = R2(drfp_rf.predict(drfp_testset[0]), np.array(drfp_testset[1]))
    # RMSE
    drfp_test_RMSE = RMSE(drfp_rf.predict(drfp_testset[0]), np.array(drfp_testset[1]))

    # Record
    f.write("RandomForest:\n")
    f.write("params:\n")
    f.write("RF:%s\n" % drfp_rf.n_estimators)

    f.write("train_R2=%s\n" % drfp_train_R2)
    f.write("train_RMSE=%s\n" % drfp_train_RMSE)
    f.write("test_R2=%s\n" % drfp_test_R2)
    f.write("test_RMSE=%s\n" % drfp_test_RMSE)
    f.write("\n")

    print("RandomForest:")
    print("params:")
    print("RF:%s" % drfp_rf.n_estimators)

    print("train_R2=%s" % drfp_train_R2)
    print("train_RMSE=%s" % drfp_train_RMSE)
    print("test_R2=%s" % drfp_test_R2)
    print("test_RMSE=%s" % drfp_test_RMSE)

    # RF Figure
    # DRFP
    # Figure
    plt.figure(dpi=120, figsize=(8, 8))
    # Test set performance
    tr = np.array(drfp_testset[1]).flatten() * 100
    pr = drfp_rf.predict(drfp_testset[0]).flatten() * 100
    plt.scatter(pr, tr, alpha=0.7, marker=".")
    plt.xlabel("Predicted Yield", fontsize=10)
    plt.ylabel("Observed Yield", fontsize=10)
    x = np.linspace(0, 100, 100)
    y = np.linspace(0, 100, 100)
    plt.plot(x, y, linestyle="--", color="r")
    plt.title("Test set performance", fontsize=15)

    plt.tight_layout()
    plt.savefig("%s/RF on %s(rs=%s).png" % (dir_path, file[:-5], rs))
    plt.show()

    f.close()
