In [None]:
import descriptors
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.model_selection import KFold, ShuffleSplit, GridSearchCV, cross_val_score, cross_validate
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.kernel_ridge import KernelRidge
import joblib
import warnings

import os
import glob
from scipy.stats import norm
import math

In [None]:
data = pd.read_excel('initial dataset.xlsx')
data['Mols'] = data['SMILES'].apply(Chem.MolFromSmiles)
data['Mols'] = data['Mols'].apply(Chem.AddHs)

bond_types,x_SOB = descriptors.literal_bag_of_bonds(list(data['Mols']))
x_ecfp = descriptors.ECFP_fingerprints(list(data['Mols']))
x_Estate = descriptors.truncated_Estate_fingerprints(list(data['Mols']))
x_combined = np.concatenate((x_Estate, x_SOB), axis=1)

x_all_list = [x_SOB, x_ecfp, x_Estate, x_combined]
y = data['HOF(kJ/mol)'].values

In [None]:
MAE_list_train = []
MAE_list_test = []
RMSE_list_train =[]
RMSE_list_test =[]
r2_list_train = []
r2_list_test = []
MAError_train_aver_list = []
MAError_test_aver_list = []
RMSEerror_train_aver_list =[]
RMSEerror_test_aver_list =[]
r2_train_aver_list =[]
r2_test_aver_list =[]
sd1_list = []
sd2_list = []
sd3_list = []
sd4_list = []
sd5_list = []
sd6_list = []
warnings.filterwarnings("ignore")
for index_x,x in enumerate(x_all_list):
    np.savez('initial dataset.npz', x_data=x ,y_data=y)
    data = np.load('initial dataset.npz')
    X = data['x_data']
    y = data['y_data']
    
    rs = ShuffleSplit(n_splits=20, test_size=0.2, random_state=0)
    i = 1
    for train_index, test_index in rs.split(X):
        x_train = X[train_index]
        y_train = y[train_index]
        x_test = X[test_index]
        y_test = y[test_index]
        GSmodel = GridSearchCV(SVR(kernel='linear'), cv=5, param_grid={"C": np.logspace(-1, 4, 20), "epsilon": np.logspace(-2, 2, 20)})
        GSmodel = GSmodel.fit(x_train, y_train)
        model = GSmodel.best_estimator_
        model.fit(x_train,y_train)
        y_train_pred = model.predict(x_train)
        y_test_pred = model.predict(x_test)
        i = i + 1
        
        MAError_train = mean_absolute_error(y_train,y_train_pred)
        MAE_list_train.append(MAError_train)
        MAError_test = mean_absolute_error(y_test,y_test_pred)
        MAE_list_test.append(MAError_test)
        
        RMSEerror_train = math.sqrt(mean_squared_error(y_train,y_train_pred))
        RMSE_list_train.append(RMSEerror_train)
        RMSEerror_test = math.sqrt(mean_squared_error(y_test,y_test_pred))
        RMSE_list_test.append(RMSEerror_test)
        
        r2_train = r2_score(y_train,y_train_pred)
        r2_list_train.append(r2_train)
        r2_test = r2_score(y_test,y_test_pred)
        r2_list_test.append(r2_test)
        print(r2_test)
        
    MAError_train_aver = np.mean(MAE_list_train)
    MAError_train_aver_list.append(MAError_train_aver)
    sd1 = np.std(MAE_list_train,axis=0)
    sd1_list.append(sd1)
    MAError_test_aver = np.mean(MAE_list_test)
    MAError_test_aver_list.append(MAError_test_aver)
    sd2 = np.std(MAE_list_test,axis=0)
    sd2_list.append(sd2)
    
    RMSEerror_train_aver = np.mean(RMSE_list_train)
    RMSEerror_train_aver_list.append(RMSEerror_train_aver)
    sd3 = np.std(RMSE_list_train,axis=0)
    sd3_list.append(sd3)
    RMSEerror_test_aver = np.mean(RMSE_list_test)
    RMSEerror_test_aver_list.append(RMSEerror_test_aver)
    sd4 = np.std(RMSE_list_test,axis=0)
    sd4_list.append(sd4)
    
    r2_train_aver = np.mean(r2_list_train)
    r2_train_aver_list.append(r2_train_aver)
    sd5 = np.std(r2_list_train,axis=0)
    sd5_list.append(sd5)
    r2_test_aver = np.mean(r2_list_test)
    r2_test_aver_list.append(r2_test_aver)
    sd6 = np.std(r2_list_test,axis=0)
    sd6_list.append(sd6)

In [None]:
print(MAError_train_aver_list)
print(sd1_list)
print(MAError_test_aver_list)
print(sd2_list)
print(RMSEerror_train_aver_list)
print(sd3_list)
print(RMSEerror_test_aver_list)
print(sd4_list)
print(r2_train_aver_list)
print(sd5_list)
print(r2_test_aver_list)
print(sd6_list)