In [2]:
import sys
sys.path.append('../..')
sys.path.append('../data')
sys.path.append('../../helper_code')

from sklearn.kernel_ridge import KernelRidge
from helper_code.QML_KernelRidge import KRR_local, KRR_global
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_absolute_error
import warnings
import numpy as np
import pandas as pd
from IPython.display import display
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from helper_code.util import evaluate_performance_local, evaluate_performance, evaluate_performance_global

## Get data ##

In [48]:
CSE = pd.read_csv("../data/benzene_training_data/[Benz] ANM_X_lexi_square_eig.csv")
CSE = CSE.values / 10
Q = np.loadtxt("CCS_basis/ANM_basis.txt")
print(CSE.shape)
print(Q.shape)


(17, 6)
(6, 6)


In [49]:
CSE = CSE[:, np.newaxis, :]
CSE_local = CSE * Q
print(CSE_local.shape)

(17, 6, 6)


In [62]:
energy = pd.read_csv("../data/benzene_training_data/[Benz] y_delta_delta_ANM.csv").to_numpy()
raw_data = np.load("../data/benzene_raw_data/Benzene_BNdoping_PBE0_pcX2_opt.npz", allow_pickle=True)
charges = raw_data['charges'][:, :6]

In [63]:
print(charges)

[[7 5 6 6 6 6]
 [7 6 5 6 6 6]
 [7 6 6 5 6 6]
 [7 7 5 5 6 6]
 [7 7 5 6 5 6]
 [7 7 5 6 6 5]
 [7 7 6 5 5 6]
 [7 5 7 6 6 5]
 [7 5 7 6 5 6]
 [7 6 7 5 5 6]
 [7 6 7 5 6 5]
 [5 7 5 6 7 6]
 [7 6 6 7 5 5]
 [5 7 6 5 7 6]
 [7 7 7 5 5 5]
 [7 7 5 7 5 5]
 [7 5 7 5 7 5]]


In [66]:
print(CSE_local.shape)

(17, 6, 6)


## Train Model ##

In [81]:
# lambda: regularization
# length: kernel length

X = CSE_local
y = energy
Q = charges

params = {'lambda': 0.9572029983630527, 'length': 8.549964763049411, 'kernel': 'rbf'}
mae_scores = []

kfold = KFold(n_splits=2, shuffle=True, random_state=10)
for fold, (train_index, test_index) in enumerate(kfold.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    Q_train, Q_test = Q[train_index], Q[test_index]
    preds = KRR_local(X_train, Q_train, y_train, X_test, Q_test, best_params=params, kernel='gaussian')
    score = mean_absolute_error(preds.reshape(-1, 1), y_test)
    mae_scores.append(score)
    print(f"fold {fold+1}: mae = {score}")

print(f"Average mae: {np.array(mae_scores).mean()}")

fold 1: mae = 0.07036990606332455
fold 2: mae = 0.061044370849836826
Average mae: 0.06570713845658069


## Tuning ##

In [74]:
X = CSE_local
y = energy
Q = charges

def objective(params):
    mae_scores = []
    kfold = KFold(n_splits=2, shuffle=True, random_state=42)
    for fold, (train_index, test_index) in enumerate(kfold.split(X)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        Q_train, Q_test = Q[train_index], Q[test_index]
        preds = KRR_local(X_train, Q_train, y_train, X_test, Q_test, best_params=params, kernel='gaussian')
        if type(preds) is str:
            return np.inf
        score = mean_absolute_error(preds.reshape(-1, 1), y_test)
        mae_scores.append(score)
    return np.array(mae_scores).mean()

space = {
    'lambda': hp.loguniform('lambda', -30, 0), 
    'length': hp.loguniform('length', -2, 2)
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=500,
                trials=trials)

print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

  2%|▏         | 9/500 [00:00<00:05, 86.07trial/s, best loss: 0.09885241818343252]

100%|██████████| 500/500 [00:04<00:00, 124.23trial/s, best loss: 0.08685913833692871]
Best hyperparameters: {'lambda': 0.6764886969726669, 'length': 7.3844636120610465}
Loss: 0.08685913833692871
