In [1]:
import qml
from qml.kernels import get_local_kernel_mbdf

In [3]:
from qml.kernels import extended_gaussian_kernel

In [1]:
import sys
sys.path.append('../..')
sys.path.append('../data')
sys.path.append('../../helper_code')

from sklearn.kernel_ridge import KernelRidge
from helper_code.QML_KernelRidge import KRR_local, KRR_global
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_absolute_error
import warnings
import numpy as np
import pandas as pd
from IPython.display import display
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from helper_code.util import evaluate_performance_local, evaluate_performance, evaluate_performance_global

In [4]:
mbdf = np.genfromtxt("../data/benzene_training_data/MBDF.csv", delimiter=',').reshape((17, 12, 6))
fchl = np.genfromtxt("../data/benzene_training_data/FCHL.csv", delimiter=',').reshape((17, 12, 496))

delta_delta_total_energy = pd.read_csv('../data/benzene_training_data/DD_e_tot (kcal).csv')

In [6]:
coronene_energy_raw_data = np.load("../data/benzene_raw_data/Benzene_BNdoping_PBE0_pcX2_correct.npz", allow_pickle=True)
charges = coronene_energy_raw_data['charges']

## MBDF ##

In [14]:
X = mbdf
y = delta_delta_total_energy.to_numpy()

params = {'lambda': 0.0049859234427626995, 'length': 3.911298728345607}
mae_scores = []

kfold = KFold(n_splits=2, shuffle=True, random_state=42)
for fold, (train_index, test_index) in enumerate(kfold.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    Q_train, Q_test = charges[train_index], charges[test_index]
    preds = KRR_local(X_train, Q_train, y_train, X_test, Q_test, best_params=params, kernel='mbdf')
    score = mean_absolute_error(preds.reshape(-1, 1), y_test)
    mae_scores.append(score)
    print(f"fold {fold+1}: mae = {score}")

print(f"Average mae: {np.array(mae_scores).mean()}")

fold 1: mae = 11.299720035110177
fold 2: mae = 12.217661858075168
Average mae: 11.758690946592672


In [13]:
X = mbdf
y = delta_delta_total_energy.to_numpy()

def objective(params):
    mae_scores = []
    kfold = KFold(n_splits=2, shuffle=True, random_state=42)
    for fold, (train_index, test_index) in enumerate(kfold.split(X)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        Q_train, Q_test = charges[train_index], charges[test_index]
        preds = KRR_local(X_train, Q_train, y_train, X_test, Q_test, best_params=params, kernel='Gaussian')
        if type(preds) is str:
            return np.inf
        score = mean_absolute_error(preds.reshape(-1, 1), y_test)
        mae_scores.append(score)
    return np.array(mae_scores).mean()

space = {
    'lambda': hp.loguniform('lambda', -30, 0), 
    'length': hp.loguniform('length', -2, 2)
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=300,
                trials=trials)

print("MBDF")
print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

100%|██████████| 300/300 [00:02<00:00, 126.56trial/s, best loss: 9.032140056099664]
MBDF
Best hyperparameters: {'lambda': 0.0049859234427626995, 'length': 3.911298728345607}
Loss: 9.032140056099664


## FCHL ##

In [17]:
X = fchl
y = delta_delta_total_energy.to_numpy()

params = {'lambda': 0.9945183493992734, 'length': 0.17323407732710847}
mae_scores = []

kfold = KFold(n_splits=2, shuffle=True, random_state=42)
for fold, (train_index, test_index) in enumerate(kfold.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    Q_train, Q_test = charges[train_index], charges[test_index]
    preds = KRR_local(X_train, Q_train, y_train, X_test, Q_test, best_params=params, kernel='mbdf')
    score = mean_absolute_error(preds.reshape(-1, 1), y_test)
    mae_scores.append(score)
    print(f"fold {fold+1}: mae = {score}")

print(f"Average mae: {np.array(mae_scores).mean()}")

fold 1: mae = 9.567979314359945
fold 2: mae = 11.519040358107839
Average mae: 10.543509836233891


In [16]:
X = fchl
y = delta_delta_total_energy.to_numpy()

def objective(params):
    mae_scores = []
    kfold = KFold(n_splits=2, shuffle=True, random_state=42)
    for fold, (train_index, test_index) in enumerate(kfold.split(X)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        Q_train, Q_test = charges[train_index], charges[test_index]
        preds = KRR_local(X_train, Q_train, y_train, X_test, Q_test, best_params=params, kernel='Gaussian')
        if type(preds) is str:
            return np.inf
        score = mean_absolute_error(preds.reshape(-1, 1), y_test)
        mae_scores.append(score)
    return np.array(mae_scores).mean()

space = {
    'lambda': hp.loguniform('lambda', -30, 0), 
    'length': hp.loguniform('length', -2, 2)
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=300,
                trials=trials)

print("MBDF")
print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

100%|██████████| 300/300 [00:03<00:00, 79.13trial/s, best loss: 10.627605788123503]
MBDF
Best hyperparameters: {'lambda': 0.9945183493992734, 'length': 0.17323407732710847}
Loss: 10.627605788123503
