In [2]:
import sys
sys.path.append('../..')
sys.path.append('../data')
sys.path.append('../../helper_code')

from helper_code.QML_KernelRidge import KRR_local
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_absolute_error
import warnings
import numpy as np
import pandas as pd
from IPython.display import display
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from helper_code.util import evaluate_performance_local, evaluate_performance, evaluate_performance_global

In [3]:
CSE_local = np.genfromtxt("../data/coronene_training_data/CSE_local.csv", delimiter=',').reshape((2400, 24, 24))
coronene_energy_raw_data = np.load("../data/coronene_raw_data/coronene_mutants_pbe0_pcx2.npz", allow_pickle=True)
charges = coronene_energy_raw_data['charges'][:, :24]
delta_delta_total_energy = pd.read_csv(f'../data/coronene_training_data/delta_delta_total_energy.csv').to_numpy()

In [4]:
print(CSE_local.shape)
print(charges.shape)

(2400, 24, 24)
(2400, 24)


## Model Prototype ##

In [5]:
X = CSE_local
y = delta_delta_total_energy
Q = charges

params = {'lambda': 0.9572029983630527, 'length': 8.549964763049411, 'kernel': 'rbf'}
mae_scores = []

kfold = KFold(n_splits=2, shuffle=True, random_state=10)
for fold, (train_index, test_index) in enumerate(kfold.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    Q_train, Q_test = Q[train_index], Q[test_index]
    preds = KRR_local(X_train, Q_train, y_train, X_test, Q_test, best_params=params, kernel='Gaussian')
    score = mean_absolute_error(preds.reshape(-1, 1), y_test)
    mae_scores.append(score)
    print(f"fold {fold+1}: mae = {score}")

print(f"Average mae: {np.array(mae_scores).mean()}")

fold 1: mae = 0.3525390874230262
fold 2: mae = 0.3486069640174276
Average mae: 0.35057302572022686


## Tuning ##

In [7]:
X = CSE_local
y = delta_delta_total_energy
Q = charges

def objective(params):
    mae_scores = []
    kfold = KFold(n_splits=2, shuffle=True, random_state=42)
    for fold, (train_index, test_index) in enumerate(kfold.split(X)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        Q_train, Q_test = Q[train_index], Q[test_index]
        preds = KRR_local(X_train, Q_train, y_train, X_test, Q_test, best_params=params, kernel='Gaussian')
        if type(preds) is str:
            return np.inf
        score = mean_absolute_error(preds.reshape(-1, 1), y_test)
        mae_scores.append(score)
    return np.array(mae_scores).mean()

space = {
    'lambda': hp.loguniform('lambda', -30, 0), 
    'length': hp.loguniform('length', -2, 2)
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=70,
                trials=trials)

print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

100%|██████████| 70/70 [20:31<00:00, 17.59s/trial, best loss: 0.18836733305467301]
Best hyperparameters: {'lambda': 0.20189699004012868, 'length': 1.9495988615657756}
Loss: 0.18836733305467301
