In [1]:
import sys
sys.path.append('../..')
sys.path.append('../data')
sys.path.append('../../helper_code')

from sklearn.kernel_ridge import KernelRidge
from helper_code.QML_KernelRidge import KRR_local
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_absolute_error
import warnings
import numpy as np
import pandas as pd
from IPython.display import display
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from helper_code.util import evaluate_performance_local, evaluate_performance

In [2]:
cm = np.genfromtxt("../data/coronene_training_data/CM.csv", delimiter=',')
bob = np.genfromtxt("../data/coronene_training_data/BOB.csv", delimiter=',')
# fchl = np.genfromtxt("../data/coronene_training_data/FCHL.csv", delimiter=',').reshape((2400, 36, 496))
mbdf = np.genfromtxt("../data/coronene_training_data/MBDF.csv", delimiter=',').reshape((2400, 36, 6))

delta_total_energy = pd.read_csv(f'../data/coronene_training_data/delta_total_energy.csv').to_numpy()
delta_delta_total_energy = pd.read_csv(f'../data/coronene_training_data/delta_delta_total_energy.csv').to_numpy()

In [3]:
coronene_energy_raw_data = np.load("../data/coronene_raw_data/coronene_mutants_pbe0_pcx2.npz", allow_pickle=True)
charges = coronene_energy_raw_data['charges']

# Model #

## CM and BOB ##

In [3]:
X_train = cm
y_train = delta_delta_total_energy

params = {'alpha': 0.00011806857533747571, 'gamma': 0.00010582969605609149, 'kernel': 'rbf'}
model = KernelRidge(**params)

neg_mae_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=2)
mae_scores = -neg_mae_scores
mean_mae_score = mae_scores.mean()

for fold, score in enumerate(mae_scores):
    print(f"fold {fold}: mae = {score}")

print(f"Average mae: {mean_mae_score}")

fold 0: mae = 0.34258307008460215
fold 1: mae = 0.35536693507702666
Average mae: 0.3489750025808144


In [15]:
X_train = bob
y_train = delta_delta_total_energy

params = {'alpha': 0.00011806857533747571, 'gamma': 0.00010582969605609149, 'kernel': 'rbf'}
model = KernelRidge(**params)

neg_mae_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=2)
mae_scores = -neg_mae_scores
mean_mae_score = mae_scores.mean()

for fold, score in enumerate(mae_scores):
    print(f"fold {fold}: mae = {score}")

print(f"Average mae: {mean_mae_score}")

fold 0: mae = 0.12449099767892426
fold 1: mae = 0.13019453367148603
Average mae: 0.12734276567520514


## MBDF ##

In [13]:
# lambda: regularization
# length: kernel length

X = mbdf
y = delta_delta_total_energy

params = {'lambda': 0.00011806857533747571, 'length': 0.01}
mae_scores = []

kfold = KFold(n_splits=2, shuffle=True, random_state=42)
for fold, (train_index, test_index) in enumerate(kfold.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    Q_train, Q_test = charges[train_index], charges[test_index]
    preds = KRR_local(X_train, Q_train, y_train, X_test, Q_test, best_params=params, kernel='Gaussian')
    score = mean_absolute_error(preds.reshape(-1, 1), y_test)
    mae_scores.append(score)
    print(f"fold {fold+1}: mae = {score}")

print(f"Average mae: {np.array(mae_scores).mean()}")

fold 1: mae = 0.1105460352223525
fold 2: mae = 0.10640692068681594
Average mae: 0.10847647795458422


# Optimization #

## CM ##

In [12]:
X_train = cm
y_train = delta_delta_total_energy

def objective(params):
    params['kernel'] = 'rbf'
    model = KernelRidge(**params)
    neg_mae_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=2)
    return {'loss': -neg_mae_scores.mean(), 'status': STATUS_OK}

space = {
    'alpha': hp.loguniform('alpha', -30, 0),
    'gamma': hp.loguniform('gamma', -30, 0),
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=50,
                trials=trials)

print("CM")
print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

100%|██████████| 50/50 [00:25<00:00,  1.94trial/s, best loss: 0.27463351776855316]
CM
Best hyperparameters: {'alpha': 1.170259798512603e-13, 'gamma': 1.3945814234136684e-05}
Loss: 0.27463351776855316


## BOB ##

In [8]:
X_train = bob
y_train = delta_delta_total_energy

def objective(params):
    params['kernel'] = 'rbf'
    model = KernelRidge(**params)
    neg_mae_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=2)
    return {'loss': -neg_mae_scores.mean(), 'status': STATUS_OK}

space = {
    'alpha': hp.loguniform('alpha', -30, 0),
    'gamma': hp.loguniform('gamma', -30, 0),
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=50,
                trials=trials)

print("BOB")
print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

100%|██████████| 50/50 [00:37<00:00,  1.32trial/s, best loss: 0.11399516531218089]
BOB
Best hyperparameters: {'alpha': 0.000536679384502507, 'gamma': 7.91857911110305e-06}
Loss: 0.11399516531218089


## MBDF ##

In [18]:
X = mbdf
y = delta_delta_total_energy

def objective(params):
    mae_scores = []
    kfold = KFold(n_splits=2, shuffle=True, random_state=42)
    for fold, (train_index, test_index) in enumerate(kfold.split(X)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        Q_train, Q_test = charges[train_index], charges[test_index]
        preds = KRR_local(X_train, Q_train, y_train, X_test, Q_test, best_params=params, kernel='Gaussian')
        if type(preds) is str:
            return np.inf
        score = mean_absolute_error(preds.reshape(-1, 1), y_test)
        mae_scores.append(score)
    return np.array(mae_scores).mean()

space = {
    'lambda': hp.loguniform('lambda', -30, 0), 
    'length': hp.loguniform('length', -5, 5)
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=50,
                trials=trials)

print("MBDF")
print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

100%|██████████| 50/50 [08:16<00:00,  9.93s/trial, best loss: 0.017500326262509996]
MBDF
Best hyperparameters: {'lambda': 6.032339645975787e-11, 'length': 1.0912415337720986}
Loss: 0.017500326262509996


# Learning #

In [4]:
performance_summary = {}

## CM ##

In [17]:
X = cm
y = delta_delta_total_energy

best_params = {'alpha': 1.170259798512603e-5, 'gamma': 1.3945814234136684e-05}
model = KernelRidge(**best_params)

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance = pd.DataFrame(columns=columns)

training_size = [2**i for i in range(6, 12)]
num_trials = 5

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance(model, X, y, num_training_sample, num_trials)
        model_performance.at[index, 'training size'] = num_training_sample
        model_performance.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

performance_summary['CM'] = model_performance

display(model_performance)

Unnamed: 0,training size,average MAE (mHa),standard deviation (mHa)
1,64,641.396006,17.292587
2,128,573.805405,10.156346
3,256,668.469553,22.196737
4,512,965.339032,26.76441
5,1024,346.484712,5.094993
6,2048,275.474588,20.258251


## BOB ##

In [10]:
X = bob
y = delta_delta_total_energy

best_params = {'alpha': 0.000536679384502507, 'gamma': 7.91857911110305e-06, 'kernel': 'rbf'}
model = KernelRidge(**best_params)

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance = pd.DataFrame(columns=columns)

training_size = [2**i for i in range(6, 12)]
num_trials = 5

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance(model, X, y, num_training_sample, num_trials)
        model_performance.at[index, 'training size'] = num_training_sample
        model_performance.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

performance_summary['BOB'] = model_performance

display(model_performance)

Unnamed: 0,training size,average MAE (mHa),standard deviation (mHa)
1,64,237.41251,6.115812
2,128,175.25626,3.951001
3,256,151.710951,2.826987
4,512,138.198604,2.084982
5,1024,119.947453,0.880823
6,2048,104.376199,2.593178


## MBDF ##

In [5]:
X = mbdf
y = delta_delta_total_energy
best_params = {'lambda': 6.032339645975787e-11, 'length': 1.0912415337720986}

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance = pd.DataFrame(columns=columns)

training_size = [2**i for i in range(6, 12)]
num_trials = 5

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance_local(best_params, X, y, charges, num_training_sample, num_trials)
        model_performance.at[index, 'training size'] = num_training_sample
        model_performance.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

performance_summary['CM'] = model_performance

display(model_performance)

Unnamed: 0,training size,average MAE (mHa),standard deviation (mHa)
1,64,91.459782,5.688672
2,128,61.880635,3.044494
3,256,45.05742,0.727712
4,512,27.131326,1.126055
5,1024,18.965722,0.476984
6,2048,15.459664,0.314807
