In [1]:
import sys
sys.path.append('../..')
sys.path.append('../data')
sys.path.append('../../helper_code')

from sklearn.kernel_ridge import KernelRidge
from helper_code.QML_KernelRidge import KRR_local, KRR_global
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_absolute_error
import warnings
import numpy as np
import pandas as pd
from IPython.display import display
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from helper_code.util import evaluate_performance_local, evaluate_performance, evaluate_performance_global

In [2]:
cm = np.genfromtxt("../data/coronene_training_data/CM.csv", delimiter=',')
bob = np.genfromtxt("../data/coronene_training_data/BOB.csv", delimiter=',')
mbdf = np.genfromtxt("../data/coronene_training_data/MBDF.csv", delimiter=',').reshape((2400, 36, 6))
mbdf_global = np.genfromtxt("../data/coronene_training_data/MBDF_global.csv", delimiter=',')

delta_total_energy = pd.read_csv(f'../data/coronene_training_data/delta_total_energy.csv').to_numpy()
delta_delta_total_energy = pd.read_csv(f'../data/coronene_training_data/delta_delta_total_energy.csv').to_numpy()

In [3]:
coronene_energy_raw_data = np.load("../data/coronene_raw_data/coronene_mutants_pbe0_pcx2.npz", allow_pickle=True)
charges = coronene_energy_raw_data['charges']

# Model #

## CM and BOB ##

In [4]:
X_train = cm
y_train = delta_delta_total_energy

params = {'alpha': 1.170259798512603e-5, 'gamma': 1.3945814234136684e-05, 'kernel': 'rbf'}
model = KernelRidge(**params)

neg_mae_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=2)
mae_scores = -neg_mae_scores
mean_mae_score = mae_scores.mean()

for fold, score in enumerate(mae_scores):
    print(f"fold {fold}: mae = {score}")

print(f"Average mae: {mean_mae_score}")

fold 0: mae = 0.2808084523387788
fold 1: mae = 0.2702182586139559
Average mae: 0.27551335547636735


In [18]:
X_train = bob
y_train = delta_delta_total_energy

params = {'alpha': 0.000536679384502507, 'gamma': 7.91857911110305e-06, 'kernel': 'rbf'}
model = KernelRidge(**params)

neg_mae_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=2)
mae_scores = -neg_mae_scores
mean_mae_score = mae_scores.mean()

for fold, score in enumerate(mae_scores):
    print(f"fold {fold}: mae = {score}")
print(f"Average mae: {mean_mae_score}")

fold 0: mae = 0.11510892550103852
fold 1: mae = 0.11288140512332326
Average mae: 0.11399516531218089


## MBDF Global ##

In [13]:
X = mbdf_global
y = delta_delta_total_energy

params = {'lambda': 6.032339645975787e-11, 'length': 1.0912415337720986}
mae_scores = []

kfold = KFold(n_splits=2, shuffle=True, random_state=42)
for fold, (train_index, test_index) in enumerate(kfold.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    Q_train, Q_test = charges[train_index], charges[test_index]
    
    preds = KRR_global(X_train, y_train, X_test, best_params=params, kernel='gaussian', norm=2) # norm = L2
    score = mean_absolute_error(preds.reshape(-1, 1), y_test)
    mae_scores.append(score)
    print(f"fold {fold+1}: mae = {score}")

print(f"Average mae: {np.array(mae_scores).mean()}")

fold 1: mae = 0.18957617377857966
fold 2: mae = 0.18537690086523664
Average mae: 0.18747653732190817


## MBDF Local ##

In [7]:
# lambda: regularization
# length: kernel length

X = mbdf
y = delta_delta_total_energy

params = {'lambda': 6.032339645975787e-11, 'length': 1.0912415337720986}
mae_scores = []

kfold = KFold(n_splits=2, shuffle=True, random_state=42)
for fold, (train_index, test_index) in enumerate(kfold.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    Q_train, Q_test = charges[train_index], charges[test_index]
    preds = KRR_local(X_train, Q_train, y_train, X_test, Q_test, best_params=params, kernel='mbdf')
    score = mean_absolute_error(preds.reshape(-1, 1), y_test)
    mae_scores.append(score)
    print(f"fold {fold+1}: mae = {score}")

print(f"Average mae: {np.array(mae_scores).mean()}")

fold 1: mae = 0.04134639014627312
fold 2: mae = 0.0423741454675655
Average mae: 0.041860267806919316


# Optimization #

## CM ##

In [12]:
X_train = cm
y_train = delta_delta_total_energy

def objective(params):
    params['kernel'] = 'rbf'
    model = KernelRidge(**params)
    neg_mae_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=2)
    return {'loss': -neg_mae_scores.mean(), 'status': STATUS_OK}

space = {
    'alpha': hp.loguniform('alpha', -30, 0),
    'gamma': hp.loguniform('gamma', -30, 0),
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=50,
                trials=trials)

print("CM")
print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

100%|██████████| 50/50 [00:25<00:00,  1.94trial/s, best loss: 0.27463351776855316]
CM
Best hyperparameters: {'alpha': 1.170259798512603e-13, 'gamma': 1.3945814234136684e-05}
Loss: 0.27463351776855316


## BOB ##

In [8]:
X_train = bob
y_train = delta_delta_total_energy

def objective(params):
    params['kernel'] = 'rbf'
    model = KernelRidge(**params)
    neg_mae_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=2)
    return {'loss': -neg_mae_scores.mean(), 'status': STATUS_OK}

space = {
    'alpha': hp.loguniform('alpha', -30, 0),
    'gamma': hp.loguniform('gamma', -30, 0),
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=50,
                trials=trials)

print("BOB")
print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

100%|██████████| 50/50 [00:37<00:00,  1.32trial/s, best loss: 0.11399516531218089]
BOB
Best hyperparameters: {'alpha': 0.000536679384502507, 'gamma': 7.91857911110305e-06}
Loss: 0.11399516531218089


## MBDF Global ##

In [23]:
X = mbdf_global
y = delta_delta_total_energy

def objective(params):
    mae_scores = []
    kfold = KFold(n_splits=2, shuffle=True, random_state=42)
    for fold, (train_index, test_index) in enumerate(kfold.split(X)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        Q_train, Q_test = charges[train_index], charges[test_index]
        preds = KRR_global(X_train, y_train, X_test, best_params=params, kernel='gaussian', norm=2)
        if type(preds) is str:
            return np.inf
        score = mean_absolute_error(preds.reshape(-1, 1), y_test)
        mae_scores.append(score)
    return np.array(mae_scores).mean()

space = {
    'lambda': hp.loguniform('lambda', -30, 0), 
    'length': hp.loguniform('length', -5, 2)
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=100,
                trials=trials)

print("MBDF")
print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

100%|██████████| 100/100 [01:45<00:00,  1.06s/trial, best loss: 0.11815085708492293]
MBDF
Best hyperparameters: {'lambda': 2.3061561318708763e-05, 'length': 7.318069540459371}
Loss: 0.11815085708492293


## MBDF Local ##

In [21]:
X = mbdf
y = delta_delta_total_energy

def objective(params):
    mae_scores = []
    kfold = KFold(n_splits=2, shuffle=True, random_state=42)
    for fold, (train_index, test_index) in enumerate(kfold.split(X)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        Q_train, Q_test = charges[train_index], charges[test_index]
        preds = KRR_local(X_train, Q_train, y_train, X_test, Q_test, best_params=params, kernel='Gaussian')
        if type(preds) is str:
            return np.inf
        score = mean_absolute_error(preds.reshape(-1, 1), y_test)
        mae_scores.append(score)
    return np.array(mae_scores).mean()

space = {
    'lambda': hp.loguniform('lambda', -30, 0), 
    'length': hp.loguniform('length', -2, 2)
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=50,
                trials=trials)

print("MBDF")
print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 50/50 [07:15<00:00,  8.70s/trial, best loss: 0.017366191436974845]
MBDF
Best hyperparameters: {'lambda': 1.4686180686107973e-10, 'length': 1.1322665811269628}
Loss: 0.017366191436974845


# Learning #

In [5]:
performance_summary = {}

## CM ##

In [18]:
X = cm
y = delta_delta_total_energy

best_params = {'alpha': 1.170259798512603e-5, 'gamma': 1.3945814234136684e-05, 'kernel': 'rbf'}
model = KernelRidge(**best_params)

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance = pd.DataFrame(columns=columns)

training_size = [2**i for i in range(6, 12)]
num_trials = 5

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance(model, X, y, num_training_sample, num_trials)
        model_performance.at[index, 'training size'] = num_training_sample
        model_performance.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

performance_summary['CM'] = model_performance

display(model_performance)

Unnamed: 0,training size,average MAE (mHa),standard deviation (mHa)
1,64,648.552735,21.061938
2,128,537.698569,5.162775
3,256,461.018088,6.458496
4,512,379.985854,4.161769
5,1024,293.843897,2.392848
6,2048,241.825781,6.267924


## BOB ##

In [7]:
X = bob
y = delta_delta_total_energy

best_params = {'alpha': 0.000536679384502507, 'gamma': 7.91857911110305e-06, 'kernel': 'rbf'}
model = KernelRidge(**best_params)

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance = pd.DataFrame(columns=columns)

training_size = [2**i for i in range(6, 12)]
num_trials = 5

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance(model, X, y, num_training_sample, num_trials)
        model_performance.at[index, 'training size'] = num_training_sample
        model_performance.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

performance_summary['BOB'] = model_performance

display(model_performance)

Unnamed: 0,training size,average MAE (mHa),standard deviation (mHa)
1,64,237.41251,6.115812
2,128,175.25626,3.951001
3,256,151.710951,2.826987
4,512,138.198604,2.084982
5,1024,119.947453,0.880823
6,2048,104.376199,2.593178


## MBDF Global ##

In [9]:
X = mbdf_global
y = delta_delta_total_energy
best_params = {'lambda': 2.3061561318708763e-05, 'length': 7.318069540459371}

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance = pd.DataFrame(columns=columns)

training_size = [2**i for i in range(6, 12)]
num_trials = 5

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance_global(best_params, X, y, num_training_sample, num_trials)
        model_performance.at[index, 'training size'] = num_training_sample
        model_performance.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

performance_summary['MBDF_global'] = model_performance

display(model_performance)

Unnamed: 0,training size,average MAE (mHa),standard deviation (mHa)
1,64,274.402932,14.142826
2,128,198.342542,3.855854
3,256,183.233907,2.906295
4,512,164.291803,1.082699
5,1024,126.449388,1.504096
6,2048,93.926692,2.538377


## MBDF Local ##

In [10]:
X = mbdf
y = delta_delta_total_energy
best_params = {'lambda': 6.032339645975787e-11, 'length': 1.0912415337720986}

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance = pd.DataFrame(columns=columns)

training_size = [2**i for i in range(6, 12)]
num_trials = 5

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance_local(best_params, X, y, charges, num_training_sample, num_trials)
        model_performance.at[index, 'training size'] = num_training_sample
        model_performance.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

performance_summary['MBDF_local'] = model_performance

display(model_performance)

Unnamed: 0,training size,average MAE (mHa),standard deviation (mHa)
1,64,91.459782,5.688672
2,128,61.880635,3.044494
3,256,45.05742,0.727712
4,512,27.131326,1.126055
5,1024,18.965722,0.476984
6,2048,15.459664,0.314807


In [19]:
learning_curve_data = {}
for key, value in performance_summary.items():
    # display(value)
    learning_curve_data[key] = value['average MAE (mHa)'].tolist()

print(learning_curve_data)

{'CM': [648.5527354673592, 537.6985692537448, 461.0180882987335, 379.9858538857778, 293.8438973546341, 241.82578139489823], 'BOB': [237.41250993141233, 175.25626039916506, 151.71095095761646, 138.19860398430035, 119.94745348162638, 104.37619882451854], 'MBDF_local': [91.45978157923743, 61.880635092542924, 45.05741969642941, 27.131326474704487, 18.96572152685743, 15.459664235069118], 'MBDF_global': [274.40293174604693, 198.34254220451777, 183.23390683681885, 164.29180299704285, 126.44938842117904, 93.92669161271573]}
