In [20]:
import sys
sys.path.append('../..')
sys.path.append('../data')
sys.path.append('../../helper_code')

from sklearn.kernel_ridge import KernelRidge
from helper_code.QML_KernelRidge import KRR_local, KRR_global, GridSearchCV, GridSearchCV_local
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_absolute_error
import warnings
import numpy as np
import pandas as pd
from IPython.display import display
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
# from helper_code.util import evaluate_performance_local, evaluate_performance, evaluate_performance_global

In [2]:
# cm = np.genfromtxt("../data/coronene_training_data/CM.csv", delimiter=',')
# bob = np.genfromtxt("../data/coronene_training_data/BOB.csv", delimiter=',')
mbdf = np.genfromtxt("../data/coronene_training_data/MBDF.csv", delimiter=',').reshape((-1, 36, 6))
df = np.genfromtxt("../data/coronene_training_data/DF.csv", delimiter=',')

D_etot = pd.read_csv(f'../data/coronene_training_data/delta_total_energy.csv').to_numpy()
DD_etot = pd.read_csv(f'../data/coronene_training_data/DD_e_tot.csv').to_numpy()

In [3]:
coronene_energy_raw_data = np.load("../data/coronene_raw_data/coronene_mutants_pbe_pcx2_corrected3.npz", allow_pickle=True)
charges = coronene_energy_raw_data['charges']

# Model #

## MBDF Global ##

### DD_etot ###

In [23]:
X =df
y = DD_etot
Q = charges
mae_scores = []

param_grid = {'lambda':[1e-3,1e-6,1e-9,1e-12],
            'length':[10**i for i in range(-2,4)]} 

kfold = KFold(n_splits=2, shuffle=True, random_state=42)
for fold, (train_index, test_index) in enumerate(kfold.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    Q_train, Q_test = Q[train_index], Q[test_index]
    best_params = GridSearchCV(X_train, y_train, param_grid, cv = 4)
    preds = KRR_global(X_train, y_train, X_test, best_params=best_params, kernel='gaussian', norm=2) # norm = L2
    score = mean_absolute_error(preds.reshape(-1, 1), y_test)
    mae_scores.append(score)
    print(f"fold {fold+1}: mae = {score}")

print(f"Average mae: {np.array(mae_scores).mean()}")

fold 1: mae = 0.09511458513863061
fold 2: mae = 0.11314163461048717
Average mae: 0.1041281098745589


In [24]:
print(best_params)

{'mae': 0.11600178809628667, 'lambda': 0.001, 'length': 1000}


In [9]:
X = df
y = DD_etot
Q = charges
params = {'lambda': 0.9971429677217133, 'length': 7.1730768859490714}
mae_scores = []

kfold = KFold(n_splits=2, shuffle=True, random_state=42)
for fold, (train_index, test_index) in enumerate(kfold.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    Q_train, Q_test = Q[train_index], Q[test_index]
    
    preds = KRR_global(X_train, y_train, X_test, best_params=params, kernel='gaussian', norm=2) # norm = L2
    score = mean_absolute_error(preds.reshape(-1, 1), y_test)
    mae_scores.append(score)
    print(f"fold {fold+1}: mae = {score}")

print(f"Average mae: {np.array(mae_scores).mean()}")

fold 1: mae = 0.10955891518598214
fold 2: mae = 0.10749169273507639
Average mae: 0.10852530396052926


In [8]:
# Tuning

X = df
y = DD_etot

def objective(params):
    mae_scores = []
    kfold = KFold(n_splits=2, shuffle=True, random_state=42)
    for fold, (train_index, test_index) in enumerate(kfold.split(X)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        Q_train, Q_test = charges[train_index], charges[test_index]
        preds = KRR_global(X_train, y_train, X_test, best_params=params, kernel='gaussian', norm=2)
        if type(preds) is str:
            return np.inf
        score = mean_absolute_error(preds.reshape(-1, 1), y_test)
        mae_scores.append(score)
    return np.array(mae_scores).mean()

space = {
    'lambda': hp.loguniform('lambda', -30, 0), 
    'length': hp.loguniform('length', -5, 2)
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=100,
                trials=trials)

print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

100%|██████████| 100/100 [02:29<00:00,  1.50s/trial, best loss: 0.10852530396052926]
MBDF
Best hyperparameters: {'lambda': 0.9971429677217133, 'length': 7.1730768859490714}
Loss: 0.10852530396052926


## MBDF Local ##

### DD_etot ###

In [6]:
# lambda: regularization
# length: kernel length

X = mbdf
y = DD_etot

params = {'lambda': 0.9700293111914039, 'length': 6.720858507072239}
mae_scores = []

kfold = KFold(n_splits=2, shuffle=True, random_state=42)
for fold, (train_index, test_index) in enumerate(kfold.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    Q_train, Q_test = charges[train_index], charges[test_index]
    preds = KRR_local(X_train, Q_train, y_train, X_test, Q_test, best_params=params, kernel='mbdf')
    score = mean_absolute_error(preds.reshape(-1, 1), y_test)
    mae_scores.append(score)
    print(f"fold {fold+1}: mae = {score}")

print(f"Average mae: {np.array(mae_scores).mean()}")

fold 1: mae = 0.09305628346319726
fold 2: mae = 0.1115939887598628
Average mae: 0.10232513611153003


In [5]:
# Tuning 

X = mbdf
y = DD_etot

def objective(params):
    mae_scores = []
    kfold = KFold(n_splits=2, shuffle=True, random_state=42)
    for fold, (train_index, test_index) in enumerate(kfold.split(X)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        Q_train, Q_test = charges[train_index], charges[test_index]
        preds = KRR_local(X_train, Q_train, y_train, X_test, Q_test, best_params=params, kernel='Gaussian')
        if type(preds) is str:
            return np.inf
        score = mean_absolute_error(preds.reshape(-1, 1), y_test)
        mae_scores.append(score)
    return np.array(mae_scores).mean()

space = {
    'lambda': hp.loguniform('lambda', -30, 0), 
    'length': hp.loguniform('length', -2, 2)
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=100,
                trials=trials)

print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

100%|██████████| 100/100 [01:49<00:00,  1.10s/trial, best loss: 0.10018734438910065]
Best hyperparameters: {'lambda': 0.9700293111914039, 'length': 6.720858507072239}
Loss: 0.10018734438910065


# Learning #

In [10]:
performance_summary = {}

In [7]:
def evaluate_performance_global(params, X, y, num_training_sample, num_trials):

    errors = []
    test_size = 1.0 - num_training_sample/X.shape[0]

    for i in range(num_trials):
        train_indices, test_indices = train_test_split(range(X.shape[0]), test_size=test_size, shuffle=True, random_state=i)
        X_train, X_test = X[train_indices], X[test_indices]
        y_train, y_test = y[train_indices], y[test_indices]
        preds = KRR_global(X_train, y_train, X_test, best_params=params, kernel='Gaussian')
        error = mean_absolute_error(preds.reshape(-1, 1), y_test)
        errors.append(error)
    
    average_error = np.mean(errors)
    std_dev_error = np.std(errors)/np.sqrt(num_trials)
    return average_error, std_dev_error



def evaluate_performance_local(params, X, y, Q, num_training_sample, num_trials):

    errors = []
    test_size = 1.0 - num_training_sample/X.shape[0]

    for i in range(num_trials):
        train_indices, test_indices = train_test_split(range(X.shape[0]), test_size=test_size, shuffle=True, random_state=i)
        X_train, X_test = X[train_indices], X[test_indices]
        y_train, y_test = y[train_indices], y[test_indices]
        Q_train, Q_test = Q[train_indices], Q[test_indices]
        preds = KRR_local(X_train, Q_train, y_train, X_test, Q_test, best_params=params, kernel='Gaussian')
        error = mean_absolute_error(preds.reshape(-1, 1), y_test)
        errors.append(error)
    
    average_error = np.mean(errors)
    std_dev_error = np.std(errors)/np.sqrt(num_trials)
    return average_error, std_dev_error

## MBDF Global ##

In [25]:
X = df
y = DD_etot

best_params = {'lambda': 0.001, 'length': 1000}

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance = pd.DataFrame(columns=columns)

training_size = [2**i for i in range(6, 11)]
num_trials = 5

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance_global(best_params, X, y, num_training_sample, num_trials)
        model_performance.at[index, 'training size'] = num_training_sample
        model_performance.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

performance_summary['MBDF_global'] = model_performance

display(model_performance)

Unnamed: 0,training size,average MAE (mHa),standard deviation (mHa)
1,64,94.87069,1.95914
2,128,97.726285,5.403965
3,256,108.099702,10.861729
4,512,110.319929,6.442153
5,1024,103.079877,1.698074


## MBDF Local ##

In [14]:
X = mbdf
y = DD_etot
best_params = {'lambda': 0.9700293111914039, 'length': 6.720858507072239}

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance = pd.DataFrame(columns=columns)

training_size = [2**i for i in range(6, 11)]
num_trials = 5

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance_local(best_params, X, y, charges, num_training_sample, num_trials)
        model_performance.at[index, 'training size'] = num_training_sample
        model_performance.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

performance_summary['MBDF_local'] = model_performance

display(model_performance)

Unnamed: 0,training size,average MAE (mHa),standard deviation (mHa)
1,64,96.380101,1.443536
2,128,99.391411,4.045516
3,256,106.968097,9.488595
4,512,104.948378,4.274618
5,1024,101.642879,1.426623


In [16]:
learning_curve_data = {}
for key, value in performance_summary.items():
    # display(value)
    learning_curve_data[key] = value['average MAE (mHa)'].tolist()

print(f"MBDF Global: {learning_curve_data['MBDF_global']}")
print(f"MBDF Local: {learning_curve_data['MBDF_local']}")

MBDF Global: [116.96369128659252, 112.42365228241572, 112.54209316496006, 109.97006810335796, 109.31274397167526]
MBDF Local: [96.38010111496062, 99.39141128888097, 106.96809661973175, 104.9483784734266, 101.64287891483133]
