In [1]:
import sys
sys.path.append('../..')
sys.path.append('../data')
sys.path.append('../../helper_code')

from sklearn.kernel_ridge import KernelRidge
from helper_code.QML_KernelRidge import KRR_local, KRR_global, GridSearchCV, GridSearchCV_local
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_absolute_error
import warnings
import numpy as np
import pandas as pd
from IPython.display import display
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from helper_code.util import evaluate_performance_local_v2, evaluate_performance_global_v2

In [2]:
# cm = np.genfromtxt("../data/coronene_training_data/CM.csv", delimiter=',')
# bob = np.genfromtxt("../data/coronene_training_data/BOB.csv", delimiter=',')
mbdf = np.genfromtxt("../data/coronene_training_data/MBDF.csv", delimiter=',').reshape((-1, 36, 6))
df = np.genfromtxt("../data/coronene_training_data/DF.csv", delimiter=',')

D_e_tot = pd.read_csv(f'../data/coronene_training_data/D_e_tot.csv').to_numpy()
D_e_elec = pd.read_csv(f'../data/coronene_training_data/D_e_elec.csv').to_numpy()
D_e_atom = pd.read_csv(f'../data/coronene_training_data/D_e_atom.csv').to_numpy()

In [3]:
coronene_energy_raw_data = np.load("../data/coronene_raw_data/coronene_mutants_pbe_pcx2_corrected3.npz", allow_pickle=True)
charges = coronene_energy_raw_data['charges']

# Model #

## MBDF Global ##

### D_etot ###

In [8]:
X =df
y = D_etot
Q = charges
mae_scores = []

param_grid = {'lambda':[1e-3,1e-6,1e-9,1e-12],
            'length':[10**i for i in range(-2,4)]} 

kfold = KFold(n_splits=2, shuffle=True, random_state=42)
for fold, (train_index, test_index) in enumerate(kfold.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    Q_train, Q_test = Q[train_index], Q[test_index]
    best_params = GridSearchCV(X_train, y_train, param_grid, cv = 4)
    preds = KRR_global(X_train, y_train, X_test, best_params=best_params, kernel='gaussian', norm=2) # norm = L2
    score = mean_absolute_error(preds.reshape(-1, 1), y_test)
    mae_scores.append(score)
    print(f"fold {fold+1}: mae = {score}")

print(f"Average mae: {np.array(mae_scores).mean()}")

fold 1: mae = 0.10963196047060059
fold 2: mae = 0.13274455639545224
Average mae: 0.12118825843302641


In [9]:
print(best_params)

{'mae': 0.14329455888445342, 'lambda': 1e-06, 'length': 1000}


### DD_etot ###

In [23]:
X =df
y = DD_etot
Q = charges
mae_scores = []

param_grid = {'lambda':[1e-3,1e-6,1e-9,1e-12],
            'length':[10**i for i in range(-2,4)]} 

kfold = KFold(n_splits=2, shuffle=True, random_state=42)
for fold, (train_index, test_index) in enumerate(kfold.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    Q_train, Q_test = Q[train_index], Q[test_index]
    best_params = GridSearchCV(X_train, y_train, param_grid, cv = 4)
    preds = KRR_global(X_train, y_train, X_test, best_params=best_params, kernel='gaussian', norm=2) # norm = L2
    score = mean_absolute_error(preds.reshape(-1, 1), y_test)
    mae_scores.append(score)
    print(f"fold {fold+1}: mae = {score}")

print(f"Average mae: {np.array(mae_scores).mean()}")

fold 1: mae = 0.09511458513863061
fold 2: mae = 0.11314163461048717
Average mae: 0.1041281098745589


In [24]:
print(best_params)

{'mae': 0.11600178809628667, 'lambda': 0.001, 'length': 1000}


In [9]:
X = df
y = DD_etot
Q = charges
params = {'lambda': 0.9971429677217133, 'length': 7.1730768859490714}
mae_scores = []

kfold = KFold(n_splits=2, shuffle=True, random_state=42)
for fold, (train_index, test_index) in enumerate(kfold.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    Q_train, Q_test = Q[train_index], Q[test_index]
    
    preds = KRR_global(X_train, y_train, X_test, best_params=params, kernel='gaussian', norm=2) # norm = L2
    score = mean_absolute_error(preds.reshape(-1, 1), y_test)
    mae_scores.append(score)
    print(f"fold {fold+1}: mae = {score}")

print(f"Average mae: {np.array(mae_scores).mean()}")

fold 1: mae = 0.10955891518598214
fold 2: mae = 0.10749169273507639
Average mae: 0.10852530396052926


### Bayesian Tuning ###

In [7]:
# Global

X = df
y = D_e_tot

def objective(params):
    mae_scores = []
    kfold = KFold(n_splits=2, shuffle=True, random_state=42)
    for fold, (train_index, test_index) in enumerate(kfold.split(X)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        Q_train, Q_test = charges[train_index], charges[test_index]
        preds = KRR_global(X_train, y_train, X_test, best_params=params, kernel='gaussian', norm=2)
        if type(preds) is str:
            return np.inf
        score = mean_absolute_error(preds.reshape(-1, 1), y_test)
        mae_scores.append(score)
    return np.array(mae_scores).mean()

space = {
    'lambda': hp.loguniform('lambda', -7, 3), 
    'length': hp.loguniform('length', 0, 4)
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=100,
                trials=trials)

print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

100%|██████████| 100/100 [02:27<00:00,  1.47s/trial, best loss: 0.14355348556589315]
Best hyperparameters: {'lambda': 0.0031724934215629635, 'length': 49.705747731572686}
Loss: 0.14355348556589315


In [None]:
# Tuning 

X = mbdf
y = D_e_tot

def objective(params):
    mae_scores = []
    kfold = KFold(n_splits=2, shuffle=True, random_state=42)
    for fold, (train_index, test_index) in enumerate(kfold.split(X)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        Q_train, Q_test = charges[train_index], charges[test_index]
        preds = KRR_local(X_train, Q_train, y_train, X_test, Q_test, best_params=params, kernel='Gaussian')
        if type(preds) is str:
            return np.inf
        score = mean_absolute_error(preds.reshape(-1, 1), y_test)
        mae_scores.append(score)
    return np.array(mae_scores).mean()

space = {
    'lambda': hp.loguniform('lambda', -30, 0), 
    'length': hp.loguniform('length', -2, 2)
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=100,
                trials=trials)

print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

### Evaluation ###

In [11]:
X = df
y = D_e_atom

best_params = {'lambda': 1e-06, 'length': 1000}

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance = pd.DataFrame(columns=columns)

training_size = [80, 160, 320, 640, 1280]
num_trials = 5

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance_global_v2(best_params, X, y, num_training_sample, num_trials)
        model_performance.at[index, 'training size'] = num_training_sample
        model_performance.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

print(model_performance['average MAE (mHa)'].tolist())
print(model_performance['standard deviation (mHa)'].tolist())
display(model_performance)

[254.11500261976727, 238.1328641186009, 186.90563407277568, 145.4709306551047, 128.47104308013488]
[206.2575899882402, 161.13352877773576, 79.08474648056563, 34.55066444458538, 14.854541057841024]


Unnamed: 0,training size,average MAE (mHa),standard deviation (mHa)
1,80,254.115003,206.25759
2,160,238.132864,161.133529
3,320,186.905634,79.084746
4,640,145.470931,34.550664
5,1280,128.471043,14.854541


In [12]:
X = mbdf
y = D_e_atom
best_params = {'lambda': 0.001, 'length': 100}

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance = pd.DataFrame(columns=columns)

training_size = [80, 160, 320, 640, 1280]
num_trials = 5

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance_local_v2(best_params, X, y, charges, num_training_sample, num_trials)
        model_performance.at[index, 'training size'] = num_training_sample
        model_performance.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

print(model_performance['average MAE (mHa)'].tolist())
print(model_performance['standard deviation (mHa)'].tolist())
display(model_performance)

[158.77594153128908, 152.8608612484898, 132.2585742647864, 112.99252974548612, 108.77644891667948]
[21.751947186430073, 27.32525593239945, 17.940957537298825, 9.27640555356414, 5.661382258208243]


Unnamed: 0,training size,average MAE (mHa),standard deviation (mHa)
1,80,158.775942,21.751947
2,160,152.860861,27.325256
3,320,132.258574,17.940958
4,640,112.99253,9.276406
5,1280,108.776449,5.661382


## MBDF Local ##

### D_etot ###

In [6]:
X = mbdf
y = D_etot

param_grid={'lambda':[1e-3,1e-6,1e-9,1e-12],
            'length':[10**i for i in range(-2,4)]}
mae_scores = []

kfold = KFold(n_splits=2, shuffle=True, random_state=42)
for fold, (train_index, test_index) in enumerate(kfold.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    Q_train, Q_test = charges[train_index], charges[test_index]
    best_params = GridSearchCV_local(X_train, Q_train, y_train, param_grid, cv = 4)
    preds = KRR_local(X_train, Q_train, y_train, X_test, Q_test, best_params=best_params, kernel='mbdf')
    score = mean_absolute_error(preds.reshape(-1, 1), y_test)
    mae_scores.append(score)
    print(f"fold {fold+1}: mae = {score}")

print(f"Average mae: {np.array(mae_scores).mean()}")

fold 1: mae = 0.09786804547787652
fold 2: mae = 0.11421316526373111
Average mae: 0.10604060537080381


In [7]:
print(best_params)

{'mae': 0.11420059134960384, 'lambda': 0.001, 'length': 100}


### DD_etot ###

In [6]:
# lambda: regularization
# length: kernel length

X = mbdf
y = DD_etot

params = {'lambda': 0.9700293111914039, 'length': 6.720858507072239}
mae_scores = []

kfold = KFold(n_splits=2, shuffle=True, random_state=42)
for fold, (train_index, test_index) in enumerate(kfold.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    Q_train, Q_test = charges[train_index], charges[test_index]
    preds = KRR_local(X_train, Q_train, y_train, X_test, Q_test, best_params=params, kernel='mbdf')
    score = mean_absolute_error(preds.reshape(-1, 1), y_test)
    mae_scores.append(score)
    print(f"fold {fold+1}: mae = {score}")

print(f"Average mae: {np.array(mae_scores).mean()}")

fold 1: mae = 0.09305628346319726
fold 2: mae = 0.1115939887598628
Average mae: 0.10232513611153003


In [5]:
# Tuning 

X = mbdf
y = D_e_tot

def objective(params):
    mae_scores = []
    kfold = KFold(n_splits=2, shuffle=True, random_state=42)
    for fold, (train_index, test_index) in enumerate(kfold.split(X)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        Q_train, Q_test = charges[train_index], charges[test_index]
        preds = KRR_local(X_train, Q_train, y_train, X_test, Q_test, best_params=params, kernel='Gaussian')
        if type(preds) is str:
            return np.inf
        score = mean_absolute_error(preds.reshape(-1, 1), y_test)
        mae_scores.append(score)
    return np.array(mae_scores).mean()

space = {
    'lambda': hp.loguniform('lambda', -30, 0), 
    'length': hp.loguniform('length', -2, 2)
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=100,
                trials=trials)

print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

100%|██████████| 100/100 [01:49<00:00,  1.10s/trial, best loss: 0.10018734438910065]
Best hyperparameters: {'lambda': 0.9700293111914039, 'length': 6.720858507072239}
Loss: 0.10018734438910065


# Learning #

In [4]:
performance_summary = {}

In [12]:
def evaluate_performance_global(params, X, y, num_training_sample, num_trials):

    errors = []
    test_size = 1.0 - num_training_sample/X.shape[0]

    for i in range(num_trials):
        train_indices, test_indices = train_test_split(range(X.shape[0]), test_size=test_size, shuffle=True, random_state=i)
        X_train, X_test = X[train_indices], X[test_indices]
        y_train, y_test = y[train_indices], y[test_indices]
        preds = KRR_global(X_train, y_train, X_test, best_params=params, kernel='Gaussian')
        error = mean_absolute_error(preds.reshape(-1, 1), y_test)
        errors.append(error)
    
    average_error = np.mean(errors)
    std_dev_error = np.std(errors)/np.sqrt(num_trials)
    return average_error, std_dev_error



def evaluate_performance_local(params, X, y, Q, num_training_sample, num_trials):

    errors = []
    test_size = 1.0 - num_training_sample/X.shape[0]

    for i in range(num_trials):
        train_indices, test_indices = train_test_split(range(X.shape[0]), test_size=test_size, shuffle=True, random_state=i)
        X_train, X_test = X[train_indices], X[test_indices]
        y_train, y_test = y[train_indices], y[test_indices]
        Q_train, Q_test = Q[train_indices], Q[test_indices]
        preds = KRR_local(X_train, Q_train, y_train, X_test, Q_test, best_params=params, kernel='Gaussian')
        error = mean_absolute_error(preds.reshape(-1, 1), y_test)
        errors.append(error)
    
    average_error = np.mean(errors)
    std_dev_error = np.std(errors)/np.sqrt(num_trials)
    return average_error, std_dev_error

## MBDF Global ##

In [5]:
X = df
y = D_etot

best_params = {'lambda': 1e-06, 'length': 1000}

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance = pd.DataFrame(columns=columns)

training_size = [10, 20, 40, 80, 160, 320, 640, 1280]
num_trials = 5

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance_global_v2(best_params, X, y, num_training_sample, num_trials)
        model_performance.at[index, 'training size'] = num_training_sample
        model_performance.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

performance_summary['MBDF_global'] = model_performance

display(model_performance)

Unnamed: 0,training size,average MAE (mHa),standard deviation (mHa)
1,10,267.348668,48.281552
2,20,173.770549,29.208821
3,40,132.61813,29.620547
4,80,255.81756,201.60466
5,160,238.311073,160.749926
6,320,187.045221,79.03926
7,640,145.877099,34.74895
8,1280,128.871766,15.174768


In [14]:
X = df
y = DD_etot

best_params = {'lambda': 0.001, 'length': 1000}

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance = pd.DataFrame(columns=columns)

training_size = [2**i for i in range(6, 11)]
num_trials = 5

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance_global(best_params, X, y, num_training_sample, num_trials)
        model_performance.at[index, 'training size'] = num_training_sample
        model_performance.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

performance_summary['MBDF_global (delta)'] = model_performance

display(model_performance)

Unnamed: 0,training size,average MAE (mHa),standard deviation (mHa)
1,64,94.87069,1.95914
2,128,97.726285,5.403965
3,256,108.099702,10.861729
4,512,110.319929,6.442153
5,1024,103.079877,1.698074


## MBDF Local ##

In [7]:
X = mbdf
y = D_etot
best_params = {'lambda': 0.001, 'length': 100}

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance = pd.DataFrame(columns=columns)

training_size = [10, 20, 40, 80, 160, 320, 640, 1280]
num_trials = 5

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance_local_v2(best_params, X, y, charges, num_training_sample, num_trials)
        model_performance.at[index, 'training size'] = num_training_sample
        model_performance.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

performance_summary['MBDF_local'] = model_performance

display(model_performance)

Unnamed: 0,training size,average MAE (mHa),standard deviation (mHa)
1,10,184.597845,12.380999
2,20,164.222805,8.040528
3,40,146.571214,10.754663
4,80,158.776434,21.75193
5,160,152.861298,27.325309
6,320,132.259076,17.940817
7,640,112.993119,9.276311
8,1280,108.777003,5.661415


In [18]:
X = mbdf
y = DD_etot
best_params = {'lambda': 0.9700293111914039, 'length': 6.720858507072239}

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance = pd.DataFrame(columns=columns)

training_size = [2**i for i in range(6, 11)]
num_trials = 5

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance_local(best_params, X, y, charges, num_training_sample, num_trials)
        model_performance.at[index, 'training size'] = num_training_sample
        model_performance.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

performance_summary['MBDF_local (delta)'] = model_performance

display(model_performance)

Unnamed: 0,training size,average MAE (mHa),standard deviation (mHa)
1,64,96.380101,1.443536
2,128,99.391411,4.045516
3,256,106.968097,9.488595
4,512,104.948378,4.274618
5,1024,101.642879,1.426623


In [8]:
learning_curve_data = {}
for key, value in performance_summary.items():
    # display(value)
    learning_curve_data[key] = value['average MAE (mHa)'].tolist()

print(f"MBDF Global: {learning_curve_data['MBDF_global']}")
print(f"MBDF Local: {learning_curve_data['MBDF_local']}")
# print(f"MBDF Global (delta): {learning_curve_data['MBDF_global (delta)']}")
# print(f"MBDF Local (delta): {learning_curve_data['MBDF_local (delta)']}")

MBDF Global: [267.34866751435857, 173.77054851163828, 132.61813037056365, 255.81756044130967, 238.3110725198692, 187.04522095157284, 145.8770986733163, 128.87176623964308]
MBDF Local: [184.59784535648154, 164.22280452583243, 146.57121401836062, 158.77643399968196, 152.8612979610525, 132.25907572913226, 112.99311945723645, 108.77700270516728]
