In [5]:
import sys
sys.path.append('../..')
sys.path.append('../data')
sys.path.append('../../helper_code')

from sklearn.linear_model import Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
import warnings
import numpy as np
import pandas as pd
from IPython.display import display
import copy
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

from helper_code.custom_kernel import extended_gaussian_kernel_nb, create_similarity_matrix_nb

In [2]:
from qml.kernels import extended_gaussian_kernel

In [3]:
# Load Data

input_dataset = ['c', 'c_lexi', 'CE', 'CE_lexi', 'CSE', 'CSE_lexi']
dataset_dict = {}

for data in input_dataset:
    dataset_dict[data] = pd.read_csv(f'../data/benzene_training_data/[Benz]_{data}.csv')
delta_delta_total_energy = pd.read_csv('../data/benzene_training_data/DD_e_tot (kcal).csv')

## CE with Gaussian Kernel ##

In [32]:
X_train = dataset_dict['CE'].to_numpy()
y_train = delta_delta_total_energy

params = {'alpha': 0.917371219009574, 'gamma': 0.01125777771448475}
model = KernelRidge(**params)

k_fold = KFold(n_splits=2, shuffle=True, random_state=42)
neg_mae_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=k_fold)
mae_scores = -neg_mae_scores
mean_mae = mae_scores.mean()

for fold, score in enumerate(mae_scores):
    print(f"fold {fold}: mae = {score}")

print(f"Average mae: {mean_mae}")

fold 0: mae = 13.569546257790307
fold 1: mae = 11.808140145717271
Average mae: 12.688843201753789


In [31]:
X_train = dataset_dict['CE'].to_numpy()
y_train = delta_delta_total_energy

def objective(params):
    params['kernel'] = 'rbf'
    model = KernelRidge(**params)
    neg_mae_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=2)
    return {'loss': -neg_mae_scores.mean(), 'status': STATUS_OK}

space = {
    'alpha': hp.loguniform('alpha', -30, 0),
    'gamma': hp.loguniform('gamma', -30, 0),
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=1000,
                trials=trials)

print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

100%|██████████| 1000/1000 [00:14<00:00, 67.39trial/s, best loss: 8.284366294970386]
Best hyperparameters: {'alpha': 0.917371219009574, 'gamma': 0.01125777771448475}
Loss: 8.284366294970386


## CSE with Gaussian Kernel ##

### No Lexi ###

In [29]:
X_train = dataset_dict['CSE'].to_numpy()
y_train = delta_delta_total_energy

params = {'alpha': 1.3071058074112173e-10, 'gamma': 1.901449554067671e-12}
model = KernelRidge(**params)

k_fold = KFold(n_splits=2, shuffle=True, random_state=42)
neg_mae_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=k_fold)
mae_scores = -neg_mae_scores
mean_mae = mae_scores.mean()

for fold, score in enumerate(mae_scores):
    print(f"fold {fold}: mae = {score}")

print(f"Average mae: {mean_mae}")

fold 0: mae = 6.297467628133976
fold 1: mae = 9.962673234588138
Average mae: 8.130070431361057


In [16]:
X_train = dataset_dict['CSE'].to_numpy()
y_train = delta_delta_total_energy

def objective(params):
    params['kernel'] = 'rbf'
    model = KernelRidge(**params)
    neg_mae_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=2)
    return {'loss': -neg_mae_scores.mean(), 'status': STATUS_OK}

space = {
    'alpha': hp.loguniform('alpha', -30, 0),
    'gamma': hp.loguniform('gamma', -30, 0),
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=1000,
                trials=trials)

print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

  0%|          | 0/1000 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 1000/1000 [00:14<00:00, 67.72trial/s, best loss: 6.024975570839132]
Best hyperparameters: {'alpha': 1.3071058074112173e-10, 'gamma': 1.901449554067671e-12}
Loss: 6.024975570839132


### With Lexi ###

In [28]:
X_train = dataset_dict['CSE_lexi'].to_numpy()
y_train = delta_delta_total_energy

params = {'alpha': 1.383001658053961e-06, 'gamma': 1.2488389145686083e-08}
model = KernelRidge(**params)

k_fold = KFold(n_splits=2, shuffle=True, random_state=42)
neg_mae_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=k_fold)
mae_scores = -neg_mae_scores
mean_mae = mae_scores.mean()

for fold, score in enumerate(mae_scores):
    print(f"fold {fold}: mae = {score}")

print(f"Average mae: {mean_mae}")

fold 0: mae = 8.140659412143084
fold 1: mae = 7.8082439788997124
Average mae: 7.974451695521398


In [21]:
X_train = dataset_dict['CSE_lexi'].to_numpy()
y_train = delta_delta_total_energy

def objective(params):
    params['kernel'] = 'rbf'
    model = KernelRidge(**params)
    neg_mae_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=2)
    return {'loss': -neg_mae_scores.mean(), 'status': STATUS_OK}

space = {
    'alpha': hp.loguniform('alpha', -30, 0),
    'gamma': hp.loguniform('gamma', -30, 0),
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=1000,
                trials=trials)

print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

100%|██████████| 1000/1000 [00:15<00:00, 65.95trial/s, best loss: 6.613715481690605]
Best hyperparameters: {'alpha': 1.383001658053961e-06, 'gamma': 1.2488389145686083e-08}
Loss: 6.613715481690605


## Extended Gaussian ##

### QML Version ###

In [16]:
X_train = dataset_dict['CSE_lexi'].to_numpy()
y_train = delta_delta_total_energy.to_numpy()

params = {'alpha': 0.01829321042152034, 'beta': 2.001536698758208, 'epsilon': 8.078367543282445e-10, 'gamma': 0.0010846659947661222}
beta = params['beta']
epsilon = params['epsilon']
gamma = params['gamma']
alpha = params['alpha']

similarity_matrix = extended_gaussian_kernel(X_train, X_train, gamma, epsilon, beta)
krr_model = KernelRidge(kernel='precomputed', alpha=alpha)

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    neg_mae = cross_val_score(krr_model, similarity_matrix, y_train, scoring='neg_mean_absolute_error', cv=2)
    mae_scores = -neg_mae
    avg_mae = mae_scores.mean()

for fold, mae in enumerate(mae_scores):
    print(f"Fold {fold+1}: MAE = {mae}")
print(f"Average MAE across all folds: {avg_mae}")

Fold 1: MAE = 5.612419153425771
Fold 2: MAE = 12.685293747081648
Average MAE across all folds: 9.14885645025371


In [12]:
X_train = dataset_dict['CSE_lexi'].to_numpy()
y_train = delta_delta_total_energy.to_numpy()

def objective(params):
    beta = params['beta']
    epsilon = params['epsilon']
    gamma = params['gamma']
    alpha = params['alpha']
    similarity_matrix = extended_gaussian_kernel(X_train, X_train, gamma, epsilon, beta)
    krr_model = KernelRidge(kernel='precomputed', alpha=alpha)
    neg_mae_scores = cross_val_score(krr_model, similarity_matrix, y_train, scoring='neg_mean_absolute_error', cv=2)
    return {'loss': -neg_mae_scores.mean(), 'status': STATUS_OK}

param_space = {
    'gamma': hp.loguniform('gamma', -30, 2),
    'epsilon': hp.loguniform('epsilon', -30, 2), 
    'beta': hp.loguniform('beta', -30, 2), 
    'alpha': hp.loguniform('alpha', -30, 2) 
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=param_space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=300,
                trials=trials)
    
print("CSE_lexi, Extended Gaussian")
print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

  3%|▎         | 9/300 [00:00<00:08, 34.53trial/s, best loss: 9.93077858509165]  

100%|██████████| 300/300 [00:16<00:00, 18.49trial/s, best loss: 9.14885645025371] 
CSE_lexi, Extended Gaussian
Best hyperparameters: {'alpha': 0.01829321042152034, 'beta': 2.001536698758208, 'epsilon': 8.078367543282445e-10, 'gamma': 0.0010846659947661222}
Loss: 9.14885645025371


### Original ###

In [19]:
X_train = dataset_dict['CSE_lexi'].to_numpy()
y_train = delta_delta_total_energy.to_numpy()

params = {'alpha': 0.00015236682100382314, 'beta': 2.908625756878312e-06, 'epsilon': 1.7774464170187634e-09, 'gamma': 1.3300690162853116e-12}
beta = params['beta']
epsilon = params['epsilon']
gamma = params['gamma']
alpha = params['alpha']

similarity_matrix = create_similarity_matrix_nb(X_train, X_train, extended_gaussian_kernel_nb, beta, epsilon, gamma)
krr_model = KernelRidge(kernel='precomputed', alpha=alpha)

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    neg_mae = cross_val_score(krr_model, similarity_matrix, y_train, scoring='neg_mean_absolute_error', cv=2)
    mae_scores = -neg_mae
    avg_mae = mae_scores.mean()

for fold, mae in enumerate(mae_scores):
    print(f"Fold {fold+1}: MAE = {mae}")
print(f"Average MAE across all folds: {avg_mae}")

Fold 1: MAE = 4.425521169940794
Fold 2: MAE = 8.890728097994797
Average MAE across all folds: 6.6581246339677955


In [18]:
X_train = dataset_dict['CSE_lexi'].to_numpy()
y_train = delta_delta_total_energy.to_numpy()

def objective(params):
    beta = params['beta']
    epsilon = params['epsilon']
    gamma = params['gamma']
    alpha = params['alpha']
    similarity_matrix = create_similarity_matrix_nb(X_train, X_train, extended_gaussian_kernel_nb, beta, epsilon, gamma)
    krr_model = KernelRidge(kernel='precomputed', alpha=alpha)
    neg_mae_scores = cross_val_score(krr_model, similarity_matrix, y_train, scoring='neg_mean_absolute_error', cv=5)
    return {'loss': -neg_mae_scores.mean(), 'status': STATUS_OK}

param_space = {
    'gamma': hp.loguniform('gamma', -30, 2),
    'epsilon': hp.loguniform('epsilon', -30, 2), 
    'beta': hp.loguniform('beta', -30, 2), 
    'alpha': hp.loguniform('alpha', -30, 2) 
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=param_space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=300,
                trials=trials)

print("CSE_lexi, Extended Gaussian")
print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

  2%|▏         | 5/300 [00:00<00:07, 40.97trial/s, best loss: 10.82017272214434]

100%|██████████| 300/300 [00:22<00:00, 13.62trial/s, best loss: 8.592090444906319]
CSE_lexi, Extended Gaussian
Best hyperparameters: {'alpha': 0.05019958885046655, 'beta': 0.00024549575430792583, 'epsilon': 1.975582471883263e-12, 'gamma': 1.9493584646253573e-13}
Loss: 8.592090444906319
