In [1]:
import sys
sys.path.append('../..')
sys.path.append('../data')
sys.path.append('../../helper_code')

from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_squared_error
import warnings
import numpy as np
import numba
import pandas as pd
from IPython.display import display
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials


from helper_code.custom_kernel import create_similarity_matrix_nb, extended_gaussian_kernel_nb


In [3]:
input_dataset = ['c', 'c_lexi', 'CE', 'CE_lexi', 'CSE', 'CSE_lexi']
dataset_dict = {}

for data in input_dataset:
    dataset_dict[data] = pd.read_csv(f'../data/coronene_training_data/{data}.csv')
    dataset_dict[f"IDM_{data}"] = pd.read_csv(f'../data/coronene_training_data/[IDM]{data}.csv')

D_etot = pd.read_csv(f'../data/coronene_training_data/delta_total_energy.csv')
DD_etot = pd.read_csv(f'../data/coronene_training_data/DD_e_tot.csv')

## CSE with Gaussian Kernel ##

### No lexi, D_etot ###

In [18]:
X_train = dataset_dict['CSE'].to_numpy()
y_train = D_etot.to_numpy()

params = {'alpha': 2.2425013674157485e-08, 'gamma': 8.136151536502313e-08, 'kernel': 'rbf'}
model = KernelRidge(**params)

neg_mae_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=2)
mae_scores = -neg_mae_scores
avg_mae = mae_scores.mean()

for fold, mae in enumerate(mae_scores):
    print(f"Fold {fold+1}: MAE = {mae}")
print(f"Average MAE across all folds: {avg_mae}")

Fold 1: MAE = 0.10517346003262494
Fold 2: MAE = 0.10475856493434792
Average MAE across all folds: 0.10496601248348643


In [17]:
# Tuning

X_train = dataset_dict['CSE'].to_numpy()
y_train = D_etot.to_numpy()

def objective(params):
    params['kernel'] = 'rbf'
    model = KernelRidge(**params)
    neg_mae_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=2)
    return {'loss': -neg_mae_scores.mean(), 'status': STATUS_OK}

space = {
    'alpha': hp.loguniform('alpha', -30, 0),
    'gamma': hp.loguniform('gamma', -30, 0),
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=300,
                trials=trials)

print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

100%|██████████| 300/300 [00:29<00:00, 10.14trial/s, best loss: 0.10496601248348643]
Best hyperparameters: {'alpha': 2.2425013674157485e-08, 'gamma': 8.136151536502313e-08}
Loss: 0.10496601248348643


### With Lexi, D_etot ###

In [21]:
X_train = dataset_dict['CSE_lexi'].to_numpy()
y_train = D_etot

params = {'alpha': 1.5182042161569614e-12, 'gamma': 6.03689365016129e-10, 'kernel': 'rbf'}
model = KernelRidge(**params)

neg_mae_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=2)
mae_scores = -neg_mae_scores
avg_mae = mae_scores.mean()

for fold, mae in enumerate(mae_scores):
    print(f"Fold {fold+1}: MAE = {mae}")
print(f"Average MAE across all folds: {avg_mae}")

Fold 1: MAE = 0.10401053546423987
Fold 2: MAE = 0.10143435614903583
Average MAE across all folds: 0.10272244580663785


In [20]:
# Tuning

X_train = dataset_dict['CSE_lexi'].to_numpy()
y_train = D_etot.to_numpy()

def objective(params):
    params['kernel'] = 'rbf'
    model = KernelRidge(**params)
    neg_mae_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=2)
    return {'loss': -neg_mae_scores.mean(), 'status': STATUS_OK}

space = {
    'alpha': hp.loguniform('alpha', -30, 0),
    'gamma': hp.loguniform('gamma', -30, 0),
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=300,
                trials=trials)

print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

100%|██████████| 300/300 [00:30<00:00,  9.98trial/s, best loss: 0.10272244580663785]
Best hyperparameters: {'alpha': 1.5182042161569614e-12, 'gamma': 6.03689365016129e-10}
Loss: 0.10272244580663785


### With Lexi, DD_etot ###

In [25]:
X_train = dataset_dict['CSE_lexi'].to_numpy()
y_train = DD_etot.to_numpy()

params = {'alpha': 0.0025629074500422688, 'gamma': 1.8104490522288843e-07, 'kernel': 'rbf'}
model = KernelRidge(**params)

kfold = KFold(n_splits=2, shuffle=True, random_state=42)
neg_mae_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=2)
mae_scores = -neg_mae_scores
avg_mae = mae_scores.mean()

for fold, mae in enumerate(mae_scores):
    print(f"Fold {fold+1}: MAE = {mae}")
print(f"Average MAE across all folds: {avg_mae}")

Fold 1: MAE = 0.1001863077773452
Fold 2: MAE = 0.09583078624793348
Average MAE across all folds: 0.09800854701263934


In [24]:
# Tuning

X_train = dataset_dict['CSE_lexi'].to_numpy()
y_train = DD_etot.to_numpy()

def objective(params):
    params['kernel'] = 'rbf'
    model = KernelRidge(**params)
    neg_mae_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=2)
    return {'loss': -neg_mae_scores.mean(), 'status': STATUS_OK}

space = {
    'alpha': hp.loguniform('alpha', -30, 0),
    'gamma': hp.loguniform('gamma', -30, 0),
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=300,
                trials=trials)

print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

100%|██████████| 300/300 [00:31<00:00,  9.46trial/s, best loss: 0.09800854701263934]
Best hyperparameters: {'alpha': 0.0025629074500422688, 'gamma': 1.8104490522288843e-07}
Loss: 0.09800854701263934


## Extended Gaussian ##

### No Lexi, D_etot ###

In [27]:
X_train = dataset_dict['CSE'].to_numpy()
y_train = D_etot.to_numpy()

params = {'alpha': 4.078182747775911e-06, 'beta': 1.9601587086085153e-06, 'epsilon': 3.757467077768846e-11, 'gamma': 3.592110159724421e-13}
beta = params['beta']
epsilon = params['epsilon']
gamma = params['gamma']
alpha = params['alpha']

similarity_matrix = create_similarity_matrix_nb(X_train, X_train, extended_gaussian_kernel_nb, beta, epsilon, gamma)
krr_model = KernelRidge(kernel='precomputed', alpha=alpha)

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    neg_mae = cross_val_score(krr_model, similarity_matrix, y_train, scoring='neg_mean_absolute_error', cv=2)
    mae_scores = -neg_mae
    avg_mae = mae_scores.mean()

for fold, mae in enumerate(mae_scores):
    print(f"Fold {fold+1}: MAE = {mae}")
print(f"Average MAE across all folds: {avg_mae}")

Fold 1: MAE = 0.10543877330951051
Fold 2: MAE = 0.10505898667621821
Average MAE across all folds: 0.10524887999286436


In [26]:
# Tuning 

X_train = dataset_dict['CSE'].to_numpy()
y_train = D_etot.to_numpy()

def objective(params):
    beta = params['beta']
    epsilon = params['epsilon']
    gamma = params['gamma']
    alpha = params['alpha']
    similarity_matrix = create_similarity_matrix_nb(X_train, X_train, extended_gaussian_kernel_nb, beta, epsilon, gamma)
    krr_model = KernelRidge(kernel='precomputed', alpha=alpha)
    neg_mae_scores = cross_val_score(krr_model, similarity_matrix, y_train, scoring='neg_mean_absolute_error', cv=2)
    return {'loss': -neg_mae_scores.mean(), 'status': STATUS_OK}

param_space = {
    'gamma': hp.loguniform('gamma', -30, 2),
    'epsilon': hp.loguniform('epsilon', -30, 2), 
    'beta': hp.loguniform('beta', -30, 2), 
    'alpha': hp.loguniform('alpha', -30, 2) 
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=param_space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=300,
                trials=trials)
    
print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

100%|██████████| 300/300 [01:19<00:00,  3.77trial/s, best loss: 0.10524887999286436]
Best hyperparameters: {'alpha': 4.078182747775911e-06, 'beta': 1.9601587086085153e-06, 'epsilon': 3.757467077768846e-11, 'gamma': 3.592110159724421e-13}
Loss: 0.10524887999286436


### With Lexi, D_etot ###

In [29]:
X_train = dataset_dict['CSE_lexi'].to_numpy()
y_train = D_etot.to_numpy()

params = {'alpha': 4.082588287978896e-08, 'beta': 2.0167466950900717e-07, 'epsilon': 2.142512351831609e-08, 'gamma': 4.181167988054245e-09}
beta = params['beta']
epsilon = params['epsilon']
gamma = params['gamma']
alpha = params['alpha']

similarity_matrix = create_similarity_matrix_nb(X_train, X_train, extended_gaussian_kernel_nb, beta, epsilon, gamma)
krr_model = KernelRidge(kernel='precomputed', alpha=alpha)

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    neg_mae = cross_val_score(krr_model, similarity_matrix, y_train, scoring='neg_mean_absolute_error', cv=2)
    mae_scores = -neg_mae
    avg_mae = mae_scores.mean()

for fold, mae in enumerate(mae_scores):
    print(f"Fold {fold+1}: MAE = {mae}")
print(f"Average MAE across all folds: {avg_mae}")

Fold 1: MAE = 0.1041707581587638
Fold 2: MAE = 0.10341720396456694
Average MAE across all folds: 0.10379398106166537


In [28]:
# Tuning 

X_train = dataset_dict['CSE_lexi'].to_numpy()
y_train = D_etot.to_numpy()

def objective(params):
    beta = params['beta']
    epsilon = params['epsilon']
    gamma = params['gamma']
    alpha = params['alpha']
    similarity_matrix = create_similarity_matrix_nb(X_train, X_train, extended_gaussian_kernel_nb, beta, epsilon, gamma)
    krr_model = KernelRidge(kernel='precomputed', alpha=alpha)
    neg_mae_scores = cross_val_score(krr_model, similarity_matrix, y_train, scoring='neg_mean_absolute_error', cv=2)
    return {'loss': -neg_mae_scores.mean(), 'status': STATUS_OK}

param_space = {
    'gamma': hp.loguniform('gamma', -30, 2),
    'epsilon': hp.loguniform('epsilon', -30, 2), 
    'beta': hp.loguniform('beta', -30, 2), 
    'alpha': hp.loguniform('alpha', -30, 2) 
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=param_space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=300,
                trials=trials)
    
print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

  0%|          | 0/300 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 300/300 [01:17<00:00,  3.89trial/s, best loss: 0.10379398106166537]
Best hyperparameters: {'alpha': 4.082588287978896e-08, 'beta': 2.0167466950900717e-07, 'epsilon': 2.142512351831609e-08, 'gamma': 4.181167988054245e-09}
Loss: 0.10379398106166537


### With Lexi, DD_etot ###

In [34]:
X_train = dataset_dict['CSE_lexi'].to_numpy(dtype=np.float64)
y_train = DD_etot.to_numpy()

params = {'alpha': 0.00028513857657025936, 'beta': 1.1753499166909471e-08, 'epsilon': 3.0350353283034606e-06, 'gamma': 1.1611337892690983e-07}
beta = params['beta']
epsilon = params['epsilon']
gamma = params['gamma']
alpha = params['alpha']

similarity_matrix = create_similarity_matrix_nb(X_train, X_train, extended_gaussian_kernel_nb, beta, epsilon, gamma)
# print(similarity_matrix.shape)
krr_model = KernelRidge(kernel='precomputed', alpha=alpha)

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    neg_mae = cross_val_score(krr_model, similarity_matrix, y_train, scoring='neg_mean_absolute_error', cv=2)
    mae_scores = -neg_mae
    avg_mae = mae_scores.mean()

for fold, mae in enumerate(mae_scores):
    print(f"Fold {fold+1}: MAE = {mae}")
print(f"Average MAE across all folds: {avg_mae}")

Fold 1: MAE = 0.09553401338682989
Fold 2: MAE = 0.09599543075525488
Average MAE across all folds: 0.09576472207104239


In [33]:
X_train = dataset_dict['CSE_lexi'].to_numpy()
y_train = DD_etot.to_numpy()

def objective(params):
    beta = params['beta']
    epsilon = params['epsilon']
    gamma = params['gamma']
    alpha = params['alpha']
    similarity_matrix = create_similarity_matrix_nb(X_train, X_train, extended_gaussian_kernel_nb, beta, epsilon, gamma)
    krr_model = KernelRidge(kernel='precomputed', alpha=alpha)
    neg_mae_scores = cross_val_score(krr_model, similarity_matrix, y_train, scoring='neg_mean_absolute_error', cv=2)
    return {'loss': -neg_mae_scores.mean(), 'status': STATUS_OK}

param_space = {
    'gamma': hp.loguniform('gamma', -30, 2),
    'epsilon': hp.loguniform('epsilon', -30, 2), 
    'beta': hp.loguniform('beta', -30, 2), 
    'alpha': hp.loguniform('alpha', -30, 2) 
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=param_space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=300,
                trials=trials)
    
print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

100%|██████████| 300/300 [01:17<00:00,  3.87trial/s, best loss: 0.09576472207104239]
Best hyperparameters: {'alpha': 0.00028513857657025936, 'beta': 1.1753499166909471e-08, 'epsilon': 3.0350353283034606e-06, 'gamma': 1.1611337892690983e-07}
Loss: 0.09576472207104239


## IDM ##

In [9]:
X_train = dataset_dict['IDM_CSE_lexi'].to_numpy()
y_train = D_etot

params = {'alpha': 2.9143654132589253e-13, 'gamma': 9.452129457528114e-10, 'kernel': 'rbf'}
model = KernelRidge(**params)

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    neg_mae_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=2)
    mae_scores = -neg_mae_scores
    avg_mae = mae_scores.mean()

for fold, mae in enumerate(mae_scores):
    print(f"Fold {fold+1}: MAE = {mae}")
print(f"Average MAE across all folds: {avg_mae}")

Fold 1: MAE = 0.10353586198509138
Fold 2: MAE = 0.1006480394548003
Average MAE across all folds: 0.10209195071994584


In [6]:
# Tuning

X_train = dataset_dict['IDM_CSE_lexi'].to_numpy()
y_train = D_etot.to_numpy()

def objective(params):
    params['kernel'] = 'rbf'
    model = KernelRidge(**params)
    neg_mae_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=2)
    return {'loss': -neg_mae_scores.mean(), 'status': STATUS_OK}

space = {
    'alpha': hp.loguniform('alpha', -30, 0),
    'gamma': hp.loguniform('gamma', -30, 0),
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=300,
                trials=trials)

print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

100%|██████████| 300/300 [02:07<00:00,  2.35trial/s, best loss: 0.10209195071994584]
Best hyperparameters: {'alpha': 2.9143654132589253e-13, 'gamma': 9.452129457528114e-10}
Loss: 0.10209195071994584


In [11]:
X_train = dataset_dict['IDM_CSE_lexi'].to_numpy()
y_train = D_etot.to_numpy()

params = {'alpha': 3.675729391358022e-12, 'beta': 7.50981822257169e-09, 'epsilon': 1.0596237543876683e-08, 'gamma': 9.718114533626386e-14}
beta = params['beta']
epsilon = params['epsilon']
gamma = params['gamma']
alpha = params['alpha']

similarity_matrix = create_similarity_matrix_nb(X_train, X_train, extended_gaussian_kernel_nb, beta, epsilon, gamma)
krr_model = KernelRidge(kernel='precomputed', alpha=alpha)

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    neg_mae = cross_val_score(krr_model, similarity_matrix, y_train, scoring='neg_mean_absolute_error', cv=2)
    mae_scores = -neg_mae
    avg_mae = mae_scores.mean()

for fold, mae in enumerate(mae_scores):
    print(f"Fold {fold+1}: MAE = {mae}")
print(f"Average MAE across all folds: {avg_mae}")

Fold 1: MAE = 0.10372669507359933
Fold 2: MAE = 0.10435532744275194
Average MAE across all folds: 0.10404101125817564


In [10]:
# Tuning 

X_train = dataset_dict['IDM_CSE_lexi'].to_numpy()
y_train = D_etot.to_numpy()

def objective(params):
    beta = params['beta']
    epsilon = params['epsilon']
    gamma = params['gamma']
    alpha = params['alpha']
    similarity_matrix = create_similarity_matrix_nb(X_train, X_train, extended_gaussian_kernel_nb, beta, epsilon, gamma)
    krr_model = KernelRidge(kernel='precomputed', alpha=alpha)
    neg_mae_scores = cross_val_score(krr_model, similarity_matrix, y_train, scoring='neg_mean_absolute_error', cv=2)
    return {'loss': -neg_mae_scores.mean(), 'status': STATUS_OK}

param_space = {
    'gamma': hp.loguniform('gamma', -30, 2),
    'epsilon': hp.loguniform('epsilon', -30, 2), 
    'beta': hp.loguniform('beta', -30, 2), 
    'alpha': hp.loguniform('alpha', -30, 2) 
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=param_space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=300,
                trials=trials)
    
print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

100%|██████████| 300/300 [07:27<00:00,  1.49s/trial, best loss: 0.10404101125817564]
Best hyperparameters: {'alpha': 3.675729391358022e-12, 'beta': 7.50981822257169e-09, 'epsilon': 1.0596237543876683e-08, 'gamma': 9.718114533626386e-14}
Loss: 0.10404101125817564


## Coulombic Matrix ##

In [10]:
CM_rep = pd.read_csv("../data/coronene_training_data/CM_rep.csv")
display(CM_rep.head(3))

Unnamed: 0,coord0,coord1,coord2,coord3,coord4,coord5,coord6,coord7,coord8,coord9,...,coord656,coord657,coord658,coord659,coord660,coord661,coord662,coord663,coord664,coord665
0,53.358707,34.053598,53.358707,34.368323,19.751342,53.358707,12.913381,19.751462,9.875736,53.358707,...,0.402831,0.119825,0.399055,0.14694,0.103771,0.20754,0.107468,0.207544,0.119824,0.5
1,53.358707,19.751342,53.358707,12.913317,8.552605,53.358707,19.885231,11.390971,9.539765,53.358707,...,0.399079,0.107396,0.119824,0.402834,0.107468,0.146938,0.146572,0.20754,0.103772,0.5
2,53.358707,34.368245,53.358707,19.751462,17.105333,53.358707,19.751342,17.105179,9.875736,53.358707,...,0.207545,0.119824,0.14657,0.402824,0.107395,0.107468,0.207542,0.14694,0.399078,0.5


In [11]:
X_train = CM_rep.to_numpy()
y_train = delta_delta_total_energy

params = {'alpha': 4.6e-11, 'gamma': 2.8e-08, 'kernel': 'rbf'}
model = KernelRidge(**params)

neg_mse_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=2)
rmse_scores = np.sqrt(-neg_mse_scores)
mean_rmse_score = rmse_scores.mean()

for fold, score in enumerate(rmse_scores):
    print(f"fold {fold}: rmse = {score}")

print(f"Average rmse: {mean_rmse_score}")

fold 0: rmse = 1.0627929185395568
fold 1: rmse = 1.0286874440862068
Average rmse: 1.045740181312882


## MBDF ##

In [6]:
X_train = np.genfromtxt("../data/coronene_training_data/MBDF.csv", delimiter=',')
y_train = delta_total_energy

params = {'alpha': 5.300500381866468e-13, 'gamma': 9.287160666894478e-05, 'kernel': 'rbf'}
model = KernelRidge(**params)

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    neg_mse_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=2)
    rmse_scores = np.sqrt(-neg_mse_scores)
    mean_rmse_score = rmse_scores.mean()

for fold, score in enumerate(rmse_scores):
    print(f"fold {fold}: rmse = {score}")

print(f"Average rmse: {mean_rmse_score}")

fold 0: rmse = 0.22988854923839755
fold 1: rmse = 0.22332026844768935
Average rmse: 0.22660440884304345


  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
