In [1]:
import sys
sys.path.append('../..')
sys.path.append('../data')
sys.path.append('../../helper_code')

from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_squared_error
import warnings
import numpy as np
import numba
import pandas as pd
from IPython.display import display
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials


from helper_code.custom_kernel import create_similarity_matrix_nb, extended_gaussian_kernel_nb


In [2]:
input_dataset = ['c', 'c_lexi', 'CE', 'CE_lexi', 'CSE', 'CSE_lexi']
dataset_dict = {}

for data in input_dataset:
    dataset_dict[data] = pd.read_csv(f'../data/coronene_training_data/{data}.csv')

delta_total_energy = pd.read_csv(f'../data/coronene_training_data/delta_total_energy.csv')
DD_etot = pd.read_csv(f'../data/coronene_training_data/DD e_tot.csv')

## CSE with Gaussian Kernel ##

In [14]:
X_train = dataset_dict['CSE_lexi'].to_numpy()
y_train = delta_total_energy

params = {'alpha': 1.52139892542621e-12, 'gamma': 1.0931156160148402e-10, 'kernel': 'rbf'}
model = KernelRidge(**params)

neg_mae_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=2)
mae_scores = -neg_mae_scores
avg_mae = mae_scores.mean()

for fold, mae in enumerate(mae_scores):
    print(f"Fold {fold+1}: MAE = {mae}")
print(f"Average MAE across all folds: {avg_mae}")

Fold 1: MAE = 0.1046758425246451
Fold 2: MAE = 0.10007086730725562
Average MAE across all folds: 0.10237335491595036


In [11]:
X_train = dataset_dict['CSE_lexi'].to_numpy()
y_train = delta_total_energy.to_numpy()

def objective(params):
    params['kernel'] = 'rbf'
    model = KernelRidge(**params)
    neg_mae_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=2)
    return {'loss': -neg_mae_scores.mean(), 'status': STATUS_OK}

space = {
    'alpha': hp.loguniform('alpha', -30, 0),
    'gamma': hp.loguniform('gamma', -30, 0),
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=500,
                trials=trials)

print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

100%|██████████| 500/500 [00:34<00:00, 14.62trial/s, best loss: 0.10237335491595036]
Best hyperparameters: {'alpha': 1.52139892542621e-12, 'gamma': 1.0931156160148402e-10}
Loss: 0.10237335491595036


In [18]:
X_train = dataset_dict['CSE_lexi'].to_numpy()
y_train = DD_etot.to_numpy()

params = {'alpha': 0.0023886633872422748, 'gamma': 2.473364869876525e-07, 'kernel': 'rbf'}
model = KernelRidge(**params)

kfold = KFold(n_splits=2, shuffle=True, random_state=42)
neg_mae_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=2)
mae_scores = -neg_mae_scores
avg_mae = mae_scores.mean()

for fold, mae in enumerate(mae_scores):
    print(f"Fold {fold+1}: MAE = {mae}")
print(f"Average MAE across all folds: {avg_mae}")

Fold 1: MAE = 0.09883994208937469
Fold 2: MAE = 0.09401657630921811
Average MAE across all folds: 0.0964282591992964


In [16]:
X_train = dataset_dict['CSE_lexi'].to_numpy()
y_train = DD_etot.to_numpy()

def objective(params):
    params['kernel'] = 'rbf'
    model = KernelRidge(**params)
    neg_mae_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=2)
    return {'loss': -neg_mae_scores.mean(), 'status': STATUS_OK}

space = {
    'alpha': hp.loguniform('alpha', -30, 0),
    'gamma': hp.loguniform('gamma', -30, 0),
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=500,
                trials=trials)

print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

100%|██████████| 500/500 [00:35<00:00, 14.21trial/s, best loss: 0.0964282591992964] 
Best hyperparameters: {'alpha': 0.0023886633872422748, 'gamma': 2.473364869876525e-07}
Loss: 0.0964282591992964


## Extended Gaussian ##

In [8]:
X_train = dataset_dict['CSE_lexi'].to_numpy(dtype=np.float64)
y_train = DD_etot.to_numpy()

params = {'alpha': 0.0003077632285837624, 'beta': 1.7201150692798052e-08, 'epsilon': 9.489197381921544e-09, 'gamma': 1.0566043571980399e-07}
beta = params['beta']
epsilon = params['epsilon']
gamma = params['gamma']
alpha = params['alpha']

similarity_matrix = create_similarity_matrix_nb(X_train, X_train, extended_gaussian_kernel_nb, beta, epsilon, gamma)
# print(similarity_matrix.shape)
krr_model = KernelRidge(kernel='precomputed', alpha=alpha)

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    neg_mae = cross_val_score(krr_model, similarity_matrix, y_train, scoring='neg_mean_absolute_error', cv=2)
    mae_scores = -neg_mae
    avg_mae = mae_scores.mean()

for fold, mae in enumerate(mae_scores):
    print(f"Fold {fold+1}: MAE = {mae}")
print(f"Average MAE across all folds: {avg_mae}")

Fold 1: MAE = 0.09527504567093169
Fold 2: MAE = 0.09346075285776559
Average MAE across all folds: 0.09436789926434863


In [7]:
X_train = dataset_dict['CSE_lexi'].to_numpy()
y_train = DD_etot.to_numpy()

def objective(params):
    beta = params['beta']
    epsilon = params['epsilon']
    gamma = params['gamma']
    alpha = params['alpha']
    similarity_matrix = create_similarity_matrix_nb(X_train, X_train, extended_gaussian_kernel_nb, beta, epsilon, gamma)
    krr_model = KernelRidge(kernel='precomputed', alpha=alpha)
    neg_mae_scores = cross_val_score(krr_model, similarity_matrix, y_train, scoring='neg_mean_absolute_error', cv=2)
    return {'loss': -neg_mae_scores.mean(), 'status': STATUS_OK}

param_space = {
    'gamma': hp.loguniform('gamma', -30, 2),
    'epsilon': hp.loguniform('epsilon', -30, 2), 
    'beta': hp.loguniform('beta', -30, 2), 
    'alpha': hp.loguniform('alpha', -30, 2) 
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=param_space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=500,
                trials=trials)
    
print("CSE_lexi, Extended Gaussian")
print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

100%|██████████| 500/500 [01:37<00:00,  5.15trial/s, best loss: 0.09436789926434863]
CSE_lexi, Extended Gaussian
Best hyperparameters: {'alpha': 0.0003077632285837624, 'beta': 1.7201150692798052e-08, 'epsilon': 9.489197381921544e-09, 'gamma': 1.0566043571980399e-07}
Loss: 0.09436789926434863


## Coulombic Matrix ##

In [10]:
CM_rep = pd.read_csv("../data/coronene_training_data/CM_rep.csv")
display(CM_rep.head(3))

Unnamed: 0,coord0,coord1,coord2,coord3,coord4,coord5,coord6,coord7,coord8,coord9,...,coord656,coord657,coord658,coord659,coord660,coord661,coord662,coord663,coord664,coord665
0,53.358707,34.053598,53.358707,34.368323,19.751342,53.358707,12.913381,19.751462,9.875736,53.358707,...,0.402831,0.119825,0.399055,0.14694,0.103771,0.20754,0.107468,0.207544,0.119824,0.5
1,53.358707,19.751342,53.358707,12.913317,8.552605,53.358707,19.885231,11.390971,9.539765,53.358707,...,0.399079,0.107396,0.119824,0.402834,0.107468,0.146938,0.146572,0.20754,0.103772,0.5
2,53.358707,34.368245,53.358707,19.751462,17.105333,53.358707,19.751342,17.105179,9.875736,53.358707,...,0.207545,0.119824,0.14657,0.402824,0.107395,0.107468,0.207542,0.14694,0.399078,0.5


In [11]:
X_train = CM_rep.to_numpy()
y_train = delta_delta_total_energy

params = {'alpha': 4.6e-11, 'gamma': 2.8e-08, 'kernel': 'rbf'}
model = KernelRidge(**params)

neg_mse_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=2)
rmse_scores = np.sqrt(-neg_mse_scores)
mean_rmse_score = rmse_scores.mean()

for fold, score in enumerate(rmse_scores):
    print(f"fold {fold}: rmse = {score}")

print(f"Average rmse: {mean_rmse_score}")

fold 0: rmse = 1.0627929185395568
fold 1: rmse = 1.0286874440862068
Average rmse: 1.045740181312882


## MBDF ##

In [6]:
X_train = np.genfromtxt("../data/coronene_training_data/MBDF.csv", delimiter=',')
y_train = delta_total_energy

params = {'alpha': 5.300500381866468e-13, 'gamma': 9.287160666894478e-05, 'kernel': 'rbf'}
model = KernelRidge(**params)

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    neg_mse_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=2)
    rmse_scores = np.sqrt(-neg_mse_scores)
    mean_rmse_score = rmse_scores.mean()

for fold, score in enumerate(rmse_scores):
    print(f"fold {fold}: rmse = {score}")

print(f"Average rmse: {mean_rmse_score}")

fold 0: rmse = 0.22988854923839755
fold 1: rmse = 0.22332026844768935
Average rmse: 0.22660440884304345


  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
