In [1]:
import sys
sys.path.append('../..')
sys.path.append('../data')
sys.path.append('../../helper_code')

from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import warnings
import numpy as np
import pandas as pd
from IPython.display import display
import copy
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

from helper_code.custom_kernel import *

In [2]:
input_dataset = ['c', 'c_lexi', 'c_lexi_nd', 'CE', 'CE_lexi', 'CE_lexi_nd', 'CSE', 'CSE_lexi', 'CSE_lexi_nd']
dataset_dict = {}

for data in input_dataset:
    dataset_dict[data] = pd.read_csv(f'../data/coronene_training_data/{data}.csv')

delta_total_energy = pd.read_csv(f'../data/coronene_training_data/delta_total_energy.csv')
delta_delta_total_energy = pd.read_csv(f'../data/coronene_training_data/delta_delta_total_energy.csv')

In [8]:
X_train = dataset_dict['CSE_lexi'].to_numpy()
y_train = delta_delta_total_energy

param_grid = {
    'alpha': np.logspace(np.log10(1e-13), np.log10(1e-3), num=101),  
    'gamma': np.logspace(np.log10(1e-13), np.log10(1e-3), num=101),  
    'kernel': ['rbf']
}

model = KernelRidge()
kfold = KFold(n_splits=2, shuffle=True, random_state=42)

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    grid_search = RandomizedSearchCV(model, param_grid, n_iter=500, scoring='neg_mean_squared_error', cv=kfold)
    grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = np.sqrt(-grid_search.best_score_)

print("Best Hyperparameters:", best_params)
print("Best Root Mean Squared Error:", best_score)

Best Hyperparameters: {'kernel': 'rbf', 'gamma': 3.9810717055349776e-08, 'alpha': 1e-10}
Best Root Mean Squared Error: 0.1489627308333547


In [9]:
X_train = dataset_dict['CSE_lexi'].to_numpy()
y_train = delta_delta_total_energy

param_grid = {
    'alpha': np.logspace(np.log10(1e-9), np.log10(1e-7), num=21),  
    'gamma': np.logspace(np.log10(1e-11), np.log10(1e-9), num=21),  
    'kernel': ['rbf']
}

model = KernelRidge()
kfold = KFold(n_splits=2, shuffle=True, random_state=42)

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=kfold)
    grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = np.sqrt(-grid_search.best_score_)

print("Best Hyperparameters:", best_params)
print("Best Root Mean Squared Error:", best_score)

Best Hyperparameters: {'alpha': 1e-09, 'gamma': 1e-09, 'kernel': 'rbf'}
Best Root Mean Squared Error: 0.15033871067431506


In [10]:
X_train = dataset_dict['CSE_lexi'].to_numpy()
y_train = delta_delta_total_energy

param_grid = {
    'alpha': np.logspace(np.log10(1e-11), np.log10(1e-9), num=21),  
    'gamma': np.logspace(np.log10(1e-9), np.log10(1e-7), num=21),  
    'kernel': ['rbf']
}

model = KernelRidge()
kfold = KFold(n_splits=2, shuffle=True, random_state=42)

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=kfold)
    grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = np.sqrt(-grid_search.best_score_)

print("Best Hyperparameters:", best_params)
print("Best Root Mean Squared Error:", best_score)

Best Hyperparameters: {'alpha': 2.5118864315095823e-11, 'gamma': 1.9952623149688786e-08, 'kernel': 'rbf'}
Best Root Mean Squared Error: 0.14895567868685927


In [11]:
X_train = dataset_dict['CSE_lexi'].to_numpy()
y_train = delta_delta_total_energy

param_grid = {
    'alpha': np.linspace(1e-11, 5e-11, num=21),  
    'gamma': np.linspace(1e-8, 5e-8, num=21),  
    'kernel': ['rbf']
}

model = KernelRidge()
kfold = KFold(n_splits=2, shuffle=True, random_state=42)

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=kfold)
    grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = np.sqrt(-grid_search.best_score_)

print("Best Hyperparameters:", best_params)
print("Best Root Mean Squared Error:", best_score)

Best Hyperparameters: {'alpha': 4.6e-11, 'gamma': 2.8000000000000003e-08, 'kernel': 'rbf'}
Best Root Mean Squared Error: 0.14895286776797315


## Bayesian Opt ##

In [18]:
X_train = dataset_dict['CSE_lexi'].to_numpy()
y_train = delta_delta_total_energy

def objective(params):
    params['kernel'] = 'rbf'
    model = KernelRidge(**params)
    neg_mae_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=3)
    return {'loss': -neg_mae_scores.mean(), 'status': STATUS_OK}

space = {
    'alpha': hp.loguniform('alpha', -30, 0),
    'gamma': hp.loguniform('gamma', -30, 0),
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=500,
                trials=trials)

print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

100%|██████████| 500/500 [06:51<00:00,  1.22trial/s, best loss: 0.10696031336577831]
Best hyperparameters: {'alpha': 1.6596764933651303e-07, 'gamma': 2.1567299387422978e-06}
Loss: 0.10696031336577831


In [19]:
X_train = dataset_dict['CSE_lexi_nd'].to_numpy()
y_train = delta_delta_total_energy

def objective(params):
    params['kernel'] = 'rbf'
    model = KernelRidge(**params)
    neg_mae_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=3)
    return {'loss': -neg_mae_scores.mean(), 'status': STATUS_OK}

space = {
    'alpha': hp.loguniform('alpha', -30, 0),
    'gamma': hp.loguniform('gamma', -30, 0),
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=100,
                trials=trials)

print("CSE_lexi_nd")
print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

100%|██████████| 100/100 [01:20<00:00,  1.25trial/s, best loss: 0.10695381401867467]
CSE_lexi_nd
Best hyperparameters: {'alpha': 1.5563301230102018e-07, 'gamma': 2.545548038867767e-06}
Loss: 0.10695381401867467


## Bayesian Opt -- Extended Gaussian ##

In [8]:
X_train = dataset_dict['CSE_lexi'].to_numpy()
y_train = delta_delta_total_energy

def objective(params):    
    alpha = params.pop('alpha') # the params passed into the kernel doesn't include regularzation
    similarity_matrix = vectorized_similarity_matrix(X_train, X_train, vectorized_extended_gaussian_kernel, params)
    krr_model = KernelRidge(kernel='precomputed', alpha=alpha)
    neg_mae_scores = cross_val_score(krr_model, similarity_matrix, y_train, scoring='neg_mean_absolute_error', cv=3)
    return {'loss': -neg_mae_scores.mean(), 'status': STATUS_OK}

param_space = {
    'gamma': hp.loguniform('gamma', -30, 0),
    'epsilon': hp.loguniform('epsilon', -30, 0), 
    'beta': hp.loguniform('beta', -30, 0), 
    'alpha': hp.loguniform('alpha', -30, 0) 
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=param_space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=100,
                trials=trials)
    
print("CSE_lexi, Extended Gaussian")
print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

100%|██████████| 100/100 [02:38<00:00,  1.59s/trial, best loss: 0.10622358626175384]
CSE_lexi, Extended Gaussian
Best hyperparameters: {'alpha': 5.948688011403127e-11, 'beta': 2.8505311767038538e-12, 'epsilon': 4.727331943165293e-07, 'gamma': 5.298346661430054e-10}
Loss: 0.10622358626175384


## CSE - XGBRegressor ##

In [6]:
X_train = dataset_dict['CSE_lexi'].to_numpy()
y_train = delta_delta_total_energy

def objective(params):
    params['objective'] = 'reg:squarederror'
    params['n_estimators'] = 100
    params['learning_rate'] = 0.05
    params['subsample'] = 0.8
    params['colsample_bytree'] = 0.8
    model = XGBRegressor(**params)
    neg_mae_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=3)
    return {'loss': -neg_mae_scores.mean(), 'status': STATUS_OK}

space = {
    'max_depth': hp.choice('max_depth', range(5, 25)),
    'gamma': hp.uniform('gamma', 0, 10),
    'alpha': hp.loguniform('alpha', -30, 0),
    'lambda': hp.loguniform('lambda', -30, 0)
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=100,
                trials=trials)

print("CSE_xgboost")
print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

100%|██████████| 100/100 [04:59<00:00,  3.00s/trial, best loss: 0.4910823526365557]
CSE_xgboost
Best hyperparameters: {'alpha': 5.573350082885807e-06, 'gamma': 0.49016228208008483, 'lambda': 0.03579084203303872, 'max_depth': 0}
Loss: 0.4910823526365557


## Bayesian Opt -- Coulomb Matrix ##

In [3]:
CM_rep = pd.read_csv("../data/coronene_training_data/CM_rep.csv")

In [7]:
X_train = CM_rep.to_numpy()
y_train = delta_delta_total_energy

def objective(params):
    params['kernel'] = 'rbf'
    model = KernelRidge(**params)
    neg_mae_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=3)
    return {'loss': -neg_mae_scores.mean(), 'status': STATUS_OK}

space = {
    'alpha': hp.loguniform('alpha', -30, 0),
    'gamma': hp.loguniform('gamma', -30, 0),
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=300,
                trials=trials)

print("CSE_lexi_nd")
print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

100%|██████████| 300/300 [05:21<00:00,  1.07s/trial, best loss: 0.581003222550443] 
CSE_lexi_nd
Best hyperparameters: {'alpha': 0.00011806857533747571, 'gamma': 0.00010582969605609149}
Loss: 0.581003222550443


## MBDF ##

In [3]:
X_train = np.genfromtxt("../data/coronene_training_data/MBDF.csv", delimiter=',')
y_train = delta_total_energy

In [5]:
def objective(params):
    params['kernel'] = 'rbf'
    model = KernelRidge(**params)
    neg_mae_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=3)
    return {'loss': -neg_mae_scores.mean(), 'status': STATUS_OK}

space = {
    'alpha': hp.loguniform('alpha', -30, 0),
    'gamma': hp.loguniform('gamma', -30, 0),
}

trials = Trials()
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest, # tree parzen estimator
                max_evals=100,
                trials=trials)

print("MBDF")
print("Best hyperparameters:", best)
print("Loss:", trials.best_trial['result']['loss'])

100%|██████████| 100/100 [01:26<00:00,  1.16trial/s, best loss: 0.12194094530689155]
MBDF
Best hyperparameters: {'alpha': 5.300500381866468e-13, 'gamma': 9.287160666894478e-05}
Loss: 0.12194094530689155
