In [1]:
import sys
sys.path.append('../..')
sys.path.append('../data')
sys.path.append('../../helper_code')

from sklearn.linear_model import Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import warnings
import numpy as np
import pandas as pd
from IPython.display import display
import copy

from helper_code.custom_kernel import *

In [2]:
input_dataset = ['c', 'c_lexi', 'c_lexi_nd', 'CE', 'CE_lexi', 'CE_lexi_nd', 
            'CSE', 'CSE_lexi', 'CSE_lexi_nd']
dataset_dict = {}

for data in input_dataset:
    dataset_dict[data] = pd.read_csv(f'../data/coronene_training_data/{data}.csv')

delta_total_energy = pd.read_csv(f'../data/coronene_training_data/delta_total_energy.csv')
delta_delta_total_energy = pd.read_csv(f'../data/coronene_training_data/delta_delta_total_energy.csv')

## CSE with Gaussian Kernel ##

In [3]:
X_train = dataset_dict['CSE'].to_numpy()
y_train = delta_total_energy

params = {'alpha': 4.6e-11, 'gamma': 2.8e-08, 'kernel': 'rbf'}
model = KernelRidge(**params)

kfold = KFold(n_splits=2, shuffle=True, random_state=42)
neg_mse_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=kfold)
rmse_scores = np.sqrt(-neg_mse_scores)
mean_rmse_score = rmse_scores.mean()

for fold, score in enumerate(rmse_scores):
    print(f"fold {fold}: rmse = {score}")

print(f"Average rmse: {mean_rmse_score}")

fold 0: rmse = 0.0474685996451559
fold 1: rmse = 0.047235484854125674
Average rmse: 0.04735204224964079


In [5]:
X_train = dataset_dict['CSE'].to_numpy()
y_train = delta_delta_total_energy

params = {'alpha': 4.6e-11, 'gamma': 2.8e-08, 'kernel': 'rbf'}
model = KernelRidge(**params)

kfold = KFold(n_splits=2, shuffle=True, random_state=42)
neg_mse_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=kfold)
rmse_scores = np.sqrt(-neg_mse_scores)
mean_rmse_score = rmse_scores.mean()

for fold, score in enumerate(rmse_scores):
    print(f"fold {fold}: rmse = {score}")

print(f"Average rmse: {mean_rmse_score}")

fold 0: rmse = 0.7333172896586568
fold 1: rmse = 0.70772046532868
Average rmse: 0.7205188774936684


## CSE with XGB Regressor ##

In [3]:
X_train = dataset_dict['CSE'].to_numpy()
y_train = delta_delta_total_energy

params = {'objective': 'reg:squarederror', 
          'n_estimators': 1000, 
          'learning_rate': 0.01, 
          'max_depth': 10,
          'gamma': 0.1,
          'alpha': 0.1}
model = xgb.XGBRegressor(**params)

neg_mae_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=2)
mae_scores = -neg_mae_scores
mean_mae_score = mae_scores.mean()

for fold, score in enumerate(mae_scores):
    print(f"fold {fold}: mae = {score}")
print(f"Average mae: {mean_mae_score}")

fold 0: mae = 0.6651837469360321
fold 1: mae = 0.6597344131515078
Average mae: -0.66245908004377


## Pure Extended Gaussian ##

In [9]:
X_train = dataset_dict['CSE_lexi'].to_numpy()
y_train = delta_delta_total_energy.to_numpy()

params = {'alpha': 5.948688011403127e-11, 'beta': 2.8505311767038538e-12, 'epsilon': 4.727331943165293e-07, 'gamma': 5.298346661430054e-10}
alpha = params['alpha']

similarity_matrix = vectorized_similarity_matrix(X_train, X_train, vectorized_extended_gaussian_kernel, params)
# print(similarity_matrix.shape)
krr_model = KernelRidge(kernel='precomputed', alpha=alpha)
kfold = KFold(n_splits=2, shuffle=True, random_state=42)

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    mse_scores = cross_val_score(krr_model, similarity_matrix, y_train, scoring='neg_mean_squared_error', cv=kfold)
    rmse_scores = np.sqrt(-mse_scores)
    avg_rmse = rmse_scores.mean()

for fold, rmse in enumerate(rmse_scores):
    print(f"Fold {fold+1}: RMSE = {rmse}")
print(f"Average RMSE across all folds: {avg_rmse}")

Fold 1: RMSE = 0.14857817086135547
Fold 2: RMSE = 0.14786487793440367
Average RMSE across all folds: 0.14822152439787956


## Coulombic Matrix ##

In [10]:
CM_rep = pd.read_csv("../data/coronene_training_data/CM_rep.csv")
display(CM_rep.head(3))

Unnamed: 0,coord0,coord1,coord2,coord3,coord4,coord5,coord6,coord7,coord8,coord9,...,coord656,coord657,coord658,coord659,coord660,coord661,coord662,coord663,coord664,coord665
0,53.358707,34.053598,53.358707,34.368323,19.751342,53.358707,12.913381,19.751462,9.875736,53.358707,...,0.402831,0.119825,0.399055,0.14694,0.103771,0.20754,0.107468,0.207544,0.119824,0.5
1,53.358707,19.751342,53.358707,12.913317,8.552605,53.358707,19.885231,11.390971,9.539765,53.358707,...,0.399079,0.107396,0.119824,0.402834,0.107468,0.146938,0.146572,0.20754,0.103772,0.5
2,53.358707,34.368245,53.358707,19.751462,17.105333,53.358707,19.751342,17.105179,9.875736,53.358707,...,0.207545,0.119824,0.14657,0.402824,0.107395,0.107468,0.207542,0.14694,0.399078,0.5


In [11]:
X_train = CM_rep.to_numpy()
y_train = delta_delta_total_energy

params = {'alpha': 4.6e-11, 'gamma': 2.8e-08, 'kernel': 'rbf'}
model = KernelRidge(**params)

neg_mse_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=2)
rmse_scores = np.sqrt(-neg_mse_scores)
mean_rmse_score = rmse_scores.mean()

for fold, score in enumerate(rmse_scores):
    print(f"fold {fold}: rmse = {score}")

print(f"Average rmse: {mean_rmse_score}")

fold 0: rmse = 1.0627929185395568
fold 1: rmse = 1.0286874440862068
Average rmse: 1.045740181312882


## MBDF ##

In [6]:
X_train = np.genfromtxt("../data/coronene_training_data/MBDF.csv", delimiter=',')
y_train = delta_total_energy

params = {'alpha': 5.300500381866468e-13, 'gamma': 9.287160666894478e-05, 'kernel': 'rbf'}
model = KernelRidge(**params)

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    neg_mse_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=2)
    rmse_scores = np.sqrt(-neg_mse_scores)
    mean_rmse_score = rmse_scores.mean()

for fold, score in enumerate(rmse_scores):
    print(f"fold {fold}: rmse = {score}")

print(f"Average rmse: {mean_rmse_score}")

fold 0: rmse = 0.22988854923839755
fold 1: rmse = 0.22332026844768935
Average rmse: 0.22660440884304345


  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
