In [1]:
import sys
sys.path.append('../..')
sys.path.append('../data')
sys.path.append('../../helper_code')

from sklearn.linear_model import Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
import warnings
import numpy as np
import pandas as pd
from IPython.display import display
import copy

from helper_code.custom_kernel import *

In [27]:
# Load Data

X = pd.read_csv('../data/benzene_training_data/[Benz] ANM_X.csv')
X_inv = pd.read_csv('../data/benzene_training_data/[Benz] ANM_X_inv.csv')
X_lexi = pd.read_csv('../data/benzene_training_data/[Benz] ANM_X_lexi.csv')
X_lexi_nd = pd.read_csv('../data/benzene_training_data/[Benz] ANM_X_lexi_nd.csv')
X_sorted = pd.read_csv('../data/benzene_training_data/[Benz] ANM_X_sorted.csv')
X_coulomb = pd.read_csv('../data/benzene_training_data/[Benz] ANM_X_coulomb.csv')

X_square_eig = pd.read_csv("../data/benzene_training_data/[Benz] ANM_X_square_eig.csv")
X_inv_square_eig = pd.read_csv("../data/benzene_training_data/[Benz] ANM_X_inv_square_eig.csv")
X_lexi_square_eig = pd.read_csv("../data/benzene_training_data/[Benz] ANM_X_lexi_square_eig.csv")
X_lexi_inv_square_eig = pd.read_csv("../data/benzene_training_data/[Benz] ANM_X_lexi_inv_square_eig.csv")

inv_dist_X = pd.read_csv('../data/benzene_training_data/[Benz] inv_dist_X.csv')
inv_dist_X_lexi = pd.read_csv('../data/benzene_training_data/[Benz] inv_dist_X_lexi.csv')
inv_dist_X_lexi_nd = pd.read_csv('../data/benzene_training_data/[Benz] inv_dist_X_lexi_nd.csv')
inv_dist_X_sorted = pd.read_csv('../data/benzene_training_data/[Benz] inv_dist_X_sorted.csv')
inv_dist_X_coulomb = pd.read_csv('../data/benzene_training_data/[Benz] inv_dist_X_coulomb.csv')

rand_X = pd.read_csv('../data/benzene_training_data/[Benz] rand_X.csv')
rand_X_lexi = pd.read_csv('../data/benzene_training_data/[Benz] rand_X_lexi.csv')
rand_X_lexi_nd = pd.read_csv('../data/benzene_training_data/[Benz] rand_X_lexi_nd.csv')
rand_X_sorted = pd.read_csv('../data/benzene_training_data/[Benz] rand_X_sorted.csv')
rand_X_coulomb = pd.read_csv('../data/benzene_training_data/[Benz] rand_X_coulomb.csv')

y_energy = pd.read_csv("../data/benzene_training_data/[Benz] y_energy.csv")
y_elec = pd.read_csv("../data/benzene_training_data/[Benz] y_elec.csv")
y_delta_elec = pd.read_csv("../data/benzene_training_data/[Benz] y_delta_elec.csv")
y_delta_energy = pd.read_csv("../data/benzene_training_data/[Benz] y_delta_energy.csv")

y_delta_delta = pd.read_csv("../data/benzene_training_data/[Benz] y_delta_delta_ANM.csv")

## Build Model ##

In [37]:
params = {'alpha': 2.612244897959184e-05, 'gamma': 2.9693877551020406e-07, 'kernel': 'rbf'}
KRR_model = KernelRidge(**params)

with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    k_fold = KFold(n_splits=5, shuffle=True, random_state=10)
    mse_scores = cross_val_score(KRR_model, X_lexi_inv_square_eig, y_delta_delta, scoring='neg_mean_squared_error', cv=k_fold)
    rmse_scores = np.sqrt(-mse_scores)
    avg_rmse = rmse_scores.mean()

for fold, rmse in enumerate(rmse_scores):
    print(f"Fold {fold+1}: RMSE = {rmse}")
print(f"Average RMSE across all folds: {avg_rmse}")

Fold 1: RMSE = 0.013659913511825068
Fold 2: RMSE = 0.019794693132416878
Fold 3: RMSE = 0.02305659778442661
Fold 4: RMSE = 0.032870120712496925
Fold 5: RMSE = 0.0194987129365835
Average RMSE across all folds: 0.0217760076155498


In [38]:
X_train = X_lexi_inv_square_eig.to_numpy()
y_train = y_delta_delta.to_numpy()

params = {'gamma': 2.3e-08, 'epsilon': 5e-13, 'beta': 9.25e-07, 'alpha': 3.85e-08}
alpha = params['alpha']

similarity_matrix = create_similarity_matrix(X_train, X_train, extended_gaussian_kernel, params)
krr_model = KernelRidge(kernel='precomputed', alpha=alpha)
kf = KFold(n_splits=3, shuffle=True, random_state=10)
mse_scores = cross_val_score(krr_model, similarity_matrix, y_train, scoring='neg_mean_squared_error', cv=kf)
rmse_scores = np.sqrt(-mse_scores)
avg_rmse = rmse_scores.mean()

for fold, rmse in enumerate(rmse_scores):
    print(f"Fold {fold+1}: RMSE = {rmse}")
print(f"Average RMSE across all folds: {avg_rmse}")

Fold 1: RMSE = 0.011576159345341187
Fold 2: RMSE = 0.027770892920602187
Fold 3: RMSE = 0.02140169570467881
Average RMSE across all folds: 0.02024958265687406


## Parameter Tuning ##

In [33]:
param_grid = {
    'alpha': np.logspace(np.log10(1e-8), np.log10(1e-3), num=50),  # Regularization parameter controlling the L2 regularization term
    'gamma': np.logspace(np.log10(1e-8), np.log10(1e-3), num=50),  # Parameter for the Gaussian kernel, controlling the width of the kernel
    'kernel': ['rbf'],  # Specifies the kernel function to be used, in this case, the Gaussian (RBF) kernel
}

gaussian_KRR = KernelRidge()

k_fold = KFold(n_splits=5, shuffle=True, random_state=42)

with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    grid_search = GridSearchCV(gaussian_KRR, param_grid, scoring='neg_mean_squared_error', cv=k_fold)
    grid_search.fit(X_lexi_inv_square_eig, y_delta_delta)

best_params = grid_search.best_params_
best_score = np.sqrt(-grid_search.best_score_)

# Print the best hyperparameters and score
print("Best Hyperparameters:", best_params)
print("Best Root Mean Squared Error:", best_score)

Best Hyperparameters: {'alpha': 2.3299518105153718e-05, 'gamma': 2.6826957952797275e-07, 'kernel': 'rbf'}
Best Root Mean Squared Error: 0.023302100364139308


In [34]:
param_grid = {
    'alpha': np.logspace(np.log10(1e-6), np.log10(1e-4), num=50),  # Regularization parameter controlling the L2 regularization term
    'gamma': np.logspace(np.log10(1e-8), np.log10(1e-6), num=50),  # Parameter for the Gaussian kernel, controlling the width of the kernel
    'kernel': ['rbf'],  # Specifies the kernel function to be used, in this case, the Gaussian (RBF) kernel
}

gaussian_KRR = KernelRidge()

k_fold = KFold(n_splits=5, shuffle=True, random_state=42)

with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    grid_search = GridSearchCV(gaussian_KRR, param_grid, scoring='neg_mean_squared_error', cv=k_fold)
    grid_search.fit(X_lexi_inv_square_eig, y_delta_delta)

best_params = grid_search.best_params_
best_score = np.sqrt(-grid_search.best_score_)

# Print the best hyperparameters and score
print("Best Hyperparameters:", best_params)
print("Best Root Mean Squared Error:", best_score)

Best Hyperparameters: {'alpha': 2.6826957952797274e-05, 'gamma': 2.94705170255181e-07, 'kernel': 'rbf'}
Best Root Mean Squared Error: 0.023299825360907887


In [35]:
param_grid = {
    'alpha': np.linspace(1e-5, 5e-5, num=50),  # Regularization parameter controlling the L2 regularization term
    'gamma': np.linspace(1e-7, 5e-7, num=50),  # Parameter for the Gaussian kernel, controlling the width of the kernel
    'kernel': ['rbf'],  # Specifies the kernel function to be used, in this case, the Gaussian (RBF) kernel
}

gaussian_KRR = KernelRidge()

k_fold = KFold(n_splits=5, shuffle=True, random_state=42)

with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    grid_search = GridSearchCV(gaussian_KRR, param_grid, scoring='neg_mean_squared_error', cv=k_fold)
    grid_search.fit(X_lexi_inv_square_eig, y_delta_delta)

best_params = grid_search.best_params_
best_score = np.sqrt(-grid_search.best_score_)

# Print the best hyperparameters and score
print("Best Hyperparameters:", best_params)
print("Best Root Mean Squared Error:", best_score)

Best Hyperparameters: {'alpha': 2.6326530612244903e-05, 'gamma': 2.959183673469388e-07, 'kernel': 'rbf'}
Best Root Mean Squared Error: 0.023299202753581882


In [36]:
param_grid = {
    'alpha': np.linspace(2e-5, 3e-5, num=50),  # Regularization parameter controlling the L2 regularization term
    'gamma': np.linspace(2.5e-7, 3.5e-7, num=50),  # Parameter for the Gaussian kernel, controlling the width of the kernel
    'kernel': ['rbf'],  # Specifies the kernel function to be used, in this case, the Gaussian (RBF) kernel
}

gaussian_KRR = KernelRidge()

k_fold = KFold(n_splits=5, shuffle=True, random_state=42)

with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    grid_search = GridSearchCV(gaussian_KRR, param_grid, scoring='neg_mean_squared_error', cv=k_fold)
    grid_search.fit(X_lexi_inv_square_eig, y_delta_delta)

best_params = grid_search.best_params_
best_score = np.sqrt(-grid_search.best_score_)

# Print the best hyperparameters and score
print("Best Hyperparameters:", best_params)
print("Best Root Mean Squared Error:", best_score)

Best Hyperparameters: {'alpha': 2.612244897959184e-05, 'gamma': 2.9693877551020406e-07, 'kernel': 'rbf'}
Best Root Mean Squared Error: 0.023299125797369036
