In [1]:
from sklearn.linear_model import Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import warnings
import numpy as np
import pandas as pd
from IPython.display import display
import copy

import sys
sys.path.append('../Data')

## Load Dataset ##

In [3]:
# Load Data

# Load Data

X = pd.read_csv('../Data/Benzene Training Data/[Benz] ANM_X.csv')
X_lexi = pd.read_csv('../Data/Benzene Training Data/[Benz] ANM_X_lexi.csv')
X_lexi_nd = pd.read_csv('../Data/Benzene Training Data/[Benz] ANM_X_lexi_nd.csv')
X_sorted = pd.read_csv('../Data/Benzene Training Data/[Benz] ANM_X_sorted.csv')
X_coulomb = pd.read_csv('../Data/Benzene Training Data/[Benz] ANM_X_coulomb.csv')

inv_dist_X = pd.read_csv('../Data/Benzene Training Data/[Benz] inv_dist_X.csv')
inv_dist_X_lexi = pd.read_csv('../Data/Benzene Training Data/[Benz] inv_dist_X_lexi.csv')
inv_dist_X_lexi_nd = pd.read_csv('../Data/Benzene Training Data/[Benz] inv_dist_X_lexi_nd.csv')
inv_dist_X_sorted = pd.read_csv('../Data/Benzene Training Data/[Benz] inv_dist_X_sorted.csv')
inv_dist_X_coulomb = pd.read_csv('../Data/Benzene Training Data/[Benz] inv_dist_X_coulomb.csv')

rand_X = pd.read_csv('../Data/Benzene Training Data/[Benz] rand_X.csv')
rand_X_lexi = pd.read_csv('../Data/Benzene Training Data/[Benz] rand_X_lexi.csv')
rand_X_lexi_nd = pd.read_csv('../Data/Benzene Training Data/[Benz] rand_X_lexi_nd.csv')
rand_X_sorted = pd.read_csv('../Data/Benzene Training Data/[Benz] rand_X_sorted.csv')
rand_X_coulomb = pd.read_csv('../Data/Benzene Training Data/[Benz] rand_X_coulomb.csv')

y_energy = pd.read_csv("../Data/Benzene Training Data/[Benz] y_energy.csv")
y_elec = pd.read_csv("../Data/Benzene Training Data/[Benz] y_elec.csv")
y_delta_elec = pd.read_csv("../Data/Benzene Training Data/[Benz] y_delta_elec.csv")
y_delta_energy = pd.read_csv("../Data/Benzene Training Data/[Benz] y_delta_energy.csv")

## Define Kernel ##

In [4]:
def extended_gaussian_kernel(x, y, params):
    """
    Calculates the similarity between two vectors using an extended gaussian kernel.
    The kernel takes into account distance between vectors, norm difference, and angular difference

    Args:
        x (numpy.ndarray): Input vector x.
        y (numpy.ndarray): Input vector y.
        params (dict): Dictionary of hyperparameters:
        - gamma (float): Hyperparameter for the distance term.
        - epsilon (float): Hyperparameter for the norm difference term.
        - beta (float): Hyperparameter for the angular difference term.

    Returns:
        float: Similarity value between the input vectors.
    """
    
    gamma = params['gamma']
    epsilon = params['epsilon']
    beta = params['beta']

    x_norm = np.linalg.norm(x)
    y_norm = np.linalg.norm(y)
    cos_theta = np.dot(x, y) / (x_norm * y_norm)
    distance = np.linalg.norm(x - y)
    
    phi = np.exp(-gamma * (distance**2)/2 - epsilon * (x_norm - y_norm)**2 - beta * (1 - cos_theta**2))
    return phi


## Build Model ##

### Extended Gaussian Kernel ###

In [5]:
def create_similarity_matrix(X_ref, X_query, similarity_kernel, params):
    """
    Create a similarity matrix using a specified similarity kernel.

    Args:
        X_ref (numpy.ndarray): Reference training examples.
        X_quary (numpy.ndarray): Query input data to be compared with X_ref
        similarity_kernel (function): Function to calculate similarity between two vectors.
        params (dict): Dictionary of hyperparameters for the similarity kernel.

    Returns:
        numpy.ndarray: Similarity matrix
    """
    
    similarity_matrix = np.zeros((X_query.shape[0], X_ref.shape[0]))
    for i in range(X_query.shape[0]):
        for j in range(X_ref.shape[0]):
            similarity_matrix[i, j] = similarity_kernel(X_query[i], X_ref[j], params)

    return similarity_matrix

In [10]:
# Convert data to numpy arrays
X_train = rand_X.to_numpy()
y_train = y_delta_energy.to_numpy()

# Define the hyperparameters for the specialized kernel
params = {'gamma': 4e-10, 'epsilon': 0.0004, 'beta': 2.333333333333333e-07, 'alpha': 1e-15}
alpha = params['alpha']

similarity_matrix = create_similarity_matrix(X_train, X_train, extended_gaussian_kernel, params)

# Create an instance of KernelRidge with your specialized kernel
krr_model = KernelRidge(kernel='precomputed', alpha=alpha)

# Create a KFold object for 5-fold cross-validation
kf = KFold(n_splits=3, shuffle=True, random_state=42)

# Perform cross-validation and get the mean R^2 score
mse_scores = cross_val_score(krr_model, similarity_matrix, y_train, scoring='neg_mean_squared_error', cv=kf)
rmse_scores = np.sqrt(-mse_scores)

# Calculate the average error across all folds
avg_rmse = rmse_scores.mean()

# Print the mean squared error for each fold
for fold, rmse in enumerate(rmse_scores):
    print(f"Fold {fold+1}: RMSE = {rmse}")

# Print the average mean squared error
print(f"Average RMSE across all folds: {avg_rmse}")

Fold 1: RMSE = 0.09636527369718939
Fold 2: RMSE = 0.05760249223454532
Fold 3: RMSE = 0.03285866912213284
Average RMSE across all folds: 0.06227547835128919


In [45]:
X_train, X_val, y_train, y_val = train_test_split(X, y_delta_energy, test_size=0.2, shuffle=True, random_state=42)
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()
X_val = X_val.to_numpy()
y_val = y_val.to_numpy()

params = {'gamma': 4e-10, 'epsilon': 0.0004, 'beta': 2.333333333333333e-07, 'alpha': 1e-15}
alpha = params['alpha']
krr_model = KernelRidge(kernel='precomputed', alpha=alpha)

similarity_matrix = create_similarity_matrix(X_train, X_train, extended_gaussian_kernel, params)

print(similarity_matrix.shape)
print(similarity_matrix.shape)
krr_model.fit(similarity_matrix, y_train)

prediction_matrix = create_similarity_matrix(X_train, X_val, extended_gaussian_kernel, params)
print(prediction_matrix.shape)
y_pred = krr_model.predict(prediction_matrix)

(13, 13)
(13, 13)
(4, 13)


### Normal Gaussian Kernel ###

In [47]:
X_train = X.to_numpy()
y_train = y_delta_energy.to_numpy()

params = {'alpha': 1.5306122448979593e-09, 'gamma': 2.2857142857142856e-06, 'kernel': 'rbf'}
KRR_model = KernelRidge(**params)

k_fold = KFold(n_splits=5, shuffle=True, random_state=42)
mse_scores = cross_val_score(KRR_model, X_train, y_train, scoring='neg_mean_squared_error', cv=k_fold)
rmse_scores = np.sqrt(-mse_scores)

# Calculate the average error across all folds
avg_rmse = rmse_scores.mean()

# Print the mean squared error for each fold
print("Polynomial KRR:")
for fold, rmse in enumerate(rmse_scores):
    print(f"Fold {fold+1}: RMSE = {rmse}")

# Print the average mean squared error
print(f"Average MSE across all folds: {avg_rmse}")

Polynomial KRR:
Fold 1: RMSE = 1.1309596512752718
Fold 2: RMSE = 0.27686803695643736
Fold 3: RMSE = 2.3547244961042795
Fold 4: RMSE = 1.7673938591789542
Fold 5: RMSE = 1.4629843706649392
Average MSE across all folds: 1.3985860828359764
