In [1]:
import sys
sys.path.append('../APDFT')
sys.path.append('../Data')

In [17]:
from pyscf import gto, scf, dft, cc
import numpy as np
import pandas as pd
import pyscf
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import basis_set_exchange as bse
from FcMole import *
import os
import ast
from IPython.display import display

%load_ext autoreload
%autoreload 2
from AP_class import APDFT_perturbator as AP

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load Dataset ##

In [3]:
# Load the dataset
total_energy_data = np.load('../Data/Benzene_BNdoping_PBE0_pcX2_opt.npz', allow_pickle=True)
electronic_energy_data = np.load('../Data/Benzene_BNdoping_PBE0_pcX2_electronic_opt.npz', allow_pickle=True)

# Unpack the data into numpy arrays
charges, coords, elements, total_energy, electronic_energy = total_energy_data['charges'], total_energy_data['coords'], total_energy_data['elements'], total_energy_data['energies'], electronic_energy_data['energies']


In [6]:
# Understand the dimension of the data

print(charges.shape) # (17, 12)
print(coords.shape) # (17, 12, 3)
print(elements.shape) # (17, 12)
print(total_energy.shape) # (17,)
print(electronic_energy.shape) #(17,)

(17, 12)
(17, 12, 3)
(17, 12)
(17,)
(17,)


In [9]:
# Creating pandas dataframe for the data

columns = ['charges', 'elements', 'total energy', 'electronic energy']
benzene_data = pd.DataFrame(columns=columns)

benzene_data['charges'] = charges.tolist()
benzene_data['elements'] = elements.tolist()
benzene_data['total energy'] = total_energy.tolist()
benzene_data['electronic energy'] = electronic_energy.tolist()
display(benzene_data)

Unnamed: 0,charges,elements,total energy,electronic energy
0,"[7, 5, 6, 6, 6, 6, 1, 1, 1, 1, 1, 1]","[N, B, C, C, C, C, H, H, H, H, H, H]",-230.034644,-336.906006
1,"[7, 6, 5, 6, 6, 6, 1, 1, 1, 1, 1, 1]","[N, C, B, C, C, C, H, H, H, H, H, H]",-230.040119,-336.995987
2,"[7, 6, 6, 5, 6, 6, 1, 1, 1, 1, 1, 1]","[N, C, C, B, C, C, H, H, H, H, H, H]",-230.03286,-337.004116
3,"[7, 7, 5, 5, 6, 6, 1, 1, 1, 1, 1, 1]","[N, N, B, B, C, C, H, H, H, H, H, H]",-233.459886,-340.400298
4,"[7, 7, 5, 6, 5, 6, 1, 1, 1, 1, 1, 1]","[N, N, B, C, B, C, H, H, H, H, H, H]",-233.463021,-340.318992
5,"[7, 7, 5, 6, 6, 5, 1, 1, 1, 1, 1, 1]","[N, N, B, C, C, B, H, H, H, H, H, H]",-233.453088,-340.193778
6,"[7, 7, 6, 5, 5, 6, 1, 1, 1, 1, 1, 1]","[N, N, C, B, B, C, H, H, H, H, H, H]",-233.470031,-340.510269
7,"[7, 5, 7, 6, 6, 5, 1, 1, 1, 1, 1, 1]","[N, B, N, C, C, B, H, H, H, H, H, H]",-233.464654,-340.067245
8,"[7, 5, 7, 6, 5, 6, 1, 1, 1, 1, 1, 1]","[N, B, N, C, B, C, H, H, H, H, H, H]",-233.470116,-340.126176
9,"[7, 6, 7, 5, 5, 6, 1, 1, 1, 1, 1, 1]","[N, C, N, B, B, C, H, H, H, H, H, H]",-233.469772,-340.325617


## ANM Calculation ##

### DFT Calculation ###

In [10]:
# Specify the atomic coordinates of benzene molecule (the reference molecule for ANM calculations)

benz_atom="""
C        3.22272669       0.22711285       0.00013582
C        5.87141753       0.22698034       0.00094988
C        7.19597908       2.52071412      -0.00011471
C        5.87164800       4.81458054      -0.00200817
C        3.22295713       4.81471307      -0.00280461
C        1.89839559       2.52097926      -0.00174231
H        2.18773340      -1.56549239       0.00096741
H        6.90623079      -1.56572844       0.00241360
H        9.26591446       2.52061061       0.00051784
H        6.90664130       6.60718579      -0.00284841
H        2.18814386       6.60742187      -0.00426425
H       -0.17153979       2.52108280      -0.00237226
"""

In [14]:
# Specify the basis used: pcx2

basis_pcx2={"H":"pc-2",'C':bse.get_basis("pcX-2",fmt="nwchem",elements=[6])\
           ,'N':bse.get_basis("pcX-2",fmt="nwchem",elements=[7])\
           ,'O':bse.get_basis("pcX-2",fmt="nwchem",elements=[8])}

In [20]:
# create molecule
mol_benz=gto.M(atom=benz_atom, basis=basis_pcx2, unit='Angstrom')

# run DFT calculation
benz_DFT = scf.RKS(mol_benz)
benz_DFT.xc = "PBE0" # specify the exchange-correlation functional used for DFT
benz_DFT.kernel() # run self-consistent field calculation

converged SCF energy = -229.912170593056
Total energy: <bound method energy_tot of RKS object of <class 'pyscf.dft.rks.RKS'>>
Electronic energy: <bound method energy_elec of RKS object of <class 'pyscf.dft.rks.RKS'>>


In [21]:
# Calculate the total and electronic energy

benz_total_energy = benz_DFT.energy_tot()
benz_electronic_energy = benz_DFT.energy_elec()

print("Total energy:", benz_total_energy)
print("Electronic energy:", benz_electronic_energy) #(electronic energy, nuclear repulsion energy)

# Total energy: -229.91217059305342
# Electronic energy: (-336.9833214880193, 181.97988746354838)

Total energy: -229.91217059305342
Electronic energy: (-336.9833214880193, 181.97988746354838)


In [24]:
print(type(benz_DFT))

<class 'pyscf.dft.rks.RKS'>


### Hessian and ANM ###

In [25]:
# Load the Hessian

def get_hessian(DFT):
    """ 
    Load the energy hessian matrix of the specified molecule with respect to its nuclear charges
    
    Args:
        DFT (pyscf.dft.rks.RKS object): the DFT RKS object of the molecule in question
    
    Returns:
        H (ndarray): The hessian matrix of the molecule
    """

    if os.path.isfile('hessian_PBE0.txt'):
        H = np.loadtxt('hessian_PBE0.txt')
    else:
        C_idxs = [0, 1, 2, 3, 4, 5]
        benz_ap=AP(DFT, sites=C_idxs)
        H = benz_ap.build_hessian()
    return H

In [26]:
# Get the Hessian

H = get_hessian(benz_DFT)
print(H)


[[-3.37583818  0.16302604  0.15061107  0.13936407  0.16491505  0.14365052]
 [ 0.16302604 -3.37583589  0.14365083  0.16491407  0.13936633  0.15060779]
 [ 0.15061107  0.14365083 -3.40105126  0.14365107  0.1506074   0.19166145]
 [ 0.13936407  0.16491407  0.14365107 -3.37583742  0.16302608  0.15061038]
 [ 0.16491505  0.13936633  0.1506074   0.16302608 -3.37583547  0.14365075]
 [ 0.14365052  0.15060779  0.19166145  0.15061038  0.14365075 -3.40104752]]


In [27]:
# compute the diagnalization matrix (of eigenvectors) Q

epsilon, Q = np.linalg.eig(H)
Q_inv = np.linalg.inv(Q)
print(Q)
print(Q_inv)
print(epsilon)

[[-4.09259973e-01 -6.06256052e-02  5.00021152e-01  2.87226831e-01
   4.99965083e-01  4.96331866e-01]
 [-4.09260235e-01  6.05875481e-02 -4.99980480e-01  2.87228132e-01
   5.00047997e-01 -4.96292987e-01]
 [-4.06216373e-01  7.01905379e-01 -1.40384008e-05 -5.78764360e-01
   1.12864525e-06  8.57257964e-02]
 [-4.09259834e-01  6.05964964e-02  5.00007350e-01  2.87201859e-01
  -4.99964170e-01 -4.96364809e-01]
 [-4.09260672e-01 -6.05777450e-02 -4.99991015e-01  2.87298223e-01
  -5.00022744e-01  4.96268084e-01]
 [-4.06217532e-01 -7.01883841e-01 -4.20414255e-05 -5.78798226e-01
  -2.71244329e-05 -8.56679749e-02]]
[[-4.09259973e-01 -4.09260235e-01 -4.06216373e-01 -4.09259834e-01
  -4.09260672e-01 -4.06217532e-01]
 [-6.06256052e-02  6.05875481e-02  7.01905379e-01  6.05964964e-02
  -6.05777450e-02 -7.01883841e-01]
 [ 5.00021152e-01 -4.99980480e-01 -1.40384008e-05  5.00007350e-01
  -4.99991015e-01 -4.20414255e-05]
 [ 2.87226831e-01  2.87228132e-01 -5.78764360e-01  2.87201859e-01
   2.87298223e-01 -5.787

## Data Transformation ##

### Feature Transformation ###

In [108]:
def lexi_transformation(arr):
    """ 
    This function maps cyclic arrays that are rotational or reflectional identical onto the same vector.
    The function iterate through all rotaional and reflectional variants of the array,
    and select the lexicographically minimum array as the final representation.

    Args:
        arr (ndarray): a numpy array to be transformed
    
    Return:
        transformed_arr (ndarray): transformed array
    """
    
    # Create all possible rotations of the cycle
    shift = np.arange(len(arr))

    shifted_arrays = []
    for s in shift:
        shifted = np.roll(arr, shift=s)
        shifted_arrays.append(shifted)
    
    rotations = np.vstack(shifted_arrays)

    # Create the corresponding reverse traversal patterns for each rotation
    reverse_traversals = np.flip(rotations, axis=1)

    # Combine rotations and reverse traversals
    all_patterns = np.vstack((rotations, reverse_traversals))

    # Find the lexicographically smallest representation (left to right)
    sorted_indices = np.lexsort(all_patterns.T[::-1])
    min_pattern = all_patterns[sorted_indices[0]]
    
    # Return the lexicographically smallest pattern as the vector representation
    return min_pattern

In [109]:
def lexi_transformation_2d(arr):
    transformed_arr = []
    
    for row in arr:
        transformed_row = lexi_transformation(row)
        transformed_arr.append(transformed_row)
    
    return np.array(transformed_arr)

In [110]:
def lexi_and_opposite_transformation(arr):
    """ 
    This function maps cyclic arrays that are rotational identical, reflectional identical, 
    or opposite onto the same vector.
    The function iterate through all rotaional, reflectional, and opposite variants of the array,
    and select the lexicographically minimum array as the final representation.

    Args:
        arr (ndarray): a numpy array to be transformed
    
    Return:
        transformed_arr (ndarray): transformed array
    """
    
    # Create all possible rotations of the cycle
    shift = np.arange(len(arr))

    shifted_arrays = []
    for s in shift:
        shifted = np.roll(arr, shift=s)
        shifted_arrays.append(shifted)
    
    rotations = np.vstack(shifted_arrays)

    # Create the corresponding reverse traversal patterns for each rotation
    reverse_traversals = np.flip(rotations, axis=1)

    # Combine rotations and reverse traversals
    all_patterns = np.vstack((rotations, reverse_traversals))
    
    # Negate existing vectors 
    all_patterns_neg = -all_patterns

    # Combing the negated vectors with the original
    all_patterns = np.vstack((all_patterns, all_patterns_neg))

    # Find the lexicographically smallest representation (left to right)
    sorted_indices = np.lexsort(all_patterns.T[::-1])
    min_pattern = all_patterns[sorted_indices[0]]
    
    # Return the lexicographically smallest pattern as the vector representation
    return min_pattern

In [111]:
def lexi_and_opposite_transformation_2d(arr):
    transformed_arr = []
    
    for row in arr:
        transformed_row = lexi_and_opposite_transformation(row)
        transformed_arr.append(transformed_row)
    
    return np.array(transformed_arr)

In [107]:
mylist = np.array([[0, 1, 0, 0, 0, -1], 
                   [0, 0, 0, -1, 0, 1],
                   [1, 0, 0, 0, -1, 0]])

index = np.lexsort(mylist.T[::-1])
print(mylist[index[0]])

[ 0  0  0 -1  0  1]


In [112]:
arr1 = np.array([1, -1, -1, 1, 0, 0])
arr2 = np.array([-1, 1, 1, -1, 0, 0])
arr3 = np.array([1, 0, -1, 0, 0, 0])
transformed_arr1 = lexi_transformation(arr1)
transformed_arr2 = lexi_transformation(arr2)
transformed_arr3 = lexi_transformation(arr3)
print(transformed_arr1)
print(transformed_arr2)
print(transformed_arr3)

[-1 -1  1  0  0  1]
[-1  0  0 -1  1  1]
[-1  0  0  0  1  0]


### Calculating dx and c ###

In [113]:
# Compute the dx value as the different between target charge and reference charge
ref_charge = [6, 6, 6, 6, 6, 6, 1, 1, 1, 1, 1, 1]
ref_charge_array = np.tile(ref_charge, (17, 1))
target_charge_array = np.array(benzene_data['charges'].tolist())
dx_array = (target_charge_array - ref_charge_array)[:, :6] # only take the dx for C atoms

# print(dx_array)

# try different ways to sort the dx vector
sorted_dx_array = np.sort(dx_array, axis=1) # sorted dx elements
lexi_dx_array = lexi_transformation_2d(dx_array) # rotational and reflectional invariant
lexi_opp_dx_array = lexi_and_opposite_transformation_2d(dx_array) # rotational, reflectional, and negation invariant

print(sorted_dx_array.shape)
print(lexi_dx_array.shape)
print(lexi_opp_dx_array.shape)

# Compute the c array, which represents the ANM coordinates
sorted_c_array = (Q_inv @ sorted_dx_array.T).T
lexi_c_array = (Q_inv @ lexi_dx_array.T).T
lexi_opp_c_array = (Q_inv @ lexi_opp_dx_array.T).T

# Append the data onto the dataframe
benzene_data['sorted_dx'] = sorted_dx_array.tolist()
benzene_data['lexi_dx'] = lexi_dx_array.tolist()
benzene_data['lexi_opp_dx'] = lexi_opp_dx_array.tolist()
benzene_data['sorted_c'] = sorted_c_array.tolist()
benzene_data['lexi_c'] = lexi_c_array.tolist()
benzene_data['lexi_opp_c'] = lexi_opp_c_array.tolist()


# for i in range(len(c_array[0])):
#     benzene_data[f"coord{i}"] = benzene_data['c'].apply(lambda x: x[i])

display(benzene_data.head())

(17, 6)
(17, 6)
(17, 6)


Unnamed: 0,charges,elements,total energy,electronic energy,sorted_dx,lexi_dx,lexi_opp_dx,sorted_c,lexi_c,lexi_opp_c
0,"[7, 5, 6, 6, 6, 6, 1, 1, 1, 1, 1, 1]","[N, B, C, C, C, C, H, H, H, H, H, H]",-230.034644,-336.906006,"[-1, 0, 0, 0, 0, 1]","[-1, 0, 0, 0, 0, 1]","[-1, 0, 0, 0, 0, 1]","[0.003042440870520857, -0.6412582352904221, -0...","[0.003042440870520857, -0.6412582352904221, -0...","[0.003042440870520857, -0.6412582352904221, -0..."
1,"[7, 6, 5, 6, 6, 6, 1, 1, 1, 1, 1, 1]","[N, C, B, C, C, C, H, H, H, H, H, H]",-230.040119,-336.995987,"[-1, 0, 0, 0, 0, 1]","[-1, 0, 0, 0, 1, 0]","[-1, 0, 0, 0, 1, 0]","[0.003042440870520857, -0.6412582352904221, -0...","[-6.987121025092691e-07, 4.78602190818328e-05,...","[-6.987121025092691e-07, 4.78602190818328e-05,..."
2,"[7, 6, 6, 5, 6, 6, 1, 1, 1, 1, 1, 1]","[N, C, C, B, C, C, H, H, H, H, H, H]",-230.03286,-337.004116,"[-1, 0, 0, 0, 0, 1]","[-1, 0, 0, 1, 0, 0]","[-1, 0, 0, 1, 0, 0]","[0.003042440870520857, -0.6412582352904221, -0...","[1.3856579200721697e-07, 0.12122210162011998, ...","[1.3856579200721697e-07, 0.12122210162011998, ..."
3,"[7, 7, 5, 5, 6, 6, 1, 1, 1, 1, 1, 1]","[N, N, B, B, C, C, H, H, H, H, H, H]",-233.459886,-340.400298,"[-1, -1, 0, 0, 1, 1]","[-1, -1, 0, 0, 1, 1]","[-1, -1, 0, 0, 1, 1]","[0.0030420040635958934, -0.7624235284472971, -...","[0.0030420040635958934, -0.7624235284472971, -...","[0.0030420040635958934, -0.7624235284472971, -..."
4,"[7, 7, 5, 6, 5, 6, 1, 1, 1, 1, 1, 1]","[N, N, B, C, B, C, H, H, H, H, H, H]",-233.463021,-340.318992,"[-1, -1, 0, 0, 1, 1]","[-1, 0, -1, 0, 1, 1]","[-1, -1, 0, 1, 0, 1]","[0.0030420040635958934, -0.7624235284472971, -...","[-1.8574247866443017e-06, -1.4037413591460208,...","[0.00304284134149041, -0.6412492870462589, 0.4..."


## Regression ##

In [114]:
from sklearn.linear_model import Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
import warnings

### Prepare training data ###

In [118]:
# Extract the coordinates as six separate features
columns = [f"coord{i}" for i in range(6)]
X_sorted = pd.DataFrame(columns=columns)
X_lexi = pd.DataFrame(columns=columns)
X_lexi_opp = pd.DataFrame(columns=columns)

for i in range(6):
    X_sorted[f"coord{i}"] = benzene_data['sorted_c'].apply(lambda x: x[i] * 100)
    X_lexi[f"coord{i}"] = benzene_data['lexi_c'].apply(lambda x: x[i] * 100)
    X_lexi_opp[f"coord{i}"] = benzene_data['lexi_opp_c'].apply(lambda x: x[i] * 100)

# Extract the targets

y_energy = benzene_data['total energy']
y_elec = benzene_data['electronic energy']
    

### Model building ###

P/G - Polynomial or Gaussian Kernel

S/L/Lo - used feature: sorted_c / lexi_c / lexi_opp_c

T/E - prediction: total energy / electronic energy

### Polynomial KRR ###

#### KRR: polynomial, lexi_opp_c, total energy ####

In [121]:
params = {'alpha': 1e-06, 'coef0': 10, 'degree': 4, 'kernel': 'poly'}
KRR_PLoT = KernelRidge(**params)

k_fold = KFold(n_splits=5, shuffle=True, random_state=42)

with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    mse_scores = cross_val_score(KRR_PLoT, X_lexi_opp, y_energy, scoring='neg_mean_squared_error', cv=k_fold)
    rmse_scores = np.sqrt(-mse_scores)  

# Calculate the average error across all folds
avg_rmse = rmse_scores.mean()

# Print the mean squared error for each fold
print("Polynomial KRR:")
for fold, rmse in enumerate(rmse_scores):
    print(f"Fold {fold+1}: RMSE = {rmse}")

# Print the average mean squared error
print(f"Average MSE across all folds: {avg_rmse}")

Polynomial KRR:
Fold 1: RMSE = 243.46004123242983
Fold 2: RMSE = 58.24701372079402
Fold 3: RMSE = 113.2140293124711
Fold 4: RMSE = 95.22485869356323
Fold 5: RMSE = 93.41278426201792
Average MSE across all folds: 120.71174544425523


### Gaussian KRR ###

#### KRR: Gaussian, lexi_opp_c, total energy ####

In [122]:
params = {'alpha': 5e-05, 'gamma': 1.6e-07, 'kernel': 'rbf'}
KRR_GLoT = KernelRidge(**params)

k_fold = KFold(n_splits=5, shuffle=True, random_state=42)
mse_scores = cross_val_score(KRR_GLoT, X_lexi_opp, y_energy, scoring='neg_mean_squared_error', cv=k_fold)
rmse_scores = np.sqrt(-mse_scores)  

# Calculate the average error across all folds
avg_rmse = rmse_scores.mean()

# Print the mean squared error for each fold
print("Polynomial KRR:")
for fold, rmse in enumerate(rmse_scores):
    print(f"Fold {fold+1}: RMSE = {rmse}")

# Print the average mean squared error
print(f"Average MSE across all folds: {avg_rmse}")

Polynomial KRR:
Fold 1: RMSE = 3.1633316474058017
Fold 2: RMSE = 2.8753259771718223
Fold 3: RMSE = 3.731471384556762
Fold 4: RMSE = 0.9744193165601209
Fold 5: RMSE = 1.3587346428398883
Average MSE across all folds: 2.4206565937068794


### Ridge Regression (non kernelized) ###

In [None]:
ridge_regression = Ridge(alpha=0.01)

k_fold = KFold(n_splits=5, shuffle=True, random_state=42)
mse_scores = cross_val_score(ridge_regression, X, y, scoring='neg_mean_squared_error', cv=k_fold)
mse_scores = -mse_scores  # Convert negative MSE scores to positive

# Calculate the average error across all folds
avg_mse = mse_scores.mean()

# Print the mean squared error for each fold
for fold, mse in enumerate(mse_scores):
    print(f"Fold {fold+1}: MSE = {mse}")

# Print the average mean squared error
print(f"Average MSE across all folds: {avg_mse}")

## Hyperparameter Tuning ##

### Polynomial KRR ###

In [None]:
# # Hyperparam tuning
# # Grid search round 1

# param_grid = {
#     'alpha': np.logspace(np.log10(1e-5), np.log10(10), num=50),
#     'kernel': ['poly'],  
#     'degree': [2, 3, 4], 
#     'coef0': np.logspace(np.log10(1e-5), np.log10(100), num=50), 
# }

# poly_KRR = KernelRidge()

# k_fold = KFold(n_splits=5, shuffle=True, random_state=10)

# with warnings.catch_warnings():
#     warnings.filterwarnings("ignore")
#     grid_search = GridSearchCV(poly_KRR, param_grid, scoring='neg_mean_squared_error', cv=k_fold)
#     grid_search.fit(X, y)

# best_params = grid_search.best_params_
# best_score = -grid_search.best_score_

# # Print the best hyperparameters and score
# print("Best Hyperparameters:", best_params)
# print("Best Mean Squared Error:", best_score)

In [None]:
# # Hyperparam tuning
# # Grid search round 2

# param_grid = {
#     'alpha': np.logspace(np.log10(1e-7), np.log10(1e-5), num=20),
#     'kernel': ['poly'],  
#     'degree': [1, 2, 3], 
#     'coef0': np.logspace(np.log10(100), np.log10(1000), num=20), 
# }

# poly_KRR = KernelRidge()

# k_fold = KFold(n_splits=5, shuffle=True, random_state=10)

# with warnings.catch_warnings():
#     warnings.filterwarnings("ignore")
#     grid_search = GridSearchCV(poly_KRR, param_grid, scoring='neg_mean_squared_error', cv=k_fold)
#     grid_search.fit(X, y)

# best_params = grid_search.best_params_
# best_score = -grid_search.best_score_

# # Print the best hyperparameters and score
# print("Best Hyperparameters:", best_params)
# print("Best Mean Squared Error:", best_score)

In [None]:
# # Hyperparam tuning
# # Grid search round 3

# param_grid = {
#     'alpha': np.logspace(np.log10(1e-9), np.log10(1e-6), num=30),
#     'kernel': ['poly'],  
#     'degree': [1, 2, 3], 
#     'coef0': np.linspace(300, 400, num=21), 
# }

# poly_KRR = KernelRidge()

# k_fold = KFold(n_splits=5, shuffle=True, random_state=10)

# with warnings.catch_warnings():
#     warnings.filterwarnings("ignore")
#     grid_search = GridSearchCV(poly_KRR, param_grid, scoring='neg_mean_squared_error', cv=k_fold)
#     grid_search.fit(X, y)

# best_params = grid_search.best_params_
# best_score = -grid_search.best_score_

# # Print the best hyperparameters and score
# print("Best Hyperparameters:", best_params)
# print("Best Mean Squared Error:", best_score)

### Gaussian KRR ###

In [125]:
# small alpha, small gamma

param_grid = {
    'alpha': np.logspace(np.log10(10e-7), np.log10(10e-3), num=50),  # Regularization parameter controlling the L2 regularization term
    'gamma': np.logspace(np.log10(10e-7), np.log10(10e-3), num=50),  # Parameter for the Gaussian kernel, controlling the width of the kernel
    'kernel': ['rbf'],  # Specifies the kernel function to be used, in this case, the Gaussian (RBF) kernel
}

KRR_GLoT = KernelRidge()

k_fold = KFold(n_splits=5, shuffle=True, random_state=10)
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    grid_search = GridSearchCV(KRR_GLoT, param_grid, scoring='neg_mean_squared_error', cv=k_fold)
    grid_search.fit(X_lexi_opp, y_energy)

best_params = grid_search.best_params_
best_score = -grid_search.best_score_

# Print the best hyperparameters and score
print("Best Hyperparameters:", best_params)
print("Best Mean Squared Error:", best_score)

Best Hyperparameters: {'alpha': 7.906043210907702e-06, 'gamma': 1e-06, 'kernel': 'rbf'}
Best Mean Squared Error: 6.886800637717728


In [126]:
# small alpha, large gamma

param_grid = {
    'alpha': np.logspace(np.log10(10e-7), np.log10(10e-3), num=50),  # Regularization parameter controlling the L2 regularization term
    'gamma': np.logspace(np.log10(10e-3), np.log10(1), num=50),  # Parameter for the Gaussian kernel, controlling the width of the kernel
    'kernel': ['rbf'],  # Specifies the kernel function to be used, in this case, the Gaussian (RBF) kernel
}

KRR_GLoT = KernelRidge()

k_fold = KFold(n_splits=5, shuffle=True, random_state=10)
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    grid_search = GridSearchCV(KRR_GLoT, param_grid, scoring='neg_mean_squared_error', cv=k_fold)
    grid_search.fit(X_lexi_opp, y_energy)

best_params = grid_search.best_params_
best_score = -grid_search.best_score_

# Print the best hyperparameters and score
print("Best Hyperparameters:", best_params)
print("Best Mean Squared Error:", best_score)

Best Hyperparameters: {'alpha': 1e-06, 'gamma': 0.01, 'kernel': 'rbf'}
Best Mean Squared Error: 41789.54380108347


In [127]:
# large alpha, small gamma

param_grid = {
    'alpha': np.logspace(np.log10(10e-3), np.log10(1), num=50),  # Regularization parameter controlling the L2 regularization term
    'gamma': np.logspace(np.log10(10e-7), np.log10(10e-3), num=50),  # Parameter for the Gaussian kernel, controlling the width of the kernel
    'kernel': ['rbf'],  # Specifies the kernel function to be used, in this case, the Gaussian (RBF) kernel
}

KRR_GLoT = KernelRidge()

k_fold = KFold(n_splits=5, shuffle=True, random_state=10)
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    grid_search = GridSearchCV(KRR_GLoT, param_grid, scoring='neg_mean_squared_error', cv=k_fold)
    grid_search.fit(X_lexi_opp, y_energy)

best_params = grid_search.best_params_
best_score = -grid_search.best_score_

# Print the best hyperparameters and score
print("Best Hyperparameters:", best_params)
print("Best Mean Squared Error:", best_score)

Best Hyperparameters: {'alpha': 0.01, 'gamma': 1e-06, 'kernel': 'rbf'}
Best Mean Squared Error: 29.514712738253742


In [128]:
# large alpha, large gamma

param_grid = {
    'alpha': np.logspace(np.log10(10e-3), np.log10(1), num=50),  # Regularization parameter controlling the L2 regularization term
    'gamma': np.logspace(np.log10(10e-3), np.log10(1), num=50),  # Parameter for the Gaussian kernel, controlling the width of the kernel
    'kernel': ['rbf'],  # Specifies the kernel function to be used, in this case, the Gaussian (RBF) kernel
}

KRR_GLoT = KernelRidge()

k_fold = KFold(n_splits=5, shuffle=True, random_state=10)
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    grid_search = GridSearchCV(KRR_GLoT, param_grid, scoring='neg_mean_squared_error', cv=k_fold)
    grid_search.fit(X_lexi_opp, y_energy)

best_params = grid_search.best_params_
best_score = -grid_search.best_score_

# Print the best hyperparameters and score
print("Best Hyperparameters:", best_params)
print("Best Mean Squared Error:", best_score)

Best Hyperparameters: {'alpha': 0.01, 'gamma': 0.01, 'kernel': 'rbf'}
Best Mean Squared Error: 41790.79139470468


In [134]:
param_grid = {
    'alpha': np.linspace(10e-7, 10e-5, num=20),  # Regularization parameter controlling the L2 regularization term
    'gamma': np.linspace(10e-7, 10e-5, num=20),  # Parameter for the Gaussian kernel, controlling the width of the kernel
    'kernel': ['rbf'],  # Specifies the kernel function to be used, in this case, the Gaussian (RBF) kernel
}

KRR_GLoT = KernelRidge()

k_fold = KFold(n_splits=17, shuffle=True, random_state=10)
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    grid_search = GridSearchCV(KRR_GLoT, param_grid, scoring='neg_mean_squared_error', cv=k_fold)
    grid_search.fit(X_lexi, y_energy)

best_params = grid_search.best_params_
best_score = -grid_search.best_score_

# Print the best hyperparameters and score
print("Best Hyperparameters:", best_params)
print("Best Mean Squared Error:", best_score)

Best Hyperparameters: {'alpha': 1e-06, 'gamma': 1e-06, 'kernel': 'rbf'}
Best Mean Squared Error: 5.301730705166797


In [None]:
# # Grid Search round 2

# param_grid = {
#     'alpha': np.logspace(np.log10(0.001), np.log10(0.1), num=10),  # Regularization parameter controlling the L2 regularization term
#     'gamma': np.logspace(np.log10(0.0001), np.log10(0.01), num=10),  # Parameter for the Gaussian kernel, controlling the width of the kernel
#     'kernel': ['rbf'],  # Specifies the kernel function to be used, in this case, the Gaussian (RBF) kernel
# }


# gaussian_KRR = KernelRidge()

# k_fold = KFold(n_splits=5, shuffle=True, random_state=10)
# grid_search = GridSearchCV(gaussian_KRR, param_grid, scoring='neg_mean_squared_error', cv=k_fold)
# grid_search.fit(X, y)

# best_params = grid_search.best_params_
# best_score = -grid_search.best_score_

# # Print the best hyperparameters and score
# print("Best Hyperparameters:", best_params)
# print("Best Mean Squared Error:", best_score)

In [None]:
# # Grid Search round 3

# param_grid = {
#     'alpha': np.linspace(0.01, 0.02, num = 20),  # Regularization parameter controlling the L2 regularization term
#     'gamma': np.logspace(np.log10(0.00001), np.log10(0.0001), num=10),  # Parameter for the Gaussian kernel, controlling the width of the kernel
#     'kernel': ['rbf'],  # Specifies the kernel function to be used, in this case, the Gaussian (RBF) kernel
# }


# gaussian_KRR = KernelRidge()

# k_fold = KFold(n_splits=5, shuffle=True, random_state=10)
# grid_search = GridSearchCV(gaussian_KRR, param_grid, scoring='neg_mean_squared_error', cv=k_fold)
# grid_search.fit(X, y)

# best_params = grid_search.best_params_
# best_score = -grid_search.best_score_

# # Print the best hyperparameters and score
# print("Best Hyperparameters:", best_params)
# print("Best Mean Squared Error:", best_score)

In [None]:
# # Grid Search round 4

# param_grid = {
#     'alpha': np.linspace(0.005, 0.01, num = 21),  # Regularization parameter controlling the L2 regularization term
#     'gamma': np.logspace(np.log10(1e-06), np.log10(1e-05), num=10),  # Parameter for the Gaussian kernel, controlling the width of the kernel
#     'kernel': ['rbf'],  # Specifies the kernel function to be used, in this case, the Gaussian (RBF) kernel
# }


# gaussian_KRR = KernelRidge()

# k_fold = KFold(n_splits=5, shuffle=True, random_state=10)
# grid_search = GridSearchCV(gaussian_KRR, param_grid, scoring='neg_mean_squared_error', cv=k_fold)
# grid_search.fit(X, y)

# best_params = grid_search.best_params_
# best_score = -grid_search.best_score_

# # Print the best hyperparameters and score
# print("Best Hyperparameters:", best_params)
# print("Best Mean Squared Error:", best_score)

In [None]:
# # Grid Search round 5

# param_grid = {
#     'alpha': np.linspace(0.001, 0.005, num=21),  # Regularization parameter controlling the L2 regularization term
#     'gamma': np.logspace(np.log10(3e-06), np.log10(6e-05), num=20),  # Parameter for the Gaussian kernel, controlling the width of the kernel
#     'kernel': ['rbf'],  # Specifies the kernel function to be used, in this case, the Gaussian (RBF) kernel
# }


# gaussian_KRR = KernelRidge()

# k_fold = KFold(n_splits=5, shuffle=True, random_state=10)
# grid_search = GridSearchCV(gaussian_KRR, param_grid, scoring='neg_mean_squared_error', cv=k_fold)
# grid_search.fit(X, y)

# best_params = grid_search.best_params_
# best_score = -grid_search.best_score_

# # Print the best hyperparameters and score
# print("Best Hyperparameters:", best_params)
# print("Best Mean Squared Error:", best_score)

In [None]:
# # Grid Search round 6

# param_grid = {
#     'alpha': np.linspace(0.0005, 0.001, num=21),  # Regularization parameter controlling the L2 regularization term
#     'gamma': np.linspace(2e-06, 4e-06, num=20),  # Parameter for the Gaussian kernel, controlling the width of the kernel
#     'kernel': ['rbf'],  # Specifies the kernel function to be used, in this case, the Gaussian (RBF) kernel
# }


# gaussian_KRR = KernelRidge()

# k_fold = KFold(n_splits=5, shuffle=True, random_state=10)
# grid_search = GridSearchCV(gaussian_KRR, param_grid, scoring='neg_mean_squared_error', cv=k_fold)
# grid_search.fit(X, y)

# best_params = grid_search.best_params_
# best_score = -grid_search.best_score_

# # Print the best hyperparameters and score
# print("Best Hyperparameters:", best_params)
# print("Best Mean Squared Error:", best_score)

In [None]:
# # Grid Search round 7

# param_grid = {
#     'alpha': np.linspace(5e-4, 7e-4, num=21),  # Regularization parameter controlling the L2 regularization term
#     'gamma': np.linspace(1e-06, 2e-06, num=20),  # Parameter for the Gaussian kernel, controlling the width of the kernel
#     'kernel': ['rbf'],  # Specifies the kernel function to be used, in this case, the Gaussian (RBF) kernel
# }


# gaussian_KRR = KernelRidge()

# k_fold = KFold(n_splits=5, shuffle=True, random_state=10)
# grid_search = GridSearchCV(gaussian_KRR, param_grid, scoring='neg_mean_squared_error', cv=k_fold)
# grid_search.fit(X, y)

# best_params = grid_search.best_params_
# best_score = -grid_search.best_score_

# # Print the best hyperparameters and score
# print("Best Hyperparameters:", best_params)
# print("Best Mean Squared Error:", best_score)

In [None]:
# # Grid Search round 8

# param_grid = {
#     'alpha': np.linspace(3e-4, 5e-4, num=21),  # Regularization parameter controlling the L2 regularization term
#     'gamma': np.linspace(1e-06, 2e-06, num=20),  # Parameter for the Gaussian kernel, controlling the width of the kernel
#     'kernel': ['rbf'],  # Specifies the kernel function to be used, in this case, the Gaussian (RBF) kernel
# }


# gaussian_KRR = KernelRidge()

# k_fold = KFold(n_splits=5, shuffle=True, random_state=10)
# grid_search = GridSearchCV(gaussian_KRR, param_grid, scoring='neg_mean_squared_error', cv=k_fold)
# grid_search.fit(X, y)

# best_params = grid_search.best_params_
# best_score = -grid_search.best_score_

# # Print the best hyperparameters and score
# print("Best Hyperparameters:", best_params)
# print("Best Mean Squared Error:", best_score)

In [None]:
# # Grid Search round 9

# param_grid = {
#     'alpha': np.linspace(1e-4, 3e-4, num=21),  # Regularization parameter controlling the L2 regularization term
#     'gamma': np.linspace(5e-07, 1e-06, num=20),  # Parameter for the Gaussian kernel, controlling the width of the kernel
#     'kernel': ['rbf'],  # Specifies the kernel function to be used, in this case, the Gaussian (RBF) kernel
# }


# gaussian_KRR = KernelRidge()

# k_fold = KFold(n_splits=5, shuffle=True, random_state=10)
# grid_search = GridSearchCV(gaussian_KRR, param_grid, scoring='neg_mean_squared_error', cv=k_fold)
# grid_search.fit(X, y)

# best_params = grid_search.best_params_
# best_score = -grid_search.best_score_

# # Print the best hyperparameters and score
# print("Best Hyperparameters:", best_params)
# print("Best Mean Squared Error:", best_score)

In [None]:
# # Grid Search round 10

# param_grid = {
#     'alpha': np.linspace(5e-5, 1e-4, num=11),  # Regularization parameter controlling the L2 regularization term
#     'gamma': np.linspace(1e-07, 5e-07, num=21),  # Parameter for the Gaussian kernel, controlling the width of the kernel
#     'kernel': ['rbf'],  # Specifies the kernel function to be used, in this case, the Gaussian (RBF) kernel
# }


# gaussian_KRR = KernelRidge()

# k_fold = KFold(n_splits=5, shuffle=True, random_state=10)
# grid_search = GridSearchCV(gaussian_KRR, param_grid, scoring='neg_mean_squared_error', cv=k_fold)
# grid_search.fit(X, y)

# best_params = grid_search.best_params_
# best_score = -grid_search.best_score_

# # Print the best hyperparameters and score
# print("Best Hyperparameters:", best_params)
# print("Best Mean Squared Error:", best_score)

### Ridge Regression ###

In [None]:
# # Create the Ridge regression model
# # Round 1

# ridge_model = Ridge()

# # Define the hyperparameters to tune and their respective values
# param_grid = {
#     'alpha': [0.1, 0.5, 1.0, 2.0],  # Regularization strength
#     'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg'],  # Solver algorithm
# }

# # Perform grid search using cross-validation
# grid_search = GridSearchCV(ridge_model, param_grid, cv=5, scoring='neg_mean_squared_error')
# grid_search.fit(X, y)

# # Retrieve the best hyperparameters and the corresponding mean squared error
# best_params = grid_search.best_params_
# best_mse = -grid_search.best_score_

# # Print the best hyperparameters and the corresponding mean squared error
# print("Best Hyperparameters:")
# print(best_params)
# print("Best Mean Squared Error:", best_mse)

In [None]:
# # Create the Ridge regression model
# # Round 2

# ridge_model = Ridge()

# # Define the hyperparameters to tune and their respective values
# param_grid = {
#     'alpha': np.linspace(1.5, 2.5, num=10),  # Regularization strength
#     'solver': ['auto'],  # Solver algorithm
# }

# # Perform grid search using cross-validation
# grid_search = GridSearchCV(ridge_model, param_grid, cv=5, scoring='neg_mean_squared_error')
# grid_search.fit(X, y)

# # Retrieve the best hyperparameters and the corresponding mean squared error
# best_params = grid_search.best_params_
# best_mse = -grid_search.best_score_

# # Print the best hyperparameters and the corresponding mean squared error
# print("Best Hyperparameters:")
# print(best_params)
# print("Best Mean Squared Error:", best_mse)

In [None]:
# # Create the Ridge regression model
# # Round 3

# ridge_model = Ridge()

# # Define the hyperparameters to tune and their respective values
# param_grid = {
#     'alpha': np.linspace(2.5, 3.5, num=10),  # Regularization strength
#     'solver': ['auto'],  # Solver algorithm
# }

# # Perform grid search using cross-validation
# grid_search = GridSearchCV(ridge_model, param_grid, cv=5, scoring='neg_mean_squared_error')
# grid_search.fit(X, y)

# # Retrieve the best hyperparameters and the corresponding mean squared error
# best_params = grid_search.best_params_
# best_mse = -grid_search.best_score_

# # Print the best hyperparameters and the corresponding mean squared error
# print("Best Hyperparameters:")
# print(best_params)
# print("Best Mean Squared Error:", best_mse)

In [None]:
# # Create the Ridge regression model
# # Round 4

# ridge_model = Ridge()

# # Define the hyperparameters to tune and their respective values
# param_grid = {
#     'alpha': np.linspace(3, 10, num=71, endpoint=True),  # Regularization strength
#     'solver': ['auto'],  # Solver algorithm
# }

# # Perform grid search using cross-validation
# grid_search = GridSearchCV(ridge_model, param_grid, cv=5, scoring='neg_mean_squared_error')
# grid_search.fit(X, y)

# # Retrieve the best hyperparameters and the corresponding mean squared error
# best_params = grid_search.best_params_
# best_mse = -grid_search.best_score_

# # Print the best hyperparameters and the corresponding mean squared error
# print("Best Hyperparameters:")
# print(best_params)
# print("Best Mean Squared Error:", best_mse)

In [None]:
# # Create the Ridge regression model
# # Round 5

# ridge_model = Ridge()

# # Define the hyperparameters to tune and their respective values
# param_grid = {
#     'alpha': np.linspace(10, 20, num=101, endpoint=True),  # Regularization strength
#     'solver': ['auto'],  # Solver algorithm
# }

# # Perform grid search using cross-validation
# grid_search = GridSearchCV(ridge_model, param_grid, cv=5, scoring='neg_mean_squared_error')
# grid_search.fit(X, y)

# # Retrieve the best hyperparameters and the corresponding mean squared error
# best_params = grid_search.best_params_
# best_mse = -grid_search.best_score_

# # Print the best hyperparameters and the corresponding mean squared error
# print("Best Hyperparameters:")
# print(best_params)
# print("Best Mean Squared Error:", best_mse)

In [None]:
# # Create the Ridge regression model
# # Round 6

# ridge_model = Ridge()

# # Define the hyperparameters to tune and their respective values
# param_grid = {
#     'alpha': np.linspace(20, 100, num=41, endpoint=True),  # Regularization strength
#     'solver': ['auto'],  # Solver algorithm
# }

# # Perform grid search using cross-validation
# grid_search = GridSearchCV(ridge_model, param_grid, cv=5, scoring='neg_mean_squared_error')
# grid_search.fit(X, y)

# # Retrieve the best hyperparameters and the corresponding mean squared error
# best_params = grid_search.best_params_
# best_mse = -grid_search.best_score_

# # Print the best hyperparameters and the corresponding mean squared error
# print("Best Hyperparameters:")
# print(best_params)
# print("Best Mean Squared Error:", best_mse)

In [None]:
# # Create the Ridge regression model
# # Round 7

# ridge_model = Ridge()

# # Define the hyperparameters to tune and their respective values
# param_grid = {
#     'alpha': np.linspace(100, 1000, num=91, endpoint=True),  # Regularization strength
#     'solver': ['auto'],  # Solver algorithm
# }

# # Perform grid search using cross-validation
# grid_search = GridSearchCV(ridge_model, param_grid, cv=5, scoring='neg_mean_squared_error')
# grid_search.fit(X, y)

# # Retrieve the best hyperparameters and the corresponding mean squared error
# best_params = grid_search.best_params_
# best_mse = -grid_search.best_score_

# # Print the best hyperparameters and the corresponding mean squared error
# print("Best Hyperparameters:")
# print(best_params)
# print("Best Mean Squared Error:", best_mse)

In [None]:
# # Create the Ridge regression model
# # Round 8

# ridge_model = Ridge()

# # Define the hyperparameters to tune and their respective values
# param_grid = {
#     'alpha': np.linspace(210, 230, num=21, endpoint=True),  # Regularization strength
#     'solver': ['auto'],  # Solver algorithm
# }

# # Perform grid search using cross-validation
# grid_search = GridSearchCV(ridge_model, param_grid, cv=5, scoring='neg_mean_squared_error')
# grid_search.fit(X, y)

# # Retrieve the best hyperparameters and the corresponding mean squared error
# best_params = grid_search.best_params_
# best_mse = -grid_search.best_score_

# # Print the best hyperparameters and the corresponding mean squared error
# print("Best Hyperparameters:")
# print(best_params)
# print("Best Mean Squared Error:", best_mse)

## Model Prediction and Learning Curve ##

In [None]:
def evaluate_performance(model, X, y, num_training_sample, num_trials):

    """ 
    Given the number of training samples used, 
    calculate the average and standard deviation of MSE across a certain number of trials.
    For each trial, a specified number of training examples is used to train the model, 
    which is then evaluated on the rest of the data set.

    Args:
        X (ndarray): training data; size (N, m) where N is the number of training examples and m is the number of features
        y (ndarray): target data; size (N, 1)
        num_training_sample (int): the number of samples used for training
        num_trials: the number of trials 
    
    Returns:
        average_error: the average MSE across all trials
        std_dev_error: standard deviation of the error across all trials
    """

    errors = []
    test_size = 1.0 - num_training_sample/X.shape[0]

    for i in range(num_trials):
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, shuffle=True, random_state=i)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        error = np.sqrt(mean_squared_error(y_val, y_pred)) # Root mean squared error
        errors.append(error)
    
    average_error = np.mean(errors)
    std_dev_error = np.std(errors)
    return average_error, std_dev_error

### Polynomial Kernel ###

In [None]:
# Polynomial kernel

best_params_poly_KRR = {'alpha': 1e-06, 'coef0': 345.0, 'degree': 2, 'kernel': 'poly'}
poly_KRR_model = KernelRidge(**best_params_poly_KRR)

columns = ['training size', 'average RMSE', 'standard deviation']
model_performance_poly_KRR = pd.DataFrame(columns=columns)

training_size = [i for i in range(1, 17)]
num_trials = 20

for num_training_sample in training_size:
    index = num_training_sample - 1
    average_error, std_dev_error = evaluate_performance(poly_KRR_model, X, y, num_training_sample, num_trials)
    model_performance_poly_KRR.at[index, 'training size'] = num_training_sample
    model_performance_poly_KRR.at[index, 'average RMSE'] = average_error
    model_performance_poly_KRR.at[index, 'standard deviation'] = std_dev_error/np.sqrt(num_trials)

display(model_performance_poly_KRR)

### Gaussian KRR ###

In [None]:
best_params_gaussian_KRR = {'alpha': 5e-05, 'gamma': 1.6e-07, 'kernel': 'rbf'}
gaussian_KRR_model = KernelRidge(**best_params_gaussian_KRR)

columns = ['training size', 'average RMSE', 'standard deviation']
model_performance_gaussian_KRR = pd.DataFrame(columns=columns)

training_size = [i for i in range(1, 17)]
num_trials = 20

for num_training_sample in training_size:
    index = num_training_sample - 1
    average_error, std_dev_error = evaluate_performance(gaussian_KRR_model, X, y, num_training_sample, num_trials)
    model_performance_gaussian_KRR.at[index, 'training size'] = num_training_sample
    model_performance_gaussian_KRR.at[index, 'average RMSE'] = average_error
    model_performance_gaussian_KRR.at[index, 'standard deviation'] = std_dev_error/np.sqrt(num_trials)

display(model_performance_gaussian_KRR)

### Ridge Regression ###

In [None]:
best_params_ridge_regression = {'alpha': 219, 'solver': 'auto'}

ridge_model = Ridge(**best_params_ridge_regression)

columns = ['training size', 'average RMSE', 'standard deviation']
model_performance_ridge_regression = pd.DataFrame(columns=columns)

training_size = [i for i in range(1, 17)]
num_trials = 20

for num_training_sample in training_size:
    index = num_training_sample - 1
    average_error, std_dev_error = evaluate_performance(ridge_model, X, y, num_training_sample, num_trials)
    model_performance_ridge_regression.at[index, 'training size'] = num_training_sample
    model_performance_ridge_regression.at[index, 'average RMSE'] = average_error
    model_performance_ridge_regression.at[index, 'standard deviation'] = std_dev_error/np.sqrt(num_trials)

display(model_performance_ridge_regression)

### Graphing ###

In [None]:
# # Graph with error bar

# graph_x = model_performance['training size']
# graph_y = model_performance['average RMSE']
# graph_error = model_performance['standard deviation']

# # Set figure size
# plt.figure(figsize=(10, 6))

# # Create line plot with error bars
# plt.errorbar(graph_x, graph_y, yerr=graph_error, marker='o', linestyle='-', capsize=4)

# # Set axis labels and title
# plt.xlabel('Training Size')
# plt.ylabel('Average RMSE')
# plt.title('Learning curve for BN-doped benzene molecule energy prediction using polynomial KRR')

# plt.xscale('log')
# plt.yscale('log')


# # Save the figure as a PNG image
# plt.savefig('[Benz] learning_curve_16_points.png', dpi=300)
# plt.show()

In [None]:
# No error bar
# Set figure size
plt.figure(figsize=(10, 6))

# Load the data
x = model_performance_poly_KRR['training size']
y1 = model_performance_poly_KRR['average RMSE']
y2 = model_performance_gaussian_KRR['average RMSE']
y3 = model_performance_ridge_regression['average RMSE']

# Plotting
plt.plot(x, y1, label='Polynomial KRR', marker='o', linestyle='-', linewidth=2.5)
plt.plot(x, y2, label='Gaussian KRR', marker='o', linestyle='-', linewidth=2.5)
plt.plot(x, y3, label='Ridge Regression', marker='o', linestyle='-', linewidth=2.5)


# Customize the plot
plt.title('Learning curve for BN-Doped Benzene molecule energy prediction')
plt.xlabel('Training Size (log)')
plt.ylabel('Average RMSE [Ha] (log)')
plt.legend()

# Create log scale
plt.xscale('log', base=2)
plt.yscale('log', base=2)

# yticks = [2**i for i in range(-4, 7)]
yticks = [2**i for i in range(0, 7)]
plt.yticks(yticks, labels = yticks)

# Display the plot
plt.savefig('../Graph/[Benz] learning_curve_16_points_no_err_bar.png', dpi=300)
plt.show()


In [None]:
# With Error Bar
# Set figure size
plt.figure(figsize=(10, 6))

# Load the data
x = model_performance_poly_KRR['training size']
y1 = model_performance_poly_KRR['average RMSE']
y2 = model_performance_gaussian_KRR['average RMSE']
y3 = model_performance_ridge_regression['average RMSE']
y1_error = model_performance_poly_KRR['standard deviation']
y2_error = model_performance_gaussian_KRR['standard deviation']
y3_error = model_performance_ridge_regression['standard deviation']

# Plotting
# plt.plot(x, y1, label='Polynomial KRR', marker='o', linestyle='-', linewidth=2.5)
# plt.plot(x, y2, label='Gaussian KRR', marker='o', linestyle='-', linewidth=2.5)
# plt.plot(x, y3, label='Ridge Regression', marker='o', linestyle='-', linewidth=2.5)
plt.errorbar(x, y1, label='Polynomial KRR', yerr=y1_error, marker='o', linestyle='-', capsize=3)
plt.errorbar(x, y2, label='Gaussian KRR', yerr=y2_error, marker='o', linestyle='-', capsize=3)
plt.errorbar(x, y3, label='Ridge Regression', yerr=y3_error, marker='o', linestyle='-', capsize=3)

# Customize the plot
plt.title('Learning curve for BN-Doped Benzene molecule energy prediction')
plt.xlabel('Training Size (log)')
plt.ylabel('Average RMSE [Ha] (log)')
plt.legend()

# Create log scale
plt.xscale('log', base=2)
plt.yscale('log', base=2)

# yticks = [2**i for i in range(-4, 7)]
yticks = [2**i for i in range(0, 7)]
plt.yticks(yticks, labels = yticks)

# Display the plot
plt.savefig('../Graph/[Benz] learning_curve_16_points_with_err_bar.png', dpi=300)
plt.show()


In [None]:
# Plot a version without polynomial KRR, which performed the worst

plt.figure(figsize=(10, 6))

x = model_performance_poly_KRR['training size']
y1 = model_performance_poly_KRR['average RMSE']
y2 = model_performance_gaussian_KRR['average RMSE']
y3 = model_performance_ridge_regression['average RMSE']
y2_error = model_performance_gaussian_KRR['standard deviation']
y3_error = model_performance_ridge_regression['standard deviation']

plt.errorbar(x, y2, label='Gaussian KRR', yerr=y2_error, marker='o', linestyle='-', capsize=3)
plt.errorbar(x, y3, label='Ridge Regression', yerr=y3_error, marker='o', linestyle='-', capsize=3)

# Customize the plot
plt.title('Learning curve for BN-Doped Benzene molecule energy prediction')
plt.xlabel('Training Size (log)')
plt.ylabel('Average RMSE [Ha] (log)')
plt.legend()

# Create log scale
plt.xscale('log', base=2)
plt.yscale('log', base=2)

yticks = [2**i for i in range(0, 3)]
plt.yticks(yticks, labels = yticks)

# Display the plot
plt.savefig('../Graph/[Benz] learning_curve_without_poly_KRR.png', dpi=300)
plt.show()

### Comparing to Prediction ###

In [None]:
# Create a dataframe that contains the energies predicted by the model and the actual energy

# Specifies the columns of the dataframe and create an empty dataframe
columns = ['Elements', 'Poly KRR prediction', 'Gaussian KRR prediction', 'Ridge regression prediction', 'Actual energy']
model_prediction = pd.DataFrame(columns=columns)

# fill in elements and the actual energy values from the original benzene_data dataframe
model_prediction['Elements'] = benzene_data['Elements']
model_prediction['Actual energy'] = benzene_data['Energy']

# fit the gaussian KRR and ridge regression model on the training set
gaussian_KRR_model.fit(X, y)
ridge_model.fit(X, y)
poly_KRR_model.fit(X, y)

# iterate through each row entry of the data
for index, row in model_prediction.iterrows():
    x_pred = X.loc[[index]] # extract the input to be predicted
    
    # predict energy using the two models
    # the given prediction is in a list of one element. use [0] to extract the actual value
    gaussian_prediction = gaussian_KRR_model.predict(x_pred)[0] 
    ridge_prediction = ridge_model.predict(x_pred)[0]
    poly_prediction = poly_KRR_model.predict(x_pred)[0] 
    
    # Record the prediction in the DataFrame
    model_prediction.at[index, 'Poly KRR prediction'] = poly_prediction
    model_prediction.at[index, 'Gaussian KRR prediction'] = gaussian_prediction
    model_prediction.at[index, 'Ridge regression prediction'] = ridge_prediction

# display the data
display(model_prediction)


In [None]:
# Graph the results

plt.figure(figsize=(10, 6))

x = [i for i in range(17)]
y_pred_1 = model_prediction['Gaussian KRR prediction']
y_pred_2 = model_prediction['Ridge regression prediction']
y_pred_3 = model_prediction['Poly KRR prediction']
y = model_prediction['Actual energy']

plt.plot(x, y_pred_1, label='Gaussian KRR prediction', marker='o', linestyle='-', linewidth=2.5)
plt.plot(x, y_pred_2, label='Ridge regression prediction', marker='o', linestyle='-', linewidth=2.5)
plt.plot(x, y_pred_3, label='Poly KRR prediction', marker='o', linestyle='-', linewidth=2.5)
plt.plot(x, y, label='Actual energy', marker='o', linestyle='-', linewidth=2.5)

plt.title('Comparing model prediction with actual energy')
plt.xlabel('Element index')
plt.ylabel('Energy [Ha]')
plt.legend()

yticks = np.linspace(-230, -240, num=11)
plt.yticks(yticks, labels = yticks)

plt.savefig('../Graph/[Benz] comparing_model_performance.png', dpi=300)
plt.show()
