In [1]:
import sys
sys.path.append('../..')
sys.path.append('../../APDFT')
sys.path.append('../../helper_code')
sys.path.append('../data')

import pickle
from pyscf import gto, scf, dft, cc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import basis_set_exchange as bse
from APDFT.FcMole import *
import os
import ast
from copy import deepcopy
from IPython.display import display
from helper_code.data_processing import *


%load_ext autoreload
%autoreload 2
from APDFT.AP_class import APDFT_perturbator as AP

## Load Dataset ##

In [2]:
# Specify the atomic coordinates of benzene molecule (the reference molecule for ANM calculations)

benz_atom="""
C        3.22272669       0.22711285       0.00013582
C        5.87141753       0.22698034       0.00094988
C        7.19597908       2.52071412      -0.00011471
C        5.87164800       4.81458054      -0.00200817
C        3.22295713       4.81471307      -0.00280461
C        1.89839559       2.52097926      -0.00174231
H        2.18773340      -1.56549239       0.00096741
H        6.90623079      -1.56572844       0.00241360
H        9.26591446       2.52061061       0.00051784
H        6.90664130       6.60718579      -0.00284841
H        2.18814386       6.60742187      -0.00426425
H       -0.17153979       2.52108280      -0.00237226
"""

In [3]:
basis_pcx2={"H":"pc-2",'C':bse.get_basis("pcX-2",fmt="nwchem",elements=[6])\
           ,'N':bse.get_basis("pcX-2",fmt="nwchem",elements=[7])\
           ,'O':bse.get_basis("pcX-2",fmt="nwchem",elements=[8])}

In [7]:
dest_csv_path = "../data/benzene_processed_data/benzene_energy_data.csv"
raw_tot_energy_path = "../data/benzene_raw_data/Benzene_BNdoping_PBE0_pcX2_opt.npz"
row_elec_energy_path = "../data/benzene_raw_data/Benzene_BNdoping_PBE0_pcX2_opt_electronic.npz"
benzene_energy_data = load_data(benz_atom, basis_pcx2, dest_csv_path, raw_tot_energy_path, raw_tot_energy_path)
display(benzene_energy_data)

Load data complete!


Unnamed: 0,charges,elements,total energy,electronic energy,delta total energy,delta electronic energy
0,"[7, 5, 6, 6, 6, 6, 1, 1, 1, 1, 1, 1]","[N, B, C, C, C, C, H, H, H, H, H, H]",-232.488488,-336.906006,2.576317,-0.077315
1,"[7, 6, 5, 6, 6, 6, 1, 1, 1, 1, 1, 1]","[N, C, B, C, C, C, H, H, H, H, H, H]",-232.427609,-336.995987,2.515439,0.012665
2,"[7, 6, 6, 5, 6, 6, 1, 1, 1, 1, 1, 1]","[N, C, C, B, C, C, H, H, H, H, H, H]",-232.433092,-337.004116,2.520922,0.020794
3,"[7, 7, 5, 5, 6, 6, 1, 1, 1, 1, 1, 1]","[N, N, B, B, C, C, H, H, H, H, H, H]",-235.671427,-340.400298,5.759256,3.416976
4,"[7, 7, 5, 6, 5, 6, 1, 1, 1, 1, 1, 1]","[N, N, B, C, B, C, H, H, H, H, H, H]",-235.708812,-340.318992,5.796641,3.33567
5,"[7, 7, 5, 6, 6, 5, 1, 1, 1, 1, 1, 1]","[N, N, B, C, C, B, H, H, H, H, H, H]",-235.785018,-340.193778,5.872848,3.210456
6,"[7, 7, 6, 5, 5, 6, 1, 1, 1, 1, 1, 1]","[N, N, C, B, B, C, H, H, H, H, H, H]",-235.595264,-340.510269,5.683093,3.526948
7,"[7, 5, 7, 6, 6, 5, 1, 1, 1, 1, 1, 1]","[N, B, N, C, C, B, H, H, H, H, H, H]",-235.875126,-340.067245,5.962955,3.083924
8,"[7, 5, 7, 6, 5, 6, 1, 1, 1, 1, 1, 1]","[N, B, N, C, B, C, H, H, H, H, H, H]",-235.813393,-340.126176,5.901222,3.142854
9,"[7, 6, 7, 5, 5, 6, 1, 1, 1, 1, 1, 1]","[N, C, N, B, B, C, H, H, H, H, H, H]",-235.694052,-340.325617,5.781881,3.342296


## Molecular Representation ##

### Hessian-based ANM ###

In [13]:
H = get_hessian()
Q_eig_val, Q = np.linalg.eig(H)
np.savetxt('CCS_basis/ANM_basis.txt', Q)

H_df = pd.DataFrame(H)
Q_df = pd.DataFrame(Q)
print(Q_eig_val)
display(H_df)
display(Q_df)


[-2.61645906 -3.59391232 -3.56441216 -3.50145981 -3.51709044 -3.51211196]


Unnamed: 0,0,1,2,3,4,5
0,-3.375838,0.163026,0.150611,0.139364,0.164915,0.143651
1,0.163026,-3.375836,0.143651,0.164914,0.139366,0.150608
2,0.150611,0.143651,-3.401051,0.143651,0.150607,0.191661
3,0.139364,0.164914,0.143651,-3.375837,0.163026,0.15061
4,0.164915,0.139366,0.150607,0.163026,-3.375835,0.143651
5,0.143651,0.150608,0.191661,0.15061,0.143651,-3.401048


Unnamed: 0,0,1,2,3,4,5
0,-0.40926,-0.060626,0.500021,0.287227,0.499965,0.496332
1,-0.40926,0.060588,-0.49998,0.287228,0.500048,-0.496293
2,-0.406216,0.701905,-1.4e-05,-0.578764,1e-06,0.085726
3,-0.40926,0.060596,0.500007,0.287202,-0.499964,-0.496365
4,-0.409261,-0.060578,-0.499991,0.287298,-0.500023,0.496268
5,-0.406218,-0.701884,-4.2e-05,-0.578798,-2.7e-05,-0.085668


### Inverse Distance Matrix ###

In [43]:
coord = create_coord_array(benz_atom, 12)
M_inv_dist = get_inv_dist_M(coord)
Q_inv_dist_eig_val, Q_inv_dist = np.linalg.eig(M_inv_dist)
np.savetxt('CCS_basis/Inverse_distance_basis.txt', Q_inv_dist)

print(Q_inv_dist_eig_val)
M_inv_dist_df = pd.DataFrame(M_inv_dist)
Q_inv_dist_df = pd.DataFrame(Q_inv_dist)
display(M_inv_dist_df)
display(Q_inv_dist_df)

[7.27710313 5.38938855 5.86808844 5.86809913 5.49054754 5.4905542 ]


Unnamed: 0,0,1,2,3,4,5
0,5.897297,0.377545,0.217972,0.188775,0.217979,0.377542
1,0.377545,5.897297,0.377542,0.217979,0.188775,0.217972
2,0.217972,0.377542,5.897297,0.377542,0.217972,0.188765
3,0.188775,0.217979,0.377542,5.897297,0.377545,0.217972
4,0.217979,0.188775,0.217972,0.377545,5.897297,0.377542
5,0.377542,0.217972,0.188765,0.217972,0.377542,5.897297


Unnamed: 0,0,1,2,3,4,5
0,0.40825,0.40825,-0.50017,-0.288379,0.499785,0.289045
1,0.40825,-0.40825,-0.49983,0.288967,-0.500215,0.288301
2,0.408245,0.408245,0.00034,0.577353,0.00043,-0.577352
3,0.40825,-0.40825,0.50017,0.288379,0.499785,0.289045
4,0.40825,0.40825,0.49983,-0.288967,-0.500215,0.288301
5,0.408245,-0.408245,-0.00034,-0.577353,0.00043,-0.577352


### Random Projection Matrix ###

In [41]:
# a random symmetric positive definite matrix

M_rand = np.array([[1., 0.06029536, 0.44295679, 0.27515277, 0.29444547, 0.45542014],
                   [0.06029536, 1., 0.04501127, 0.30063993, 0.44013144, 0.31636733],
                   [0.44295679, 0.04501127, 1., 0.26095881, 0.24953525, 0.32341057],
                   [0.27515277, 0.30063993, 0.26095881, 1., 0.89543119, 0.67206881],
                   [0.29444547, 0.44013144, 0.24953525, 0.89543119, 1., 0.90339149],
                   [0.45542014, 0.31636733, 0.32341057, 0.67206881, 0.90339149, 1.]])
Q_rand_eig_val, Q_rand = np.linalg.eig(M_rand)
np.savetxt('CCS_basis/Random_matrix_basis.txt', Q_rand)

Q_rand_df = pd.DataFrame(Q_rand)
display(Q_rand_df)

Unnamed: 0,0,1,2,3,4,5
0,0.299522,0.578496,-0.083805,-0.257781,0.687691,-0.170984
1,0.261689,-0.456619,0.085953,-0.061749,0.048034,-0.842324
2,0.260628,0.612509,-0.024358,0.066004,-0.680893,-0.29722
3,0.481236,-0.170599,0.396131,-0.666556,-0.187974,0.320556
4,0.532336,-0.228489,-0.783564,0.088042,-0.052447,0.199843
5,0.508831,-0.019035,0.462713,0.68799,0.151898,0.173844


## Data Transformation ##

In [30]:
def compute_lambda_c_square(c_arr, eig_val_arr):
    """ 
    square each coefficient and multiply it by the ANM eigenvalue

    Args:
        c_arr (list): a list of the ANM coefficients
        eig_val_arr (list): a list of the ANM eigenvalues
    Returns:
        list: the transformed coefficient
    """
    transformed_c = [eig_val * coef**2 for eig_val, coef in zip(eig_val_arr, c_arr)]
    return transformed_c

In [39]:
benzene_data_ANM_basis, dx_col, c_col = generate_coef_with_specific_basis(benzene_energy_data, Q, coord, ref_charge=6)
print(dx_col)
print(c_col)
print()
print("Sample c vector:", [round(element, 2) for element in benzene_data_ANM_basis['c'][0]])
print("Sample lexi_c vector:", [round(element, 2) for element in benzene_data_ANM_basis['lexi_c'][0]])
print("Sample c_inv vector:", [round(element, 2) for element in benzene_data_ANM_basis['c_inv'][0]])
print("Sample lexi_c_inv vector:", [round(element, 2) for element in benzene_data_ANM_basis['lexi_c_inv'][0]])

# engineer some new features
# transformation: lambda * c^2

benzene_data_ANM_basis['c_square_eig'] = benzene_data_ANM_basis['c'].apply(lambda c_arr: compute_lambda_c_square(c_arr, Q_eig_val))
benzene_data_ANM_basis['lexi_c_square_eig'] = benzene_data_ANM_basis['lexi_c'].apply(lambda c_arr: compute_lambda_c_square(c_arr, Q_eig_val))
benzene_data_ANM_basis['c_inv_square_eig'] = benzene_data_ANM_basis['c_inv'].apply(lambda c_arr: compute_lambda_c_square(c_arr, Q_eig_val))
benzene_data_ANM_basis['lexi_c_inv_square_eig'] = benzene_data_ANM_basis['lexi_c_inv'].apply(lambda c_arr: compute_lambda_c_square(c_arr, Q_eig_val))
print()
print("Sample c_square_eig vector:", [round(element, 2) for element in benzene_data_ANM_basis['c_square_eig'][0]])
print("Sample lexi_c_square_eig vector:", [round(element, 2) for element in benzene_data_ANM_basis['lexi_c_square_eig'][0]])
print("Sample c_inv_square_eig vector:", [round(element, 2) for element in benzene_data_ANM_basis['c_inv_square_eig'][0]])
print("Sample lexi_c_inv_square_eig vector:", [round(element, 2) for element in benzene_data_ANM_basis['lexi_c_inv_square_eig'][0]])


['dx', 'sorted_dx', 'lexi_dx', 'num_dope']
['c', 'c_inv', 'sorted_c', 'lexi_c', 'lexi_c_inv', 'coulomb_sort_c']

Sample c vector: [-0.35, -0.47, -1.11, -0.47, -0.35, 0.3]
Sample lexi_c vector: [0.91, -0.09, 0.49, -0.09, 0.91, 0.32]
Sample c_inv vector: [0.0, -0.12, 1.0, -0.0, -0.0, 0.99]
Sample lexi_c_inv vector: [0.0, -0.64, -0.5, -0.87, -0.5, -0.58]

Sample c_square_eig vector: [-0.32, -0.79, -4.38, -0.77, -0.43, -0.31]
Sample lexi_c_square_eig vector: [-2.15, -0.03, -0.86, -0.03, -2.88, -0.36]
Sample c_inv_square_eig vector: [-0.0, -0.05, -3.56, -0.0, -0.0, -3.46]
Sample lexi_c_inv_square_eig vector: [-0.0, -1.48, -0.89, -2.63, -0.88, -1.19]


In [44]:
benzene_data_inv_dist_basis, dx_col, c_col = generate_coef_with_specific_basis(benzene_energy_data, Q_inv_dist, coord, ref_charge=6)
benzene_data_inv_dist_basis['c_square_eig'] = benzene_data_inv_dist_basis['c'].apply(lambda c_arr: compute_lambda_c_square(c_arr, Q_inv_dist_eig_val))
benzene_data_inv_dist_basis['lexi_c_square_eig'] = benzene_data_inv_dist_basis['lexi_c'].apply(lambda c_arr: compute_lambda_c_square(c_arr, Q_inv_dist_eig_val))
benzene_data_inv_dist_basis['c_inv_square_eig'] = benzene_data_inv_dist_basis['c_inv'].apply(lambda c_arr: compute_lambda_c_square(c_arr, Q_inv_dist_eig_val))
benzene_data_inv_dist_basis['lexi_c_inv_square_eig'] = benzene_data_inv_dist_basis['lexi_c_inv'].apply(lambda c_arr: compute_lambda_c_square(c_arr, Q_inv_dist_eig_val))

benzene_data_rand_basis, dx_col, c_col = generate_coef_with_specific_basis(benzene_energy_data, Q_rand, coord, ref_charge=6)
benzene_data_rand_basis['c_square_eig'] = benzene_data_rand_basis['c'].apply(lambda c_arr: compute_lambda_c_square(c_arr, Q_rand_eig_val))
benzene_data_rand_basis['lexi_c_square_eig'] = benzene_data_rand_basis['lexi_c'].apply(lambda c_arr: compute_lambda_c_square(c_arr, Q_rand_eig_val))
benzene_data_rand_basis['c_inv_square_eig'] = benzene_data_rand_basis['c_inv'].apply(lambda c_arr: compute_lambda_c_square(c_arr, Q_rand_eig_val))
benzene_data_rand_basis['lexi_c_inv_square_eig'] = benzene_data_rand_basis['lexi_c_inv'].apply(lambda c_arr: compute_lambda_c_square(c_arr, Q_rand_eig_val))

## Export Training Data ##

In [50]:
ANM_basis_datasets = generate_input_training_data(benzene_data_ANM_basis)
inv_dist_basis_datasets = generate_input_training_data(benzene_data_inv_dist_basis)
rand_basis_datasets = generate_input_training_data(benzene_data_rand_basis)

complete_dataset = [ANM_basis_datasets, inv_dist_basis_datasets, rand_basis_datasets]
prefixes = ['ANM', 'inv_dist', 'rand']
dataset_names = ['X', 'X_inv', 'X_sorted', 'X_lexi', 'X_lexi_inv', 'X_nd', 'X_lexi_nd', 'X_coulomb',
                     'X_square_eig', 'X_inv_square_eig', 'X_lexi_square_eig', 'X_lexi_inv_square_eig']

dest_folder = "../data/benzene_training_data"
for datasets, prefix in zip(complete_dataset, prefixes):
    export_to_csv_custom(datasets, dataset_names, prefix, dest_folder)

In [47]:
# Save data to csv

y_energy, y_elec, y_delta_energy, y_delta_elec = generate_target_training_data(benzene_energy_data)

y_energy.to_csv('../data/benzene_training_data/[Benz] y_energy.csv', index=False)
y_elec.to_csv('../data/benzene_training_data/[Benz] y_elec.csv', index=False)
y_delta_energy.to_csv('../data/benzene_training_data/[Benz] y_delta_energy.csv', index=False)
y_delta_elec.to_csv('../data/benzene_training_data/[Benz] y_delta_elec.csv', index=False)

## Regression ##

In [None]:
from sklearn.linear_model import Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
import warnings

import torch
import torch.nn as nn
import torch.optim as optim

### Polynomial Kernel ###

In [None]:
params = {'alpha': 3.4835443037974683e-13, 'coef0': 0.0024369823529411766, 'degree': 2, 'kernel': 'poly'}
KRR_model = KernelRidge(**params)

k_fold = KFold(n_splits=5, shuffle=True, random_state=10)

with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    mse_scores = cross_val_score(KRR_model, X_concat, y_delta_energy, scoring='neg_mean_squared_error', cv=k_fold)
    rmse_scores = np.sqrt(-mse_scores)  

# Calculate the average error across all folds
avg_rmse = rmse_scores.mean()

# Print the mean squared error for each fold
print("Polynomial KRR:")
for fold, rmse in enumerate(rmse_scores):
    print(f"Fold {fold+1}: RMSE = {rmse}")

# Print the average mean squared error
print(f"Average RMSE across all folds: {avg_rmse}")

### Gaussian Kernel ###

In [None]:
params = {'alpha': 1.5306122448979593e-09, 'gamma': 2.2857142857142856e-06, 'kernel': 'rbf'}
KRR_model = KernelRidge(**params)

k_fold = KFold(n_splits=5, shuffle=True, random_state=10)
mse_scores = cross_val_score(KRR_model, X_coulomb, y_delta_energy, scoring='neg_mean_squared_error', cv=k_fold)
rmse_scores = np.sqrt(-mse_scores)

# Calculate the average error across all folds
avg_rmse = rmse_scores.mean()

# Print the mean squared error for each fold
print("Polynomial KRR:")
for fold, rmse in enumerate(rmse_scores):
    print(f"Fold {fold+1}: RMSE = {rmse}")

# Print the average mean squared error
print(f"Average MSE across all folds: {avg_rmse}")

### Linear Ridge Regression ###

In [None]:
liear_model = Ridge(alpha=1)

k_fold = KFold(n_splits=5, shuffle=True, random_state=42)
mse_scores = cross_val_score(liear_model, X_lexi_nd, y_delta_energy, scoring='neg_mean_squared_error', cv=k_fold)
rmse_scores = np.sqrt(-mse_scores)  # Convert negative MSE scores to positive

# Calculate the average error across all folds
avg_rmse = rmse_scores.mean()

# Print the mean squared error for each fold
for fold, mse in enumerate(mse_scores):
    print(f"Fold {fold+1}: RMSE = {rmse}")

# Print the average mean squared error
print(f"Average MSE across all folds: {avg_rmse}")

## Model Prediction and Learning Curve ##

In [None]:
def evaluate_performance(model, X, y, num_training_sample, num_trials):

    """ 
    Given the number of training samples used, 
    calculate the average and standard deviation of MSE across a certain number of trials.
    For each trial, a specified number of training examples is used to train the model, 
    which is then evaluated on the rest of the data set.

    Args:
        X (ndarray): training data; size (N, m) where N is the number of training examples and m is the number of features
        y (ndarray): target data; size (N, 1)
        num_training_sample (int): the number of samples used for training
        num_trials: the number of trials 
    
    Returns:
        average_error: the average MSE across all trials
        std_dev_error: standard deviation of the error across all trials
    """

    errors = []
    test_size = 1.0 - num_training_sample/X.shape[0]

    for i in range(num_trials):
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, shuffle=True, random_state=i)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        error = mean_absolute_error(y_val, y_pred) 
        errors.append(error)
    
    average_error = np.mean(errors)
    std_dev_error = np.std(errors)/np.sqrt(num_trials)
    return average_error, std_dev_error

### Polynomial Kernel ###

#### Poly_lexi ####

In [None]:
# Poly_lexi_delta_tot

best_params = {'alpha': 3.4835443037974683e-13, 'coef0': 0.0024369823529411766, 'degree': 2, 'kernel': 'poly'}
model_poly_lexi_delta_tot = KernelRidge(**best_params)

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance_poly_lexi_delta_tot = pd.DataFrame(columns=columns)

training_size = [2**i for i in range(1, 5)]
num_trials = 20

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance(model_poly_lexi_delta_tot, X_lexi, y_delta_energy, num_training_sample, num_trials)
        model_performance_poly_lexi_delta_tot.at[index, 'training size'] = num_training_sample
        model_performance_poly_lexi_delta_tot.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance_poly_lexi_delta_tot.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

display(model_performance_poly_lexi_delta_tot)

In [None]:
# Poly_lexi_delta_elec

best_params = {'alpha': 3.4835443037974683e-13, 'coef0': 0.0024369823529411766, 'degree': 2, 'kernel': 'poly'}
model_poly_lexi_delta_elec = KernelRidge(**best_params)

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance_poly_lexi_delta_elec = pd.DataFrame(columns=columns)

training_size = [2**i for i in range(1, 5)]
num_trials = 20

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance(model_poly_lexi_delta_elec, X_lexi, y_delta_elec, num_training_sample, num_trials)
        model_performance_poly_lexi_delta_elec.at[index, 'training size'] = num_training_sample
        model_performance_poly_lexi_delta_elec.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance_poly_lexi_delta_elec.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

display(model_performance_poly_lexi_delta_elec)

#### Poly_lexi_nd ####

In [None]:
# Poly_lexi_nd_delta_tot

best_params = {'alpha': 1e-13, 'coef0': 6.04e-06, 'degree': 2, 'kernel': 'poly'}
model_poly_lexi_nd_delta_tot = KernelRidge(**best_params)

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance_poly_lexi_nd_delta_tot = pd.DataFrame(columns=columns)

training_size = [2**i for i in range(1, 5)]
num_trials = 20

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance(model_poly_lexi_nd_delta_tot, X_lexi_nd, y_delta_energy, num_training_sample, num_trials)
        model_performance_poly_lexi_nd_delta_tot.at[index, 'training size'] = num_training_sample
        model_performance_poly_lexi_nd_delta_tot.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance_poly_lexi_nd_delta_tot.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

display(model_performance_poly_lexi_nd_delta_tot)

In [None]:
# Poly_lexi_nd_delta_elec

best_params = {'alpha': 1e-13, 'coef0': 6.04e-06, 'degree': 2, 'kernel': 'poly'}
model_poly_lexi_nd_delta_elec = KernelRidge(**best_params)

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance_poly_lexi_nd_delta_elec = pd.DataFrame(columns=columns)

training_size = [2**i for i in range(1, 5)]
num_trials = 20

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance(model_poly_lexi_nd_delta_elec, X_lexi_nd, y_delta_elec, num_training_sample, num_trials)
        model_performance_poly_lexi_nd_delta_elec.at[index, 'training size'] = num_training_sample
        model_performance_poly_lexi_nd_delta_elec.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance_poly_lexi_nd_delta_elec.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

display(model_performance_poly_lexi_nd_delta_elec)

#### Poly_sorted ####

In [None]:
# Poly_sorted_delta_tot

best_params = {'alpha': 7.969230769230769e-13, 'coef0': 3.9076923076923075e-05, 'degree': 2, 'kernel': 'poly'}
model_poly_sorted_delta_tot = KernelRidge(**best_params)

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance_poly_sorted_delta_tot = pd.DataFrame(columns=columns)

training_size = [2**i for i in range(1, 5)]
num_trials = 20

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance(model_poly_sorted_delta_tot, X_sorted, y_delta_energy, num_training_sample, num_trials)
        model_performance_poly_sorted_delta_tot.at[index, 'training size'] = num_training_sample
        model_performance_poly_sorted_delta_tot.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance_poly_sorted_delta_tot.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

display(model_performance_poly_sorted_delta_tot)

In [None]:
# Poly_sorted_delta_elec

best_params = {'alpha': 7.969230769230769e-13, 'coef0': 3.9076923076923075e-05, 'degree': 2, 'kernel': 'poly'}
model_poly_sorted_delta_elec = KernelRidge(**best_params)

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance_poly_sorted_delta_elec = pd.DataFrame(columns=columns)

training_size = [2**i for i in range(1, 5)]
num_trials = 20

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance(model_poly_sorted_delta_elec, X_sorted, y_delta_elec, num_training_sample, num_trials)
        model_performance_poly_sorted_delta_elec.at[index, 'training size'] = num_training_sample
        model_performance_poly_sorted_delta_elec.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance_poly_sorted_delta_elec.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

display(model_performance_poly_sorted_delta_elec)

#### Poly_coulomb ####

In [None]:
# Poly_coulomb_delta_tot

best_params = {'alpha': 0.1489795918367347, 'coef0': 1.183673469387755, 'degree': 2, 'kernel': 'poly'}
model_poly_coulomb_delta_tot = KernelRidge(**best_params)

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance_poly_coulomb_delta_tot = pd.DataFrame(columns=columns)

training_size = [2**i for i in range(1, 5)]
num_trials = 20

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance(model_poly_coulomb_delta_tot, X_coulomb, y_delta_energy, num_training_sample, num_trials)
        model_performance_poly_coulomb_delta_tot.at[index, 'training size'] = num_training_sample
        model_performance_poly_coulomb_delta_tot.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance_poly_coulomb_delta_tot.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

display(model_performance_poly_coulomb_delta_tot)

In [None]:
# Poly_coulomb_delta_elec

best_params = {'alpha': 0.1489795918367347, 'coef0': 1.183673469387755, 'degree': 2, 'kernel': 'poly'}
model_poly_coulomb_delta_elec = KernelRidge(**best_params)

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance_poly_coulomb_delta_elec = pd.DataFrame(columns=columns)

training_size = [2**i for i in range(1, 5)]
num_trials = 20

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance(model_poly_coulomb_delta_elec, X_coulomb, y_delta_elec, num_training_sample, num_trials)
        model_performance_poly_coulomb_delta_elec.at[index, 'training size'] = num_training_sample
        model_performance_poly_coulomb_delta_elec.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance_poly_coulomb_delta_elec.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

display(model_performance_poly_coulomb_delta_elec)

### Gaussian Kernel ###

#### Gaussian_lexi ####

In [None]:
# Gaussian_lexi_delta_tot

best_params = {'alpha': 1e-17, 'gamma': 2e-07, 'kernel': 'rbf'}
model_gaussian_lexi_delta_tot = KernelRidge(**best_params)

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance_gaussian_lexi_delta_tot = pd.DataFrame(columns=columns)

training_size = [2**i for i in range(1, 5)]
num_trials = 20

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance(model_gaussian_lexi_delta_tot, X_lexi, y_delta_energy, num_training_sample, num_trials)
        model_performance_gaussian_lexi_delta_tot.at[index, 'training size'] = num_training_sample
        model_performance_gaussian_lexi_delta_tot.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance_gaussian_lexi_delta_tot.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

display(model_performance_gaussian_lexi_delta_tot)

In [None]:
# Gaussian_lexi_delta_elec

best_params = {'alpha': 1e-17, 'gamma': 2e-07, 'kernel': 'rbf'}
model_gaussian_lexi_delta_elec = KernelRidge(**best_params)

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance_gaussian_lexi_delta_elec = pd.DataFrame(columns=columns)

training_size = [2**i for i in range(1, 5)]
num_trials = 20

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance(model_gaussian_lexi_delta_elec, X_lexi, y_delta_elec, num_training_sample, num_trials)
        model_performance_gaussian_lexi_delta_elec.at[index, 'training size'] = num_training_sample
        model_performance_gaussian_lexi_delta_elec.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance_gaussian_lexi_delta_elec.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

display(model_performance_gaussian_lexi_delta_elec)

#### Gaussian_lexi_nd ####

In [None]:
# Gaussian_lexi_nd_delta_tot

best_params = {'alpha': 3.755102040816326e-08, 'gamma': 1.7346938775510206e-06, 'kernel': 'rbf'}
model_gaussian_lexi_nd_delta_tot = KernelRidge(**best_params)

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance_gaussian_lexi_nd_delta_tot = pd.DataFrame(columns=columns)

training_size = [2**i for i in range(1, 5)]
num_trials = 20

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance(model_gaussian_lexi_nd_delta_tot, X_lexi_nd, y_delta_energy, num_training_sample, num_trials)
        model_performance_gaussian_lexi_nd_delta_tot.at[index, 'training size'] = num_training_sample
        model_performance_gaussian_lexi_nd_delta_tot.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance_gaussian_lexi_nd_delta_tot.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

display(model_performance_gaussian_lexi_nd_delta_tot)

In [None]:
# Gaussian_lexi_nd_delta_elec

best_params = {'alpha': 3.755102040816326e-08, 'gamma': 1.7346938775510206e-06, 'kernel': 'rbf'}
model_gaussian_lexi_nd_delta_elec = KernelRidge(**best_params)

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance_gaussian_lexi_nd_delta_elec = pd.DataFrame(columns=columns)

training_size = [2**i for i in range(1, 5)]
num_trials = 20

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance(model_gaussian_lexi_nd_delta_elec, X_lexi_nd, y_delta_elec, num_training_sample, num_trials)
        model_performance_gaussian_lexi_nd_delta_elec.at[index, 'training size'] = num_training_sample
        model_performance_gaussian_lexi_nd_delta_elec.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance_gaussian_lexi_nd_delta_elec.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

display(model_performance_gaussian_lexi_nd_delta_elec)

#### Gaussian_sorted ####

In [None]:
# Gaussian_sorted_delta_tot

best_params = {'alpha': 1.5306122448979593e-9, 'gamma': 2.2857142857142856e-06, 'kernel': 'rbf'}
model_gaussian_sorted_delta_tot = KernelRidge(**best_params)

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance_gaussian_sorted_delta_tot = pd.DataFrame(columns=columns)

training_size = [2**i for i in range(1, 5)]
num_trials = 20

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance(model_gaussian_sorted_delta_tot, X_sorted, y_delta_energy, num_training_sample, num_trials)
        model_performance_gaussian_sorted_delta_tot.at[index, 'training size'] = num_training_sample
        model_performance_gaussian_sorted_delta_tot.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance_gaussian_sorted_delta_tot.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

display(model_performance_gaussian_sorted_delta_tot)

In [None]:
# Gaussian_sorted_delta_elec

best_params = {'alpha': 1.5306122448979593e-9, 'gamma': 2.2857142857142856e-06, 'kernel': 'rbf'}
model_gaussian_sorted_delta_elec = KernelRidge(**best_params)

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance_gaussian_sorted_delta_elec = pd.DataFrame(columns=columns)

training_size = [2**i for i in range(1, 5)]
num_trials = 20

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance(model_gaussian_sorted_delta_elec, X_sorted, y_delta_elec, num_training_sample, num_trials)
        model_performance_gaussian_sorted_delta_elec.at[index, 'training size'] = num_training_sample
        model_performance_gaussian_sorted_delta_elec.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance_gaussian_sorted_delta_elec.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

display(model_performance_gaussian_sorted_delta_elec)

#### Gaussian coulomb ####

In [None]:
# Gaussian_coulomb_delta_tot

best_params = {'alpha': 1e-9, 'gamma': 1.5998587196060572e-05, 'kernel': 'rbf'}
model_gaussian_coulomb_delta_tot = KernelRidge(**best_params)

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance_gaussian_coulomb_delta_tot = pd.DataFrame(columns=columns)

training_size = [2**i for i in range(1, 5)]
num_trials = 20

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance(model_gaussian_coulomb_delta_tot, X_coulomb, y_delta_energy, num_training_sample, num_trials)
        model_performance_gaussian_coulomb_delta_tot.at[index, 'training size'] = num_training_sample
        model_performance_gaussian_coulomb_delta_tot.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance_gaussian_coulomb_delta_tot.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

display(model_performance_gaussian_coulomb_delta_tot)

In [None]:
# Gaussian_coulomb_delta_elec

best_params = {'alpha': 1e-05, 'gamma': 1.5998587196060572e-05, 'kernel': 'rbf'}
model_gaussian_coulomb_delta_elec = KernelRidge(**best_params)

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance_gaussian_coulomb_delta_elec = pd.DataFrame(columns=columns)

training_size = [2**i for i in range(1, 5)]
num_trials = 20

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance(model_gaussian_coulomb_delta_elec, X_coulomb, y_delta_elec, num_training_sample, num_trials)
        model_performance_gaussian_coulomb_delta_elec.at[index, 'training size'] = num_training_sample
        model_performance_gaussian_coulomb_delta_elec.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance_gaussian_coulomb_delta_elec.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

display(model_performance_gaussian_coulomb_delta_elec)

### Linear Model ###

In [None]:
# linear_lexi_delta_tot

best_params = {'alpha': 219}
model_linear_lexi_delta_tot = Ridge(**best_params)

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance_linear_lexi_delta_tot = pd.DataFrame(columns=columns)

training_size = [2**i for i in range(1, 5)]
num_trials = 20

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance(model_linear_lexi_delta_tot, X_lexi, y_delta_energy, num_training_sample, num_trials)
        model_performance_linear_lexi_delta_tot.at[index, 'training size'] = num_training_sample
        model_performance_linear_lexi_delta_tot.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance_linear_lexi_delta_tot.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

display(model_performance_linear_lexi_delta_tot)

In [None]:
# linear_lexi_nd_delta_tot

best_params = {'alpha': 0.0367}
model_linear_lexi_nd_delta_tot = Ridge(**best_params)

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance_linear_lexi_nd_delta_tot = pd.DataFrame(columns=columns)

training_size = [2**i for i in range(1, 5)]
num_trials = 20

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance(model_linear_lexi_nd_delta_tot, X_lexi_nd, y_delta_energy, num_training_sample, num_trials)
        model_performance_linear_lexi_nd_delta_tot.at[index, 'training size'] = num_training_sample
        model_performance_linear_lexi_nd_delta_tot.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance_linear_lexi_nd_delta_tot.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

display(model_performance_linear_lexi_nd_delta_tot)

In [None]:
# linear_sorted_delta_tot

best_params = {'alpha': 0.002}
model_linear_sorted_delta_tot = Ridge(**best_params)

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance_linear_sorted_delta_tot = pd.DataFrame(columns=columns)

training_size = [2**i for i in range(1, 5)]
num_trials = 20

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance(model_linear_sorted_delta_tot, X_sorted, y_delta_energy, num_training_sample, num_trials)
        model_performance_linear_sorted_delta_tot.at[index, 'training size'] = num_training_sample
        model_performance_linear_sorted_delta_tot.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance_linear_sorted_delta_tot.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

display(model_performance_linear_sorted_delta_tot)

In [None]:
# linear_coulomb_delta_tot

best_params = {'alpha': 5}
model_linear_coulomb_delta_tot = Ridge(**best_params)

columns = ['training size', 'average MAE (mHa)', 'standard deviation (mHa)']
model_performance_linear_coulomb_delta_tot = pd.DataFrame(columns=columns)

training_size = [2**i for i in range(1, 5)]
num_trials = 20

with warnings.catch_warnings():
    for index, num_training_sample in enumerate(training_size):
        warnings.filterwarnings("ignore")
        index = index + 1
        average_error, std_dev_error = evaluate_performance(model_linear_coulomb_delta_tot, X_coulomb, y_delta_energy, num_training_sample, num_trials)
        model_performance_linear_coulomb_delta_tot.at[index, 'training size'] = num_training_sample
        model_performance_linear_coulomb_delta_tot.at[index, 'average MAE (mHa)'] = average_error * 1000
        model_performance_linear_coulomb_delta_tot.at[index, 'standard deviation (mHa)'] = std_dev_error * 1000

display(model_performance_linear_coulomb_delta_tot)

### Graphing ###

In [None]:
# No error bar
# Set figure size
plt.figure(figsize=(12, 8))

# Load the data
x = model_performance_poly_lexi_delta_tot['training size']

y1 = model_performance_poly_lexi_delta_tot['average MAE (mHa)']
y2 = model_performance_poly_lexi_nd_delta_tot['average MAE (mHa)']
y3 = model_performance_poly_sorted_delta_tot['average MAE (mHa)']
y4 = model_performance_poly_coulomb_delta_tot['average MAE (mHa)']

y5 = model_performance_gaussian_lexi_delta_tot['average MAE (mHa)']
y6 = model_performance_gaussian_lexi_nd_delta_tot['average MAE (mHa)']
y7 = model_performance_gaussian_sorted_delta_tot['average MAE (mHa)']
y8 = model_performance_gaussian_coulomb_delta_tot['average MAE (mHa)']

y9 = model_performance_linear_lexi_delta_tot['average MAE (mHa)']
y10 = model_performance_linear_lexi_nd_delta_tot['average MAE (mHa)']
y11 = model_performance_linear_sorted_delta_tot['average MAE (mHa)']
y12 = model_performance_linear_coulomb_delta_tot['average MAE (mHa)']

# Plotting
linewidth = 4
plt.plot(x, y1, label='Polynomial KRR (lexi)', marker='o', linestyle='-', linewidth=linewidth)
plt.plot(x, y2, label='Polynomial KRR (lexi, num_doped)', marker='o', linestyle='-', linewidth=linewidth)
plt.plot(x, y3, label='Polynomial KRR (sorted)', marker='o', linestyle='-', linewidth=linewidth)
plt.plot(x, y4, label='Polynomial KRR (coulomb)', marker='o', linestyle='-', linewidth=linewidth)

plt.plot(x, y5, label='Gaussian KRR (lexi)', marker='^', linestyle='-', linewidth=linewidth)
plt.plot(x, y6, label='Gaussian KRR (lexi, num_doped)', marker='^', linestyle='-', linewidth=linewidth)
plt.plot(x, y7, label='Gaussian KRR (sorted)', marker='^', linestyle='-', linewidth=linewidth)
plt.plot(x, y8, label='Gaussian KRR (coulomb)', marker='^', linestyle='-', linewidth=linewidth)

plt.plot(x, y9, label='Linear Model (lexi)', marker='*', linestyle='-', linewidth=linewidth)
plt.plot(x, y10, label='Linear Model (lexi, num_doped)', marker='*', linestyle='-', linewidth=linewidth)
plt.plot(x, y11, label='Linear Model (sorted)', marker='*', linestyle='-', linewidth=linewidth)
plt.plot(x, y12, label='Linear Model (coulomb)', marker='*', linestyle='-', linewidth=linewidth)


# Customize the plot
plt.title('Learning curve for BN-Doped Benzene molecule total energy prediction')
plt.xlabel('Training Size')
plt.ylabel('Average MAE [mHa]')
plt.legend()

# Create log scale
plt.xscale('log', base=2)
plt.yscale('log', base=10)

xticks = [2**i for i in range(1, 5)]
yticks = [10**i for i in range(1, 4)]
plt.xticks(xticks, labels = xticks)
plt.yticks(yticks, labels = yticks)


# Display the plot
plt.savefig('../Graph/[Benz] [2.0] [all] Learning curve for BN-Doped Benzene molecule total energy prediction.png', dpi=300)
plt.show()


In [None]:
# No error bar
# Set figure size
plt.figure(figsize=(12, 8))

# Load the data
x = model_performance_poly_lexi_delta_tot['training size']

y1 = model_performance_poly_lexi_delta_tot['average MAE (mHa)']
y2 = model_performance_poly_lexi_nd_delta_tot['average MAE (mHa)']
y3 = model_performance_poly_sorted_delta_tot['average MAE (mHa)']
y4 = model_performance_poly_coulomb_delta_tot['average MAE (mHa)']

# Plotting
linewidth = 4
plt.plot(x, y1, label='Polynomial KRR (lexi)', marker='o', linestyle='-', linewidth=linewidth)
plt.plot(x, y2, label='Polynomial KRR (lexi, num_doped)', marker='o', linestyle='-', linewidth=linewidth)
plt.plot(x, y3, label='Polynomial KRR (sorted)', marker='o', linestyle='-', linewidth=linewidth)
plt.plot(x, y4, label='Polynomial KRR (coulomb)', marker='o', linestyle='-', linewidth=linewidth)

# Customize the plot
plt.title('Learning curve for BN-Doped Benzene molecule total energy prediction')
plt.xlabel('Training Size')
plt.ylabel('Average MAE [mHa]')
plt.legend()

# Create log scale
plt.xscale('log', base=2)
plt.yscale('log', base=10)

xticks = [2**i for i in range(1, 5)]
yticks = [10**i for i in range(1, 4)]
plt.xticks(xticks, labels = xticks)
plt.yticks(yticks, labels = yticks)


# Display the plot
plt.savefig('../Graph/[Benz] [2.0] [Poly] Learning curve for BN-Doped Benzene molecule total energy prediction.png', dpi=300)
plt.show()


In [None]:
# No error bar
# Set figure size
plt.figure(figsize=(12, 8))

# Load the data
x = model_performance_poly_lexi_delta_tot['training size']

y5 = model_performance_gaussian_lexi_delta_tot['average MAE (mHa)']
y6 = model_performance_gaussian_lexi_nd_delta_tot['average MAE (mHa)']
y7 = model_performance_gaussian_sorted_delta_tot['average MAE (mHa)']
y8 = model_performance_gaussian_coulomb_delta_tot['average MAE (mHa)']

# Plotting
linewidth = 4

plt.plot(x, y5, label='Gaussian KRR (lexi)', marker='^', linestyle='-', linewidth=linewidth)
plt.plot(x, y6, label='Gaussian KRR (lexi, num_doped)', marker='^', linestyle='-', linewidth=linewidth)
plt.plot(x, y7, label='Gaussian KRR (sorted)', marker='^', linestyle='-', linewidth=linewidth)
plt.plot(x, y8, label='Gaussian KRR (coulomb)', marker='^', linestyle='-', linewidth=linewidth)


# Customize the plot
plt.title('Learning curve for BN-Doped Benzene molecule total energy prediction')
plt.xlabel('Training Size')
plt.ylabel('Average MAE [mHa]')
plt.legend()

# Create log scale
plt.xscale('log', base=2)
plt.yscale('log', base=10)

xticks = [2**i for i in range(1, 5)]
yticks = [10**i for i in range(1, 4)]
plt.xticks(xticks, labels = xticks)
plt.yticks(yticks, labels = yticks)

# Display the plot
plt.savefig('../Graph/[Benz] [2.0] [Gaussian] Learning curve for BN-Doped Benzene molecule total energy prediction.png', dpi=300)
plt.show()


In [None]:
# No error bar
# Set figure size
plt.figure(figsize=(12, 8))

# Load the data
x = model_performance_poly_lexi_delta_tot['training size']

y9 = model_performance_linear_lexi_delta_tot['average MAE (mHa)']
y10 = model_performance_linear_lexi_nd_delta_tot['average MAE (mHa)']
y11 = model_performance_linear_sorted_delta_tot['average MAE (mHa)']
y12 = model_performance_linear_coulomb_delta_tot['average MAE (mHa)']

# Plotting
linewidth = 4

plt.plot(x, y9, label='Linear Model (lexi)', marker='*', linestyle='-', linewidth=linewidth)
plt.plot(x, y10, label='Linear Model (lexi, num_doped)', marker='*', linestyle='-', linewidth=linewidth)
plt.plot(x, y11, label='Linear Model (sorted)', marker='*', linestyle='-', linewidth=linewidth)
plt.plot(x, y12, label='Linear Model (coulomb)', marker='*', linestyle='-', linewidth=linewidth)


# Customize the plot
plt.title('Learning curve for BN-Doped Benzene molecule total energy prediction')
plt.xlabel('Training Size')
plt.ylabel('Average MAE [mHa]')
plt.legend()

# Create log scale
plt.xscale('log', base=2)
plt.yscale('log', base=10)

xticks = [2**i for i in range(1, 5)]
yticks = [10**i for i in range(1, 4)]
plt.xticks(xticks, labels = xticks)
plt.yticks(yticks, labels = yticks)


# Display the plot
plt.savefig('../Graph/[Benz] [2.0] [Linear] Learning curve for BN-Doped Benzene molecule total energy prediction.png', dpi=300)
plt.show()


In [None]:
# Best Performing

# No error bar
# Set figure size
plt.figure(figsize=(12, 8))

# Load the data
x = model_performance_poly_lexi_delta_tot['training size']

y6 = model_performance_gaussian_lexi_nd_delta_tot['average MAE (mHa)']
y7 = model_performance_gaussian_sorted_delta_tot['average MAE (mHa)']

y10 = model_performance_linear_lexi_nd_delta_tot['average MAE (mHa)']
y11 = model_performance_linear_sorted_delta_tot['average MAE (mHa)']

# Plotting
linewidth = 4
markersize = 20

plt.plot(x, y6, label='Gaussian KRR (lexi, num_doped)', marker='^', linestyle='-', linewidth=linewidth, markersize=markersize)
plt.plot(x, y7, label='Gaussian KRR (sorted)', marker='^', linestyle='-', linewidth=linewidth, markersize=markersize)

plt.plot(x, y10, label='Linear Model (lexi, num_doped)', marker='*', linestyle='--', linewidth=linewidth, markersize=markersize)
plt.plot(x, y11, label='Linear Model (sorted)', marker='*', linestyle='--', linewidth=linewidth, markersize=markersize)



# Customize the plot
plt.title('Learning curve for BN-Doped Benzene molecule total energy prediction')
plt.xlabel('Training Size')
plt.ylabel('Average MAE [mHa]')
plt.legend()

# Create log scale
plt.xscale('log', base=2)
plt.yscale('log', base=10)

xticks = [2**i for i in range(1, 5)]
yticks = [10**i for i in range(1, 4)]
plt.xticks(xticks, labels = xticks)
plt.yticks(yticks, labels = yticks)


# Display the plot
plt.savefig('../Graph/[Benz] [2.0] [best] Learning curve for BN-Doped Benzene molecule total energy prediction.png', dpi=300)
plt.show()


In [None]:
# # With Error Bar
# # Set figure size
# plt.figure(figsize=(10, 6))

# # Load the data
# x = model_performance_poly_KRR['training size']
# y1 = model_performance_poly_KRR['average RMSE']
# y2 = model_performance_gaussian_KRR['average RMSE']
# y3 = model_performance_ridge_regression['average RMSE']
# y1_error = model_performance_poly_KRR['standard deviation']
# y2_error = model_performance_gaussian_KRR['standard deviation']
# y3_error = model_performance_ridge_regression['standard deviation']

# # Plotting
# # plt.plot(x, y1, label='Polynomial KRR', marker='o', linestyle='-', linewidth=2.5)
# # plt.plot(x, y2, label='Gaussian KRR', marker='o', linestyle='-', linewidth=2.5)
# # plt.plot(x, y3, label='Ridge Regression', marker='o', linestyle='-', linewidth=2.5)
# plt.errorbar(x, y1, label='Polynomial KRR', yerr=y1_error, marker='o', linestyle='-', capsize=3)
# plt.errorbar(x, y2, label='Gaussian KRR', yerr=y2_error, marker='o', linestyle='-', capsize=3)
# plt.errorbar(x, y3, label='Ridge Regression', yerr=y3_error, marker='o', linestyle='-', capsize=3)

# # Customize the plot
# plt.title('Learning curve for BN-Doped Benzene molecule energy prediction')
# plt.xlabel('Training Size (log)')
# plt.ylabel('Average RMSE [Ha] (log)')
# plt.legend()

# # Create log scale
# plt.xscale('log', base=2)
# plt.yscale('log', base=2)

# # yticks = [2**i for i in range(-4, 7)]
# yticks = [2**i for i in range(0, 7)]
# plt.yticks(yticks, labels = yticks)

# # Display the plot
# plt.savefig('../Graph/[Benz] learning_curve_16_points_with_err_bar.png', dpi=300)
# plt.show()


### Comparing to Prediction ###

In [None]:
# Create a dataframe that contains the energies predicted by the model and the actual energy

# Specifies the columns of the dataframe and create an empty dataframe
columns = ['Elements', 'Poly KRR prediction', 'Gaussian KRR prediction', 'Ridge regression prediction', 'Actual energy']
model_prediction = pd.DataFrame(columns=columns)

# fill in elements and the actual energy values from the original benzene_data dataframe
model_prediction['Elements'] = benzene_data['Elements']
model_prediction['Actual energy'] = benzene_data['Energy']

# fit the gaussian KRR and ridge regression model on the training set
gaussian_KRR_model.fit(X, y)
ridge_model.fit(X, y)
poly_KRR_model.fit(X, y)

# iterate through each row entry of the data
for index, row in model_prediction.iterrows():
    x_pred = X.loc[[index]] # extract the input to be predicted
    
    # predict energy using the two models
    # the given prediction is in a list of one element. use [0] to extract the actual value
    gaussian_prediction = gaussian_KRR_model.predict(x_pred)[0] 
    ridge_prediction = ridge_model.predict(x_pred)[0]
    poly_prediction = poly_KRR_model.predict(x_pred)[0] 
    
    # Record the prediction in the DataFrame
    model_prediction.at[index, 'Poly KRR prediction'] = poly_prediction
    model_prediction.at[index, 'Gaussian KRR prediction'] = gaussian_prediction
    model_prediction.at[index, 'Ridge regression prediction'] = ridge_prediction

# display the data
display(model_prediction)


In [None]:
# Graph the results

plt.figure(figsize=(10, 6))

x = [i for i in range(17)]
y_pred_1 = model_prediction['Gaussian KRR prediction']
y_pred_2 = model_prediction['Ridge regression prediction']
y_pred_3 = model_prediction['Poly KRR prediction']
y = model_prediction['Actual energy']

plt.plot(x, y_pred_1, label='Gaussian KRR prediction', marker='o', linestyle='-', linewidth=2.5)
plt.plot(x, y_pred_2, label='Ridge regression prediction', marker='o', linestyle='-', linewidth=2.5)
plt.plot(x, y_pred_3, label='Poly KRR prediction', marker='o', linestyle='-', linewidth=2.5)
plt.plot(x, y, label='Actual energy', marker='o', linestyle='-', linewidth=2.5)

plt.title('Comparing model prediction with actual energy')
plt.xlabel('Element index')
plt.ylabel('Energy [Ha]')
plt.legend()

yticks = np.linspace(-230, -240, num=11)
plt.yticks(yticks, labels = yticks)

plt.savefig('../Graph/[Benz] comparing_model_performance.png', dpi=300)
plt.show()
