In [1]:
import sys
sys.path.append('../..')
sys.path.append('../../APDFT')
sys.path.append('../../helper_code')
sys.path.append('../data')

import pickle
from pyscf import gto, scf, dft, cc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import basis_set_exchange as bse
from APDFT.FcMole import *
import os
import ast
from copy import deepcopy
from IPython.display import display
from helper_code.data_processing import *


%load_ext autoreload
%autoreload 2
from APDFT.AP_class import APDFT_perturbator as AP

## Load Dataset ##

In [2]:
# Specify the atomic coordinates of benzene molecule (the reference molecule for ANM calculations)

benz_atom="""
C        3.22272669       0.22711285       0.00013582
C        5.87141753       0.22698034       0.00094988
C        7.19597908       2.52071412      -0.00011471
C        5.87164800       4.81458054      -0.00200817
C        3.22295713       4.81471307      -0.00280461
C        1.89839559       2.52097926      -0.00174231
H        2.18773340      -1.56549239       0.00096741
H        6.90623079      -1.56572844       0.00241360
H        9.26591446       2.52061061       0.00051784
H        6.90664130       6.60718579      -0.00284841
H        2.18814386       6.60742187      -0.00426425
H       -0.17153979       2.52108280      -0.00237226
"""

In [3]:
basis_pcx2={"H":"pc-2",'C':bse.get_basis("pcX-2",fmt="nwchem",elements=[6])\
           ,'N':bse.get_basis("pcX-2",fmt="nwchem",elements=[7])\
           ,'O':bse.get_basis("pcX-2",fmt="nwchem",elements=[8])}

In [4]:
dest_csv_path = "../data/benzene_processed_data/benzene_energy_data.csv"
raw_tot_energy_path = "../data/benzene_raw_data/Benzene_BNdoping_PBE0_pcX2_opt.npz"
row_elec_energy_path = "../data/benzene_raw_data/Benzene_BNdoping_PBE0_pcX2_opt_electronic.npz"
benzene_energy_data = load_data(benz_atom, basis_pcx2, dest_csv_path, raw_tot_energy_path, raw_tot_energy_path)
display(benzene_energy_data)

Load data complete!


Unnamed: 0,charges,elements,total energy,electronic energy,delta total energy,delta electronic energy
0,"[7, 5, 6, 6, 6, 6, 1, 1, 1, 1, 1, 1]","[N, B, C, C, C, C, H, H, H, H, H, H]",-232.488488,-336.906006,2.576317,-0.077315
1,"[7, 6, 5, 6, 6, 6, 1, 1, 1, 1, 1, 1]","[N, C, B, C, C, C, H, H, H, H, H, H]",-232.427609,-336.995987,2.515439,0.012665
2,"[7, 6, 6, 5, 6, 6, 1, 1, 1, 1, 1, 1]","[N, C, C, B, C, C, H, H, H, H, H, H]",-232.433092,-337.004116,2.520922,0.020794
3,"[7, 7, 5, 5, 6, 6, 1, 1, 1, 1, 1, 1]","[N, N, B, B, C, C, H, H, H, H, H, H]",-235.671427,-340.400298,5.759256,3.416976
4,"[7, 7, 5, 6, 5, 6, 1, 1, 1, 1, 1, 1]","[N, N, B, C, B, C, H, H, H, H, H, H]",-235.708812,-340.318992,5.796641,3.33567
5,"[7, 7, 5, 6, 6, 5, 1, 1, 1, 1, 1, 1]","[N, N, B, C, C, B, H, H, H, H, H, H]",-235.785018,-340.193778,5.872848,3.210456
6,"[7, 7, 6, 5, 5, 6, 1, 1, 1, 1, 1, 1]","[N, N, C, B, B, C, H, H, H, H, H, H]",-235.595264,-340.510269,5.683093,3.526948
7,"[7, 5, 7, 6, 6, 5, 1, 1, 1, 1, 1, 1]","[N, B, N, C, C, B, H, H, H, H, H, H]",-235.875126,-340.067245,5.962955,3.083924
8,"[7, 5, 7, 6, 5, 6, 1, 1, 1, 1, 1, 1]","[N, B, N, C, B, C, H, H, H, H, H, H]",-235.813393,-340.126176,5.901222,3.142854
9,"[7, 6, 7, 5, 5, 6, 1, 1, 1, 1, 1, 1]","[N, C, N, B, B, C, H, H, H, H, H, H]",-235.694052,-340.325617,5.781881,3.342296


## Molecular Representation ##

### Hessian-based ANM ###

In [5]:
H = get_hessian()
Q_eig_val, Q = np.linalg.eig(H)
np.savetxt('CCS_basis/ANM_basis.txt', Q)

H_df = pd.DataFrame(H)
Q_df = pd.DataFrame(Q)
print(Q_eig_val)
display(H_df)
display(Q_df)


[-2.61645906 -3.59391232 -3.56441216 -3.50145981 -3.51709044 -3.51211196]


Unnamed: 0,0,1,2,3,4,5
0,-3.375838,0.163026,0.150611,0.139364,0.164915,0.143651
1,0.163026,-3.375836,0.143651,0.164914,0.139366,0.150608
2,0.150611,0.143651,-3.401051,0.143651,0.150607,0.191661
3,0.139364,0.164914,0.143651,-3.375837,0.163026,0.15061
4,0.164915,0.139366,0.150607,0.163026,-3.375835,0.143651
5,0.143651,0.150608,0.191661,0.15061,0.143651,-3.401048


Unnamed: 0,0,1,2,3,4,5
0,-0.40926,-0.060626,0.500021,0.287227,0.499965,0.496332
1,-0.40926,0.060588,-0.49998,0.287228,0.500048,-0.496293
2,-0.406216,0.701905,-1.4e-05,-0.578764,1e-06,0.085726
3,-0.40926,0.060596,0.500007,0.287202,-0.499964,-0.496365
4,-0.409261,-0.060578,-0.499991,0.287298,-0.500023,0.496268
5,-0.406218,-0.701884,-4.2e-05,-0.578798,-2.7e-05,-0.085668


### Inverse Distance Matrix ###

In [6]:
coord = create_coord_array(benz_atom, 12)
M_inv_dist = get_inv_dist_M(coord)
Q_inv_dist_eig_val, Q_inv_dist = np.linalg.eig(M_inv_dist)
np.savetxt('CCS_basis/Inverse_distance_basis.txt', Q_inv_dist)

print(Q_inv_dist_eig_val)
M_inv_dist_df = pd.DataFrame(M_inv_dist)
Q_inv_dist_df = pd.DataFrame(Q_inv_dist)
display(M_inv_dist_df)
display(Q_inv_dist_df)

[7.27710313 5.38938855 5.86808844 5.86809913 5.49054754 5.4905542 ]


Unnamed: 0,0,1,2,3,4,5
0,5.897297,0.377545,0.217972,0.188775,0.217979,0.377542
1,0.377545,5.897297,0.377542,0.217979,0.188775,0.217972
2,0.217972,0.377542,5.897297,0.377542,0.217972,0.188765
3,0.188775,0.217979,0.377542,5.897297,0.377545,0.217972
4,0.217979,0.188775,0.217972,0.377545,5.897297,0.377542
5,0.377542,0.217972,0.188765,0.217972,0.377542,5.897297


Unnamed: 0,0,1,2,3,4,5
0,0.40825,0.40825,-0.50017,-0.288379,0.499785,0.289045
1,0.40825,-0.40825,-0.49983,0.288967,-0.500215,0.288301
2,0.408245,0.408245,0.00034,0.577353,0.00043,-0.577352
3,0.40825,-0.40825,0.50017,0.288379,0.499785,0.289045
4,0.40825,0.40825,0.49983,-0.288967,-0.500215,0.288301
5,0.408245,-0.408245,-0.00034,-0.577353,0.00043,-0.577352


### Random Projection Matrix ###

In [7]:
# a random symmetric positive definite matrix

M_rand = np.array([[1., 0.06029536, 0.44295679, 0.27515277, 0.29444547, 0.45542014],
                   [0.06029536, 1., 0.04501127, 0.30063993, 0.44013144, 0.31636733],
                   [0.44295679, 0.04501127, 1., 0.26095881, 0.24953525, 0.32341057],
                   [0.27515277, 0.30063993, 0.26095881, 1., 0.89543119, 0.67206881],
                   [0.29444547, 0.44013144, 0.24953525, 0.89543119, 1., 0.90339149],
                   [0.45542014, 0.31636733, 0.32341057, 0.67206881, 0.90339149, 1.]])
Q_rand_eig_val, Q_rand = np.linalg.eig(M_rand)
np.savetxt('CCS_basis/Random_matrix_basis.txt', Q_rand)

Q_rand_df = pd.DataFrame(Q_rand)
display(Q_rand_df)

Unnamed: 0,0,1,2,3,4,5
0,0.299522,0.578496,-0.083805,-0.257781,0.687691,-0.170984
1,0.261689,-0.456619,0.085953,-0.061749,0.048034,-0.842324
2,0.260628,0.612509,-0.024358,0.066004,-0.680893,-0.29722
3,0.481236,-0.170599,0.396131,-0.666556,-0.187974,0.320556
4,0.532336,-0.228489,-0.783564,0.088042,-0.052447,0.199843
5,0.508831,-0.019035,0.462713,0.68799,0.151898,0.173844


## Detla Learning ##

In [10]:
arr1 = np.array([3, 4, 5])
arr2 = np.array([1, 2, 3])
print(arr1 - arr2)

[2 2 2]


In [14]:
benzene_energy_data['delta delta total energy'] = None

for index, row in benzene_energy_data.iterrows():
    ref_charge = np.array([6, 6, 6, 6, 6, 6])
    dx = np.array(row['charges'][:6]) - ref_charge
    alchemy_prediction = 0.5 * (dx.T @ H @ dx)
    benzene_energy_data.at[index, 'delta delta total energy'] = benzene_energy_data.at[index, 'delta total energy'] + alchemy_prediction

display(benzene_energy_data)


Unnamed: 0,charges,elements,total energy,electronic energy,delta total energy,delta electronic energy,delta delta total energy
0,"[7, 5, 6, 6, 6, 6, 1, 1, 1, 1, 1, 1]","[N, B, C, C, C, C, H, H, H, H, H, H]",-232.488488,-336.906006,2.576317,-0.077315,-0.962546
1,"[7, 6, 5, 6, 6, 6, 1, 1, 1, 1, 1, 1]","[N, C, B, C, C, C, H, H, H, H, H, H]",-232.427609,-336.995987,2.515439,0.012665,-1.023617
2,"[7, 6, 6, 5, 6, 6, 1, 1, 1, 1, 1, 1]","[N, C, C, B, C, C, H, H, H, H, H, H]",-232.433092,-337.004116,2.520922,0.020794,-0.99428
3,"[7, 7, 5, 5, 6, 6, 1, 1, 1, 1, 1, 1]","[N, N, B, B, C, C, H, H, H, H, H, H]",-235.671427,-340.400298,5.759256,3.416976,-1.296888
4,"[7, 7, 5, 6, 5, 6, 1, 1, 1, 1, 1, 1]","[N, N, B, C, B, C, H, H, H, H, H, H]",-235.708812,-340.318992,5.796641,3.33567,-1.252549
5,"[7, 7, 5, 6, 6, 5, 1, 1, 1, 1, 1, 1]","[N, N, B, C, C, B, H, H, H, H, H, H]",-235.785018,-340.193778,5.872848,3.210456,-1.137871
6,"[7, 7, 6, 5, 5, 6, 1, 1, 1, 1, 1, 1]","[N, N, C, B, B, C, H, H, H, H, H, H]",-235.595264,-340.510269,5.683093,3.526948,-1.351087
7,"[7, 5, 7, 6, 6, 5, 1, 1, 1, 1, 1, 1]","[N, B, N, C, C, B, H, H, H, H, H, H]",-235.875126,-340.067245,5.962955,3.083924,-1.154701
8,"[7, 5, 7, 6, 5, 6, 1, 1, 1, 1, 1, 1]","[N, B, N, C, B, C, H, H, H, H, H, H]",-235.813393,-340.126176,5.901222,3.142854,-1.19528
9,"[7, 6, 7, 5, 5, 6, 1, 1, 1, 1, 1, 1]","[N, C, N, B, B, C, H, H, H, H, H, H]",-235.694052,-340.325617,5.781881,3.342296,-1.2673


## Data Transformation ##

In [15]:
def compute_lambda_c_square(c_arr, eig_val_arr):
    """ 
    square each coefficient and multiply it by the ANM eigenvalue

    Args:
        c_arr (list): a list of the ANM coefficients
        eig_val_arr (list): a list of the ANM eigenvalues
    Returns:
        list: the transformed coefficient
    """
    transformed_c = [eig_val * coef**2 for eig_val, coef in zip(eig_val_arr, c_arr)]
    return transformed_c

In [16]:
benzene_data_ANM_basis, dx_col, c_col = generate_coef_with_specific_basis(benzene_energy_data, Q, coord, ref_charge=6)
print(dx_col)
print(c_col)

benzene_data_ANM_basis['c_square_eig'] = benzene_data_ANM_basis['c'].apply(lambda c_arr: compute_lambda_c_square(c_arr, Q_eig_val))
benzene_data_ANM_basis['lexi_c_square_eig'] = benzene_data_ANM_basis['lexi_c'].apply(lambda c_arr: compute_lambda_c_square(c_arr, Q_eig_val))
benzene_data_ANM_basis['c_inv_square_eig'] = benzene_data_ANM_basis['c_inv'].apply(lambda c_arr: compute_lambda_c_square(c_arr, Q_eig_val))
benzene_data_ANM_basis['lexi_c_inv_square_eig'] = benzene_data_ANM_basis['lexi_c_inv'].apply(lambda c_arr: compute_lambda_c_square(c_arr, Q_eig_val))

print(benzene_data_ANM_basis.columns)


['dx', 'sorted_dx', 'lexi_dx', 'num_dope']
['c', 'c_inv', 'sorted_c', 'lexi_c', 'lexi_c_inv', 'coulomb_sort_c']
Index(['charges', 'elements', 'total energy', 'electronic energy',
       'delta total energy', 'delta electronic energy',
       'delta delta total energy', 'dx', 'sorted_dx', 'lexi_dx', 'c', 'c_inv',
       'sorted_c', 'lexi_c', 'lexi_c_inv', 'coulomb_sort_c', 'num_dope',
       'c_square_eig', 'lexi_c_square_eig', 'c_inv_square_eig',
       'lexi_c_inv_square_eig'],
      dtype='object')


In [44]:
benzene_data_inv_dist_basis, dx_col, c_col = generate_coef_with_specific_basis(benzene_energy_data, Q_inv_dist, coord, ref_charge=6)
benzene_data_inv_dist_basis['c_square_eig'] = benzene_data_inv_dist_basis['c'].apply(lambda c_arr: compute_lambda_c_square(c_arr, Q_inv_dist_eig_val))
benzene_data_inv_dist_basis['lexi_c_square_eig'] = benzene_data_inv_dist_basis['lexi_c'].apply(lambda c_arr: compute_lambda_c_square(c_arr, Q_inv_dist_eig_val))
benzene_data_inv_dist_basis['c_inv_square_eig'] = benzene_data_inv_dist_basis['c_inv'].apply(lambda c_arr: compute_lambda_c_square(c_arr, Q_inv_dist_eig_val))
benzene_data_inv_dist_basis['lexi_c_inv_square_eig'] = benzene_data_inv_dist_basis['lexi_c_inv'].apply(lambda c_arr: compute_lambda_c_square(c_arr, Q_inv_dist_eig_val))

benzene_data_rand_basis, dx_col, c_col = generate_coef_with_specific_basis(benzene_energy_data, Q_rand, coord, ref_charge=6)
benzene_data_rand_basis['c_square_eig'] = benzene_data_rand_basis['c'].apply(lambda c_arr: compute_lambda_c_square(c_arr, Q_rand_eig_val))
benzene_data_rand_basis['lexi_c_square_eig'] = benzene_data_rand_basis['lexi_c'].apply(lambda c_arr: compute_lambda_c_square(c_arr, Q_rand_eig_val))
benzene_data_rand_basis['c_inv_square_eig'] = benzene_data_rand_basis['c_inv'].apply(lambda c_arr: compute_lambda_c_square(c_arr, Q_rand_eig_val))
benzene_data_rand_basis['lexi_c_inv_square_eig'] = benzene_data_rand_basis['lexi_c_inv'].apply(lambda c_arr: compute_lambda_c_square(c_arr, Q_rand_eig_val))

## Delta learning Data ##

In [38]:
benzene_data_ANM_basis['second order alchemy'] = benzene_data_ANM_basis['lexi_c_inv_square_eig'].apply(lambda arr: 0.5*sum(arr))
benzene_data_ANM_basis['delta delta total energy'] = benzene_data_ANM_basis['delta total energy'] + benzene_data_ANM_basis['second order alchemy']
benzene_data_ANM_basis['delta delta electronic energy'] = benzene_data_ANM_basis['delta electronic energy'] + benzene_data_ANM_basis['second order alchemy']
display(benzene_data_ANM_basis[['second order alchemy', 'delta total energy', 'delta electronic energy', 
                                'delta delta total energy', 'delta delta electronic energy']])

Unnamed: 0,second order alchemy,delta total energy,delta electronic energy,delta delta total energy,delta delta electronic energy
0,-3.532093,2.576317,-0.077315,-0.955776,-3.609409
1,-3.540752,2.515439,0.012665,-1.025313,-3.528087
2,-3.515202,2.520922,0.020794,-0.99428,-3.494408
3,-7.056141,5.759256,3.416976,-1.296885,-3.639165
4,-7.133459,5.796641,3.33567,-1.336818,-3.797789
5,-7.103467,5.872848,3.210456,-1.230619,-3.89301
6,-7.034181,5.683093,3.526948,-1.351087,-3.507233
7,-7.038445,5.962955,3.083924,-1.07549,-3.954521
8,-7.096513,5.901222,3.142854,-1.195291,-3.953659
9,-7.04918,5.781881,3.342296,-1.267298,-3.706884


In [39]:
print(benzene_data_ANM_basis['delta delta total energy'].mean())

-1.20668734505365


In [40]:
y_delta_delta = benzene_data_ANM_basis['delta delta total energy']
y_delta_delta.to_csv('../data/benzene_training_data/[Benz] y_delta_delta_ANM.csv', index=False)

## Export Training Data ##

In [50]:
ANM_basis_datasets = generate_input_training_data(benzene_data_ANM_basis)
inv_dist_basis_datasets = generate_input_training_data(benzene_data_inv_dist_basis)
rand_basis_datasets = generate_input_training_data(benzene_data_rand_basis)

complete_dataset = [ANM_basis_datasets, inv_dist_basis_datasets, rand_basis_datasets]
prefixes = ['ANM', 'inv_dist', 'rand']
dataset_names = ['X', 'X_inv', 'X_sorted', 'X_lexi', 'X_lexi_inv', 'X_nd', 'X_lexi_nd', 'X_coulomb',
                     'X_square_eig', 'X_inv_square_eig', 'X_lexi_square_eig', 'X_lexi_inv_square_eig']

dest_folder = "../data/benzene_training_data"
for datasets, prefix in zip(complete_dataset, prefixes):
    export_to_csv_custom(datasets, dataset_names, prefix, dest_folder)

In [47]:
# Save data to csv

y_energy, y_elec, y_delta_energy, y_delta_elec = generate_target_training_data(benzene_energy_data)

y_energy.to_csv('../data/benzene_training_data/[Benz] y_energy.csv', index=False)
y_elec.to_csv('../data/benzene_training_data/[Benz] y_elec.csv', index=False)
y_delta_energy.to_csv('../data/benzene_training_data/[Benz] y_delta_energy.csv', index=False)
y_delta_elec.to_csv('../data/benzene_training_data/[Benz] y_delta_elec.csv', index=False)