In [5]:
import sys
sys.path.append('../..')
sys.path.append('../../APDFT')
sys.path.append('../../helper_code')
sys.path.append('../data')

import pickle
from pyscf import gto, scf, dft, cc
from ase import Atoms
import numpy as np
import pandas as pd
import pyscf
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import basis_set_exchange as bse
from APDFT.FcMole import *
import os
import ast
from copy import deepcopy
from IPython.display import display
from helper_code.data_processing import *
import qml
from helper_code.util import charge_arr_to_str
from ase import Atoms
from ase.io import write

%load_ext autoreload
%autoreload 2
from APDFT.AP_class import APDFT_perturbator as AP

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
coronene_energy_raw_data = np.load("../data/coronene_raw_data/coronene_mutants_pbe0_pcx2.npz", allow_pickle=True)
charges, total_energy = coronene_energy_raw_data['charges'], coronene_energy_raw_data['energies']

columns = ['charges', 'total energy']
coronene_energy_data = pd.DataFrame(columns=columns)
coronene_energy_data['charges'] = charges.tolist()
coronene_energy_data['total energy'] = total_energy.tolist()

CORONENE_REF_ENERGY = -909.721935153841
coronene_energy_data['delta total energy'] = coronene_energy_data['total energy'].apply(lambda x: CORONENE_REF_ENERGY-x)

display(coronene_energy_data.head(3))

Unnamed: 0,charges,total energy,delta total energy
0,"[7, 5, 5, 5, 5, 7, 5, 7, 5, 5, 7, 7, 5, 7, 6, ...",-946.036986,36.315051
1,"[6, 5, 7, 6, 6, 6, 6, 6, 6, 6, 5, 7, 6, 5, 7, ...",-929.442623,19.720687
2,"[6, 5, 7, 5, 5, 5, 7, 5, 5, 6, 5, 7, 7, 5, 6, ...",-942.508112,32.786177


In [3]:
xyz_file = '../data/coronene_raw_data/coronene_opt_pbe0_pcX2.xyz'
with open(xyz_file, 'r') as f:
    lines = f.readlines()[2:]  # Skip the first two lines (atom count and comment)
    atoms = [line.split() for line in lines]

# Extract atomic symbols and coordinates
symbols = [atom[0] for atom in atoms]
all_atoms_coord = [[float(atom[1]), float(atom[2]), float(atom[3])] for atom in atoms]

In [13]:
coronene_energy_data['CM'] = None
for index, row in coronene_energy_data.iterrows():
    charge_arr = row['charges']
    compressed_str = charge_arr_to_str(charge_arr)

    ase_mol = Atoms(symbols=compressed_str, positions=all_atoms_coord)
    write("temp_mol.xyz", ase_mol, format='xyz')
    qml_mol = qml.Compound(xyz="temp_mol.xyz")
    qml_mol.generate_coulomb_matrix(size=36, sorting="row-norm")
    CM_rep = qml_mol.representation

    coronene_energy_data.at[index, 'CM'] = CM_rep
    os.remove("temp_mol.xyz")
    del ase_mol, qml_mol, CM_rep

# display(coronene_energy_data['CM'][0])

In [15]:
num_column = coronene_energy_data['CM'][0].shape[0]
columns = [f"coord{i}" for i in range(num_column)]
CM_rep = pd.DataFrame(columns=columns)

for i in range(num_column):
    CM_rep[f"coord{i}"] = coronene_energy_data['CM'].apply(lambda arr: arr[i])

display(CM_rep.head())

Unnamed: 0,coord0,coord1,coord2,coord3,coord4,coord5,coord6,coord7,coord8,coord9,...,coord656,coord657,coord658,coord659,coord660,coord661,coord662,coord663,coord664,coord665
0,53.358707,34.053598,53.358707,34.368323,19.751342,53.358707,12.913381,19.751462,9.875736,53.358707,...,0.402831,0.119825,0.399055,0.14694,0.103771,0.20754,0.107468,0.207544,0.119824,0.5
1,53.358707,19.751342,53.358707,12.913317,8.552605,53.358707,19.885231,11.390971,9.539765,53.358707,...,0.399079,0.107396,0.119824,0.402834,0.107468,0.146938,0.146572,0.20754,0.103772,0.5
2,53.358707,34.368245,53.358707,19.751462,17.105333,53.358707,19.751342,17.105179,9.875736,53.358707,...,0.207545,0.119824,0.14657,0.402824,0.107395,0.107468,0.207542,0.14694,0.399078,0.5
3,53.358707,34.053598,53.358707,34.054102,19.661149,53.358707,34.368245,19.75113,19.751262,53.358707,...,0.146571,0.107467,0.107395,0.146938,0.399074,0.207539,0.402831,0.103771,0.119824,0.5
4,53.358707,34.053927,53.358707,34.053598,19.660937,53.358707,12.913298,19.751239,11.385929,53.358707,...,0.103771,0.14657,0.207545,0.399056,0.119825,0.107396,0.119824,0.207539,0.402833,0.5


In [16]:
CM_rep.to_csv("../data/coronene_training_data/CM_rep.csv", index=False)