In [1]:
import sys
sys.path.append('../..')
sys.path.append('../../APDFT')
sys.path.append('../../helper_code')
sys.path.append('../data')

import pickle
from pyscf import gto, scf, dft, cc
from ase import Atoms
from ase.io import write
from ase.visualize import view
import numpy as np
import pandas as pd
import pyscf
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import basis_set_exchange as bse
from APDFT.FcMole import *
import os
import ast
from copy import deepcopy
from IPython.display import display
from helper_code.data_processing import *
from helper_code.util import charge_arr_to_str

%load_ext autoreload
%autoreload 2
from APDFT.AP_class import APDFT_perturbator as AP

## Set up ##

In [2]:
coronene_energy_raw_data = np.load("../data/coronene_raw_data/coronene_mutants_pbe_pcx2_corrected3.npz", allow_pickle=True)
charges, total_energy = coronene_energy_raw_data['charges'], coronene_energy_raw_data['energies']
num_mol = charges.shape[0]

xyz_file = '../data/coronene_raw_data/coronene_opt_pbe0_pcX2.xyz'
with open(xyz_file, 'r') as f:
    lines = f.readlines()[2:]  # Skip the first two lines (atom count and comment)
    atoms = [line.split() for line in lines]

# Extract atomic symbols and coordinates
symbols = [atom[0] for atom in atoms]
all_atoms_coord = [[float(atom[1]), float(atom[2]), float(atom[3])] for atom in atoms]
coordinates = np.tile(np.array(all_atoms_coord), (num_mol, 1, 1))

print(charges.shape)
print(total_energy.shape)

(2043, 36)
(2043,)


In [6]:
IDM = get_inv_dist_M(coordinates[0], 24)
IDM_eigval, Q = np.linalg.eig(IDM)
print(Q.shape)
print(IDM_eigval)

(24, 24)
[ 8.04766557  2.71957365  2.71961208  1.3213949   1.46824254  1.46823411
  0.94975959  0.85080608  0.76852506  0.76850603  0.59986358  0.5998329
 -0.04001478  0.01639216  0.01638804  0.1217896   0.29444385  0.29442023
  0.09569998  0.09570227  0.19075144  0.19311639  0.21965206  0.21964266]


## Process Data ##

In [12]:
rotation_mapping = {0:21, 21:22, 22:15, 15:12, 12:7, 7:0, 
                    1:20, 20:23, 23:14, 14:13, 13:6, 6:1,
                    2:19, 19:16, 16:11, 11:8, 8:5, 5:2,
                    3:18, 18:17, 17:10, 10:9, 9:4, 4:3}

reflection_mapping1 = {0:6, 6:0, 7:1, 1:7, 5:5, 
                       8:2, 2:8, 4:4, 13:21, 21:13,
                       9:3, 3:9, 12:20, 20:12, 10:18, 18:10,
                       11:19, 19:11, 17:17, 14:22, 22:14, 16:16,
                       15:23, 23:15}

reflection_mapping2 = {0:1, 1:0, 6:21, 21:6, 5:2, 2:5,
                       7:20, 20:7, 4:3, 3:4, 8:19, 19:8,
                       9:18, 18:9, 13:22, 22:13, 10:17, 17:10,
                       12:23, 23:12, 11:16, 16:11, 14:15, 15:14}

In [10]:
coronene_energy_data = pd.DataFrame()
coronene_energy_data['charges'] = charges.tolist()
coronene_energy_data['dx'] = None
for index, row in coronene_energy_data.iterrows():
    ref_charge = np.full(24, 6)
    dx = np.array(row['charges'][:24]) - ref_charge
    coronene_energy_data.at[index, 'dx'] = dx

display(coronene_energy_data.head(3))

Unnamed: 0,charges,dx
0,"[7, 6, 5, 6, 7, 7, 6, 7, 7, 5, 6, 7, 6, 6, 6, ...","[1, 0, -1, 0, 1, 1, 0, 1, 1, -1, 0, 1, 0, 0, 0..."
1,"[5, 5, 7, 5, 5, 7, 5, 5, 7, 7, 7, 7, 5, 7, 5, ...","[-1, -1, 1, -1, -1, 1, -1, -1, 1, 1, 1, 1, -1,..."
2,"[5, 6, 6, 7, 7, 6, 7, 6, 5, 6, 5, 5, 7, 7, 5, ...","[-1, 0, 0, 1, 1, 0, 1, 0, -1, 0, -1, -1, 1, 1,..."


In [13]:
coronene_energy_data['dx_lexi'] = coronene_energy_data['dx'].apply(
    lambda arr: coronene_lexi(arr.tolist(), rotation_mapping, reflection_mapping1, reflection_mapping2))

coronene_energy_data['c'] = coronene_energy_data['dx'].apply(
    lambda arr: (arr.reshape(1, 24)@Q).reshape(24,).tolist()) # column vector view: c = Q_inv @ dx

coronene_energy_data['c_lexi'] = coronene_energy_data['dx_lexi'].apply(
    lambda arr: (arr.reshape(1, 24)@Q).reshape(24,).tolist())

coronene_energy_data['CE'] = coronene_energy_data['c'].apply(
    lambda arr: compute_lambda_c(arr, IDM_eigval))

coronene_energy_data['CE_lexi'] = coronene_energy_data['c_lexi'].apply(
    lambda arr: compute_lambda_c(arr, IDM_eigval))

coronene_energy_data['CSE'] = coronene_energy_data['c'].apply(
    lambda arr: compute_lambda_c_square(arr, IDM_eigval))

coronene_energy_data['CSE_lexi'] = coronene_energy_data['c_lexi'].apply(
    lambda arr: compute_lambda_c_square(arr, IDM_eigval))

## Export ##

In [14]:
datasets = ['c', 'c_lexi', 'CE', 'CE_lexi', 'CSE', 'CSE_lexi']
columns = [f"coord{i}" for i in range(24)]

dataset_dict = {}
for data in datasets:
    dataset_dict[data] = pd.DataFrame(columns=columns)
    for i in range(24):
        dataset_dict[data][f"coord{i}"] = coronene_energy_data[data].apply(lambda arr: arr[i])

In [15]:
for key, dataframe in dataset_dict.items():
    dataframe.to_csv(f'../data/coronene_training_data/[IDM]{key}.csv', index=False)