In [1]:
import sys
# !{sys.executable} -m pip install --upgrade  xeus-python notebook
#######################################################
#Import packages
import numpy as np
import os
os.environ['MOLCAS']='/home/grierjones/Test/build'
os.environ['MOLCAS_WORKDIR']='/tmp'
import re
from math import sin, cos, pi
from glob import glob
import subprocess
import pickle
from subprocess import call, check_output
import pandas as pd
# import psi4
from joblib import Parallel,effective_n_jobs,delayed
from time import time
import matplotlib.pyplot as plt
from plumbum.cmd import grep, awk

import shutil
import random
import sklearn
from shutil import copy
import csv
import h5py as h5
import seaborn as sns; sns.set(style="ticks", color_codes=True)

from sklearn.model_selection import train_test_split

# To-do 
- Two-electron integral features
- CASSCF Fock matrix
- SCF Fock matrix
- SCF MO features
- CASSCF MO features
- Generality

In [2]:
#######################################################
# Obital labels
## Inactive i,j
## Active t,u,v
## Virtual a,b

## Type 1: IA->AA
## Type 2: II->AA (P)
## Type 3: II->AA (M)
## Type 4: AA->VA
## Type 5: IA->VA/AV
## Type 6: II->AV (P)
## Type 7: II->AV (M)
## Type 8: AA->VV (P)
## Type 9: AA->VV (M)
## Type 10: IA->VV (P)
## Type 11: IA->VV (M)
## Type 12: II->VV (P)
## Type 13: II->VV (M)

## A: IA->AA
## B: II->AA
## C: AA->VA
## D: IA->VA/AV
## E: II->AV
## F: AA->VV
## G: IA->VV 
## H: II->VV
#######################################################


In [3]:
# Delete excessive extra files
def del_useless():
    '''
    Delete the extra files
    '''
    for root, dirs, files in os.walk(os.getcwd()):
        for file in files:
            for j in ['status','GssOrb','LprOrb','LoProp','guessorb','xmldump','RasOrb','SpdOrb']:
                if j in file:
    #                 print(root,dirs,file)
                    os.remove(os.path.join(root,file))

In [4]:
# When restarting a setr of calculations just clear everyting out
def clean_dir():
    for entry in os.scandir(path=os.getcwd()):
        if entry.is_dir():
            if entry.name=='Fock':
                shutil.rmtree(entry.name)
            if entry.name=='hdf5':
                shutil.rmtree(entry.name)
            if entry.name=='e2':
                shutil.rmtree(entry.name)                
            if entry.name=='Labels':
                shutil.rmtree(entry.name)
            if entry.name=='Coords':
                shutil.rmtree(entry.name)
            if 'dir' in entry.name:
                shutil.rmtree(entry.name)
                


In [5]:
def gen_gateway(name,basis_set):
    string=f'''&GATEWAY 
coord={f'{name}.xyz'}
Basis = {basis_set}
Group = nosymm
Expert
End of Input

'''
    return string

def gen_seward():
    string=f'''&SEWARD
End of Input

'''
    return string

def gen_motra(name):
    string=f'''&MOTRA
Frozen=0
LUMORB
>>> COPY $WorkDir/GMJ_one_int_indx.csv $CurrDir/{name}.GMJ_one_int_indx.csv
>>> COPY $WorkDir/GMJ_one_int.csv $CurrDir/{name}.GMJ_one_int.csv
>>> COPY $WorkDir/GMJ_two_int_indx.csv $CurrDir/{name}.GMJ_two_int_indx.csv
>>> COPY $WorkDir/GMJ_two_int.csv $CurrDir/{name}.GMJ_two_int.csv

'''
    return string

def gen_scf(name):
    string=f"""&SCF &END
>>> COPY $WorkDir/{name}.scf.h5 $CurrDir/

"""
    return string    


def gen_rasscf(name,electrons,occupied,inactive,symmetry=1,spin=1,previous=None):
    start_string="""&RASSCF &END
Title= RASSCF
"""
    if previous!=None:
        fileorb=f"""FileOrb
{previous}
"""
    else:
        fileorb=''

    end_string=f"""NACTEL
{electrons} 0 0
Inactive
{inactive}
RAS2
{occupied}
Symmetry
{symmetry}
Spin
{spin}
orblisting
all
ITERation
200 100
CIMX
200
SDAV
500

>>> COPY $WorkDir/{name}.rasscf.h5 $CurrDir/
>>> COPY $WorkDir/GMJ_Fock_MO.csv $CurrDir/{name}.GMJ_Fock_MO.csv
>>> COPY $WorkDir/GMJ_PT2_Fock_MO.csv $CurrDir/{name}.GMJ_PT2_Fock_MO.csv

"""
    return start_string+fileorb+end_string 

def gen_caspt2():
    string="""&CASPT2 &END
Frozen 
0

>>foreach i in (B,E,F,G,H)
>>foreach j in (P,M)
>>if ( -FILE GMJ_e2_${i}_${j}.csv )
>>> COPY $WorkDir/GMJ_RHS_${i}_${j}.csv $CurrDir/GMJ_RHS_${i}_${j}.csv
>>> COPY $WorkDir/GMJ_IVECW_${i}_${j}.csv $CurrDir/GMJ_IVECW_${i}_${j}.csv
>>> COPY $WorkDir/GMJ_IVECX_${i}_${j}.csv $CurrDir/GMJ_IVECX_${i}_${j}.csv
>>> COPY $WorkDir/GMJ_IVECC2_${i}_${j}.csv $CurrDir/GMJ_IVECC2_${i}_${j}.csv
>>> COPY $WorkDir/GMJ_e2_${i}_${j}.csv $CurrDir/GMJ_e2_${i}_${j}.csv
>>endif
>>enddo
>>enddo

>>foreach i in (A,C,D)
>>if ( -FILE GMJ_e2_$i.csv )
>>> COPY $WorkDir/GMJ_RHS_$i.csv $CurrDir/GMJ_RHS_$i.csv
>>> COPY $WorkDir/GMJ_IVECW_$i.csv $CurrDir/GMJ_IVECW_$i.csv
>>> COPY $WorkDir/GMJ_IVECX_$i.csv $CurrDir/GMJ_IVECX_$i.csv
>>> COPY $WorkDir/GMJ_IVECC2_$i.csv $CurrDir/GMJ_IVECC2_$i.csv
>>> COPY $WorkDir/GMJ_e2_$i.csv $CurrDir/GMJ_e2_$i.csv
>>endif
>>enddo

"""
    return string    





In [6]:
basis_set='ANO-RCC-VDZP'
top=os.getcwd()

In [7]:
# with open(os.path.join(f'O3_106.00.xyz'),'w') as f:
#     f.write(f'2\n\n')
#     for j in range(2):
#         f.write(f'H {0:>8f} {0:>8f} {j*0.94:>8f}\n')

In [8]:
def write_input(path,basis_set,name,electrons,occupied,inactive,previous=None):
       # Write input
        with open(os.path.join(path,f'{name}.input'),'wb') as g:
            g.write(gen_gateway(name,basis_set).encode())
            g.write(gen_seward().encode())
            g.write(gen_scf(name).encode())   
            g.write(gen_rasscf(name,electrons,occupied,inactive,previous=previous).encode())
            g.write(gen_motra(name).encode())
            g.write(gen_caspt2().encode())    

In [9]:
# write_input('./',basis_set,'O3_106.00',2,2,0,previous=None)
write_input('./',basis_set,'O3_106.00',4,3,10,previous=None)

In [10]:
call(['pymolcas','-new','-clean',f'O3_106.00.input', '-oe', f'O3_106.00.output'])

0

In [11]:
# Grab energies
path_check = 'O3_106.00.output'

E2 = float((grep['-i', 'E2 (Variational):',path_check] | awk['{print $NF }'])())
CASSCF_E = float((grep['-i', '::    RASSCF root number  1',path_check] | awk['{print $8 }'])())
CASPT2_E = float((grep['-i', '::    CASPT2',path_check] | awk['{print $NF }'])())

#Grab basis information
fro=int(subprocess.Popen(f"grep -i 'Frozen orbitals' {path_check} | tail -n 1",shell=True,stdout=subprocess.PIPE,stderr=subprocess.STDOUT).communicate()[0].split()[-1])
# Number of inactive orbitals
inact=int(subprocess.Popen(f"grep -i 'Inactive orbitals' {path_check} | tail -n 1",shell=True,stdout=subprocess.PIPE,stderr=subprocess.STDOUT).communicate()[0].split()[-1])
# Number of active orbitals
act=int(subprocess.Popen(f"grep -i 'Active orbitals' {path_check} | tail -n 1",shell=True,stdout=subprocess.PIPE,stderr=subprocess.STDOUT).communicate()[0].split()[-1])
# Number of seconary orbitals
virt=int(subprocess.Popen(f"grep -i 'Secondary orbitals' {path_check} | tail -n 1",shell=True,stdout=subprocess.PIPE,stderr=subprocess.STDOUT).communicate()[0].split()[-1])
# Number of basis functions for sanity check
bas_check=int(subprocess.Popen(f"grep -i 'Number of basis functions' {path_check} | tail -n 1",shell=True,stdout=subprocess.PIPE,stderr=subprocess.STDOUT).communicate()[0].split()[-1])

Basis_Indices=[]
for i in range(fro):
    Basis_Indices.append(f'F{i+1}')
for i in range(inact):
    Basis_Indices.append(f'I{i+1}')
for i in range(act):
    Basis_Indices.append(f'A{i+1}')
for i in range(virt):
    Basis_Indices.append(f'S{i+1}')   


basis_dict = {v:k for k,v in dict(enumerate(Basis_Indices)).items()}

In [12]:
# Load the PT2 Fock elements
# Columns are as follows:
# IT,IU,F(global index),FI(global index),fa(global index),d(global index)
pt2fock = 'O3_106.00.GMJ_PT2_Fock_MO.csv'
pt2fock_values = np.nan_to_num(np.fromfile(pt2fock,dtype=float).reshape(-1,6)[:,3:])
pt2fock_idx = np.fromfile(pt2fock,dtype=int).reshape(-1,6)[:,0:3]-1
pt2fock_stacked = np.hstack([pt2fock_idx,pt2fock_values])


In [13]:
# Read CASSCF Fock from file
CASSCF_fock = np.fromfile('O3_106.00.GMJ_Fock_MO.csv')

In [14]:
# Load one-electron integrals
oneelint = np.fromfile('O3_106.00.GMJ_one_int.csv').reshape(-1,1)
oneelint_idx = np.fromfile('O3_106.00.GMJ_one_int_indx.csv',dtype=int).reshape(-1,4)[:,0:2]-1
h_stacked = np.hstack([oneelint_idx,oneelint])

# Load two-electron integrals
twoelint = np.fromfile('O3_106.00.GMJ_two_int.csv').reshape(-1,1)
twoelint_idx = np.fromfile('O3_106.00.GMJ_two_int_indx.csv',dtype=int).reshape(-1,4)-1
twostacked = np.hstack([twoelint_idx,twoelint])

In [15]:
# Grab rasscf and scf hdf5 data
rasscf_h5 = h5.File('O3_106.00.rasscf.h5', 'r')
scf_h5 = h5.File('O3_106.00.scf.h5', 'r')

datasetNames = [n for n in rasscf_h5.keys()]
NBAS = rasscf_h5.attrs['NBAS']
NACTEL = rasscf_h5.attrs['NACTEL']

#Keys: MO_VECTORS, MO_ENERGIES, MO_OCCUPATIONS
casMO_dict = {k:np.array(rasscf_h5[k]) for k in datasetNames if "MO" in k}
scfMO_dict = {k:np.array(scf_h5[k]) for k in datasetNames if "MO" in k}

In [16]:
# Get two-electron indices
two_el_ex_labels = {i.split('.')[0].replace("GMJ_RHS_",""):[re.sub(r'(?<!\d)0+(\d+)', r'\1', j) for j in pd.read_csv(i,header=None)[0].values] for i in glob("GMJ_RHS_*.csv")}

pair_labels = {i.split('.')[0].replace("GMJ_RHS_",""):['_'.join(re.sub(r'(?<!\d)0+(\d+)', r'\1', j).split('_')[0:2]) for j in pd.read_csv(i,header=None)[0].values] for i in glob("GMJ_RHS_*.csv")}

In [17]:
np.genfromtxt('GMJ_e2_C.csv',skip_header=True).flatten().sum()+np.genfromtxt('GMJ_e2_F_P.csv',skip_header=True).flatten().sum()

np.float64(-0.023455826504621642)

In [18]:
def strip(lst):   
    return '_'.join(re.sub(r'(?<!\d)0+(\d+)', r'\1', i) for i in lst.split('_'))


Dimension check for DDCASPT2: check the ordering of the pair-energies,
this notation follows a mix of the papers and code.

A (IA->AA): \ TIUV \ E$_{ti}$ E$_{uv}$ \ pqrs=tiuv=0123 \    
B_P (II->AA) (P): \ IJTU \ E$_{ti}$ E$_{uj}$ \ pqrs=tiuj=2031 \
B_M (II->AA) (M): \ IJTU \ E$_{ti}$ E$_{uj}$ \ pqrs=tiuj=2031 \
C (AA->VA): \ UVAT \ E$_{at}$ E$_{uv}$ \ pqrs=atuv=2301 \
D (IA->VA/AV): \ IUAT/IUTA \ E$_{ai}$ E$_{tu}$/E$_{ti}$ E$_{au}$ \ pqrs=(a/t)i(t/a)u=2031 \
E_P (II->AV) (P): \ IJAT \ E$_{ti}$ E$_{aj}$ \ pqrs=tiaj=3021 \
E_M (II->AV) (M): \ IJAT \ E$_{ti}$ E$_{aj}$ \ pqrs=tiaj=3021 \
F_P (AA->VV) (P): \ TUAB \ E$_{at}$ E$_{bu}$ \ pqrs=atbu=2031 \
F_M (AA->VV) (M): \ TUAB \ E$_{at}$ E$_{bu}$ \ pqrs=atbu=2031 \
G_P (IA->VV) (P): \ ITAB \ E$_{ai}$ E$_{bt}$ \ pqrs=aibt=2031 \
G_M (IA->VV) (M): \ ITAB \ E$_{ai}$ E$_{bt}$ \ pqrs=aibt=2031 \
H_P (II->VV) (P): \ IJAB \ E$_{ai}$ E$_{bj}$ \ pqrs=aibj=2031 \
H_M (II->VV) (M): \ IJAB \ E$_{ai}$ E$_{bj}$ \ pqrs=aibj=2031 \


In [19]:
# CASPT2 E_pq E_rs ordering
index_dict = {"A":{"p":0,"q":1,"r":2,"s":3},
"B_P":{"p":2,"q":0,"r":3,"s":1},
"B_M":{"p":2,"q":0,"r":3,"s":1},
"C":{"p":2,"q":3,"r":0,"s":1},
"D":{"p":2,"q":0,"r":3,"s":1},
"E_P":{"p":3,"q":0,"r":2,"s":1},
"E_M":{"p":3,"q":0,"r":2,"s":1},
"F_P":{"p":2,"q":0,"r":3,"s":1},
"F_M":{"p":2,"q":0,"r":3,"s":1},
"G_P":{"p":2,"q":0,"r":3,"s":1},
"G_M":{"p":2,"q":0,"r":3,"s":1},
"H_P":{"p":2,"q":0,"r":3,"s":1},
"H_M":{"p":2,"q":0,"r":3,"s":1}}

In [33]:
typedict = {v:k for k,v in dict(enumerate(["A", "B_P", "B_M", "C", "D", "E_P", "E_M", "F_P", "F_M", "G_P", "G_M", "H_P", "H_M"])).items()}

In [34]:
typedict

{'A': 0,
 'B_P': 1,
 'B_M': 2,
 'C': 3,
 'D': 4,
 'E_P': 5,
 'E_M': 6,
 'F_P': 7,
 'F_M': 8,
 'G_P': 9,
 'G_M': 10,
 'H_P': 11,
 'H_M': 12}

In [20]:
# IVECW and IRHS should have same indices
# Same as IVECC2, it should all be element wise
pairs = []
for i in glob('GMJ_e2_*.csv'):
    typ = i.split('.')[0].replace('GMJ_e2_','')
    # print(typ)
    IVEC = pd.read_csv(f'GMJ_IVECW_{typ}.csv',sep='\s+',header=None,skiprows=[0])
    RHS = pd.read_csv(f'GMJ_RHS_{typ}.csv',sep=',',header=None,index_col=0)
    RHS.index = list(map(strip,RHS.index))
    RHS = np.array(RHS.index).reshape(IVEC.shape)
    
    e2 = np.genfromtxt(f'GMJ_e2_{typ}.csv',skip_header=True)
    IVECX = pd.read_csv(f'GMJ_IVECX_{typ}.csv',sep='\s+',header=None,skiprows=[0])
    IVECC2 = pd.read_csv(f'GMJ_IVECC2_{typ}.csv',sep='\s+',header=None,skiprows=[0])    
    for idxi,i in enumerate(RHS):
        for idxj,j in enumerate(i):
            # Split the index and enforce a standardization of p,q,r,s 
            split_index = j.split('_')
            type_idx = index_dict[typ]
            p,q,r,s = split_index[type_idx['p']],split_index[type_idx['q']],split_index[type_idx['r']],split_index[type_idx['s']]
            # typ, pq,rs,qs,e2
            pairs.append((typ,'_'.join((p,q)),'_'.join((r,s)),'_'.join((q,s)),e2[idxi,idxj]))

pairs = np.array(pairs)

In [21]:
# qs pairs!
uniquepairs = np.unique(pairs[:,3])

In [24]:
def caspt2_fock_indexing(u,v):
    '''
    Fast way to generate feature labels for the CASPT2 style Fock featues
    '''
    return ["F$_{"+f"{u}{v}"+"}$","FI$_{"+f"{u}{v}"+"}$","FA$_{"+f"{u}{v}"+"}$","D$_{"+f"{u}{v}"+"}$"]

In [72]:
checkE2=0
pairenergy_df = pd.DataFrame(range(len(uniquepairs)),index=uniquepairs,columns=['Pair_Energies'])

h_features = []
CASPT2Fockfeatures = []
b4_type = []
binary_feat = []
for i in uniquepairs:
    q,s = i.split('_')
    qidx = basis_dict[q]
    sidx = basis_dict[s]

    # From same orbital = 1, else 0
    if q==s:
        binary_feat.append((i,1))
    else:
        binary_feat.append((i,0))

    # CASPT2 style Fock features
    if qidx>=sidx:
        CASPT2Fockfeatures.append((i,dict(zip(caspt2_fock_indexing('q','s'),pt2fock_stacked[:,2:][(pt2fock_stacked[:,0]==qidx)&(pt2fock_stacked[:,1]==sidx)].flatten()))))
    else:
        CASPT2Fockfeatures.append((i,dict(zip(caspt2_fock_indexing('q','s'),pt2fock_stacked[:,2:][(pt2fock_stacked[:,0]==sidx)&(pt2fock_stacked[:,1]==qidx)].flatten()))))

    # Get the pair-energies that share the same qs
    subpairs = pairs[pairs[:,3]==i]

    # Grab the largest 4 two-electron contributers
    best4 = subpairs[np.argsort(abs(subpairs[:,-1].astype(float)))][-4:]

    # Loop over best four 
    for b4idx, (typ, pq, rs) in enumerate(best4[:,0:3]):
        b4_type.append((i,f"typ_{b4idx}",typedict[typ]))
        p,q = pq.split('_')
        r,s = rs.split('_')
        pidx = basis_dict[p]
        qidx = basis_dict[q]
        ridx = basis_dict[r]
        sidx = basis_dict[s]
    
        # Set of label index pairs
        pqrsindex_dict = {"p":[p,pidx],"q":[q,qidx],"r":[r,ridx],"s":[s,sidx]}

        # All possible two-index pairs
        twoidxpairs = [['p','q'],['r','s'],['p','r'],['q','s']]
        # h_{ij} features
        for u,v in twoidxpairs:
            u_item, u_idx = pqrsindex_dict[u]
            v_item, v_idx = pqrsindex_dict[v]
            if u_idx>=v_idx:
                h_features.append((i,"h$_{"+f"{u}{v}"+"}^{"+f"{b4idx}"+"}$",h_stacked[(oneelint_idx[:,0]==u_idx)&(oneelint_idx[:,1]==v_idx)].flatten()[-1]))     
            else:
                h_features.append((i,"h$_{"+f"{u}{v}"+"}^{"+f"{b4idx}"+"}$",h_stacked[(oneelint_idx[:,0]==v_idx)&(oneelint_idx[:,1]==u_idx)].flatten()[-1]))
            

    
    # Pair-energies
    pairenergy = np.sum(subpairs[:,-1].astype(float))
    pairenergy_df.loc[i] = pairenergy
    checkE2 += pairenergy

  pairenergy_df.loc[i] = pairenergy


In [79]:
print(f"Pair-energies sum up to calculated correlation energy? {np.isclose(checkE2,E2)}")

Pair-energies sum up to calculated correlation energy? True


In [68]:
# IT,IU,F(global index),FI(global index),fa(global index),d(global index)

caspt2fockdf = pd.concat([pd.DataFrame.from_dict(vals,orient='index',columns=[idx]) for idx, vals in CASPT2Fockfeatures],axis=1).T

In [75]:
# binary feature df
bindf = pd.DataFrame(binary_feat).set_index(0).rename(columns={0:'binary'})

In [49]:
# one-electron dataframe
h_df = pd.DataFrame(h_features).pivot(index=0, columns=1)
h_df.columns=h_df.columns.droplevel()
h_df.drop(columns=["h$_{qs}^{3}$","h$_{qs}^{1}$","h$_{qs}^{2}$"],inplace=True)
h_df.rename(columns={"h$_{qs}^{0}$":"h$_{qs}$"},inplace=True)

# Important 4 types
important2e = pd.DataFrame(b4_type).pivot(index=0, columns=1)
important2e.columns=important2e.columns.droplevel()

In [80]:
# Everything together so far
concatdf = pd.concat([h_df,important2e,bindf,caspt2fockdf,pairenergy_df],axis=1)