In [1]:
#######################################################
#Import packages
import numpy as np
import os
import re
from math import sin, cos, pi
import glob
import subprocess
import pickle
from subprocess import call, check_output
import pandas as pd
import sys
from time import time
import matplotlib.pyplot as plt
from plumbum.cmd import grep, awk

import shutil
import random
import sklearn
from shutil import copy
import csv
import h5py as h5
import seaborn as sns; sns.set(style="ticks", color_codes=True)

In [2]:
#######################################################
# Obital labels
## Inactive i,j
## Active t,u,v
## Virtual a,b

## Type 1: IA->AA
## Type 2: II->AA (P)
## Type 3: II->AA (M)
## Type 4: AA->VA
## Type 5: IA->VA/AV
## Type 6: II->AV (P)
## Type 7: II->AV (M)
## Type 8: AA->VV (P)
## Type 9: AA->VV (M)
## Type 10: IA->VV (P)
## Type 11: IA->VV (M)
## Type 12: II->VV (P)
## Type 13: II->VV (M)

## A: IA->AA
## B: II->AA
## C: AA->VA
## D: IA->VA/AV
## E: II->AV
## F: AA->VV
## G: IA->VV 
## H: II->VV
#######################################################


In [3]:
len(np.arange(106,181,0.5))

150

In [39]:
# Set precision, we can change this at will
np.set_printoptions(precision=4)
pd.set_option('precision', 4)

# This tells us the number of structures and how theyre labeled
# structs=np.arange(106,181,1)
structs=np.round(np.arange(0.2,4,0.1), 2)
# structs=np.arange(150,171,20)
# CASPT2 excitation types
excittype=np.arange(1,14,1)
# Define molecule
typ='FeO'

# Find structures
struct_name=[i for i in os.listdir() if i.endswith('.xyz') and i.startswith(f'{typ}') and '_' in i]

if len(struct_name)==0:
    for i in structs:
        if os.path.exists(os.path.join(os.getcwd(),'Coords',f'FeO_{i}.xyz')):
            struct_name.append(f'FeO_{i}.xyz')


In [40]:
print(len(structs))

38


In [41]:
# Delete excessive extra files
def del_useless():
    '''
    Delete the extra files
    '''
    for i in os.listdir():
        for j in ['status','GssOrb','LprOrb','LoProp','guessorb','xmldump','RasOrb','SpdOrb']:
            if j in i:
                os.remove(i)

In [42]:
# When restarting a setr of calculations just clear everyting out
def clean_dir():
    for entry in os.scandir(path=os.getcwd()):
        if entry.is_dir():
            if entry.name=='Fock':
                shutil.rmtree(entry.name)
            if entry.name=='hdf5':
                shutil.rmtree(entry.name)
            if entry.name=='e2':
                shutil.rmtree(entry.name)                
            if entry.name=='Labels':
                shutil.rmtree(entry.name)
            if entry.name=='Coords':
                shutil.rmtree(entry.name)
            if 'dir' in entry.name:
                shutil.rmtree(entry.name)
                


In [43]:
# Run this before clean_dir, this pulls the xyz files out just to 
def pull_xyz():
    import re
    for i in struct_name:
        if os.path.exists(os.path.join(os.getcwd(),i))==False and os.path.exists(os.path.join(os.getcwd(),'Coords')):
            shutil.copy(os.path.join(os.getcwd(),'/'.join(('Coords',i))),os.path.join(os.getcwd(),i))


In [44]:
def gen_gateway(angle):
    string=f'''&GATEWAY 
coord={f'FeO_{angle}.xyz'}
Basis = ANO-RCC-MB
Group = nosymm
End of Input


'''
    return string
def gen_seward():
    string=f'''&SEWARD
End of Input
'''
    return string

def gen_motra(angle):
    string=f'''&MOTRA
Frozen=0
>>> COPY $WorkDir/GMJ_one_int_indx.csv $CurrDir/{angle}.GMJ_one_int_indx.csv
>>> COPY $WorkDir/GMJ_one_int.csv $CurrDir/{angle}.GMJ_one_int.csv
>>> COPY $WorkDir/GMJ_AO_INT.csv $CurrDir/{angle}.GMJ_AO_INT.csv

'''
    return string

def gen_scf(angle):
    string=f"""&SCF &END
>>> COPY $WorkDir/PT2_iron_oxo_{angle}.scf.h5 $CurrDir/
"""
    return string    


def gen_rasscf(angle):
    string=f"""&RASSCF &END
Title= RASSCF
fileorb
{fileorb}
NACTEL
12 0 0
Inactive
11
RAS2
9
Symmetry
1
Spin
1
orblisting
all
ITERation
200 100
CIMX
200
SDAV
500
PRWF
0

>>> COPY $WorkDir/PT2_iron_oxo_{angle}.rasscf.h5 $CurrDir/
>>> COPY $WorkDir/GMJ_Fock_MO.csv $CurrDir/{angle}.GMJ_Fock_MO.csv
"""
    return string    

def gen_caspt2():
    string="""&CASPT2 &END
Frozen 
0
Imaginary Shift
0.0

>>foreach i in (B,E,F,G,H)
>>foreach j in (P,M)
>>if ( -FILE GMJ_e2_${i}_${j}.csv )
>>> COPY $WorkDir/GMJ_RHS_${i}_${j}.csv $CurrDir/GMJ_RHS_${i}_${j}.csv
>>> COPY $WorkDir/GMJ_IVECW_${i}_${j}.csv $CurrDir/GMJ_IVECW_${i}_${j}.csv
>>> COPY $WorkDir/GMJ_e2_${i}_${j}.csv $CurrDir/GMJ_e2_${i}_${j}.csv
>>endif
>>enddo
>>enddo

>>foreach i in (A,C,D)
>>if ( -FILE GMJ_e2_$i.csv )
>>> COPY $WorkDir/GMJ_RHS_$i.csv $CurrDir/GMJ_RHS_$i.csv
>>> COPY $WorkDir/GMJ_IVECW_$i.csv $CurrDir/GMJ_IVECW_$i.csv
>>> COPY $WorkDir/GMJ_e2_$i.csv $CurrDir/GMJ_e2_$i.csv
>>endif
>>enddo

"""
    return string    





In [52]:
def gen_XYZ(angle):
    f=open(f'FeO_{angle}.xyz','w+')
#     N        0.0000000000      0.0000000000      0.0000000000                 
#     N        1.4485557861      0.0000000000      0.0000000000
    Fe=f'Fe {0.0000:.6f} {0.0000:.6f} {0.0000:.6f}'
    O=f'O {angle:.6f} {0.0000:.6f} {0.0000:.6f}'    
    mylist=[Fe, O]
    f.write(str(len(mylist))+'\n'+'\n')
    [f.write(i+'\n') for i in mylist]
    f.close()
    return mylist
    

def gen_eq_orbs():
    angle=1.558
    gen_XYZ(angle)
    new_input=f'''&GATEWAY 
coord={f'FeO_{angle}.xyz'}
Basis = ANO-RCC-MB
Group = nosymm
End of Input



&SEWARD
End of Input

&RASSCF &END
Title= RASSCF
NACTEL
12 0 0
Inactive
11
RAS2
9
Symmetry
1
Spin
1
orblisting
all
ITERation
200 100
CIMX
200
SDAV
500
PRWF
0


&CASPT2 &END
Frozen
0
Imaginary Shift
0.0
'''
    g=open(f'eq_orbs.input', 'wb')
    g.write(new_input.encode())
    g.close()
    call(['pymolcas', f'eq_orbs.input', '-oe', f'eq_orbs.output'])

# ALTEr=3; 1 10 11; 1 9 12; 1 13 13

fileorb=os.path.join(os.getcwd(),'eq_orbs.RasOrb')
def Gen_Scan():
    import re
    for angle in structs:
        gen_XYZ(angle)
        g=open(f'PT2_iron_oxo_{angle}.input', 'wb')
        g.write(gen_gateway(angle).encode())
        g.write(gen_seward().encode())
        g.write(gen_motra(angle).encode())
        g.write(gen_scf(angle).encode())        
        g.write(gen_rasscf(angle).encode())        
        g.write(gen_caspt2().encode())
        g.close()
        call(['pymolcas','-new','-clean',f'PT2_iron_oxo_{angle}.input', '-oe', f'PT2_iron_oxo_{angle}.output'])
        [shutil.move(os.path.join(os.getcwd(),i),os.path.join(os.getcwd(),f'{angle}.{i}')) for i in glob.glob(f'GMJ_RHS_*')]
        [shutil.move(os.path.join(os.getcwd(),i),os.path.join(os.getcwd(),f'{angle}.{i}')) for i in glob.glob(f'GMJ_IVECW_*')]        
        [shutil.move(os.path.join(os.getcwd(),i),os.path.join(os.getcwd(),f'{angle}.{i}')) for i in glob.glob(f'GMJ_e2_*')]

In [53]:
# Move input and output files to their proper directory
def mv_fils():
    '''
    Move the files to directories named for them
    '''
    import shutil
    import os
    for i in structs:
        if os.path.exists(os.path.join(os.getcwd(),f'{i}_dir')) is True:
            if os.path.exists(os.path.join(os.getcwd(),f'PT2_iron_oxo_{i}.output'))==True:
                shutil.move(os.path.join(os.getcwd(),f'PT2_iron_oxo_{i}.output'),os.path.join(os.getcwd(),f'{i}_dir',f'PT2_iron_oxo_{i}.output'))
            if os.path.exists(os.path.join(os.getcwd(),f'PT2_iron_oxo_{i}.input'))==True:
                shutil.move(os.path.join(os.getcwd(),f'PT2_iron_oxo_{i}.input'),os.path.join(os.getcwd(),f'{i}_dir',f'PT2_iron_oxo_{i}.input'))
        else:
            os.mkdir(f'{i}_dir')
            if os.path.exists(os.path.join(os.getcwd(),f'PT2_iron_oxo_{i}.output'))==True:
                shutil.move(os.path.join(os.getcwd(),f'PT2_iron_oxo_{i}.output'),os.path.join(os.getcwd(),f'{i}_dir',f'PT2_iron_oxo_{i}.output'))
            if os.path.exists(os.path.join(os.getcwd(),f'PT2_iron_oxo_{i}.input'))==True:
                shutil.move(os.path.join(os.getcwd(),f'PT2_iron_oxo_{i}.input'),os.path.join(os.getcwd(),f'{i}_dir',f'PT2_iron_oxo_{i}.input'))




In [54]:
# Move data files to directories
def mv_dat():
# Move h5 and coord files to dirs
    for i in os.listdir():
        if i.endswith('h5'):
            if os.path.exists(os.path.join(os.getcwd(),i))==True:
                if os.path.exists(os.path.join(os.getcwd(),f'hdf5')) is True:
                    shutil.move(os.path.join(os.getcwd(),i),os.path.join(os.getcwd(),'/'.join(('hdf5',i))))
                else:
                    os.mkdir(f'hdf5')
                    shutil.move(os.path.join(os.getcwd(),i),os.path.join(os.getcwd(),'/'.join(('hdf5',i))))
# Move xyz files
        if i.endswith('.xyz') and i.startswith(f'{typ}') and '_' in i:
            if os.path.exists(os.path.join(os.getcwd(),i))==True:
                if os.path.exists(os.path.join(os.getcwd(),f'Coords')) is True:
                    shutil.move(os.path.join(os.getcwd(),i),os.path.join(os.getcwd(),'/'.join(('Coords',i))))
                else:
                    os.mkdir(f'Coords')
                    shutil.move(os.path.join(os.getcwd(),i),os.path.join(os.getcwd(),'/'.join(('Coords',i))))
# Move pair energy files
        if '_e2' in i and i.endswith('csv'):
            if os.path.exists(os.path.join(os.getcwd(),i))==True:
                if os.path.exists(os.path.join(os.getcwd(),f'e2')) is True:
                    shutil.move(os.path.join(os.getcwd(),i),os.path.join(os.getcwd(),'/'.join(('e2',i))))
                else:
                    os.mkdir(f'e2')
                    shutil.move(os.path.join(os.getcwd(),i),os.path.join(os.getcwd(),'/'.join(('e2',i))))
# Move rhs files which tell how the ivvecc2 and ivecw files are indexed
        if 'RHS' in i and i.endswith('csv'):
            if os.path.exists(os.path.join(os.getcwd(),i))==True:
                if os.path.exists(os.path.join(os.getcwd(),f'Labels')) is True:
                    shutil.move(os.path.join(os.getcwd(),i),os.path.join(os.getcwd(),'/'.join(('Labels',i))))
                else:
                    os.mkdir(f'Labels')
                    shutil.move(os.path.join(os.getcwd(),i),os.path.join(os.getcwd(),'/'.join(('Labels',i))))
        if 'IVECW' in i and i.endswith('csv'):
            if os.path.exists(os.path.join(os.getcwd(),i))==True:
                if os.path.exists(os.path.join(os.getcwd(),f'IVECW')) is True:
                    shutil.move(os.path.join(os.getcwd(),i),os.path.join(os.getcwd(),'/'.join(('IVECW',i))))
                else:
                    os.mkdir(f'IVECW')
                    shutil.move(os.path.join(os.getcwd(),i),os.path.join(os.getcwd(),'/'.join(('IVECW',i))))

                    
# MO fock matrix
        if 'Fock' in i and i.endswith('csv'):
            if os.path.exists(os.path.join(os.getcwd(),i))==True:
                if os.path.exists(os.path.join(os.getcwd(),f'Fock')) is True:
                    shutil.move(os.path.join(os.getcwd(),i),os.path.join(os.getcwd(),'/'.join(('Fock',i))))
                else:
                    os.mkdir(f'Fock')
                    shutil.move(os.path.join(os.getcwd(),i),os.path.join(os.getcwd(),'/'.join(('Fock',i))))
        if i.endswith('_one_int_indx.csv'):
            if os.path.exists(os.path.join(os.getcwd(),i))==True:
                if os.path.exists(os.path.join(os.getcwd(),f'1_int_idx')) is True:
                    shutil.move(os.path.join(os.getcwd(),i),os.path.join(os.getcwd(),'/'.join(('1_int_idx',i))))
                else:
                    os.mkdir(f'1_int_idx')
                    shutil.move(os.path.join(os.getcwd(),i),os.path.join(os.getcwd(),'/'.join(('1_int_idx',i))))
        if i.endswith('_two_int_indx.csv'):
            if os.path.exists(os.path.join(os.getcwd(),i))==True:
                if os.path.exists(os.path.join(os.getcwd(),f'2_int_idx')) is True:
                    shutil.move(os.path.join(os.getcwd(),i),os.path.join(os.getcwd(),'/'.join(('2_int_idx',i))))
                else:
                    os.mkdir(f'2_int_idx')
                    shutil.move(os.path.join(os.getcwd(),i),os.path.join(os.getcwd(),'/'.join(('2_int_idx',i))))
        if i.endswith('GMJ_one_int.csv'):
            if os.path.exists(os.path.join(os.getcwd(),i))==True:
                if os.path.exists(os.path.join(os.getcwd(),f'1_ints')) is True:
                    shutil.move(os.path.join(os.getcwd(),i),os.path.join(os.getcwd(),'/'.join(('1_ints',i))))
                else:
                    os.mkdir(f'1_ints')
                    shutil.move(os.path.join(os.getcwd(),i),os.path.join(os.getcwd(),'/'.join(('1_ints',i))))
        if i.endswith('GMJ_two_int.csv'):
            if os.path.exists(os.path.join(os.getcwd(),i))==True:
                if os.path.exists(os.path.join(os.getcwd(),f'2_ints')) is True:
                    shutil.move(os.path.join(os.getcwd(),i),os.path.join(os.getcwd(),'/'.join(('2_ints',i))))
                else:
                    os.mkdir(f'2_ints')
                    shutil.move(os.path.join(os.getcwd(),i),os.path.join(os.getcwd(),'/'.join(('2_ints',i))))
        if i.endswith('GMJ_AO_INT.csv'):
            if os.path.exists(os.path.join(os.getcwd(),i))==True:
                if os.path.exists(os.path.join(os.getcwd(),f'AO_ints')) is True:
                    shutil.move(os.path.join(os.getcwd(),i),os.path.join(os.getcwd(),'/'.join(('AO_ints',i))))
                else:
                    os.mkdir(f'AO_ints')
                    shutil.move(os.path.join(os.getcwd(),i),os.path.join(os.getcwd(),'/'.join(('AO_ints',i))))





In [55]:


def run():
    # Pull xyz structures
    pull_xyz()
    # The rest follows as labeled above
    clean_dir()
    gen_eq_orbs()    
    Gen_Scan() 
    mv_fils()
    del_useless()
    mv_dat()

In [56]:
# Do a new run
run()

In [57]:
# Grab caspt2 energies
def caspt2():
    PT2=[]
    PT2List=[]
    PT2labels=[]

    for j in structs:
        if os.path.exists(os.path.join(os.getcwd(),f'{j}_dir'))==True:
            path=os.path.join(os.getcwd(),f'{j}_dir',f'PT2_iron_oxo_{j}.output')
            PT2labels.append(j)
            PT2.append(float((grep['-i', '::    CASPT2',path] | awk['{print $NF }'])()))
    PT2List=np.array(PT2)
    PT2Dict=pd.DataFrame({'Label':PT2labels,'PT2':PT2List})
    PT2Dict.to_csv('CASPT2.csv')        
    return PT2Dict.set_index('Label')

In [58]:
df_CASPT2=caspt2()

ProcessExecutionError: Unexpected exit code: 1
Command line: | /usr/bin/grep -i '::    CASPT2' /Users/gjonesresearch/DDCASPT2/MolcasProjects/iron_oxo/2.9_dir/PT2_iron_oxo_2.9.output

In [None]:
plt.plot(df_CASPT2-df_CASPT2.max())
plt.ylim(-176.25,-176)

In [None]:
# How to grab the whole row
# caspt2().loc[caspt2()['Labels'] =='ethylene_7']

In [None]:
# Grab casscf energies 
def casscf():
    SCF=[]
    SCFList=[]
    SCFlabels=[]
    for j in structs:
        if os.path.exists(os.path.join(os.getcwd(),f'{j}_dir'))==True:
            path=os.path.join(os.getcwd(),f'{j}_dir',f'PT2_iron_oxo_{j}.output')
            SCFlabels.append(j)
            SCF.append(float((grep['-i', '::    RASSCF root number  1',path] | awk['{print $8 }'])()))
    SCFList=np.array(SCF)
    SCFDict=pd.DataFrame({'Label':SCFlabels,'SCF':SCFList})
    SCFDict.to_csv('CASSCF.csv')    
    return SCFDict.set_index('Label')

In [None]:
df_CASSCF=casscf()


In [None]:
df_CASSCF

In [None]:
# Grab PT2 energies 
def PT2():
    E2=[]
    E2List=[]
    E2labels=[]
    ES_E2=[]
    ES_E2List=[]
    ES_E2labels=[]

    for j in structs:
        if os.path.exists(os.path.join(os.getcwd(),f'{j}_dir'))==True:
            path=os.path.join(os.getcwd(),f'{j}_dir',f'PT2_iron_oxo_{j}.output')
            E2labels.append(j)
            E2.append(float((grep['-i', 'E2 (Variational):',path] | awk['{print $NF }'])()))
    E2List=np.array(E2)
    E2Dict=pd.DataFrame({'Label':E2labels,'E2':E2List})
    df=E2Dict.set_index('Label')
    df.to_csv('E2.csv')
    return df

In [None]:
E2Dict=PT2()

In [None]:
E2Dict

In [None]:
# Grab file paths for the directories
# onlyfiles = [f for f in os.listdir(os.getcwd()) if os.path.isdir(os.path.join(os.getcwd(), f))]
onlyfiles = (f for f in os.listdir(os.getcwd()) if os.path.isdir(os.path.join(os.getcwd(), f)))
for entry in os.scandir(path=os.getcwd()):
    if entry.is_dir():
        if entry.name=='Fock':
            path_to_Focks=os.path.join(os.getcwd(),entry.name)
        if entry.name=='e2':
            path_to_e2=os.path.join(os.getcwd(),entry.name)
        if entry.name=='Labels':
            path_to_Labels=os.path.join(os.getcwd(),entry.name)
        if entry.name=='IVECW':
            path_to_IVECW=os.path.join(os.getcwd(),entry.name)
        if entry.name=='1_ints':
            path_to_1_ints=os.path.join(os.getcwd(),entry.name)
        if entry.name=='2_ints':
            path_to_2_ints=os.path.join(os.getcwd(),entry.name)
        if entry.name=='1_int_idx':
            path_to_1_int_idx=os.path.join(os.getcwd(),entry.name)
        if entry.name=='2_int_idx':
            path_to_2_int_idx=os.path.join(os.getcwd(),entry.name)


In [None]:

typ_exists=sorted(sum(list([j.replace('GMJ_e2_','') for j in i.split('/')[-1].split('.') if 'GMJ' in j] for i in glob.glob(os.path.join(path_to_e2,f'{structs[0]}.GMJ_e2_*'))),[]))


In [None]:
typ_exists

In [None]:
with open('exists.pickle', 'wb') as handle:
    pickle.dump(typ_exists, handle, protocol=pickle.HIGHEST_PROTOCOL)



In [None]:
cwd = os.getcwd()
#   Keep everything at float64
DTYPE = np.float_
# DTYPE = np.float16

#   Create an array with the easy data
def createArrray(filename):
    files = sorted(glob.glob(filename))
    arrayname = []
    for i in sorted(files):
        arrayname.append(
            np.stack(
                np.array(pd.read_csv(i, header=None),
                         dtype=DTYPE,
                         copy=False).flatten()))

    arrayname = np.asarray(arrayname, dtype=DTYPE)
    return arrayname
#   Start transforming the HDF5 files from the data directory
h5list = []
for i in structs:
    h5list.append(os.path.join(os.getcwd(),'hdf5',f'PT2_iron_oxo_{str(i)}.rasscf.h5'))
# Grab AO_OVERLAP_MATRIX,MO_ENERGIES,MO_OCCUPATIONS,MO_VECTORS,MO_TYPEINDICES,NACTEL,NBAS from hdf5
def h5feats(h5list):
    f = h5.File(h5list[0], 'r')
    datasetNames = [n for n in f.keys()]
    b = []
    labels = []
    # AO_FOCKINT_MATRIX=[]
    # Useful attributes from the hdf5 files
    NBAS=[]
    NACTEL=[]
    for k, elem in enumerate(structs):
        for count, ele in enumerate([i for i in f.attrs]):
            if ele =='NBAS':
                for i, elemt in enumerate(np.array(h5.File(h5list[k],'r').attrs[ele]).reshape(-1)):
                    NBAS.append(elemt)
            if ele =='NACTEL':
                for i, elemt in enumerate(np.array(h5.File(h5list[k],'r').attrs[ele]).reshape(-1)):
                    NACTEL.append(elemt)


    MO_ENERGIES=[]
    MO_OCCUPATIONS=[]
    MO_TYPEINDICES=[]
    MO_VECTORS=[]
    t0=time()
    #   Eliminate certain features that won't be good for regression
    for k, elem in enumerate(structs):
        for count, ele in enumerate([n for n in h5.File(h5list[k], 'r').keys()]):
            if ele =='MO_TYPEINDICES':
                for i, elemt in enumerate(np.array(h5.File(h5list[k],'r')[ele]).reshape(-1)):
                    MO_TYPEINDICES.append(elemt)

            if ele =='MO_ENERGIES':
                for i, elemt in enumerate(np.array(h5.File(h5list[k],'r')[ele]).reshape(-1)):
                    MO_ENERGIES.append(elemt)

            if ele =='MO_OCCUPATIONS':
                for i, elemt in enumerate(np.array(h5.File(h5list[k],'r')[ele]).reshape(-1)):
                    MO_OCCUPATIONS.append(elemt)


                    

    print(f'time: {time()-t0} s')
    # AO_FOCKINT_MATRIX=np.array(AO_FOCKINT_MATRIX).reshape(len(dislist),int(NBAS[0]),int(NBAS[0]))
    MO_ENERGIES= np.array(MO_ENERGIES).reshape(len(structs),int(NBAS[0]))
    MO_OCCUPATIONS= np.array(MO_OCCUPATIONS).reshape(len(structs),int(NBAS[0]))
    MO_TYPEINDICES=np.array(MO_TYPEINDICES).reshape(len(structs),int(NBAS[0]))
    
    return MO_ENERGIES,MO_OCCUPATIONS,MO_TYPEINDICES,NACTEL,NBAS


MO_ENERGIES,MO_OCCUPATIONS,MO_TYPEINDICES,NACTEL,NBAS=h5feats(h5list)


h5list_scf = []
for i in structs:
    h5list_scf.append(os.path.join(os.getcwd(),'hdf5',f'PT2_iron_oxo_{str(i)}.scf.h5'))
# Grab AO_OVERLAP_MATRIX,MO_ENERGIES,MO_OCCUPATIONS,MO_VECTORS,MO_TYPEINDICES,NACTEL,NBAS from hdf5
def MO_VEC(h5list_scf):
    f = h5.File(h5list_scf[0], 'r')
    datasetNames = [n for n in f.keys()]
    b = []
    labels = []
    # AO_FOCKINT_MATRIX=[]
    # Useful attributes from the hdf5 files
    NBAS=[]
    NACTEL=[]
    for k, elem in enumerate(structs):
        for count, ele in enumerate([i for i in f.attrs]):
            if ele =='NBAS':
                for i, elemt in enumerate(np.array(h5.File(h5list_scf[k],'r').attrs[ele]).reshape(-1)):
                    NBAS.append(elemt)
    MO_VECTORS=[]
    MO_ENERGIES=[]  
    MO_OCCUPATIONS=[]
    t0=time()
    #   Eliminate certain features that won't be good for regression
    for k, elem in enumerate(structs):
        for count, ele in enumerate([n for n in h5.File(h5list_scf[k], 'r').keys()]):
            if ele =='MO_VECTORS':
                for i, elemt in enumerate(np.array(h5.File(h5list_scf[k],'r')[ele]).reshape(-1)):
                    MO_VECTORS.append(elemt)
            if ele =='MO_ENERGIES':
                for i, elemt in enumerate(np.array(h5.File(h5list_scf[k],'r')[ele]).reshape(-1)):
                    MO_ENERGIES.append(elemt)
            if ele =='MO_OCCUPATIONS':
                for i, elemt in enumerate(np.array(h5.File(h5list_scf[k],'r')[ele]).reshape(-1)):
                    MO_OCCUPATIONS.append(elemt)

                    

                    

                    

    print(f'time: {time()-t0} s')
    MO_ENERGIES= np.array(MO_ENERGIES).reshape(len(structs),int(NBAS[0]))
    MO_OCCUPATIONS= np.array(MO_OCCUPATIONS).reshape(len(structs),int(NBAS[0]))
    MO_VECTORS=np.array(MO_VECTORS).reshape(len(structs),int(NBAS[0]),int(NBAS[0]))
    
    return MO_VECTORS,MO_ENERGIES,MO_OCCUPATIONS


MO_VECTORS,scf_F,scf_OCC=MO_VEC(h5list_scf)



In [None]:
def gen_indx(list_of_dicts):
    indx=[]
    for i in list_of_dicts.keys():
        if len(list_of_dicts[i])>0:
            indx.append(list(list_of_dicts[i].keys()))
    return indx[0]


path_check=os.path.join(os.getcwd(),f'{structs[0]}_dir',f'PT2_iron_oxo_{structs[0]}.output')

# Sanity check...
# REMEMBER FROZEN CORE APPROXIMATION
# Number of frozen orbitals
fro=int(subprocess.Popen(f"grep -i 'Frozen orbitals' {path_check} | tail -n 1",shell=True,stdout=subprocess.PIPE,stderr=subprocess.STDOUT).communicate()[0].split()[-1])
# Number of inactive orbitals
inact=int(subprocess.Popen(f"grep -i 'Inactive orbitals' {path_check} | tail -n 1",shell=True,stdout=subprocess.PIPE,stderr=subprocess.STDOUT).communicate()[0].split()[-1])
# Number of active orbitals
act=int(subprocess.Popen(f"grep -i 'Active orbitals' {path_check} | tail -n 1",shell=True,stdout=subprocess.PIPE,stderr=subprocess.STDOUT).communicate()[0].split()[-1])
# Number of seconary orbitals
virt=int(subprocess.Popen(f"grep -i 'Secondary orbitals' {path_check} | tail -n 1",shell=True,stdout=subprocess.PIPE,stderr=subprocess.STDOUT).communicate()[0].split()[-1])
# Number of basis functions for sanity check
bas_check=int(subprocess.Popen(f"grep -i 'Number of basis functions' {path_check} | tail -n 1",shell=True,stdout=subprocess.PIPE,stderr=subprocess.STDOUT).communicate()[0].split()[-1])

Basis_Indices=[]
for i in range(fro):
    Basis_Indices.append(f'F{i+1}')
for i in range(inact):
    Basis_Indices.append(f'I{i+1}')
for i in range(act):
    Basis_Indices.append(f'A{i+1}')
for i in range(virt):
    Basis_Indices.append(f'S{i+1}')    
    
print(f'Basis sanity check passed={bas_check==len(Basis_Indices)}') 

In [None]:
# Generate the labels that match the IVECW and IVECC2 files
def gen_labels(typ):
    Labels=[]
    Indexes=[]
    i=structs[0]
    return [j.split()[0].replace('\n','').replace('00','').replace('S0','S').replace('I0','I').replace(',','') for j in open(os.path.join(path_to_Labels,f'{i}.GMJ_RHS_{typ}.csv'),'r').readlines()]




In [None]:
def gen_pair_labels(typ):
    Labels=[]
    Indexes=[]
    i=structs[0]
    return sorted(set(['_'.join(j.split()[0].replace('\n','').replace('00','').replace('S0','S').replace('I0','I').replace(',','').split('_')[0:2]) for j in open(os.path.join(path_to_Labels,f'{i}.GMJ_RHS_{typ}.csv'),'r').readlines()]))



In [None]:
print([(typ,len(gen_labels(typ)),len(gen_pair_labels(typ))) for typ in typ_exists])

In [None]:
def gen_dim_dict(typ_exists):
    '''    
    Dimension check for DDCASPT2: check the ordering of the pair-energies,
    this notation follows a mix of the papers and code.
    
    A (IA->AA): \n TIUV \n E_{ti} E_{uv} \n pqrs=tiuv=0123 \n    
    B_P (II->AA) (P): \n IJTU \n E_{ti} E_{uj} \n pqrs=tiuj=2031 \n
    B_M (II->AA) (M): \n IJTU \n E_{ti} E_{uj} \n pqrs=tiuj=2031 \n
    C (AA->VA): \n UVAT \n E_{at} E_{uv} \n pqrs=atuv=2301 \n
    D (IA->VA/AV): \n IUAT/IUTA \n E_{ai} E_{tu}/E_{ti} E_{au} \n pqrs=(a/t)i(t/a)u=2031 \n
    E_P (II->AV) (P): \n IJAT \n E_{ti} E_{aj} \n pqrs=tiaj=3021 \n
    E_M (II->AV) (M): \n IJAT \n E_{ti} E_{aj} \n pqrs=tiaj=3021 \n
    F_P (AA->VV) (P): \n TUAB \n E_{at} E_{bu} \n pqrs=atbu=2031 \n
    F_M (AA->VV) (M): \n TUAB \n E_{at} E_{bu} \n pqrs=atbu=2031 \n
    G_P (IA->VV) (P): \n ITAB \n E_{ai} E_{bt} \n pqrs=aibt=2031 \n
    G_M (IA->VV) (M): \n ITAB \n E_{ai} E_{bt} \n pqrs=aibt=2031 \n
    H_P (II->VV) (P): \n IJAB \n E_{ai} E_{bj} \n pqrs=aibj=2031 \n
    H_M (II->VV) (M): \n IJAB \n E_{ai} E_{bj} \n pqrs=aibj=2031 \n
    '''    
    i=structs[0]
    dims=[]
    for typ in typ_exists:
        dims.append((typ,np.array([i.split('=')[-1].split('x') for i in open(os.path.join(path_to_e2,f'{i}.GMJ_e2_{typ}.csv'),'r').readlines() if 'mat. size =' in i ]).flatten().astype(int)))
    return dict(dims)

In [None]:
typ_exists

In [None]:
dims_dict=gen_dim_dict(typ_exists)

In [None]:
dims_dict

In [None]:
def strip(lst):   
    return '_'.join(i.replace('A00','A').replace('I00','I').replace('S00','S').replace('I0','I').replace('A0','A').replace('S0','S') for i in lst.split('_'))

In [None]:
def gen_ordered(typ):
    '''
    Return a dataframe for each type
    Index=proper indexing
    level_0=row
    level_1=column
    0=W value
    '''
    i=structs[0]
    ordered=pd.read_csv(os.path.join(path_to_IVECW,f'{i}.GMJ_IVECW_{typ}.csv'),delim_whitespace=True, skiprows=[0],header=None).astype(np.float64).dropna(axis=1)
    ordered.columns=list(range(len(ordered.columns)))
    ordered=ordered.stack()
    df=pd.read_csv(os.path.join(path_to_Labels,f'{i}.GMJ_RHS_{typ}.csv'),header=None,delimiter=',',index_col=0)
    df.index=list(map(strip,df.index))
    merged=ordered.reset_index().sort_values(by=0).set_index(df.sort_values(by=1).index).sort_values(['level_0','level_1'])    

#     print(f'Type {typ} all are correct={(merged[0].values.flatten()==ordered.values.flatten()).all()==True}')
    return merged

In [None]:
## Generate IVECW
def gen_e2(typ):
    e2=[]
    proper_labels=gen_labels(typ)
    for i in structs:
        df=pd.read_csv(os.path.join(path_to_e2,f'{i}.GMJ_e2_{typ}.csv'),delim_whitespace=True, skiprows=[0],header=None).astype(np.float64).dropna(axis=1).stack()
        df.index=gen_ordered(typ).index
        df=df.to_frame(name=str(i))
        e2.append(df)
    df1=pd.concat(e2,axis=1).loc[proper_labels]
    df1.index=[i for idx,i in enumerate(proper_labels)]
    return df1

In [None]:
def gen_pair(typ):
    Y=gen_e2(typ).astype(float)
# Needs to be qs, we're summing over the occupied orbitals    
    Y_pair_set=list(set(['_'.join((i.split('_')[0],i.split('_')[1]))+'_' for i in Y.index.tolist()]))
    Y_pair_df=pd.concat([Y[Y.index.str.find(j)==0].sum() for j in Y_pair_set],axis=1)
    Y_pair_df.columns=list(set(['_'.join((i.split('_')[0],i.split('_')[1])) for i in Y.index.tolist()]))
    return Y_pair_df.T.sort_index().groupby(level=0).sum()



In [None]:
def stack_label(typ):
    if f'{typ}_M' in typ_exists and f'{typ}_P' in typ_exists:
        return gen_pair_labels(f'{typ}_P')+gen_pair_labels(f'{typ}_M')
    elif f'{typ}_P' in typ_exists:
        return gen_pair_labels(f'{typ}_P')
    elif f'{typ}_M' in typ_exists:
        return gen_pair_labels(f'{typ}_M')
        

In [None]:
def stack_e2(typ):
    if f'{typ}_M' in typ_exists and f'{typ}_P' in typ_exists:
        df=pd.concat([gen_pair(f'{typ}_M'),gen_pair(f'{typ}_P')],axis=0).groupby(level=0).sum()
#         df.index=[i for idx,i in enumerate(stack_label(typ))]
        return df
    elif f'{typ}_P' in typ_exists:
        return gen_pair(f'{typ}_P').groupby(level=0).sum()
    elif f'{typ}_M' in typ_exists:
        return gen_pair(f'{typ}_M').groupby(level=0).sum()
        

In [None]:
# Generate the data
for typ in set([i.split('_')[0] for i in typ_exists ]):
    if typ=='A':
        typA_e2=gen_pair(f'{typ}')        
        typA_labels=gen_pair_labels(typ)
    if typ=='B':        
        typB_e2=stack_e2(typ)
        typB_labels=stack_label(typ)        
    if typ=='C':
        typC_e2=gen_pair(f'{typ}')
        typC_labels=gen_pair_labels(f'{typ}')
    if typ=='D':        
        typD_labels=gen_pair_labels(f'{typ}')
        typD_e2=gen_pair(f'{typ}')
    if typ=='E':
        typE_e2=stack_e2(typ)
        typE_labels=stack_label(typ)
    if typ=='F':        
        typF_e2=stack_e2(typ)
        typF_labels=stack_label(typ)
    if typ=='G':        
        typG_e2=stack_e2(typ)
        typG_labels=stack_label(typ)
    if typ=='H':  
        typH_e2=stack_e2(typ)
        typH_labels=stack_label(typ)



In [None]:
len(typA_labels+typB_labels+typC_labels+typD_labels+typE_labels+typF_labels+typG_labels+typH_labels)

In [None]:
stacked_e2=pd.concat([gen_e2(typ) for typ in typ_exists]).groupby(level=0).sum()

In [None]:
stacked_e2.shape

In [None]:
plt.scatter(E2Dict,stacked_e2.sum())

In [None]:
# big_40=stacked_e2.T.abs().describe().loc['mean'].sort_values(ascending=False).nlargest(500).index


In [None]:
stacked_pairs=pd.concat([typA_e2,typB_e2,typC_e2,typD_e2,typE_e2,typF_e2,typG_e2,typH_e2]).groupby(level=0).sum()
pair_labels=stacked_pairs.index.tolist()

In [None]:
typ_exists

In [None]:
pd.concat([gen_pair(typ) for typ in typ_exists]).groupby(level=0).sum()

In [None]:
dummy_stack=pd.concat([gen_e2(typ) for typ in typ_exists])

In [None]:
dummy_stack

In [None]:
# Grab molecular orbital occupations and make it into a dataframe labeled with xyz file name
MO_OCC=[]
for j in range(len(structs)):
    MO_OCC.append(dict(zip(Basis_Indices,[i for i in list(MO_OCCUPATIONS[j])])))
MO_OCC_Dict=dict(zip([str(k) for k in structs],MO_OCC))
MO_OCC_DF=pd.DataFrame(MO_OCC_Dict)

# Dataframe of MO occupation, index=basis indices and columns=structs
MO_OCCUPATIONS_DF=pd.DataFrame(MO_OCCUPATIONS,index=structs,columns=Basis_Indices).transpose()



In [None]:
# 
# Keep in mind HDF5 zeroes out the actrive orbitals... we'll use the Fock matrix to recover these
# 
# Grab molecular orbital energy and make it into a dataframe labeled with xyz file name
MO_Energ=[]
for j in range(len(structs)):
    MO_Energ.append(dict(zip(Basis_Indices,[i for i in list(MO_ENERGIES[j])])))

MO_Energ_Dict=dict(zip([str(k) for k in structs],MO_Energ))
MO_Energy_DF=pd.DataFrame(MO_Energ_Dict)

# Dataframe of MO energies, index=basis indices and columns=structs
MO_ENERGIES_DF=pd.DataFrame(MO_ENERGIES,index=structs,columns=Basis_Indices).transpose()

# Generate fock matrix
Fock_mats=[]
for indx,j in enumerate(structs):
    fileFock=open(os.path.join(path_to_Focks,f'{j}.GMJ_Fock_MO.csv'),'r').readlines()
    shape=[int(k) for k in fileFock[0].split('x')]
    Fock_mats.append(np.array([float(i.split()[2]) for i in fileFock[1:]]).reshape([int(k) for k in fileFock[0].split('x')]))
Fock_mats=np.array(Fock_mats)    
Fock_dict=dict(zip([str(z) for z in structs],Fock_mats))
labeled_fock_dict=dict(zip([str(z) for z in structs],[pd.DataFrame(Fock_dict[str(x)],index=Basis_Indices,columns=Basis_Indices) for x in structs]))


# List of MOs Energies
TEST_New_MO_Energies=[]
for i in structs:
    for indx,nam in enumerate(Basis_Indices):
        if nam[0]=='A':
            TEST_New_MO_Energies.append((nam,float(0.5*labeled_fock_dict[str(i)][nam][nam])))
        if nam[0]!='A':
            TEST_New_MO_Energies.append((nam,float(MO_Energ_Dict[str(i)][nam])))

New_MO_Dict=dict(zip([str(z) for z in structs],[dict(np.array(TEST_New_MO_Energies).reshape(len(structs),len(Basis_Indices),-1)[i]) for i in range(len(structs))]))





In [None]:
for indx,k in enumerate(structs):
    labeled_fock_dict[str(k)]=labeled_fock_dict[str(k)]*0.5
    for i in Basis_Indices:
        for j in Basis_Indices:
            labeled_fock_dict[str(k)][j][i]=labeled_fock_dict[str(k)][i][j]
            if i==j and i.startswith('S') and j.startswith('S'):
                labeled_fock_dict[str(k)][i][j]=MO_ENERGIES_DF[k].loc[i]

In [None]:
labeled_fock_dict['106.0']['A1']['A1']

In [None]:
New_MO_Dict['106.0']['A1']

In [None]:
def gen_one_int():
    one_int=[]
    Labels=[]
    Indexes=[]
    upd_1int_indx=[]
    def one_gener(i):
        for j in open(os.path.join(path_to_1_ints,f'{i}.GMJ_one_int.csv'),'r').readlines():
            yield float(j.replace('\n',''))
    def one_lbl_gen(i):
        for j in open(os.path.join(path_to_1_int_idx,f'{i}.GMJ_one_int_indx.csv'),'r').readlines():
            yield '_'.join(j.replace('\n','').split()) 

    for i in one_lbl_gen(structs[0]):
        upd_1int_indx.append(Basis_Indices[int(i.split('_')[0])-1]+'_'+Basis_Indices[int(i.split('_')[1])-1])

#     Dict=dict(zip(Indexes,Labels))
    return pd.concat([pd.DataFrame(one_gener(str(i)),index=upd_1int_indx,columns=[str(i)]) for ind,i in enumerate(structs)],axis=1)





In [None]:
t0=time()
int1=gen_one_int()
print(f'Integrals loaded in {time()-t0:0.4f} s')

In [None]:
def fix1(g):
    II=[]
    AA=[]
    SS=[]
    IA=[]
    AS=[]
    SI=[]    
    ad_df=[]


# II
    for i in range(len([i for i in Basis_Indices if i.startswith('I')])):
        for j in range(i,len([i for i in Basis_Indices if i.startswith('I')])):
            if f'I{i+1}'!=f'I{j+1}':
                II.append((f'I{i+1}_I{j+1}',int1[str(g)].loc[f'I{j+1}_I{i+1}']))

    # AA        
    for i in range(len([i for i in Basis_Indices if i.startswith('A')])):
        for j in range(i,len([i for i in Basis_Indices if i.startswith('A')])):
            if f'A{i+1}'!=f'A{j+1}':
                AA.append((f'A{i+1}_A{j+1}',int1[str(g)].loc[f'A{j+1}_A{i+1}']))

    # SS    
    for i in range(len([i for i in Basis_Indices if i.startswith('S')])):
        for j in range(i,len([i for i in Basis_Indices if i.startswith('S')])):
            if f'S{i+1}'!=f'S{j+1}':
                SS.append((f'S{i+1}_S{j+1}',int1[str(g)].loc[f'S{j+1}_S{i+1}']))

# Off diagonal        

# IA
    for i in range(len([i for i in Basis_Indices if i.startswith('I')])):
        for j in range(len([i for i in Basis_Indices if i.startswith('A')])):
            IA.append((f'I{i+1}_A{j+1}',int1[str(g)].loc[f'A{j+1}_I{i+1}']))


# SA
    for i in range(len([i for i in Basis_Indices if i.startswith('A')])):
        for j in range(len([i for i in Basis_Indices if i.startswith('S')])):
            AS.append((f'A{i+1}_S{j+1}',int1[str(g)].loc[f'S{j+1}_A{i+1}']))
           
# SI
    for i in range(len([i for i in Basis_Indices if i.startswith('I')])):
        for j in range(len([i for i in Basis_Indices if i.startswith('S')])):
            SI.append((f'I{i+1}_S{j+1}',int1[str(g)].loc[f'S{j+1}_I{i+1}']))


    
    
    return pd.concat([pd.DataFrame(int1[str(g)],columns=[str(g)]),pd.DataFrame(II,columns=[0,str(g)]).set_index(0),pd.DataFrame(AA,columns=[0,str(g)]).set_index(0),pd.DataFrame(SS,columns=[0,str(g)]).set_index(0),pd.DataFrame(IA,columns=[0,str(g)]).set_index(0),pd.DataFrame(AS,columns=[0,str(g)]).set_index(0),pd.DataFrame(SI,columns=[0,str(g)]).set_index(0)],axis=0)

In [None]:
fxd_1ints=pd.concat([fix1(str(i)) for i in structs],axis=1)

In [None]:
# pd.set_option("precision", 2)
# np.set_printoptions(precision=2)
# pd.options.display.float_format = '{:,.2f}'.format



nmo=len(Basis_Indices)
indice=[]
ad_ind=[]
for ind,i in enumerate(range(nmo)):
    for indx,j in enumerate(range(nmo)):
        ad_ind.append(f'{i+1}_{j+1}')
        if j<=i:
            indice.append(f'{i+1}_{j+1}')

In [None]:
len(indice),len(ad_ind)

In [None]:
import itertools

def gen_MO(k):
    if os.path.exists(os.path.join(os.getcwd(),f'AO_ints/{k}.GMJ_AO_INT.csv'))==True:
        AO_ints=np.genfromtxt(fname=f'AO_ints/{k}.GMJ_AO_INT.csv', dtype='float').reshape(len(indice),nmo*nmo)
        AO_DF=pd.DataFrame(AO_ints,index=indice)
        for ind,i in enumerate(range(nmo)):
            for indx,j in enumerate(range(nmo)):
                if j<=i and i!=j:
                    AO_DF.loc[f'{j+1}_{i+1}']=AO_DF.loc[f'{i+1}_{j+1}']
        AO_DF=AO_DF.sort_index()
        AO_ints=AO_DF.values.reshape(nmo,nmo,nmo,nmo)    
# Use molecular orbital coefficients from hdf5        
        CMO=dict(zip(structs.astype(str),MO_VECTORS))[k].T
#         CMO=pd.read_csv(f'CMO/{k}.GMJ_CMO.csv',header=None).values.reshape(nmo,nmo).T
# Kill Brute force einsum is 100xs faster
        MO=np.einsum("iqrs,ip->pqrs",np.einsum("ijrs,jq->iqrs",np.einsum("ijks,kr->ijrs",np.einsum("ijkl,ls->ijks",AO_ints,CMO),CMO),CMO),CMO)
        # physicist notation
        MO=MO.swapaxes(1, 2)
#     return pd.DataFrame(MO.reshape(nmo**2,nmo**2),index=ad_ind,columns=ad_ind)
    return MO


In [None]:

froz=[indx for indx,i in enumerate(Basis_Indices) if i.startswith('F')]
inact=[indx for indx,i in enumerate(Basis_Indices) if i.startswith('I')]
act=[indx for indx,i in enumerate(Basis_Indices) if i.startswith('A')]
virt=[indx for indx,i in enumerate(Basis_Indices) if i.startswith('S')]


In [None]:
gen_F=dict(zip(structs.astype(str),scf_F))
gen_occ=dict(zip(structs.astype(str),scf_OCC))

In [None]:
(scf_OCC[0]==2).sum()

In [None]:
len(scf_OCC[0])-(scf_OCC[0]==2).sum()

In [None]:

full_set=sorted(set(sum([gen_pair_labels(typ) for typ in typ_exists],[])))

In [None]:
occcc=len(MO_OCC_DF.T.describe().loc['mean'][MO_OCC_DF.T.describe().loc['mean']!=0])
virttt=len(MO_OCC_DF.T.describe().loc['mean'][MO_OCC_DF.T.describe().loc['mean']==0])




In [None]:

class gen_two_ints(object):
# Some of the D_{ij}^{ab}=f_{ii}+f_{jj}-f_{aa}-f_{bb}=e_{ii}+e_{jj}-e_{aa}-e_{bb} elements 
# will be 0 since ij and ab overlap for CASPT2
# ij \in {I,A}
# ab \in {A,V}
# So ignore the warnings since they'll only be 0 when ijab \in {A} only
    import warnings
    warnings.simplefilter('ignore')
    
    def get_MO(self, string):
        return self.MO[self.slice_dict[string[0]], self.slice_dict[string[1]],self.slice_dict[string[2]], self.slice_dict[string[3]]]

    def get_F(self, string):
        return self.F[self.slice_dict[string[0]], self.slice_dict[string[1]]] 
    
    def compute_pairmatrix(self,selft2start,selfdoublecheck):
        test = 2*selft2start*selfdoublecheck
        test -= np.swapaxes(selft2start,2,3)*selft2start
        c=np.sum(test,axis=(2,3))
        return c    
    
    def build_tau(self,t2,t1):
        ttau = t2.copy()
        tmp = np.einsum('ia,jb->ijab', t1, t1,optimize=True)
        ttau += tmp
        return ttau    
    
    def __init__(self):
        nocc=occcc
        nvirt=virttt
#         nocc=(scf_OCC[0]==2).sum()
#         nvirt=(len(scf_OCC[0])-(scf_OCC[0]==2).sum())
        self.nfzc=len(froz)
        self.nocc=nocc
        self.nvirt=nvirt
        self.nmo=nocc+nvirt
#         all_nmo=act+virt
        nmo=nocc+nvirt
        self.slice_o = slice(0,nocc)
        self.slice_v = slice(nocc,nocc+nvirt)
        self.slice_a = slice(0,nmo)    

        self.slice_dict = {
            'o': self.slice_o,
            'v': self.slice_v,
            'a': self.slice_a
        }
        
        featurelist=list()
        def gen_feat(k):
            
            values=4
            self.empty=np.zeros((nvirt,))
            self.occupado=np.zeros((nocc,))
            self.MO=gen_MO(k)
            self.F=gen_F[k]
            self.t1=np.zeros((nocc,nvirt))
            
            Focc = self.F[self.slice_o]
            Fvir = self.F[self.slice_v]  
            self.orbocc=Focc
            self.orbvirt=Fvir

            self.Dia = Focc.reshape(-1, 1) - Fvir
            self.Dijab = Focc.reshape(-1, 1, 1, 1) + Focc.reshape(-1, 1, 1) - Fvir.reshape(-1, 1) - Fvir 
# Clean up AA block to get rid of infinities, we do not need this anyway              

            self.t2start=self.MO[self.slice_o, self.slice_o, self.slice_v,self.slice_v] / self.Dijab
            self.t2start[np.isinf(self.t2start)]=0
        
            self.triplecheck=2*self.t2start*self.get_MO('oovv')
            self.triplecheck -=  np.swapaxes(self.get_MO('oovv'),2,3)*self.t2start 
            
            
            self.doublecheck = self.MO[self.slice_o, self.slice_o, self.slice_v, self.slice_v]   
# Pair energy and pairs is not reasonable            
            self.pairenergy=(np.zeros(self.doublecheck.shape)+self.compute_pairmatrix(self.t2start,self.doublecheck)[:,:,np.newaxis,np.newaxis])
#             print(self.pairenergy,np.einsum('ijab->',self.pairenergy))
            tmp_tau = self.build_tau(self.t2start,self.t1)
            self.pairs=2*tmp_tau*self.get_MO('oovv')
            self.pairs-= np.swapaxes(self.get_MO('oovv'),2,3)*tmp_tau
            self.pairs = np.sum(self.pairs,axis=(2,3))
            
#             print(self.pairs,np.einsum('ij->',self.pairs))

            
            
            test=np.zeros(self.t2start.shape)
            self.diag=test
            for i in range (0,self.nocc):
                for j in range (0,self.nocc):
                    np.fill_diagonal(self.diag[i,j,:,:],1)
                    
           #I think those features are not great, try <ii,aa>
            temp=np.zeros((self.nocc,self.nvirt))
            for i in range (0,self.nocc):
                for j in range (0,self.nvirt):
                    temp[i,j]=self.doublecheck[i,i,j,j]
            test1=np.zeros((self.t2start.shape))
            self.screen1=test1+temp[:,np.newaxis,:,np.newaxis]
            self.screen2=test1+temp[np.newaxis,:,np.newaxis,:]
# nfzc=# frozen core
            val=self.nmo-self.nfzc
            temp=np.zeros((val,val))        
            for i in range (0,val):
                for j in range (0,val):
                    temp[i,j]=self.MO[i,i,j,j]
            temp =temp[self.slice_v,self.slice_v]
            self.screenvirt=test1+temp[np.newaxis,np.newaxis,:,:]

            b=self.triplecheck
            diag_indx=[]
            off_diag_indx=[]      

            for i in range(b.shape[0]):
                for j in range (b.shape[1]):
                    featurelist.clear()                    
                    ind=np.argsort(b[i,j].flatten(),axis=0)
                    new=np.sum(b[i,j])#0
                    
                    featurelist.append('Pair_Energy')
                    new=np.hstack((new,self.MO[i,i,j,j]))
                    
                    featurelist.append('coulomb_ij')
                    new=np.hstack((new,np.take_along_axis(self.screen1[i,j].flatten(), ind, axis=0)[:values]))#1,2,3,4
                    
                    featurelist.append('i_screen1')
                    new=np.hstack((new,np.take_along_axis(self.screen2[i,j].flatten(), ind, axis=0)[:values]))#9,10,11,12
                    
                    featurelist.append('j_screen1')
                    new=np.hstack((new,np.take_along_axis(self.screenvirt[i,j].flatten(), ind, axis=0)[:values]))#9,10,11,12
                    
                    featurelist.append('triplecheck1')                    
                    new=np.hstack((new,np.take_along_axis(b[i,j].flatten(), ind, axis=0)[:values]))#17,18,19,20
                    if i==j:
#                         print(Basis_Indices[i],Basis_Indices[j])
                        ind=np.argsort(b[i,j].flatten(),axis=0)
                        
# Eij:dim(1)            
                        featurelist.append('MP2_Pair_Energy')
                        new=np.sum(b[i,j])#0
# <ii||jj>:dim(1)        
                        new=np.hstack((new,self.MO[i,i,j,j]))
                        featurelist.append('coulomb_ij')    
# e_ij^ab:dim(4)                  
                        featurelist.append('coulomb_ij')
                        new=np.hstack((new,np.take_along_axis(self.screen1[i,j].flatten(), ind, axis=0)[:values]))#1,2,3,4
# <ij||ab>:dim(4)
                        featurelist.append('i_screen1')
                        new=np.hstack((new,np.take_along_axis(self.screen2[i,j].flatten(), ind, axis=0)[:values]))#9,10,11,12
# <ii||aa>:dim(4)                       
                        featurelist.append('j_screen1')
                        new=np.hstack((new,np.take_along_axis(self.screen1[i,j].flatten(), ind, axis=0)[-values:]))#1,2,3,4
# <jj||bb>:dim(4)                      
                        featurelist.append('i_screen1')
                        new=np.hstack((new,np.take_along_axis(self.screen2[i,j].flatten(), ind, axis=0)[-values:]))#9,10,11,12
# <aa||bb>:dim(4)                      
                        featurelist.append('j_screen1')
                        new=np.hstack((new,np.take_along_axis(b[i,j].flatten(), ind, axis=0)[:values]))#17,18,19,20
# t2 matrix:dim(4)                      
                        new=np.hstack((new,np.take_along_axis(b[i,j].flatten(), ind, axis=0)[-values:]))#17,18,19,20
# missing Eij:dim(1)                      
                        one=np.sum(np.take_along_axis(b[i,j].flatten(), ind, axis=0)[-values:])
                        two=np.sum(np.take_along_axis(b[i,j].flatten(), ind, axis=0)[:values])
                        new=np.hstack((new, np.sum(b[i,j])-one-two))
                        featurelist.append('triplecheck1')


                    else:
#                         print(Basis_Indices[i],Basis_Indices[j])
                        ind=np.argsort(b[i,j].flatten(),axis=0)
# Eij:dim(1)                        
                        new=np.sum(b[i,j])#0
                        featurelist.append('Pair_Energy')
# <ii||jj>:dim(1)        
                        new=np.hstack((new,self.MO[i,i,j,j]))
                        featurelist.append('coulomb_ij')
# e_ij^ab:dim(4)                          
                        new=np.hstack((new,np.take_along_axis(self.screen1[i,j].flatten(), ind, axis=0)[:values]))#1,2,3,4
                        featurelist.append('i_screen1')
# <ij||ab>:dim(4)        
                        new=np.hstack((new,np.take_along_axis(self.screen2[i,j].flatten(), ind, axis=0)[:values]))#9,10,11,12
                        featurelist.append('j_screen1')
                        featurelist.append('triplecheck1')
# <ii||aa>:dim(4)            
                        new=np.hstack((new,np.take_along_axis(self.screen1[i,j].flatten(), ind, axis=0)[-values:]))#1,2,3,4
                        featurelist.append('i_screen1')
# <jj||bb>:dim(4)                              
                        new=np.hstack((new,np.take_along_axis(self.screen2[i,j].flatten(), ind, axis=0)[-values:]))#9,10,11,12
                        featurelist.append('j_screen1')
# <aa||bb>:dim(4)                        
                        new=np.hstack((new,np.take_along_axis(b[i,j].flatten(), ind, axis=0)[:values]))#17,18,19,20
# t2 matrix:dim(4)    
                        new=np.hstack((new,np.take_along_axis(b[i,j].flatten(), ind, axis=0)[-values:]))#17,18,19,20
                        one=np.sum(np.take_along_axis(b[i,j].flatten(), ind, axis=0)[-values:])
                        two=np.sum(np.take_along_axis(b[i,j].flatten(), ind, axis=0)[:values])
# missing Eij:dim(1)            
                        new=np.hstack((new, np.sum(b[i,j])-one-two))
                        featurelist.append('triplecheck1')

          
                    if ((i==0) and (j==0)):
                        diag_indx.append(Basis_Indices[i]+'_'+Basis_Indices[j])
                        self.a=new.copy()
                    elif ((i==0) and (j==1)):
                        off_diag_indx.append(Basis_Indices[i]+'_'+Basis_Indices[j])
                        self.g=new.copy()
                    elif (i==j):
                        diag_indx.append(Basis_Indices[i]+'_'+Basis_Indices[j])
                        self.a=np.vstack((self.a,new))#41
                    elif i!=j:  
                        off_diag_indx.append(Basis_Indices[i]+'_'+Basis_Indices[j])        
                        self.g=np.vstack((self.g,new))#41


            
            return pd.concat([pd.DataFrame(self.a,index=diag_indx),pd.DataFrame(self.g,index=off_diag_indx)]).loc[full_set]

        
        self.dict=dict([(k,gen_feat(k))for k in structs.astype(str)])




In [None]:
def elements(x):
    '''
    Takes an integer, x, and returns the number of off-diagonal elements of an upper triangular matrix
    f(x)=(x*(x-1))/2
    '''
    return (x*(x-1))/2

In [None]:
full_set

In [None]:

def gen_two_el():
# Eij:dim(1)                        
# <ii||jj>:dim(1)        
# e_ij^ab:dim(4)                          
# <ij||ab>:dim(4)        
# <ii||aa>:dim(4)            
# <jj||bb>:dim(4)                              
# <aa||bb>:dim(4)                        
# t2 matrix:dim(4)    
# missing Eij:dim(1)            
    return gen_two_ints().dict


def gen_one_diag_int():
    keys=[] 
    featind=[]
    h_qq=[]
    h_ss=[]
    for i in structs:
        k=str(i)
        keys.append(k)
        ints=fxd_1ints[k]
    # Epq Ers
    # # e_q+e_s-e_p-e_r
    # TIUV       
    #'A'+g[0]+'_I'+g[1]+''+g[2]+''+g[3]
    # Eti Euv (E01 E23)
    # e_i+e_v-e_u-e_t
    # e[0] + e[3] - e[1] - e[2]
        for ind,g in enumerate(full_set):
            featind.append(g)            
            idx=g.split('_')
            q=idx[0]
            s=idx[1]
            h_qq.append(ints.loc[str(q+'_'+q)])
            h_ss.append(ints.loc[str(s+'_'+s)])            
    return dict([(z,pd.DataFrame({'h_qq':np.array(h_qq).reshape(len(structs),-1)[idx],'h_ss':np.array(h_ss).reshape(len(structs),-1)[idx]},index=np.array(featind).reshape(len(structs),-1)[idx])) for idx,z in enumerate(keys)])                          

def gen_one_off_diag_int():
    keys=[] 
    featind=[]
    h_qs=[]
    for i in structs:
        k=str(i)
        keys.append(k)
        ints=fxd_1ints[k]
    # Epq Ers
    # # e_q+e_s-e_p-e_r
    # TIUV       
    #'A'+g[0]+'_I'+g[1]+''+g[2]+''+g[3]
    # Eti Euv (E01 E23)
    # e_i+e_v-e_u-e_t
    # e[0] + e[3] - e[1] - e[2]
        for ind,g in enumerate(full_set):
            featind.append(g)            
            idx=g.split('_')
            q=idx[0]
            s=idx[1]
            h_qs.append(ints.loc[str(q+'_'+s)]) 
                        
    return dict([(z,pd.DataFrame({'h_qs':np.array(h_qs).reshape(len(structs),-1)[idx]},index=np.array(featind).reshape(len(structs),-1)[idx])) for idx,z in enumerate(keys)])                          

def gen_bin():
    # FSO=From same orbital
    FSO=[]
    # Indexing
    featind=[]
    # Keys=Structs in string format
    keys=[]
    for i in structs:
        k=str(i)
        keys.append(k)
        for ind,g in enumerate(full_set):
# Epq Ers
# # e_q+e_s-e_p-e_r
# TIUV
#'A'+g[0]+'_I'+g[1]+''+g[2]+''+g[3]
# Eti Euv (E01 E23)
# e_i+e_v-e_u-e_t
# e[0] + e[3] - e[1] - e[2]
            featind.append(g)
            idx=g.split('_')
            q=idx[0]
            s=idx[1]
# Since I!=A just append 0 since they'll never both come from the same orbital                
            if q==s:
                FSO.append(1)
            else:
                FSO.append(0)
    return dict([(z,pd.DataFrame({'From_Same_Orbital':np.array(FSO).reshape(len(structs),-1)[idx]},index=np.array(featind).reshape(len(structs),-1)[idx])) for idx,z in enumerate(keys)])               

def gen_dbl_index():
    MO_Fock_Mat_Q_Q=[]
    MO_Fock_Mat_S_S=[]
    MO_Fock_Mat_Q_S=[]
    featind=[]
    keys=[]
    for i in structs:
        k=str(i)
        keys.append(k)
        for ind,g in enumerate(full_set):
# Epq Ers
# # e_q+e_s-e_p-e_r
# TIUV
#'A'+g[0]+'_I'+g[1]+''+g[2]+''+g[3]
# Eti Euv (E01 E23)
# e_i+e_v-e_u-e_t
# e[0] + e[3] - e[1] - e[2]
            featind.append(g)
            idx=g.split('_')
            q=idx[0]
            s=idx[1]
            MO_Fock_Mat_Q_Q.append(labeled_fock_dict[k][q][q])            
            MO_Fock_Mat_S_S.append(labeled_fock_dict[k][s][s])
            MO_Fock_Mat_Q_S.append(labeled_fock_dict[k][q][s])
    return dict([(z,pd.DataFrame({'MO_Fock_Mat_Q_Q':np.array(MO_Fock_Mat_Q_Q).reshape(len(structs),-1)[idx],'MO_Fock_Mat_S_S':np.array(MO_Fock_Mat_S_S).reshape(len(structs),-1)[idx],'MO_Fock_Mat_Q_S':np.array(MO_Fock_Mat_Q_S).reshape(len(structs),-1)[idx]},index=np.array(featind).reshape(len(structs),-1)[idx])) for idx,z in enumerate(keys)])                                       

def gen_sif():
    MO_Energy_Q=[]
    MO_Energy_S=[]
    MO_OCC_Q=[]
    MO_OCC_S=[]
    featind=[]
    # Keys=Structs in string format
    keys=[]
    for i in structs:
        k=str(i)
        keys.append(k)
        for ind,g in enumerate(full_set):
# Epq Ers
# # e_q+e_s-e_p-e_r
# TIUV
#'A'+g[0]+'_I'+g[1]+''+g[2]+''+g[3]
# Eti Euv (E01 E23)
# e_i+e_v-e_u-e_t
# e[0] + e[3] - e[1] - e[2]
            featind.append(g)
            idx=g.split('_')
            q=idx[0]
            s=idx[1]
            MO_Energy_Q.append(float(New_MO_Dict[k][q]))
            MO_Energy_S.append(float(New_MO_Dict[k][s]))
            MO_OCC_Q.append(float(MO_OCC_DF[k][q]))
            MO_OCC_S.append(float(MO_OCC_DF[k][s]))
            
    return dict([(z,pd.DataFrame({'MO_Energy_Q':np.array(MO_Energy_Q).reshape(len(structs),-1)[idx],'MO_Energy_S':np.array(MO_Energy_S).reshape(len(structs),-1)[idx],'MO_OCC_Q':np.array(MO_OCC_Q).reshape(len(structs),-1)[idx],'MO_OCC_S':np.array(MO_OCC_S).reshape(len(structs),-1)[idx]},index=np.array(featind).reshape(len(structs),-1)[idx])) for idx,z in enumerate(keys)]) 







In [None]:


# gen_names=['MP2_pair', 'MP2_Denom', 
#            'MP2_amp', 'MP2_amp_mag', 'MP2_amp_sign',
#            'From_Same_Orbital', 'To_Same_Orbital', 
#            'Jia1', 'Jia2', 'Kia1', 'Kia2', 
#            'two_el',  
#            'Jia1mag', 'Jia2mag', 'Kia1mag', 'Kia2mag', 
#            'Jpp', 'Jqq', 'Jrr', 'Jss', 
#            'Kpp', 'Kqq', 'Krr', 'Kss', 
#            'h_pp', 'h_qq', 'h_rr', 'h_ss', 
#            'h_qp', 'h_sr', 'h_qr', 'h_sp', 
#            'MO_Fock_Mat_P_P', 'MO_Fock_Mat_Q_Q', 'MO_Fock_Mat_S_S', 'MO_Fock_Mat_R_R', 
#            'MO_Fock_Mat_Q_P', 'MO_Fock_Mat_S_R', 'MO_Fock_Mat_Q_R', 'MO_Fock_Mat_S_P', 
#            'MO_Energy_P', 'MO_Energy_Q', 'MO_Energy_R', 'MO_Energy_S', 
#            'MO_OCC_P', 'MO_OCC_Q', 'MO_OCC_R', 'MO_OCC_S']

In [None]:
with open('keys.pickle', 'wb') as handle:
    pickle.dump([str(i) for i in structs], handle, protocol=pickle.HIGHEST_PROTOCOL)


In [None]:
# '''
# First Filter Step: Just the contributers
# '''

# # eq_dict=dict(zip(list([str(i) for i in np.arange(1,14,1)]),[i.split()[1] for i in (grep['-i','-A 14', 'Correlation energy /Case, /Symm, and sums:','/'.join((os.getcwd(),'120_dir/PT2_iron_oxo_120.output'))])().split('\n')[1:-2]]))
# eq_dict=dict(zip(list([str(i) for i in np.arange(1,14,1)]),[i.split()[1] for i in (grep['-i','-A 14', 'Correlation energy /Case, /Symm, and sums:','/'.join((os.getcwd(),f'eq_orbs.output'))])().split('\n')[1:-2]]))

# type_drop=[]
# if float(eq_dict['1'])==0:
#     type_drop.append('A')
# if float(eq_dict['2'])+float(eq_dict['3'])==0:
#     type_drop.append('B')    
# if float(eq_dict['4'])==0:
#     type_drop.append('C')    
# if float(eq_dict['5'])==0:
#     type_drop.append('D')    
# if float(eq_dict['6'])+float(eq_dict['7'])==0:
#     type_drop.append('E')    
# if float(eq_dict['8'])+float(eq_dict['9'])==0:
#     type_drop.append('F')    
# if float(eq_dict['10'])+float(eq_dict['11'])==0:
#     type_drop.append('G')    
# if float(eq_dict['12'])+float(eq_dict['13'])==0:
#     type_drop.append('H')    
    
# dummytyp=sorted([i for i in list(set([i.split('_')[0] for i in typ_exists])-set(type_drop))])

# with open('type_drop.pickle', 'wb') as handle:
#     pickle.dump(dummytyp, handle, protocol=pickle.HIGHEST_PROTOCOL)




In [None]:

def Big_Data_GS():
    t0=time()
    with open('feats.pickle', 'wb') as handle:
        pickle.dump(pd.concat([pd.concat({k: v for k,v in gen_bin().items()},axis=0),pd.concat({k: v for k,v in gen_two_el().items()},axis=0),pd.concat({k: v for k,v in gen_one_diag_int().items()},axis=0),pd.concat({k: v for k,v in gen_one_off_diag_int().items()},axis=0),pd.concat({k: v for k,v in gen_dbl_index().items()},axis=0),pd.concat({k: v for k,v in gen_sif().items()},axis=0)],axis=1), handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('targets.pickle', 'wb') as handle:
        pickle.dump(stacked_pairs, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print(time()-t0)


In [None]:
Big_Data_GS()


In [None]:
# pd.read_pickle(f'typH_feats.pickle'),
# pd.read_pickle(f'typH_targets.pickle').plot()