In [1]:
import pandas as pd
import pathlib

In [2]:
def getLines(f):
    """Read file contents and return stripped lines."""
    with open(f) as file:
        return [line.strip() for line in file.readlines()]

def getEnergy(out):
    """Returns total energy from output file."""
    for line in getLines(out):
        if line.startswith('FINAL SINGLE POINT ENERGY'):
            return float(line.split()[4])

def getNumCores(inp):
    """Returns number of CPU cores from input file."""
    for line in getLines(inp):
        if line.startswith('%pal'):
            return int(line.split()[2])

def getNumSCFIterations(out):
    """Returns number of SCF iterations from output file."""
    for line in getLines(out):
        if 'SCF CONVERGED AFTER' in line:
            return int(line.split()[4])

def getSCFTiming(out):
    """Returns timing for SCF module in seconds from output file."""
    lines = getLines(out)
    offset = 5
    for i, line in enumerate(lines):
        if line.startswith('TIMINGS'):
            return float(lines[i+offset].split()[3])

def getWalltime(out):
    """Returns total wall time in seconds from output file."""
    for line in reversed(getLines(out)):
        if line.startswith('TOTAL RUN TIME'):
            _, _, _, d, _, h, _, m, _, s, _, ms, _ = line.split()
            return float(d)*60*60*24 + float(h)*60*60 + float(m)*60 + float(s) + float(ms)/1000

def getMemoryUsage(log):
    lines = getLines(log)
    for i, line in enumerate(lines):
        if line.startswith('Memory usage stats:'):
            for j, subline in enumerate(lines[i:]):
                if subline.split()[0].endswith('.batch'):
                    m = subline.split()[1]
                    return int(m[:-1]) // 1000000
                
def getORCAVersion(out):
    for line in getLines(out):
        if line.startswith('Program Version'):
            return line.split()[2]

In [3]:
root = pathlib.Path('').absolute()
inputs = root.joinpath('calcs').glob('*.inp')
mol = 'Valinomycine'

table = []
for inp in inputs:
    basis = inp.stem
    out = inp.with_suffix('.out')
    log = inp.with_suffix('.log')
    table.append((
        basis,
        mol,
        getEnergy(out),
        getNumCores(inp),
        getNumSCFIterations(out),
        getSCFTiming(out),
        getWalltime(out),
        getMemoryUsage(log),
        getORCAVersion(out)
    ))
    
columns = ('BasisSet', 'Molecule', 'Energy', 'NumCores', 'NumIter', 'SCFTime', 'WallTime', 'Memory', 'ORCAVersion')
df = pd.DataFrame(table, columns=columns).sort_values(by='BasisSet')
df.to_csv('data.csv', index=False)