In [2]:
import re
import pandas as pd
import numpy as np

In [3]:
SECTION_REPLACEMENTS = {
    "Sample parameters": "entry/sample",
    "Measurements": "entry/measurement"
}

MEASUREMENT_REPLACEMENTS = {
    "IV Curve Measurement": "iv_curve",
    "Variable Field Measurement": "variable_field"
}

MEASUREMENT_KEYS = ['Contact Sets']

def is_section(expr):
    return bool(re.search(r'^\[.+\]$', expr))

def is_measurement(expr):
    return bool(re.search(r'^\<.+\>$', expr))

def is_key(expr):
    return bool(re.search(r'^.+\s*[:|=]\s*.+$', expr))

def is_meas_header(expr):
    return bool(re.search(r'^[^\]]+\[[^\]]+\]', expr))

def get_unique_dkey(dic, dkey):
    suffix = 0
    while f'{dkey}{suffix}' in dic:
        suffix += 1
    
    return f'{dkey}{suffix}'

def split_add_key(fobj, dic, prefix, expr):
    key, *val = re.split('\s*[:|=]\s*', expr)
    jval = ''.join(val).strip()
    
    if key in MEASUREMENT_KEYS:
        data = []
        for line in fobj:
            if not line.strip():
                break
            if is_key(line):
                split_add_key(None,# There should be no deeper measurement, prevent further consum of lines 
                              dic, 
                              f'{prefix}/{key}/{jval}',
                              line)
            else:
                data.append(list(map(lambda x: x.strip(), re.split('\t+', line))))
        
        dkey = get_unique_dkey(dic,
                               f"{prefix}/{key}/{jval}/data")
        dic[dkey] = pd.DataFrame(
            np.array(data[1:], dtype=np.float64), columns=data[0]
        )
    else:
        dic[f'{prefix}/{key}'] = jval
        

def read_template_from_file(fname, encoding="iso-8859-1"):
    template = {}
    current_section = '/entry'
    current_measurement = ''
    with open(fname, encoding=encoding) as f:
        for line in f:
            if is_section(line):
                sline = line.strip()[1:-1]
                current_section = f'/{SECTION_REPLACEMENTS.get(sline, sline)}'
                current_measurement = ''
            elif is_measurement(line):
                step, _, *meas = line.partition(":")
                sline = f'{step[6:]}_' + ''.join(meas).strip()[:-1]
                current_measurement = f'/{MEASUREMENT_REPLACEMENTS.get(sline, sline)}'
            elif is_key(line):
                split_add_key(f, template,
                              f'{current_section}{current_measurement}',
                              line)
            elif is_meas_header(line):
                data = []
                for mline in f:
                    if not mline.strip():
                        break
                    data.append(list(map(lambda x: x.strip(), re.split('\t+', mline))))

                header = list(map(lambda x: x.strip(), re.split('\t+', line)))
                dkey = get_unique_dkey(template,
                                       f'{current_section}{current_measurement}/data')
                template[dkey] = \
                    pd.DataFrame(np.array(data, dtype=np.float64),
                                 columns=header)
                
    return template
            
read_template_from_file('22-127-G_Hall-RT_TT-Halter.txt')
read_template_from_file('22-127-G_20K-320K_TT-Halter_WDH_060722.txt')

/entry/measurement/2_Variable Field Measurement/data0
/entry/measurement/1_Variable Temperature Measurement/data0
/entry/measurement/1_Variable Temperature Measurement/data1
/entry/measurement/2_Variable Temperature Measurement/data0
/entry/measurement/2_Variable Temperature Measurement/data1
/entry/measurement/3_Variable Temperature Measurement/data0
/entry/measurement/3_Variable Temperature Measurement/data1
/entry/measurement/4_Variable Temperature Measurement/data0
/entry/measurement/4_Variable Temperature Measurement/data1
/entry/measurement/5_Variable Temperature Measurement/data0
/entry/measurement/5_Variable Temperature Measurement/data1
/entry/measurement/6_Variable Temperature Measurement/data0
/entry/measurement/6_Variable Temperature Measurement/data1


{'/entry/sample/Sample Type': 'van der Pauw',
 '/entry/sample/Hall Factor': '1.0',
 '/entry/sample/Thickness': '1.8 [µm]',
 '/entry/sample/L': '15.0 [mm]',
 '/entry/sample/Depletion Layer Correction': 'Off',
 '/entry/measurement/1_Variable Temperature Measurement/Start Time': '07/06/22 151551',
 '/entry/measurement/1_Variable Temperature Measurement/Time Completed': '07/06/22 160752',
 '/entry/measurement/1_Variable Temperature Measurement/Elapsed Time': '0521',
 '/entry/measurement/1_Variable Temperature Measurement/Starting Temperature': '20.0 [K]',
 '/entry/measurement/1_Variable Temperature Measurement/Ending Temperature': '28.0 [K]',
 '/entry/measurement/1_Variable Temperature Measurement/Spacing': 'Linear Spacing',
 '/entry/measurement/1_Variable Temperature Measurement/Temperature Step': '2.0 [K]',
 '/entry/measurement/1_Variable Temperature Measurement/Field at': '3.0 [kG]',
 '/entry/measurement/1_Variable Temperature Measurement/Measurement Type': 'Hall and Resistivity Measure