In [20]:
import csv
import json
import flatten_json
from tqdm.auto import tqdm

In [17]:
def file_to_headers(in_path):
    w = open(in_path, encoding="utf-8")
    reader = csv.reader(w, delimiter='\t')
    headers = next(reader)
    w.close()
    return headers

in_path = 'laborwerte/dump.txt'
original_headers = file_to_headers(in_path)
print(original_headers)

['version', 'reportnr_labor_ano', 'patnr_ano', 'fallnr_ano', 'dat_sekunden_nach_fall_eindat', 'content_ano']


In [21]:
def file_to_list(in_path, error_path):
    reports = []
    with open(in_path) as csv_file, open(error_path, 'w') as error_file:
        reader = csv.DictReader(csv_file, delimiter='\t')
        for row in tqdm(reader):
            report = dict((k, row[k]) for k in row if k in original_headers[:7])
            try:
                pruned = row['content_ano'].replace("'", "")
                j = json.loads(pruned)
                j = flatten_json.flatten(j)
                report.update(j)
                reports.append(report)
            except Error as e:  
                error_file.write(row['content_ano'])
                print("Error during loading {}".format(row['reportnr_ano']))
    return reports

in_path = 'laborwerte/dump.txt'
error_path = 'errors.txt'
reports = file_to_list(in_path, error_path)

0it [00:00, ?it/s]

In [22]:
def list_to_headers(haystack):
    return set().union(*(h.keys() for h in haystack))

def order_headers(headers, order):
    headers_without_order_items = list({header for header in headers if header not in order})
    return order + sorted(headers_without_order_items)

all_headers = list_to_headers(reports)
ordered_headers = order_headers(all_headers, original_headers[:-1])
print(ordered_headers[:15])

['version', 'reportnr_labor_ano', 'patnr_ano', 'fallnr_ano', 'dat_sekunden_nach_fall_eindat', '5HIEQUO:STD_DAT', '5HIEQUO:STD_DIM', '5HIEQUO:STD_EINDAT', '5HIEQUO:STD_NAM', '5HIEQUO:STD_REF', '5HIEQUO:STD_RES', '5HIEQUO:STD_RESDAT', '5HIEQUO:STD_STAT', '5HIEX:STD_DAT', '5HIEX:STD_DIM']


In [23]:
def list_to_file(reports, out_path, headers):
    with open(out_path, 'w') as out_file:
        dict_writer = csv.DictWriter(out_file, fieldnames=headers, delimiter=",", quotechar='"')
        dict_writer.writeheader()
        dict_writer.writerows(tqdm(reports))

out_path = 'laborwerte_converted.csv'
list_to_file(reports, out_path, ordered_headers)

  0%|          | 0/133479 [00:00<?, ?it/s]

In [24]:
def tidy_list(reports):
    """Ensures all reports are a one-dimensional dictionaries."""
    
    for report in reports:
        deletes = []
        for k, v in report.items():
            if not isinstance(v, str):
                deletes.append(k)
            elif v == "":
                deletes.append(k)
        for delete in deletes:
            report.pop(delete)
            
tidy_list(reports)

In [27]:
import os
def list_to_files(reports, out_directory, headers):
    for report in tqdm(reports):
        # reportnr_ano for berichte
        out_path = os.path.join(out_directory, report['reportnr_labor_ano'] + ".txt")
        with open(out_path, 'w') as out_file:
            for header in headers:
                if header in report:
                    if not isinstance(report[header], str):
                        print(report[header])
                    out_file.write(header.upper() + "\n")
                    out_file.write(report[header] + "\n\n")

out_directory = 'laborwerte_converted'
list_to_files(reports, out_directory, ordered_headers)

  0%|          | 0/133479 [00:00<?, ?it/s]