# Importing InKind from FileMaker

We use an XML export of the various tables in the FileMaker Inkind database.

In [70]:
import os,sys,re,collections
from os.path import splitext, basename
from copy import deepcopy
from functools import reduce
from glob import glob
from lxml import etree

## Table and Field organization

In [71]:
DB_NAME = 'inkind_data'

MERGE_FIELDS = dict(
    contrib=dict(
        academic_entity_url=('academic_entity_url_2',),
        contribution_url=('contribution_url_2',),
        contact_person_mail=('contact_person_mail_2',),
    ),
)

SKIP_FIELDS = dict(
    contrib={
        'teller',
        'whois',
        'help_text',
        'help_description',
        'total_costs_total',
        'goldpassword',
        'gnewpassword',
        'gnewpassword2',
    },
)

MOVE_FIELDS = dict(
    contrib=dict(
        assessment={
            'submit',
            'approved',
            'vcchead_approval',
            'vcchead_disapproval',
            'dateandtime_approval',
            'dateandtime_cioapproval',
            'dateandtime_ciozero',
            'vcc_head_decision',
            'vcc_head_decision_vcc11',
            'vcc_head_decision_vcc12',
            'vcc_head_decision_vcc21',
            'vcc_head_decision_vcc22',
            'vcc_head_decision_vcc31',
            'vcc_head_decision_vcc32',
            'vcc_head_decision_vcc41',
            'vcc_head_decision_vcc42',
            'vcc11_name',
            'vcc12_name',
            'vcc21_name',
            'vcc22_name',
            'vcc31_name',
            'vcc32_name',
            'vcc41_name',
            'vcc42_name',
        },
    ),
)

FIELD_TYPE_OVERRIDE = dict(
    contrib=dict(
        costs_total='valuta',
        total_costs_total='valuta',
        creation_date_time='datetime',
        modification_date_time='datetime',
    ),
    assessment=dict(
        dateandtime_approval='datetime',
        dateandtime_cioapproval='datetime',
        dateandtime_ciozero='datetime',
    ),
)

SPLIT_FIELDS = dict(
    contrib=dict(
        disciplines_associated='generic',
        other_keywords='generic_comma',
        tadirah_research_activities='generic',
        tadirah_research_objects='generic',
        tadirah_research_techniques='generic',
        type_of_inkind='generic',
        vcc='generic',
    ),
)

VALUE_FIELDS = dict(
    contrib={
        'country',
        'creator',
        'last_modifier',
        'other_type_of_inkind',
        'year',
    },
)

FIX_FIELDS = dict(
    contrib=dict(
        country='countrycode',
    ),
)

LIMIT_ROWS = 50 # maximum number of rows to be written in one sql insert statement

## Field types

In [72]:
TYPES = {'number', 'text', 'valuta', 'date', 'datetime'}

MIN_M = 5       # minimum varchar size = 2**MIN_M
MAX_M = 13      # maximum varchar size = 2**MAX_M

DATE_PATTERN = re.compile(
    '^\s*([0-9]{2})/([0-9]{2})/([0-9]{4})$'
)
DATE2_PATTERN = re.compile(
    '^\s*([0-9]{4})-([0-9]{2})-([0-9]{2})$'
)
DATETIME_PATTERN = re.compile(
    '^\s*([0-9]{2})/([0-9]{2})/([0-9]{4})\s+([0-9]{2}):([0-9]{2})(?::([0-9]{2}))?$'
)
NULL_VALUES = {
    'http://',
    'https://',
}

TBF = '_tobefixed'

splitters = dict(
    generic=re.compile('[ \t]*[\n+][ \t\n]*'),
    generic_comma=re.compile('[ \t]*[\n+,][ \t\n]*'),
)

## Config settings

In [73]:
# Locations

HOME_DIR = os.path.expanduser('~').replace('\\', '/')
BASE_DIR = '{}/projects/has/dacs'.format(HOME_DIR)
FM_DIR = '{}/fm'.format(BASE_DIR)
TEMP_DIR = '{}/tmp'.format(BASE_DIR)
RESULT_DIR = '{}/sql'.format(BASE_DIR)
FMNS = '{http://www.filemaker.com/fmpxmlresult}'
ROW_RAW_FILE = '{}/row_raw_file'.format(TEMP_DIR)
ROW_FILE = '{}/row_file'.format(TEMP_DIR)
ROW_EXT = 'txt'

## Config checks

In [74]:
nwarnings = 0

def resetw():
    global nwarnings
    nwarnings = 0

def info(msg):
    sys.stdout.write('{}\n'.format(msg))
    sys.stdout.flush()

def note(msg):
    sys.stdout.write('NB: {}\n'.format(msg))
    sys.stdout.flush()
    
def warning(msg, count=True):
    global nwarnings
    sys.stderr.write('{} {}: {}\n'.format('!'*5, 'WARNING', msg))
    sys.stderr.flush()
    if count: nwarnings += 1

def finalw():
    if nwarnings == 0:
        info('OK, no warnings')
    else:
        warning('There were {} warnings'.format(nwarnings), count=False)

def check_config():
    good = True
    for x in [1]:
        good = False
        if not os.path.exists(BASE_DIR):
            warning('BASE_DIR does not exist: {}'.format(BASE_DIR))
            break
        this_good = True
        for cdir in (TEMP_DIR, RESULT_DIR):
            this_good = False
            if not os.path.exists(cdir):
                try:
                    os.makedirs(cdir)
                except os.error as e:
                    warning('{} could not be created.'.format(cdir))
                    break
            this_good = True
        if not this_good:
            break
        good = True
    if not good:
        warning('There were configuration errors', count=False)
    else:
        info('Configuration OK')

## Value validation and transformation

In [75]:
def date_repl(match):
    [d,m,y] = list(match.groups())
    return '{}-{}-{}'.format(y,m,d)
    
def date2_repl(match):
    [y,m,d] = list(match.groups())
    return '{}-{}-{}'.format(y,m,d)
    
def datetime_repl(match):
    [d,m,y,hr,mn,sc] = list(match.groups())
    return '{}-{}-{}T{}:{}:{}'.format(y,m,d,hr,mn,sc or '00')
    
def sq(v_raw):
    return "'{}'".format(
        v_raw.strip().replace("'","''").replace('\t', '\\t').replace('\n', '\\n')
    )

def num(v_raw, i, t, fname):
    if v_raw.isdigit(): return int(v_raw)
    warning(
        'table `{}` field `{}` record {}: not an integer: "{}"'.format(
            t, fname, i, v_raw
    ))
    return v_raw

money_warnings = {}
money_notes = {}

def money(v_raw, i, t, fname):
    note = ',' in v_raw or '.' in v_raw
    v = v_raw.strip().replace(' ','').replace('€', '').replace('\u00a0', '')
    for p in range(2,4): # interpret . or , as decimal point if less than 3 digits follow it
        if len(v) >= p and v[-p] in '.,': 
            v_i = v[::-1]
            if v_i[p-1] == ',': v_i = v_i.replace(',', 'D', 1)
            elif v_i[p-1] == '.': v_i = v_i.replace('.', 'D', 1)
            v = v_i[::-1]
    v = v.replace('.','').replace(',','')
    v = v.replace('D', '.')
    if not v.replace('.','').isdigit():
        warning(
            'table `{}` field `{}` record {}: not a decimal number: "{}" <= "{}"'.format(
                t, fname, i, v, v_raw,
        ))
        money_warnings.setdefault('{}:{}'.format(t, fname), {}).setdefault(v, set()).add(v_raw)
    elif note:
        money_notes.setdefault('{}:{}'.format(t, fname), {}).setdefault(v, set()).add(v_raw)
    return v

def dt(v_raw, i, t, fname):
    if not DATE2_PATTERN.match(v_raw):
        warning(
            'table `{}` field `{}` record {}: not a valid date: "{}"'.format(
                t, fname, i, v_raw
        ))
        return v_raw
    return("'{}'".format(DATE2_PATTERN.sub(date2_repl, v_raw)))

def dtm(v_raw, i, t, fname):
    if not DATETIME_PATTERN.match(v_raw):
        warning(
            'table `{}` field `{}` record {}: not a valid date time: "{}"'.format(
                t, fname, i, v_raw
        ))
        return v_raw
    return("'{}'".format(DATETIME_PATTERN.sub(datetime_repl, v_raw)))

## Parse the XML file

In [76]:
def read_fm():
    main_tables_raw = []
    parser = etree.XMLParser(remove_blank_text=True, ns_clean=True)
    root = {}
    for infile in glob('{}/*.xml'.format(FM_DIR)):
        tname = basename(splitext(infile)[0])
        print('Parsing {}'.format(tname))
        root[tname] = etree.parse(infile, parser).getroot()
        main_tables_raw.append(tname)
    return (root, main_tables_raw)

## Process field specs

In [77]:
def check_merge():
    merge_errors = 0
    merge_fields = {}

    for t in main_tables_raw:
        for (mfhead, mftail) in MERGE_FIELDS.get(t, {}).items():
            for f in mftail:
                if f in merge_fields.get(t, {}):
                    warning(
    'table `{}` field `{}` already merged into `{}` now to be merged into `{}`'.format(
                        t, f, merge_fields[t][f], mfhead,
                    ))
                    merge_errors += 1
                merge_fields.setdefault(t, {})[f] = mfhead
    if merge_errors:
        warning('There were {} merge errors'.format(merge_errors), count=False)
    else:
        info('Merge definitions OK')
    return merge_fields

def getfielddefs():
    field_defs_raw = {}
    fd_errors = 0
    tfields = {}
    for t in main_tables_raw:
        fieldroots = [x for x in root[t].iter(FMNS+'METADATA')]
        fieldroot = fieldroots[0]
        tfields[t] = []
        for x in fieldroot.iter(FMNS+'FIELD'):
            fname = x.get('NAME').lower().replace(' ','_').replace(':', '_')
            ftype = FIELD_TYPE_OVERRIDE.\
                get(t, {}).\
                get(fname, None) or x.get('TYPE').lower()
            fmult = int(x.get('MAXREPEAT'))
            if fname in SPLIT_FIELDS.get(t, {}): fmult += 1
            tfields[t].append(fname)
            field_defs_raw.setdefault(t, {})[fname] = [ftype, fmult]
            if ftype not in TYPES:
                warning('table `{}` field `{}` has unknown type "{}"'.format(
                    t, fname, ftype,
                ))
                fd_errors += 1
        info('Table {:<20}: {:>2} fields'.format(t, len(tfields[t])))
    if fd_errors:
        warning('There were {} field definition errors'.format(fd_errors), count=False)
    else:
        info('Field definitions OK')
    return (tfields, field_defs_raw)

def check_merge_more():
    merge_errors = 0
    for t in main_tables_raw:
        for f in merge_fields.get(t, {}):
            if f not in field_defs_raw[t]:
                warning(
                    'table `{}`: cannot merge unknown field `{}`'.format(
                    t, f,
                ))
                merge_errors += 1
                continue
            ftarget = merge_fields[t][f]
            (ftype, fmult) = field_defs_raw[t][f]
            if ftarget not in field_defs_raw[t]:
                field_defs_raw[t][ftarget] = [ftype, 0]
            (ttype, tmult) = field_defs_raw[t][ftarget]
            if ttype != ftype:
                warning(
                    'table `{}` field `{}` of type "{}" is merged into field `{}` of other type "{}"'.format(
                        t, f, ftype, ftarget, ttype,
                ))
                merge_errors += 1
            field_defs_raw[t][ftarget][1] += fmult
            del field_defs_raw[t][f]
    if merge_errors:
        warning('There were {} merge errors'.format(merge_errors), count=False)
    else:
        info('Merge OK')

def do_skips():
    fields_raw = {}
    s_errors = 0
    for t in main_tables_raw:
        for f in SKIP_FIELDS.get(t, set()):
            if f not in field_defs_raw[t]:
                warning('table `{}`: unknown skip field `{}`'.format(t,f))
                s_errors += 1
            else:
                del field_defs_raw[t][f]
        fields_raw[t] = sorted(
            set(field_defs_raw[t].keys()) | set(merge_fields.get(t, {}).values())
        )
    if s_errors:
        warning('There were {} field skip errors'.format(s_errors), count=False)
    else:
        info('Field skips OK')

    return fields_raw

## Get the data

In [78]:
def getdata():
    rows_raw = {}
    errors = {}

    for t in main_tables_raw:
        dataroots = [x for x in root[t].iter(FMNS+'RESULTSET')]
        dataroot = dataroots[0]
        rows_raw[t] = []

        for (i, r) in enumerate(dataroot.iter(FMNS+'ROW')):
            row = []
            for c in r.iter(FMNS+'COL'):
                data = [x.text for x in c.iter(FMNS+'DATA')]
                row.append(data)
            if len(row) != len(tfields[t]):
                errors.setdefault(t, {}).setdefault('Number of fields', []).append(i)
            rows_raw[t].append(row)

        rf = open('{}_{}.{}'.format(ROW_RAW_FILE, t, ROW_EXT), 'w')
        for row in rows_raw[t]:
            for (fname, values) in zip(tfields[t], row):
                rf.write('@{:>30} = {}\n'.format(
                    fname,
                    ' | '.join('{}'.format(v) for v in values),
                ))
            rf.write('{}\n'.format('='*100))
        rf.close()
        info('Table {:<20}: {:>4} rows read'.format(t, len(rows_raw[t])))
    if errors:
        for t in sorted(errors):
            for k in sorted(errors[t]):
                warning('table {:<20}: {:<20}: {}'.format(t, k, ','.join(errors[k])))
    else:
        info('Data import OK')
    return rows_raw

## Transform the values

Various non-informational values will be converted to NULL.
Values will be thinned: 
Identical values will be reduced to one copy.

In [79]:
def transformrows():
    rows = {}
    money_warnings.clear()
    for t in main_tables_raw:
        for (i, row_raw) in enumerate(rows_raw.get(t, [])):
            values = {}
            for (fname, values_raw) in zip(tfields[t], row_raw):
                if fname in SKIP_FIELDS.get(t, set()): continue
                sep = SPLIT_FIELDS.get(t, {}).get(fname, None)
                if sep != None:
                    values_raw = sorted(reduce(
                        set.union, 
                        [set(splitters[sep].split(v)) for v in values_raw if v != None], 
                        set(),
                    ))
                    if '' in values_raw: values_raw.remove('')
                ftarget = merge_fields.get(t, {}).get(fname, fname)
                (ftype, fmult) = field_defs_raw[t][ftarget]
                valset = set()
                for v_raw in values_raw:
                    if v_raw == None or v_raw in NULL_VALUES: v = 'NULL'
                    elif ftype == 'text': v = sq(v_raw)
                    elif ftype == 'number': v = num(v_raw, i, t, fname)
                    elif ftype == 'valuta': v = money(v_raw, i, t, fname)
                    elif ftype == 'date': v = dt(v_raw, i, t, fname)
                    elif ftype == 'datetime': v = dtm(v_raw, i, t, fname)
                    else: v = v_raw
                    valset.add(v)
                if fmult > 1: valset.discard('NULL')
                these_values = values.setdefault(ftarget, set())
                these_values |= valset
            rows.setdefault(t, []).append(values)
        info('Table `{}`: {:>5} rows checked'.format(t, len(rows[t])))

        rf = open('{}_{}.{}'.format(ROW_FILE, t, ROW_EXT), 'w')
        for row in rows[t]:
            for (fname, values) in sorted(row.items()):
                rf.write('@{:>30} = {}\n'.format(
                    fname,
                    ' | '.join('{}'.format(v) for v in sorted(values)),
                ))
            rf.write('{}\n'.format('='*100))
        rf.close()

    if money_notes:
        for tf in sorted(money_notes):
            for v in sorted(money_notes[tf]):
                note('{} "{}" <= {}'.format(
                    tf, v,
                    ' | '.join(money_notes[tf][v]),
            ))

    if money_warnings:
        for tf in sorted(money_warnings):
            for v in sorted(money_warnings[tf]):
                warning('{} "{}" <= {}'.format(
                    tf, v,
                    ' | '.join(money_warnings[tf][v]),
            ))
    else:
        info('Money OK')
    return rows

## Turn the data into a dict

We represent the data with a dictionary. The keys are the field names.
The values are dictionaries again, with keys new ids and with values the value that the row with that id has for that field.

In [80]:
def pivot():
    field_data_raw = {}
    for t in main_tables_raw:
        for row in rows[t]:
            for (fname, values) in sorted(row.items()):
                field_data_raw.setdefault(t, {}).setdefault(fname, []).append(values)
        info('Table `{}`: {:<5} records and {:<2} fields pivoted'.format(
            t, len(rows[t]), len(field_data_raw[t]),
        ))

    # check
    good = True
    for t in field_data_raw:
        for f in field_data_raw[t]:
            if len(field_data_raw[t][f]) != len(rows[t]):        
                warning(
        'table `{}`, field `{}`: wrong number of records: {} instead of {}'.format(
                    t, f, len(field_data_raw[t][f]), len(rows[t]),
                ))
                good = False
    if good:
        info('Pivot OK')
    else:
        warning('There were errors', count=False)
    return field_data_raw

## Move fields

In [81]:
def move_fields():
    errors = 0
    main_tables = deepcopy(main_tables_raw)
    fields = deepcopy(fields_raw)
    field_defs = deepcopy(field_defs_raw)
    field_data = deepcopy(field_data_raw)
    for t in MOVE_FIELDS:
        if t not in field_data:
            warning('move fields from table `{}`: this table does not exist'.format(
                t,
            ))
            errors += 1
            continue
        for t_new in MOVE_FIELDS[t]:
            main_tables.append(t_new)
            nid = '{}_id'.format(t)
            field_data.setdefault(t_new, {})[nid] = [{i} for i in range(len(rows[t]))]
            field_defs.setdefault(t_new, {})[nid] = ((t, 'id'), 1)
            move_fields = set(MOVE_FIELDS[t][t_new])
            for f in sorted(move_fields):
                if f not in field_data[t]:
                    warning(
            'table `{}`: move field `{}` to `{}`: this field does not exist'.format(
                        t, f, t_new,
                    ))
                    errors += 1
                    move_fields.remove(f)
                    continue
                field_data.setdefault(t_new, {})[f] = field_data[t][f]
                del field_data[t][f]
                field_defs.setdefault(t_new, {})[f] = field_defs[t][f]
                del field_defs[t][f]
            fields[t] = sorted(set(fields[t]) - move_fields)
            fields[t_new] = [nid]+sorted(move_fields)
            info('moved fields\n\t`{}`\nfrom `{}` to `{}`'.format(
                '`\n\t`'.join(sorted(move_fields)), t, t_new, 
            ))
            
    if errors:
        warning('There were {} errors'.format(errors), count=False)
    else:
        info('Move fields OK')
    return (main_tables, fields, field_defs, field_data)

## Extract related tables

In [82]:
def extract(t, fname, maindata, relvalues, relindex, reltables, relxtables, relfieldindex):
    fname_alt = fname
    if fname in main_tables:
        link_field = FIX_FIELDS.get(t, {}).get(fname, None)
        if link_field:
            note('table `{}`: value field `{}` will be linked to `{}:{}'.format(
                t, fname, fname, link_field,
        ))
        else:
            warning('table `{}`: value field `{}` is already a table!'.format(t, fname))
        fname_alt = '{}{}'.format(fname, TBF)
    is_single = field_defs[t][fname][1] == 1 # single value of multiple values
    error = False
    if fname in relfieldindex:
        warning(
    'related table `{}` extracted from `{}` and earlier from `{}`'.format(
            fname, t, relfieldindex[fname],
        ))
        error = True
    relfieldindex[fname] = t
    for (i, values) in enumerate(field_data[t][fname]):
        for value in values:
            vid = relvalues.setdefault(fname, {}).get(value, None)
            if vid == None:
                relindex[fname] += 1
                vid = relindex[fname]
                reltables.setdefault(fname_alt, []).append((vid, value))
            relvalues[fname][value] = vid
            if is_single:
                maindata[t][fname][i] = {vid}
            else:
                relxtables.setdefault(fname_alt, []).append((i, vid))
    if not is_single: del maindata[t][fname]
    return error

def transform_data():
    maindata = deepcopy(field_data)
    relvalues = {} # dicts
    relindex = collections.Counter()
    reltables = {} # lists
    relxtables = {} # lists
    relfieldindex = {}
    errors = 0
    for t in main_tables:
        field_list =\
            VALUE_FIELDS.get(t, set()) |\
            {f for f in fields[t] if field_defs[t][f][1] > 1}
        for fname in field_list:
            if fname not in field_defs[t]:
                warning('table `{}`: wrong field {}'.format(t, fname))
                errors += 1
                continue
            error = extract(
                t, fname, maindata, 
                relvalues, relindex, reltables, relxtables, relfieldindex,
            )
            if error: errors +=1
    if errors:
        warning('There were {} extraction errors'.format(errors), count=False)
    else:
        info('Extraction OK')
    return (maindata, reltables, relxtables, relvalues, relfieldindex)

## Fix fields

In [83]:
def getmapping(main_t, main_f):
    rel_t = '{}{}'.format(main_t, TBF)
    rel_f = main_t
    main_codes = maindata[main_t][main_f]
    rel_codes = reltables[rel_t]
    main_index = dict((list(c)[0],i) for (i,c) in enumerate(main_codes))
    rel_index = dict((i,c) for (i,c) in rel_codes)
    return dict((i, main_index[rel_index[i]]) for i in rel_index)

def fix(t, main_t, main_f):
    mapping = getmapping(main_t, main_f)
    rel_t = '{}{}'.format(main_t, TBF)
    if rel_t in reltables: del reltables[rel_t]

    new_maindata = [{mapping[list(x)[0]]} for x in maindata[t][main_t]]
    maindata[t][main_t] = new_maindata
    main_tables.remove(main_t)
    main_tables.insert(0, main_t)

def fix_them():
    for t in FIX_FIELDS:
        for main_t in FIX_FIELDS[t]:
            link_field = FIX_FIELDS[t][main_t]
            note('linking `{}:{}` to table `{}` on `{}`'.format(
                t, main_t, main_t, link_field,
            ))
            fix(t, main_t, link_field)

## Write sql to file

In [84]:
def getsize(source, fname):
    values = set()
    for vals in source: values |= set(vals)
    maxlen = max({len(x) for x in values if x != 'NULL'}, default=0)
    result = 0
    for m in range(MIN_M, MAX_M+1):
        if maxlen <= 2**m:
            result = m
            break
    if maxlen > 2**MAX_M:
        note(
            'Field `{}`: value with length {} gets type TEXT'.format(
                fname, maxlen, 2**MAX_M,
        ))
        return False
    return 2**m

def getdef(source, t, fname, newfname, warn_mult=True):
    (ft, fmult) = field_defs[t][fname]
    if warn_mult and fmult > 1:
        warning(
            'skipping field `{}` because it contains multiple values'.format(
                fname,
        ))
        return None
    if type(ft) is tuple:
        (ftable, ffield) = ft
        ftype = 'int'
        fsize = '(4)'
        fext = ',\n\tforeign key ({}) references {}({})'.format(fname, ftable, ffield)
    elif ft == 'number':
        ftype = 'int'
        fsize = '(4)'
        fext = ''
    elif ft == 'text':
        ftype = 'varchar'
        fsize_raw = getsize(source, fname)
        if not fsize_raw:
            ftype = 'text'
            fsize = ''
        else:
            fsize = '({})'.format(fsize_raw)
        fext = 'character set utf8'
    elif ft == 'valuta':
        ftype = 'decimal'
        fsize = '(10,2)'
        fext = ''
    elif ft == 'date':
        ftype = 'datetime'
        fsize = ''
        fext = ''
    elif ft == 'datetime':
        ftype = 'datetime'
        fsize = ''
        fext = ''
    else:
        warning('skipping field `{}` because it has unknown type `{}`'.format(
            fname, ft,
        ))
        return None
    return '{} {}{} {}'.format(newfname, ftype, fsize, fext)

def getrdef(fname):
    return '''{fn}_id int(4),
    foreign key ({fn}_id) references {fn}(id)'''.format(fn=fname)

def sql_data(df, tname, flist, rows):
    head = 'insert into {} ({}) values'.format(tname, ','.join(flist))
    for (i, row) in enumerate(rows):
        if i % LIMIT_ROWS == 0:
            if i > 0: df.write(';')
            df.write('\n')
            df.write('select "table {} row {}" as " ";\n'.format(tname, i))
            df.write(head)
            sep = ''
        df.write('\n{}\t'.format(sep))
        sep = ','
        df.write('({})'.format(','.join(str(x) for x in row)))
    df.write(';\n')
        
def print_maintables(maindata, reltables, cf, df):
    errors = 0
    for t in main_tables:
        fdefs = ['id int(4) primary key']
        flist = sorted(maindata[t])
        fnewlist = []
        for fname in flist:
            if fname in reltables or fname in FIX_FIELDS.get(t, {}):
                fdef = getrdef(fname)
                fnewname = '{}_id'.format(fname)
            else:
                fdef = getdef(field_data[t][fname], t, fname, fname)
                if fdef == None:
                    errors += 1
                    continue
                fnewname = fname
            fdefs.append(fdef)
            fnewlist.append(fnewname)
        cf.write('''
create table {} (
    {}
);
    '''.format(t, ',\n\t'.join(fdefs)))
        maintable_raw = zip(*(maindata[t][f] for f in flist))
        maintable = [
            [i]+[sorted(vals)[0] for vals in row] for (i, row) in enumerate(maintable_raw)
        ]
        sql_data(df, t, ['id'] + fnewlist, maintable)
    return errors

def print_reltables(reltables, relvalues, cf, df):
    errors = 0
    for tname_alt in sorted(reltables):
        tname = tname_alt
        pos = tname_alt.rfind(TBF)
        if pos > 0: tname = tname_alt[0:pos]
        fdefs = ['id int(4) primary key']
        fdef = getdef(
            [relvalues[tname].keys()], 
            relfieldindex[tname], 
            tname, 'val', warn_mult=False,
        )
        if fdef == None:
            errors += 1
            continue            
        fdefs.append(fdef)
        cf.write('''
create table {} (
    {}
);
'''.format(tname_alt, ',\n\t'.join(fdefs)))
        sql_data(df, tname_alt, ['id', 'val'], reltables[tname_alt])
    return errors

def print_relxtables(relxtables, cf, df):
    errors = 0
    for tname_alt in sorted(relxtables):
        tname = tname_alt
        pos = tname_alt.rfind(TBF)
        if pos > 0: tname = tname_alt[0:pos]
        t = relfieldindex[tname]
        tname_rep = '{}_{}'.format(t, tname_alt)
        main_id = '{}_id'.format(t)
        val_id = '{}_id'.format(tname)
        fdefs = '''
    {mi} int(4),
    {vi} int(4),
    foreign key ({mi}) references {mt}(id),
    foreign key ({vi}) references {tn}(id)
'''.format(mt=t, mi=main_id, tn=tname, vi=val_id)
        cf.write('''
create table {} ({});
'''.format(tname_rep, fdefs))
        sql_data(df, tname_rep, [main_id, val_id], relxtables[tname_alt])
    return errors


def sql_export():
    errors = 0
    cf = open('{}/create.sql'.format(RESULT_DIR), 'w')
    df = open('{}/data.sql'.format(RESULT_DIR), 'w')
    df.write('''
select "FILL TABLES OF DATABASE {db}" as " ";

use {db};

'''.format(db=DB_NAME))

    cf.write('''
select "CREATE DATABASE {db} AND TABLES" as " ";

drop database if exists {db};
create database {db} character set utf8;
use {db};

'''.format(db=DB_NAME))
    cf.write('/* value tables */\n')
    df.write('\n/* value tables */\n')
    errors += print_reltables(reltables, relvalues, cf, df)

    cf.write('/* main tables */\n')
    df.write('\n/* main tables */\n')
    errors += print_maintables(maindata, reltables, cf, df)

    cf.write('/* cross tables */\n')
    df.write('\n/* cross tables */\n')
    errors += print_relxtables(relxtables, cf, df)

    cf.close()
    df.close()
    
    if errors:
        warning('There were {} errors'.format(errors), count=False)
    else:
        info('SQL OK')

## Call everything

In [85]:
info('{:=^80}'.format('BEGIN PROCESSING'))
resetw()

info('{:=^80}'.format('CHECK CONFIG'))
check_config()

info('{:=^80}'.format('READ FM'))
(root, main_tables_raw) = read_fm()

info('{:=^80}'.format('MERGE pre CHECK'))
merge_fields = check_merge()

info('{:=^80}'.format('FIELD DEFINITIONS'))
(tfields, field_defs_raw) = getfielddefs()

info('{:=^80}'.format('MERGE post CHECK'))
check_merge_more()

info('{:=^80}'.format('SKIP FIELDS'))
fields_raw = do_skips()

info('{:=^80}'.format('READ DATA'))
rows_raw = getdata()

info('{:=^80}'.format('TRANSFORM ROWS'))
rows = transformrows()

info('{:=^80}'.format('PIVOT DATA'))
field_data_raw = pivot()

info('{:=^80}'.format('MOVE FIELDS'))
(main_tables, fields, field_defs, field_data) = move_fields()

info('{:=^80}'.format('REMODEL DATA'))
(maindata, reltables, relxtables, relvalues, relfieldindex) = transform_data()

info('{:=^80}'.format('FIX LINKED DATA'))
fix_them()

info('{:=^80}'.format('WRITE SQL'))
sql_export()

info('{:=^80}'.format('END PROCESSING'))
finalw()

Configuration OK
Parsing contrib
Parsing country
Parsing help
Parsing remark
Parsing vcchead
Parsing workinggroup
Merge definitions OK
Table contrib             : 60 fields
Table country             :  2 fields
Table help                :  2 fields
Table remark              :  5 fields
Table vcchead             :  2 fields
Table workinggroup        : 27 fields
Field definitions OK
Merge OK
Field skips OK
Table contrib             :  309 rows read
Table country             :   24 rows read
Table help                :    1 rows read
Table remark              :  176 rows read
Table vcchead             :   14 rows read
Table workinggroup        :   20 rows read
Data import OK
Table `contrib`:   309 rows checked
Table `country`:    24 rows checked
Table `help`:     1 rows checked
Table `remark`:   176 rows checked
Table `vcchead`:    14 rows checked
Table `workinggroup`:    20 rows checked
NB: contrib:costs_total "1000000" <= 1,000,000 

NB: contrib:costs_total "120000" <= 120,000
NB: contr

## Exploration

In [86]:
def pprintf(tname, fname):
    values_raw = field_data[tname][fname]
    values = sorted(v for v in reduce(set.union, values_raw, set()) if v != 'NULL')
    print('\n'.join('{}'.format(v) for v in values))  

In [87]:
pprintf('contrib', 'country')

'AT'
'BE'
'DE'
'FR'
'GR'
'HR'
'IE'
'IT'
'LU'
'NL'
'RS'
'SI'
