# Importing InKind from FileMaker

We use an XML export of the various tables in the FileMaker Inkind database.

In [1]:
import os,sys,re,collections
from os.path import splitext, basename
from copy import deepcopy
from functools import reduce
from glob import glob
from lxml import etree

## Config.

In [235]:
# Locations

HOME_DIR = os.path.expanduser('~').replace('\\', '/')
BASE_DIR = '{}/projects/has/dacs'.format(HOME_DIR)
FM_DIR = '{}/fm'.format(BASE_DIR)
TEMP_DIR = '{}/tmp'.format(BASE_DIR)
RESULT_DIR = '{}/sql'.format(BASE_DIR)
FMNS = '{http://www.filemaker.com/fmpxmlresult}'
ROW_RAW_FILE = '{}/row_raw_file'.format(TEMP_DIR)
ROW_FILE = '{}/row_file'.format(TEMP_DIR)
ROW_EXT = 'txt'
MODEL_FILE = '{}/model.txt'.format(TEMP_DIR)

# data type details

MIN_M = 5       # minimum varchar size = 2**MIN_M
MAX_M = 13      # maximum varchar size = 2**MAX_M
LIMIT_ROWS = 50 # maximum number of rows to be written in one sql insert statement
TYPES = {'number', 'text', 'valuta', 'date', 'datetime'}
DATE_PATTERN = re.compile(
    '^\s*([0-9]{2})/([0-9]{2})/([0-9]{4})$'
)
DATE2_PATTERN = re.compile(
    '^\s*([0-9]{4})-([0-9]{2})-([0-9]{2})$'
)
DATETIME_PATTERN = re.compile(
    '^\s*([0-9]{2})/([0-9]{2})/([0-9]{4})\s+([0-9]{2}):([0-9]{2})(?::([0-9]{2}))?$'
)
NULL_VALUES = {
    'http://',
    'https://',
}

splitters = dict(
    generic=re.compile('[ \t]*[\n+][ \t\n]*'),
    generic_comma=re.compile('[ \t]*[\n+,][ \t\n]*'),
)

good = True
for x in [1]:
    good = False
    if not os.path.exists(BASE_DIR):
        print('BASE_DIR does not exist: {}'.format(BASE_DIR))
        break
    this_good = True
    for cdir in (TEMP_DIR, RESULT_DIR):
        this_good = False
        if not os.path.exists(cdir):
            try:
                os.makedirs(cdir)
            except os.error as e:
                print('{} could not be created.'.format(cdir))
                break
        this_good = True
    if not this_good:
        break
    good = True
if not good:
    print('There were configuration errors')
else:
    print('Configuration OK')

Configuration OK


## Value validation and transformation

In [289]:
def date_repl(match):
    [d,m,y] = list(match.groups())
    return '{}-{}-{}'.format(y,m,d)
    
def date2_repl(match):
    [y,m,d] = list(match.groups())
    return '{}-{}-{}'.format(y,m,d)
    
def datetime_repl(match):
    [d,m,y,hr,mn,sc] = list(match.groups())
    return '{}-{}-{}T{}:{}:{}'.format(y,m,d,hr,mn,sc or '00')
    
def sq(v_raw):
    return "'{}'".format(
        v_raw.strip().replace("'","''").replace('\t', '\\t').replace('\n', '\\n')
    )

def num(v_raw, i, t, fname):
    if v_raw.isdigit(): return int(v_raw)
    print(
        'WARNING: table `{}` field `{}` record {}: not an integer: "{}"'.format(
            t, fname, i, v_raw
    ))
    return v_raw

money_warnings = {}

def money(v_raw, i, t, fname):
    warn = ',' in v_raw or '.' in v_raw
    v = v_raw.strip().replace(' ','').replace('€', '').replace('\u00a0', '')
    for p in range(2,4): # interpret . or , as decimal point if less than 3 digits follow it
        if len(v) >= p and v[-p] in '.,': 
            v_i = v[::-1]
            if v_i[p-1] == ',': v_i = v_i.replace(',', 'D', 1)
            elif v_i[p-1] == '.': v_i = v_i.replace('.', 'D', 1)
            v = v_i[::-1]
    v = v.replace('.','').replace(',','')
    v = v.replace('D', '.')
    if not v.replace('.','').isdigit():
        print(
            'WARNING: table `{}` field `{}` record {}: not a decimal number: "{}" <= "{}"'.format(
                t, fname, i, v, v_raw,
        ))
        money_warnings.setdefault('{}:{}'.format(t, fname), {}).setdefault(v, set()).add(v_raw)
    elif warn:
        money_warnings.setdefault('{}:{}'.format(t, fname), {}).setdefault(v, set()).add(v_raw)
    return v

def dt(v_raw, i, t, fname):
    if not DATE2_PATTERN.match(v_raw):
        print(
            'WARNING: table `{}` field `{}` record {}: not a valid date: "{}"'.format(
                t, fname, i, v_raw
        ))
        return v_raw
    return("'{}'".format(DATE2_PATTERN.sub(date2_repl, v_raw)))

def dtm(v_raw, i, t, fname):
    if not DATETIME_PATTERN.match(v_raw):
        print(
            'WARNING: table `{}` field `{}` record {}: not a valid date time: "{}"'.format(
                t, fname, i, v_raw
        ))
        return v_raw
    return("'{}'".format(DATETIME_PATTERN.sub(datetime_repl, v_raw)))

## Parse the XML file

In [291]:
main_tables = []

def read_fm():
    main_tables.clear()
    parser = etree.XMLParser(remove_blank_text=True, ns_clean=True)
    root = {}
    for infile in glob('{}/*.xml'.format(FM_DIR)):
        tname = basename(splitext(infile)[0])
        print('Parsing {}'.format(tname))
        root[tname] = etree.parse(infile, parser).getroot()
        main_tables.append(tname)
    return root

## Fields and their types

In [292]:
DB_NAME = 'inkind_data'

FIELD_TYPE_OVERRIDE = dict(
    contrib=dict(
        costs_total='valuta',
        total_costs_total='valuta',
        whois='text',
        creation_date_time='datetime',
        modification_date_time='datetime',
        dateandtime_approval='datetime',
        dateandtime_cioapproval='datetime',
        dateandtime_ciozero='datetime',
    )
)

SPLIT_FIELDS = dict(
    contrib=dict(
        disciplines_associated=splitters['generic'],
        other_keywords=splitters['generic_comma'],
        tadirah_research_activities=splitters['generic'],
        tadirah_research_objects=splitters['generic'],
        tadirah_research_techniques=splitters['generic'],
        type_of_inkind=splitters['generic'],
        vcc=splitters['generic'],
    )
)

MERGE_FIELDS = dict(
    contrib=dict(
        academic_entity_url=('academic_entity_url_2',),
        contribution_url=('contribution_url_2',),
        contact_person_mail=('contact_person_mail_2',),
        gnewpassword=('gnewpassword2',),
        vcc_head_decision_vcc1=('vcc_head_decision_vcc11', 'vcc_head_decision_vcc12'),
        vcc_head_decision_vcc2=('vcc_head_decision_vcc21', 'vcc_head_decision_vcc22'),
        vcc_head_decision_vcc3=('vcc_head_decision_vcc31', 'vcc_head_decision_vcc32'),
        vcc_head_decision_vcc4=('vcc_head_decision_vcc41', 'vcc_head_decision_vcc42'),
        vcc_name=(
            'vcc11_name', 'vcc12_name', 
            'vcc21_name', 'vcc22_name',
            'vcc31_name', 'vcc32_name',
            'vcc41_name', 'vcc42_name',
        ),
    ),
)

SKIP_FIELDS = dict(
    contrib={
        'teller',
        'whois',
        'help_text',
        'help_description',
    },
)

VALUE_FIELDS = dict(
    contrib={
        'country',
    },
)

## Process field specs

In [293]:
def check_merge():
    merge_errors = 0
    merge_fields = {}

    for t in main_tables:
        for (mfhead, mftail) in MERGE_FIELDS.get(t, {}).items():
            for f in mftail:
                if f in merge_fields.get(t, {}):
                    print(
                        'WARNING: table `{}` field `{}` already merged into `{}` now to be merged into `{}`'.format(
                            t, f, merge_fields[t][f], mfhead,
                    ))
                    merge_errors += 1
                merge_fields.setdefault(t, {})[f] = mfhead
    if merge_errors:
        print('There were {} merge errors'.format(merge_errors))
    else:
        print('Merge definitions OK')
    return merge_fields

def check_merge_more():
    merge_errors = 0
    for t in main_tables:
        for f in merge_fields.get(t, {}):
            if f not in field_defs[t]:
                print(
                    'WARNING: table `{}`: cannot merge unknown field `{}`'.format(
                    t, f,
                ))
                merge_errors += 1
                continue
            ftarget = merge_fields[t][f]
            (ftype, fmult) = field_defs[t][f]
            if ftarget not in field_defs[t]:
                field_defs[t][ftarget] = [ftype, 0]
            (ttype, tmult) = field_defs[t][ftarget]
            if ttype != ftype:
                print(
                    'WARNING: table `{}` field `{}` of type "{}" is merged into field `{}` of other type "{}"'.format(
                        t, f, ftype, ftarget, ttype,
                ))
                merge_errors += 1
            field_defs[t][ftarget][1] += fmult
            del field_defs[t][f]
    if merge_errors:
        print('There were {} merge errors'.format(merge_errors))
    else:
        print('Merge OK')

def getfielddefs():
    field_defs = {}
    fd_errors = 0
    tfields = {}
    for t in main_tables:
        fieldroots = [x for x in root[t].iter(FMNS+'METADATA')]
        fieldroot = fieldroots[0]
        tfields[t] = []
        for x in fieldroot.iter(FMNS+'FIELD'):
            fname = x.get('NAME').lower().replace(' ','_')
            ftype = FIELD_TYPE_OVERRIDE.\
                get(t, {}).\
                get(fname, None) or x.get('TYPE').lower()
            fmult = int(x.get('MAXREPEAT'))
            if fname in SPLIT_FIELDS.get(t, {}): fmult += 1
            tfields[t].append(fname)
            field_defs.setdefault(t, {})[fname] = [ftype, fmult]
            if ftype not in TYPES:
                print('WARNING: table `{}` field `{}` has unknown type "{}"'.format(
                    t, fname, ftype,
                ))
                fd_errors += 1
        print('Table {:<20}: {:>2} fields'.format(t, len(tfields[t])))
    if fd_errors:
        print('There were {} field definition errors'.format(fd_errors))
    else:
        print('Field definitions OK')
    return (tfields, field_defs)

def do_skips():
    fields = {}
    for t in main_tables:
        for f in SKIP_FIELDS.get(t, set()): del field_defs[t][f]
        fields[t] = sorted(
            set(field_defs[t].keys()) | set(merge_fields.get(t, {}).values())
        )
    return fields

def report_model():
    mf = open(MODEL_FILE, 'w')
    for t in main_tables:
        mf.write('[{}] ({} fields)\n\t{}\n\n'.format(
            t,
            len(fields[t]), 
            '\n\t'.join('{:<8}{} {}'.format(
                *field_defs[t][f], 
                f, 
            ) for f in fields[t])
        ))
        print('Table {:<20}: {:>2} retained fields'.format(
            t,
            len(fields[t]), 
        ))
    mf.close()

## Get the data

In [294]:
def getdata():
    rows_raw = {}
    errors = {}

    for t in main_tables:
        dataroots = [x for x in root[t].iter(FMNS+'RESULTSET')]
        dataroot = dataroots[0]
        rows_raw[t] = []

        for (i, r) in enumerate(dataroot.iter(FMNS+'ROW')):
            row = []
            for c in r.iter(FMNS+'COL'):
                data = [x.text for x in c.iter(FMNS+'DATA')]
                row.append(data)
            if len(row) != len(tfields[t]):
                errors.setdefault(t, {}).setdefault('Number of fields', []).append(i)
            rows_raw[t].append(row)

        rf = open('{}_{}.{}'.format(ROW_RAW_FILE, t, ROW_EXT), 'w')
        for row in rows_raw[t]:
            for (fname, values) in zip(tfields[t], row):
                rf.write('@{:>30} = {}\n'.format(
                    fname,
                    ' | '.join('{}'.format(v) for v in values),
                ))
            rf.write('{}\n'.format('='*100))
        rf.close()
        print('Table {:<20}: {:>4} rows read'.format(t, len(rows_raw[t])))
    if errors:
        for t in sorted(errors):
            for k in sorted(errors[t]):
                print('Table {:<20}: {:<20}: {}'.format(t, k, ','.join(errors[k])))
    else:
        print('Data import OK')
    return rows_raw

## Transform the values

Various non-informational values will be converted to NULL.
Values will be thinned: 
Identical values will be reduced to one copy.

In [295]:
def transformrows():
    rows = {}
    money_warnings.clear()
    for t in main_tables:
        for (i, row_raw) in enumerate(rows_raw.get(t, [])):
            values = {}
            for (fname, values_raw) in zip(tfields[t], row_raw):
                if fname in SKIP_FIELDS.get(t, set()): continue
                sep = SPLIT_FIELDS.get(t, {}).get(fname, None)
                if sep != None:
                    values_raw = sorted(reduce(
                        set.union, 
                        [set(sep.split(v)) for v in values_raw if v != None], 
                        set(),
                    ))
                    if '' in values_raw: values_raw.remove('')
                ftarget = merge_fields.get(t, {}).get(fname, fname)
                (ftype, fmult) = field_defs[t][ftarget]
                valset = set()
                for v_raw in values_raw:
                    if v_raw == None or v_raw in NULL_VALUES: v = 'NULL'
                    elif ftype == 'text': v = sq(v_raw)
                    elif ftype == 'number': v = num(v_raw, i, t, fname)
                    elif ftype == 'valuta': v = money(v_raw, i, t, fname)
                    elif ftype == 'date': v = dt(v_raw, i, t, fname)
                    elif ftype == 'datetime': v = dtm(v_raw, i, t, fname)
                    else: v = v_raw
                    valset.add(v)
                if fmult > 1: valset.discard('NULL')
                these_values = values.setdefault(ftarget, set())
                these_values |= valset
            rows.setdefault(t, []).append(values)
        print('Table `{}`: {:>5} rows checked'.format(t, len(rows[t])))

        rf = open('{}_{}.{}'.format(ROW_FILE, t, ROW_EXT), 'w')
        for row in rows[t]:
            for (fname, values) in sorted(row.items()):
                rf.write('@{:>30} = {}\n'.format(
                    fname,
                    ' | '.join('{}'.format(v) for v in sorted(values)),
                ))
            rf.write('{}\n'.format('='*100))
        rf.close()

    if money_warnings:
        for tf in sorted(money_warnings):
            for v in sorted(money_warnings[tf]):
                print('WARNING: {} "{}" <= {}'.format(
                    tf, v,
                    ' | '.join(money_warnings[tf][v]),
            ))
    else:
        print('Money OK')
    return rows

## Turn the data into a dict

We represent the data with a dictionary. The keys are the field names.
The values are dictionaries again, with keys new ids and with values the value that the row with that id has for that field.

In [325]:
def pivot():
    field_data = {}
    for t in main_tables:
        for row in rows[t]:
            for (fname, values) in sorted(row.items()):
                field_data.setdefault(t, {}).setdefault(fname, []).append(values)
        print('Table `{}`: {:<5} records and {:<2} fields pivoted'.format(
            t, len(rows[t]), len(field_data[t]),
        ))

    # check
    good = True
    for t in field_data:
        for f in field_data[t]:
            if len(field_data[t][f]) != len(rows[t]):        
                print(
                    'WARNING: table `{}`, field `{}`: wrong number of records: {} instead of {}'.format(
                        t, f, len(field_data[t][f]), len(rows[t]),
                ))
                good = False
    if good:
        print('OK')
    else:
        print('There were errors')
    return field_data

## Extract related tables

In [332]:
def extract(t, fname, maindata, relvalues, relindex, reltables, relxtables, relfieldindex):
    is_single = field_defs[t][fname][1] == 1 # single value of multiple values
    error = False
    if fname in relfieldindex:
        print('WARNING: Related table `{}` extracted from `{}` and earlier from `{}`'.format(
            fname, t, relfieldindex[fname],
        ))
        error = True
    relfieldindex[fname] = t
    for (i, values) in enumerate(field_data[t][fname]):
        for value in values:
            vid = relvalues.setdefault(fname, {}).get(value, None)
            if vid == None:
                relindex[fname] += 1
                vid = relindex[fname]
                reltables.setdefault(fname, []).append((vid, value))
            relvalues[fname][value] = vid
            if is_single:
                maindata[t][fname][i] = [vid]
            else:
                relxtables.setdefault(fname, []).append((i, vid))
    if not is_single: del maindata[t][fname]
    return error

def transform_data():
    maindata = deepcopy(field_data)
    relvalues = {} # dicts
    relindex = collections.Counter()
    reltables = {} # lists
    relxtables = {} # lists
    relfieldindex = {}
    errors = 0
    for t in main_tables:
        field_list =\
            VALUE_FIELDS.get(t, set()) |\
            {f for f in fields[t] if field_defs[t][f][1] > 1}
        for fname in field_list:
            if fname not in field_defs[t]:
                print('ERROR: table `{}`: wrong field {}'.format(t, fname))
                continue
            error = extract(t, fname, maindata, relvalues, relindex, reltables, relxtables, relfieldindex)
            if error: errors +=1
    if errors:
        print('There were {} extraction errors'.format(errors))
    else:
        print('Extraction OK')
    return (maindata, reltables, relxtables, relvalues, relfieldindex)

## Write sql to file

In [339]:
def getsize(source, fname):
    values = set()
    for vals in source: values |= set(vals)
    maxlen = max({len(x) for x in values if x != 'NULL'}, default=0)
    result = 0
    for m in range(MIN_M, MAX_M+1):
        if maxlen <= 2**m:
            result = m
            break
    if maxlen > 2**MAX_M:
        print(
            'Field `{}`: value with length {} gets type TEXT'.format(
                fname, maxlen, 2**MAX_M,
        ))
        return False
    return 2**m

def getdef(source, t, fname, newfname, warn_mult=True):
    (ft, fmult) = field_defs[t][fname]
    if warn_mult and fmult > 1:
        print(
            'WARNING: skipping field `{}` because it contains multiple values'.format(
                fname,
        ))
        return None
    if ft == 'number':
        ftype = 'int'
        fsize = '(4)'
        fext = ''
    elif ft == 'text':
        ftype = 'varchar'
        fsize_raw = getsize(source, fname)
        if not fsize_raw:
            ftype = 'text'
            fsize = ''
        else:
            fsize = '({})'.format(fsize_raw)
        fext = 'character set utf8'
    elif ft == 'valuta':
        ftype = 'decimal'
        fsize = '(10,2)'
        fext = ''
    elif ft == 'date':
        ftype = 'datetime'
        fsize = ''
        fext = ''
    elif ft == 'datetime':
        ftype = 'datetime'
        fsize = ''
        fext = ''
    else:
        print('WARNING: skipping field `{}` because it has unknown type `{}`'.format(
            fname, ft,
        ))
        return None
    return '{} {}{} {}'.format(newfname, ftype, fsize, fext)

def getrdef(fname):
    return '''{fn}_id int(4),
    foreign key ({fn}_id) references {fn}(id)'''.format(fn=fname)

def sql_data(df, tname, flist, rows):
    head = 'insert into {} ({}) values'.format(tname, ','.join(flist))
    for (i, row) in enumerate(rows):
        if i % LIMIT_ROWS == 0:
            if i > 0: df.write(';')
            df.write('\n')
            df.write('select "table {} row {}" as " ";\n'.format(tname, i))
            df.write(head)
            sep = ''
        df.write('\n{}\t'.format(sep))
        sep = ','
        df.write('({})'.format(','.join(str(x) for x in row)))
    df.write(';\n')
        
def print_maintables(maindata, reltables, cf, df):
    for t in maindata:
        fdefs = ['id int(4) primary key']
        flist = sorted(maindata[t])
        fnewlist = []
        for fname in flist:
            if fname in reltables:
                fdef = getrdef(fname)
                fnewname = '{}_id'.format(fname)
            else:
                fdef = getdef(field_data[t][fname], t, fname, fname)
                fnewname = fname
            fdefs.append(fdef)
            fnewlist.append(fnewname)
        cf.write('''
create table {} (
    {}
);
    '''.format(t, ',\n\t'.join(fdefs)))
        maintable_raw = zip(*(maindata[t][f] for f in flist))
        maintable = [
            [i]+[sorted(vals)[0] for vals in row] for (i, row) in enumerate(maintable_raw)
        ]
        sql_data(df, t, ['id'] + fnewlist, maintable)

def print_reltables(reltables, relvalues, cf, df):
    for tname in sorted(reltables):
        fdefs = ['id int(4) primary key']
        fdef = getdef(
            [relvalues[tname].keys()], 
            relfieldindex[tname], 
            tname, 'val', warn_mult=False,
        )
        if fdef == None: continue            
        fdefs.append(fdef)
        cf.write('''
create table {} (
    {}
);
'''.format(tname, ',\n\t'.join(fdefs)))
        sql_data(df, tname, ['id', 'val'], reltables[tname])

def print_relxtables(relxtables, cf, df):
    for tname in sorted(relxtables):
        t = relfieldindex[tname]
        tname_rep = '{}_{}'.format(t, tname)
        main_id = '{}_id'.format(t)
        val_id = '{}_id'.format(tname)
        fdefs = '''
    {mi} int(4),
    {vi} int(4),
    foreign key ({mi}) references {mt}(id),
    foreign key ({vi}) references {tn}(id)
'''.format(mt=t, mi=main_id, tn=tname, vi=val_id)
        cf.write('''
create table {} ({});
'''.format(tname_rep, fdefs))
        sql_data(df, tname_rep, [main_id, val_id], relxtables[tname])


def sql_export():
    cf = open('{}/create.sql'.format(RESULT_DIR), 'w')
    df = open('{}/data.sql'.format(RESULT_DIR), 'w')
    df.write('''
select "FILL TABLES OF DATABASE {db}" as " ";

use {db};

'''.format(db=DB_NAME))

    cf.write('''
select "CREATE DATABASE {db} AND TABLES" as " ";

drop database if exists {db};
create database {db} character set utf8;
use {db};

'''.format(db=DB_NAME))
    cf.write('/* value tables */\n')
    df.write('\n/* value tables */\n')
    print_reltables(reltables, relvalues, cf, df)

    cf.write('/* main tables */\n')
    df.write('\n/* main tables */\n')
    print_maintables(maindata, reltables, cf, df)

    cf.write('/* cross tables */\n')
    df.write('\n/* cross tables */\n')
    print_relxtables(relxtables, cf, df)

    cf.close()
    df.close()
    print('SQL written')

## Call everything

In [340]:
root = read_fm()
merge_fields = check_merge()
(tfields, field_defs) = getfielddefs()
check_merge_more()
fields = do_skips()
report_model()
rows_raw = getdata()
rows = transformrows()
field_data = pivot()
(maindata, reltables, relxtables, relvalues, relfieldindex) = transform_data()
sql_export()

Parsing contrib
Parsing country
Parsing help
Parsing remark
Parsing vcc
Parsing workinggroup
Merge definitions OK
Table contrib             : 60 fields
Table country             :  2 fields
Table help                :  2 fields
Table remark              :  5 fields
Table vcc                 :  2 fields
Table workinggroup        : 27 fields
Field definitions OK
Merge OK
Table contrib             : 41 retained fields
Table country             :  2 retained fields
Table help                :  2 retained fields
Table remark              :  5 retained fields
Table vcc                 :  2 retained fields
Table workinggroup        : 27 retained fields
Table contrib             :  309 rows read
Table country             :   24 rows read
Table help                :    1 rows read
Table remark              :  176 rows read
Table vcc                 :   14 rows read
Table workinggroup        :   20 rows read
Data import OK
Table `contrib`:   309 rows checked
Table `country`:    24 rows checked
T

## Exploration

In [341]:
def pprintf(tname, fname):
    values_raw = field_data[tname][fname]
    values = sorted(v for v in reduce(set.union, values_raw, set()) if v != 'NULL')
    print('\n'.join('{}'.format(v) for v in values))  

In [342]:
pprintf('contrib', 'vcc_name')

'Dirk Wintergrün'
'Hansmichael Hohenegger'
'Hella Hollander'
'Marianne Huan'
'Sophie David'
'Susan Schreibman'
'Tibor Kálmán'
