In [1]:
import json
import itertools
from collections import Counter

import fuzzysearch
from genson import SchemaBuilder
from typing import Dict

### Load the raw data

In [2]:
# would really like to use pandas, but it doesn't really support nested data
# so I invented stuff myself
with open("raw_companies.json") as f:
    raw_companies = json.load(f)

### Print all the top-level keys

In [3]:
company: dict
top_level_keys = set(itertools.chain(*(company.keys() for company in raw_companies)))

print(*sorted(top_level_keys), sep='\n')

antalAnsatte
ejerforhold
foreningsrepraesentanter
harManuelSignering
historiskStamdata
hovedselskab
oplysningerOmRevisionsvirksomhed
personkreds
produktionsenheder
sammenhaengendeRegnskaber
skjulOevrigeDokumenter
stamdata
udvidedeOplysninger
virksomhedRegistreringer
virksomhedsMeddelelser


It's all in Danish!

In [4]:
# just a cell for looking at various fields
# I use PyCharm to view it
test = [ c['antalAnsatte'] for c in raw_companies]

### Load the pre-downloaded translations

In [5]:
# oh wow, they have the translations on the site!
# though it might be not so easy to interpret
with open('raw_translations.json') as f:
    raw_translations = json.load(f)

# filter to include only the english ones
raw_translations = [t for t in raw_translations if t['locale'] == 'en']

# convert to an easily-searchable form
raw_translations_text = []
for trans in raw_translations:
    assert trans['type'] == 'Text'
    raw_translations_text.append(f"{trans['code']:100}: {trans['message']}")

# write it out
with open('en_translations.txt', 'w') as f:
    f.write('\n'.join(raw_translations_text))

In [6]:
def collect_keys(obj):
    """
    Collect all the JSON keys used in the document
    """
    ty = type(obj)
    if ty is list:
        return Counter(itertools.chain(*(collect_keys(obj) for obj in obj)))
    elif ty is dict:
        obj: dict
        res = Counter()
        for k, v in obj.items():
            res[k] += 1
            res |= collect_keys(v)
        return res

    elif ty in [str, int, bool, float, type(None)]:
        return Counter()
    else:
        raise RuntimeError(f"Don't know how to handle type {ty}")


### Define the mapping from the danish names to English

In [7]:
# mostly based on google translate and common sense
translations = {
    'antalAnsatte':             'number_of_employees',
    'ejerforhold':              'ownership',
    'foreningsrepraesentanter': 'association_representatives', # like a person that can operate in the name of the company?
    'harManuelSignering':       'has_manual_signing', # a boolean, so prolly like a confirmation?

    'stamdata':             'basic_data', # master data seems to be the main info about the company
    'historiskStamdata':    'historic_basic_data', # old values of the master data

    'hovedselskab':                         'parent_company',
    'oplysningerOmRevisionsvirksomhed':     'information_about_auditing',
    'personkreds':                          'circle_of_people', # maybe line contacts? Or just people connected to the company?
    'produktionsenheder':                   'production_units',
    'sammenhaengendeRegnskaber':            'coherent_accounts',
    'skjulOevrigeDokumenter':               'hide_other_documents',
    'udvidedeOplysninger':                  'extended_information',
    'virksomhedRegistreringer':             'company_registrations',
    'virksomhedsMeddelelser':               'company_announcements',

    'adresse': 'address',
    'aktiveLegaleEjere':    'active_legal_owners',
    'aktiveReelleEjere':    'active_beneficial_owners',
    'formaal':              'purpose', # seems like a mission statement or smth
    'virksomhedsform':      'bussiness_type',

    'hovedbranche':     'main_industry',
    'bibrancher':       'additional_industries',
    'branchekode':      'industry_code',
    'titel':            'title',
    'bibranche':        'additional_industries',

    'navn':                         'name',
    'ophoerteLegaleEjere':          'terminated_legal_owners',
    'begunstigetGruppeNavn':        'beneficiary_group_name',
    'begunstigetGruppeRetskrav':    'beneficiary_group_legal_claim',
    'status':                       'status',
    'email':                        'email',
    'pnummer':                      'p_number', # some domain-specific identifier?
    'cvrnummer':                    'cvr_number',
    'ophoersdato':                  'cessation_date',
    'regnummer':                    'registration_number',
    'telefon':                      'telephone',
    'startdato':                    'start_date',
    'virksomhedsnavn':              'company_name',
    'binavne':                      'secondary_names',
    'reklamebeskyttet':             'advertising_protection',
}

### Print the missing translations with candidates found via fuzzy search

In [8]:
keys = collect_keys(raw_companies)

print("Missing translations:\n")
untranslated_count = 0
for k, count in keys.most_common():
    if k not in translations:
        untranslated_count += 1

        # emitting too much info is not really convienent
        if untranslated_count < 40:
            print(f"{k:70} ({count} counts)")

            # search for possible candidates in the raw_translations_text

            r = []
            if len(k) > 4:
                for text in raw_translations_text:
                    if fuzzysearch.find_near_matches(k, text, max_l_dist=4):
                        r.append(text)

            if r:
                print(*r, sep='\n')
            print()

print(f"\nIn total {untranslated_count} untranslated keys")

Missing translations:

kvartalsbeskaeftigelse                                                 (1578 counts)

maanedsbeskaeftigelse                                                  (1578 counts)

bestyrelseAnsesSomReelleEjere                                          (1578 counts)

ejerregistreringUnderFemProcent                                        (1578 counts)

ophoerteReelleEjere                                                    (1578 counts)
soeg-filter-personrolle-ophoert_reelle_ejere-label                                                  : Beneficial owners
ejerforhold-ophoerte-reelle-ejere-label                                                             : Terminated beneficial owners

virksomhedHarIkkeKunnetIdentificereReelleEjereLedelseErIndsat          (1578 counts)

virksomhedHarIkkeReelleEjereOgLedelseErIndsat                          (1578 counts)

registreretKapital                                                     (1578 counts)
historisk-stamdata-registreret_kapital-

### Now translate the keys using the translation mapping defined above

In [9]:
def translate_keys(obj, translations: Dict[str, object]):
    ty = type(obj)
    if ty is list:
        return [ translate_keys(obj, translations) for obj in obj ]
    elif ty is dict:
        obj: dict
        res = dict()
        for k, v in obj.items():
            k = translations.get(k, k)
            res[k] = translate_keys(v, translations)
        return res

    elif ty in [str, int, bool, float, type(None)]:
        return obj
    else:
        raise RuntimeError(f"Don't know how to handle type {ty}")

In [10]:
translated_companies = translate_keys(raw_companies, translations)

### Build the schema using `genson`

In [11]:
schema_builder = SchemaBuilder()
for company in translated_companies:
    schema_builder.add_object(company)

schema = schema_builder.to_schema()

### Save the results

In [12]:
# translations
with open('translations.json', 'w') as f:
    json.dump(translations, f, indent=4)

In [13]:
# schema
with open('schema.json', 'w') as f:
    json.dump(schema, f, indent=4)

In [14]:
# translated companies
with open('companies.json', 'w') as f:
    json.dump(translated_companies, f, indent=4)