# Complete GOV extract #10
Issue link: https://github.com/CorrelAid/compgen-ii-cgv/issues/10

In [1]:
%load_ext autoreload
%autoreload 2
#%load_ext memory_profiler
#%load_ext line_profiler

In [2]:
import pandas as pd
import numpy as np
from compgen2 import Gov, Matcher, const
from pathlib import Path
import sys

## Initializing GOV instance

In [3]:
data_root = "../data"
gov = Gov(data_root)

In [4]:
gov.load_data()

In [5]:
gov.build_indices()

In [6]:
#gov.clear_data() # needed when pickled in pipeline

### Attributes

In [7]:
for attr in ["items", "names", "types", "relations", "type_names"]:
    print(f"Data: {attr}")
    print(getattr(gov, attr).head())

Data: items
    id    textual_id  deleted
0  500  ADLING_W8351    False
1  501  ADLING_W8359    False
2  502  ADLITZ_W8581    False
3  503  ADLKELKO14FU    False
4  504  ADLLDEKO14FT    False
Data: names
    id                   content language  time_begin    time_end
0  500                Adlwarting      deu -2147483648  2147483647
1  501                 Adlmaning      deu -2147483648  2147483647
2  502                    Adlitz      deu -2147483648  2147483647
3  503               Adlerwinkel      deu    24289000  2147483647
4  503  Forsthaus Laukehlischken      deu -2147483648    24292640
Data: types
    id  content  time_begin    time_end
0  500       55 -2147483648  2147483647
1  501       69 -2147483648  2147483647
2  502       55 -2147483648  2147483647
3  503      129 -2147483648    24318210
4  504      129 -2147483648  2147483647
Data: relations
    child  parent  time_begin    time_end
0  105172     849 -2147483648  2147483647
1  120161     849 -2147483648  2147483647
2   98

In [8]:
# all paths are accessible via all_paths attribute
paths = gov.all_paths
list(paths)[:10]

[(190315, 217953, 1354, 795, 300318, 204698),
 (306245, 354642, 1047424, 355595, 269242, 819656),
 (190315, 190317, 1312, 1311, 1245, 1041974, 111117),
 (190315, 190317, 1281054, 1320, 190259, 1077163, 15376),
 (190315, 190729, 293588, 190735, 265071, 1261010),
 (306245, 354642, 276529, 356661, 362642, 1223707),
 (190315, 190317, 190337, 190356, 311757, 142216, 129341),
 (306245, 318701, 316711, 309960, 309970, 1115310),
 (190315, 352387, 578, 385818, 142603),
 (306245, 318701, 215344, 215408, 364134, 247107, 237565)]

In [9]:
gov.items_by_id

{218129: 'object_218129',
 220100: 'object_220100',
 306245: 'object_306245',
 191050: 'object_191050',
 190315: 'object_190315',
 191059: 'object_191059',
 220707: 'MAUICHJN39DO',
 191063: 'object_191063',
 404923: 'object_404923',
 191056: 'object_191056',
 220764: 'MAIFERJN39CM',
 220496: 'JENHOFJN29XT',
 1285491: 'QINDAOPM06EC',
 220105: 'SCHAANJN47SE',
 352387: 'object_352387',
 191057: 'object_191057',
 220645: 'WEIHOFJN39BO',
 190317: 'object_190317',
 190873: 'object_190873',
 191072: 'object_191072',
 220103: 'TRISENJN47SC',
 1331: 'adm_369040',
 220840: 'MOUENKJN39CR',
 218152: 'object_218152',
 191065: 'object_191065',
 220107: 'PLAKENJN47SE',
 220732: 'GREUERJN39CP',
 191069: 'object_191069',
 220848: 'SCHHLEJN39CR',
 220785: 'WICEIDJN39CS',
 220521: 'MAILTZJN29XQ',
 191071: 'object_191071',
 220454: 'TOCHLEJN29VW',
 191075: 'object_191075',
 220109: 'GAMERNJN47SF',
 220663: 'LEIHOFJN39BT',
 217750: 'object_217750',
 220603: 'GRUHLEJN29WV',
 220335: 'HAUZIGJN39AT',
 1085290

In [10]:
gov.names_by_id

defaultdict(None,
            {218129: {'luxemburg'},
             220100: {'liechtenstein'},
             306245: {'österreich-ungarn (monarchie)'},
             191050: {'schweiz', 'schweizerische eidgenossenschaft'},
             190315: {'deutsches reich'},
             191059: {'obwalden'},
             220707: {'maulin diederich'},
             191063: {'luzern'},
             404923: {'deutsch-südwestafrika'},
             191056: {'neuenburg'},
             220764: {'maison peffer'},
             220496: {'jenkenhof'},
             1285491: {'tsingtau'},
             220105: {'schaan'},
             352387: {'oldenburg'},
             191057: {'wallis'},
             220645: {'weimershof'},
             190317: {'preußen'},
             190873: {'anhalt'},
             191072: {'st. gallen'},
             220103: {'triesen'},
             1331: {'bremen', 'freie hansestadt bremen'},
             220840: {'moulin schenk'},
             218152: {'waldeck-pyrmont'},
             1

In [11]:
gov.ids_by_name

defaultdict(None,
            {'luxemburg': {80090, 218129, 220828, 1260199},
             'liechtenstein': {220100},
             'österreich-ungarn (monarchie)': {306245},
             'schweizerische eidgenossenschaft': {191050},
             'schweiz': {114189, 191050},
             'deutsches reich': {190315},
             'obwalden': {191059, 191127},
             'maulin diederich': {220707},
             'luzern': {191063, 191117, 191874},
             'deutsch-südwestafrika': {404923},
             'neuenburg': {90586,
              90597,
              90608,
              90609,
              191056,
              191254,
              284661,
              1050461,
              1072404,
              1196006},
             'maison peffer': {220764},
             'jenkenhof': {220199, 220496},
             'tsingtau': {1285491, 1285812},
             'schaan': {112899, 220105, 220116},
             'oldenburg': {573,
              583,
              96362,
              963

In [12]:
gov.types_by_id

defaultdict(None,
            {218129: {61},
             220100: {60},
             306245: {71},
             191050: {50},
             190315: {130},
             191059: {25},
             220707: {39},
             191063: {25},
             404923: {277},
             191056: {25},
             220764: {39},
             220496: {39},
             1285491: {277},
             220105: {18},
             352387: {61},
             191057: {25},
             220645: {39},
             190317: {31},
             190873: {23},
             191072: {25},
             220103: {18},
             1331: {16},
             220840: {39},
             218152: {60},
             191065: {25},
             220107: {18},
             220732: {39},
             191069: {25},
             220848: {39},
             220785: {39},
             220521: {39},
             191071: {25},
             220454: {39},
             191075: {25},
             220109: {18},
             220663: {39},
        

In [13]:
gov.all_reachable_nodes_by_id

defaultdict(set,
            {190315: {1048576,
              1048580,
              1048583,
              1048584,
              1048585,
              1048587,
              1048588,
              1048591,
              1048592,
              1048593,
              208416,
              1048595,
              1048596,
              1048597,
              1048598,
              1048599,
              208417,
              1048601,
              1048594,
              1048603,
              208418,
              1048605,
              1048606,
              1048607,
              1048608,
              1048609,
              1048610,
              1048611,
              1048612,
              1048613,
              1048614,
              1048615,
              1048616,
              1048617,
              1048618,
              1048619,
              1048620,
              1048621,
              1048622,
              208423,
              208425,
              1250489,
              

## Using GOV instance

In [14]:
pmax = max(paths, key=lambda p: len(p))
pmin = min(paths, key=lambda p: len(p))

In [15]:
pmax, pmin

((190315, 190317, 190325, 1335, 621, 1140583, 1131062, 20832, 45811, 111286),
 (218129, 220838))

In [16]:
gov.decode_path_name(pmax)

('deutsches reich',
 'preußen',
 'westphalen',
 'münster',
 'coesfeld',
 'rorup',
 'darup',
 'darup',
 'hanrorup',
 'rüsch')

In [17]:
gov.decode_path_id(pmax)

('object_190315',
 'object_190317',
 'object_190325',
 'adm_369055',
 'adm_135558',
 'object_1140583',
 'DARRU1JO31PW',
 'DARRUPJO31PW',
 'HANRUPJO31PW',
 'RUSSCHJO31PV')

In [18]:
gov.names_by_id[190315]

{'deutsches reich'}

In [19]:
gov.decode_path_type(pmax)

('Bundesstaat',
 'Königreich',
 'Provinz',
 'Regierungsbezirk',
 'Landkreis',
 'Amt',
 'Gemeinde',
 'Dorf',
 'Bauerschaft',
 'Hof')

In [20]:
gov.decode_path_name(pmin)

('luxemburg', 'grund')

In [21]:
gov.decode_path_id(pmin)

('object_218129', 'GRUUNDJN39BO')

In [22]:
gov.decode_path_type(pmin)

('Großherzogtum', 'Ort')

In [23]:
gov.ids_by_name["krefeld"]

{593, 70108, 311786, 1160594}

In [24]:
gov.decode_path_name(tuple(gov.ids_by_name["krefeld"]))

('krefeld', 'krefeld', 'krefeld', 'krefeld')

## Using Matcher

In [25]:
matcher = Matcher(gov)

In [26]:
matcher.get_match_for_locations(["Blasdorf, Landeshut"])

Processing locations: 100%|███████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10754.63it/s]


In [27]:
matcher.results

{'Blasdorf, Landeshut': {'parts': {'blasdorf': {'in_gov': True,
    'candidates': ['blasdorf'],
    'anchor': True},
   'landeshut': {'in_gov': True, 'candidates': ['landeshut'], 'anchor': True}},
  'anchor_method': 'gov complete',
  'possible_matches': [{'blasdorf': {'gov_id': 187906,
     'textual_id': 'object_187906',
     'type_ids': [55],
     'type_names': ['Dorf']},
    'landeshut': {'gov_id': 1227,
     'textual_id': 'adm_169760',
     'type_ids': [36],
     'type_names': ['Landkreis']}}]}}

## Weitere Beispiele

In [28]:
matcher.get_match_for_locations("Aach, Freudenstadt")

Processing locations: 100%|████████████████████████████████████████████████████████████████████████████████████████| 18/18 [00:10<00:00,  1.74it/s]


In [29]:
matcher.results

{'Blasdorf, Landeshut': {'parts': {'blasdorf': {'in_gov': True,
    'candidates': ['blasdorf'],
    'anchor': True},
   'landeshut': {'in_gov': True, 'candidates': ['landeshut'], 'anchor': True}},
  'anchor_method': 'gov complete',
  'possible_matches': [{'blasdorf': {'gov_id': 187906,
     'textual_id': 'object_187906',
     'type_ids': [55],
     'type_names': ['Dorf']},
    'landeshut': {'gov_id': 1227,
     'textual_id': 'adm_169760',
     'type_ids': [36],
     'type_names': ['Landkreis']}}]},
 'A': {'parts': {'a': {'in_gov': False,
    'candidates': ['ay', 'au', 'aa'],
    'anchor': True}},
  'anchor_method': 'ALL GOV | Cost 3',
  'possible_matches': [{'ay': {'gov_id': 5776,
     'textual_id': 'AYXXAY_W8313',
     'type_ids': [67],
     'type_names': ['Einöde']}},
   {'ay': {'gov_id': 5777,
     'textual_id': 'AYXXAY_W8315',
     'type_ids': [67],
     'type_names': ['Einöde']}},
   {'ay': {'gov_id': 211469,
     'textual_id': 'AYXXAY_W8445',
     'type_ids': [69],
     'type_nam

In [30]:
matcher.get_match_for_locations("Neustadt, Sachsen")

Processing locations: 100%|████████████████████████████████████████████████████████████████████████████████████████| 17/17 [00:09<00:00,  1.74it/s]


In [31]:
matcher.results

{'Blasdorf, Landeshut': {'parts': {'blasdorf': {'in_gov': True,
    'candidates': ['blasdorf'],
    'anchor': True},
   'landeshut': {'in_gov': True, 'candidates': ['landeshut'], 'anchor': True}},
  'anchor_method': 'gov complete',
  'possible_matches': [{'blasdorf': {'gov_id': 187906,
     'textual_id': 'object_187906',
     'type_ids': [55],
     'type_names': ['Dorf']},
    'landeshut': {'gov_id': 1227,
     'textual_id': 'adm_169760',
     'type_ids': [36],
     'type_names': ['Landkreis']}}]},
 'A': {'parts': {'a': {'in_gov': False,
    'candidates': ['ay', 'au', 'aa'],
    'anchor': True}},
  'anchor_method': 'ALL GOV | Cost 3',
  'possible_matches': [{'ay': {'gov_id': 5776,
     'textual_id': 'AYXXAY_W8313',
     'type_ids': [67],
     'type_names': ['Einöde']}},
   {'ay': {'gov_id': 5777,
     'textual_id': 'AYXXAY_W8315',
     'type_ids': [67],
     'type_names': ['Einöde']}},
   {'ay': {'gov_id': 211469,
     'textual_id': 'AYXXAY_W8445',
     'type_ids': [69],
     'type_nam