In [1]:
# only run if you are developping/debugging
# !pip install line_profiler
%load_ext line_profiler
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import pandas as pd
import random

from collections import Counter

from compgen2 import Gov, Matcher, GovTestData, Preprocessing
from compgen2.const import FILENAME_VL, FILENAME_GOV_TEST_SET
from compgen2.testdata import sample_test_set_from_gov, Synthetic, get_accuracy

#random.seed(1337)

In [3]:
data_root= Path("../data")

## Test sets
We use 4 test sets:
- different sample sets from the file "deutsche-verlustlisten-1wk"
- different sample sets from the gov database 
- different sample sets from a synthetic data set that tries to mimic the errors found in the original "verlustliste"
- manually collected correction suggestions from http://wiki-de.genealogy.net/Verlustlisten_Erster_Weltkrieg/Projekt/Ortsnamen

All test sets change when preprocessing is applied.

**Note**: We have a ground truth for all test setsbut the first one as the "verlustliste" is the actual problem we want to solve. So for the test set "verlustliste" we cannot calculate an accuracy score but we can compare who many items we were able to match. Assumption is that more matches are, in general, better.

## Test suite without Preprocessing -> Baseline

In [4]:
final_results = []

In [5]:
gov = Gov(data_root)
gov.load_data()
gov.build_indices()

In [6]:
# Test Set 1: VL
assert data_root.joinpath(FILENAME_VL).exists()
test_set_size = 100

vl = pd.read_parquet(data_root / FILENAME_VL)  # location column has the test data, truth is unknown

vl_test_sets = []
vl_test_sets.append(("vl test set with loc_count=1", vl.query("loc_parts_count == 1").sample(test_set_size)))
vl_test_sets.append(("vl test set with loc_count=2", vl.query("loc_parts_count == 2").sample(test_set_size)))
vl_test_sets.append(("vl test set with loc_count=3", vl.query("loc_parts_count == 3").sample(test_set_size)))
vl_test_sets.append(("vl test set containing '.'", vl[vl.location.str.contains(".", regex=False)].sample(test_set_size)))

In [7]:
# Test Set 2: Gov database
test_set_size = 100

gov_test_sets = []
gov_test_sets.append(("gov db test set with loc_count=1 and valid=1", sample_test_set_from_gov(gov, size=test_set_size, num_parts=1, valid=1)))
gov_test_sets.append(("gov db test set with loc_count=1 and valid=0.7", sample_test_set_from_gov(gov, size=test_set_size, num_parts=1, valid=0.7)))
gov_test_sets.append(("gov db test set with loc_count=2 and valid=1", sample_test_set_from_gov(gov, size=test_set_size, num_parts=2, valid=1)))
gov_test_sets.append(("gov db test set with loc_count=2 and valid=0.7", sample_test_set_from_gov(gov, size=test_set_size, num_parts=2, valid=0.7)))
gov_test_sets.append(("gov db test set with loc_count=3 and valid=1", sample_test_set_from_gov(gov, size=test_set_size, num_parts=3, valid=1)))
gov_test_sets.append(("gov db test set with loc_count=3 and valid=0.7", sample_test_set_from_gov(gov, size=test_set_size, num_parts=3, valid=0.7)))

In [16]:
# Test Set 3: Synthetic
test_set_size = 100

syn = Synthetic(gov)

syn_test_sets. = []
syn_test_sets..append(("syn test set with loc_count=1 and distortion=0.", syn.create_synthetic_test_set(size=test_set_size, num_parts=1, distortion_factor=0.)))
syn_test_sets..append(("syn test set with loc_count=1 and distortion=1.", syn.create_synthetic_test_set(gov, size=test_set_size, num_parts=1, distortion_factor=1.)))
syn_test_sets..append(("syn test set with loc_count=2 and distortion=0.", syn.create_synthetic_test_set(size=test_set_size, num_parts=2, distortion_factor=0.)))
syn_test_sets..append(("syn test set with loc_count=2 and distortion=1.", syn.create_synthetic_test_set(size=test_set_size, num_parts=2, distortion_factor=1.)))
syn_test_sets..append(("syn test set with loc_count=3 and distortion=0.", syn.create_synthetic_test_set(size=test_set_size, num_parts=3, distortion_factor=0.)))
syn_test_sets..append(("syn test set with loc_count=3 and distortion=1.", syn.create_synthetic_test_set(size=test_set_size, num_parts=3, distortion_factor=1.)))

In [18]:
# Test Set 4: GovTestData
assert data_root.joinpath(FILENAME_GOV_TEST_SET).exists()

gtd = GovTestData(gov)
gtd_test_sets = []
gtd_test_sets.append(("gov web test set", gtd.get_test_set()))

### Run the tests

In [19]:
result_row = []

Test Set 1

In [24]:
test_results = {}
for name, test_set in vl_test_sets:
    m = Matcher(gov)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches / test_set.location.nunique() * 100, 4)}%).")
    print(Counter(m.anchor_method))
    test_results[name] = m
    print()
    
    result_row.append(total_matches / test_set.location.nunique())

Running vl test set with loc_count=1


Processing locations: 100%|█████████████████████████████████████████████████████████████████████████████| 100/100 [00:54<00:00,  1.83it/s]


Total matches: 71 (71.0%).
Counter({'gov only': 32, 'No anchor at all': 29, 'Cost 2': 20, 'Cost 3': 17, 'Cost 1': 2})

Running vl test set with loc_count=2


Processing locations: 100%|█████████████████████████████████████████████████████████████████████████████| 100/100 [00:11<00:00,  8.72it/s]


Total matches: 92 (92.0%).
Counter({'gov only': 87, 'Cost 2': 6, 'Cost 1': 4, 'Cost 3': 3})

Running vl test set with loc_count=3


Processing locations: 100%|█████████████████████████████████████████████████████████████████████████████| 100/100 [00:12<00:00,  7.87it/s]


Total matches: 64 (64.0%).
Counter({'gov only': 93, 'Cost 2': 4, 'Cost 3': 3})

Running vl test set containing '.'


Processing locations: 100%|█████████████████████████████████████████████████████████████████████████████| 100/100 [00:48<00:00,  2.06it/s]

Total matches: 88 (88.0%).
Counter({'gov only': 60, 'Cost 3': 15, 'Cost 2': 13, 'No anchor at all': 11, 'Cost 1': 1})






In [25]:
test_results["vl test set with loc_count=1"].results

{'Pfaffen': {'parts': {'pfaffen': {'in_gov': True, 'candidates': ['pfaffen']}},
  'possible_matches': [{'pfaffen': {'gov_id': 246046,
     'textual_id': 'PFAFENJN77VM',
     'type_ids': [39],
     'type_names': ['Ort']}}]},
 'Roßnow': {'parts': {'roßnow': {'in_gov': True, 'candidates': ['roßnow']}},
  'possible_matches': [{'roßnow': {'gov_id': 109724,
     'textual_id': 'ROSSNOJO84DB',
     'type_ids': [85],
     'type_names': ['Landgemeinde']}},
   {'roßnow': {'gov_id': 325893,
     'textual_id': 'object_325893',
     'type_ids': [108],
     'type_names': ['Gutsbezirk']}}]},
 'Liebenau b. Hannover': {'parts': {'liebenau b. hannover': {'in_gov': False,
    'candidates': []}},
  'possible_matches': []},
 'Oberteuringen': {'parts': {'oberteuringen': {'in_gov': True,
    'candidates': ['oberteuringen']}},
  'possible_matches': [{'oberteuringen': {'gov_id': 197211,
     'textual_id': 'OBEGENJN47RR',
     'type_ids': [65],
     'type_names': ['Pfarrdorf']}}]},
 'Tauberbischofsheim i. Bad.':

Test Set 2

In [12]:
for name, test_set in gov_test_sets:
    m = Matcher(gov)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches / test_set.location.nunique() * 100, 4)}%).")
    
    accuracy = get_accuracy(m.results, test_set)
    print("Accuracy (entries where all parts of truth are in possible matches):", round(accuracy, 4))
    print()
    
    result_row.append(total_matches / test_set.location.nunique())
    result_row.append(accuracy)

Running gov db test set with loc_count=1 and valid=1


Processing locations: 100%|████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 46015.40it/s]

Total matches: 998 (99.8%).
Accuracy (entries where all parts of truth are in possible matches): 0.998






Running gov db test set with loc_count=1 and valid=0.7


Processing locations: 100%|████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 49200.05it/s]

Total matches: 700 (70.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.699






Running gov db test set with loc_count=2 and valid=1


Processing locations: 100%|██████████████████████████████████████████████████████████████████████████| 1000/1000 [00:01<00:00, 813.93it/s]


Total matches: 993 (99.3%).
Accuracy (entries where all parts of truth are in possible matches): 0.992

Running gov db test set with loc_count=2 and valid=0.7


Processing locations: 100%|████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 28703.34it/s]

Total matches: 698 (69.8%).
Accuracy (entries where all parts of truth are in possible matches): 0.696






Running gov db test set with loc_count=3 and valid=1


Processing locations: 100%|███████████████████████████████████████████████████████████████████████████| 1000/1000 [00:16<00:00, 60.27it/s]


Total matches: 994 (99.4%).
Accuracy (entries where all parts of truth are in possible matches): 0.991

Running gov db test set with loc_count=3 and valid=0.7


Processing locations: 100%|██████████████████████████████████████████████████████████████████████████| 1000/1000 [00:01<00:00, 771.78it/s]

Total matches: 696 (69.6%).
Accuracy (entries where all parts of truth are in possible matches): 0.696






Test Set 3

In [None]:
for name, test_set in syn_test_sets:
    m = Matcher(gov)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches /  test_set.location.nunique() * 100, 4)}%).")
    
    accuracy = get_accuracy(m.results, test_set)
    print("Accuracy (entries where all parts of truth are in possible matches):", round(accuracy, 4))
    print()
    
    result_row.append(total_matches / test_set.location.nunique())
    result_row.append(accuracy)

Test Set 4

In [14]:
for name, test_set in gtd_test_sets:
    m = Matcher(gov)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches /  test_set.location.nunique() * 100, 4)}%).")
    
    accuracy = get_accuracy(m.results, test_set)
    print("Accuracy (entries where all parts of truth are in possible matches):", round(accuracy, 4))
    print()
    
    result_row.append(total_matches / test_set.location.nunique())
    result_row.append(accuracy)

Running gov web test set


Processing locations: 100%|████████████████████████████████████████████████████████| 5641/5641 [06:43<00:00, 13.98it/s]

Total matches: 5029 (90.1739%).
Accuracy (entries where all parts of truth are in possible matches): 0.6462






In [15]:
final_results.append(result_row)

## Test suite with VL + Gov Preprocessing - replace corrections and characters, no substitution

In [26]:
result_row = []

In [27]:
for _, test_set in vl_test_sets + gov_test_sets + syn_test_sets + gtd_test_sets:
    test_set.location = Preprocessing.replace_corrections_vl(test_set.location)
    test_set.location = Preprocessing.replace_characters_vl(test_set.location)
    
    if "truth" in test_set:
        test_set.truth = Preprocessing.replace_corrections_vl(test_set.truth)
        test_set.truth = Preprocessing.replace_characters_vl(test_set.truth)

In [33]:
from collections import defaultdict

In [36]:
gov = Gov(data_root)
gov.load_data()
gov.build_indices()

old_names = list(gov.ids_by_name.keys())
new_names = Preprocessing.replace_characters_gov(pd.Series(old_names, dtype=str))

ids_by_pname = defaultdict(set)
for old_name, new_name in zip(old_names, new_names):
    ids_by_pname[new_name] |= gov.ids_by_name[old_name]
ids_by_pname.default_factory = None
gov.ids_by_name = ids_by_pname
    
pnames_by_id = defaultdict(set)
for k, v in ids_by_pname.items():
    for i in v:
        pnames_by_id[i] |= {k}
pnames_by_id.default_factory = None
gov.names_by_id = pnames_by_id

Test Set 1

In [37]:
for name, test_set in vl_test_sets:
    m = Matcher(gov)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches / test_set.location.nunique() * 100, 4)}%).")
    print()
    
    result_row.append(total_matches / test_set.location.nunique())

Running vl test set with loc_count=1


Processing locations: 100%|█████████████████████████████████████████████████████████████████████████████| 100/100 [00:50<00:00,  1.97it/s]


Total matches: 76 (76.0%).

Running vl test set with loc_count=2


Processing locations: 100%|█████████████████████████████████████████████████████████████████████████████| 100/100 [00:11<00:00,  8.72it/s]


Total matches: 92 (92.0%).

Running vl test set with loc_count=3


Processing locations: 100%|█████████████████████████████████████████████████████████████████████████████| 100/100 [00:08<00:00, 11.44it/s]


Total matches: 63 (63.0%).

Running vl test set containing '.'


Processing locations: 100%|█████████████████████████████████████████████████████████████████████████████| 100/100 [00:46<00:00,  2.16it/s]

Total matches: 87 (87.0%).






Test Set 2

In [38]:
for name, test_set in gov_test_sets:
    m = Matcher(gov)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches / test_set.location.nunique() * 100, 4)}%).")
    
    accuracy = get_accuracy(m.results, test_set)
    print("Accuracy (entries where all parts of truth are in possible matches):", round(accuracy, 4))
    print()
    
    result_row.append(total_matches / test_set.location.nunique())
    result_row.append(accuracy)

Running gov db test set with loc_count=1 and valid=1


Processing locations: 100%|██████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 50957.40it/s]

Total matches: 100 (100.0%).
Accuracy (entries where all parts of truth are in possible matches): 1.0






Running gov db test set with loc_count=1 and valid=0.7


Processing locations: 100%|████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 184.45it/s]


Total matches: 70 (70.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.69

Running gov db test set with loc_count=2 and valid=1


Processing locations: 100%|█████████████████████████████████████████████████████████████████████████████| 100/100 [00:07<00:00, 13.79it/s]


Total matches: 100 (100.0%).
Accuracy (entries where all parts of truth are in possible matches): 1.0

Running gov db test set with loc_count=2 and valid=0.7


Processing locations: 100%|█████████████████████████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 23.39it/s]


Total matches: 68 (68.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.67

Running gov db test set with loc_count=3 and valid=1


Processing locations: 100%|█████████████████████████████████████████████████████████████████████████████| 100/100 [00:11<00:00,  8.43it/s]


Total matches: 98 (98.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.98

Running gov db test set with loc_count=3 and valid=0.7


Processing locations: 100%|█████████████████████████████████████████████████████████████████████████████| 100/100 [00:08<00:00, 11.12it/s]

Total matches: 66 (66.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.66






Test Set 3

In [39]:
for name, test_set in syn_test_sets:
    m = Matcher(gov)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches /  test_set.location.nunique() * 100, 4)}%).")
    
    accuracy = get_accuracy(m.results, test_set)
    print("Accuracy (entries where all parts of truth are in possible matches):", round(accuracy, 4))
    print()
    
    result_row.append(total_matches / test_set.location.nunique())
    result_row.append(accuracy)

Running syn test set with default probabilities


Processing locations: 100%|█████████████████████████████████████████████████████████████████████████████| 100/100 [00:14<00:00,  7.09it/s]

Total matches: 99 (99.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.78






Test Set 4

In [None]:
for name, test_set in gtd_test_sets:
    m = Matcher(gov)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches /  test_set.location.nunique() * 100, 4)}%).")
    
    accuracy = get_accuracy(m.results, test_set)
    print("Accuracy (entries where all parts of truth are in possible matches):", round(accuracy, 4))
    print()
    
    result_row.append(total_matches / test_set.location.nunique())
    result_row.append(accuracy)

In [None]:
final_results.append(result_row)

## Test suite with VL + Gov Preprocessing - replace corrections and characters + substitution

In [41]:
result_row = []

In [42]:
for _, test_set in vl_test_sets + gov_test_sets + syn_test_sets + gtd_test_sets:
    test_set.location = Preprocessing.substitute_partial_words(test_set.location, data_root)
    test_set.location = Preprocessing.substitute_delete_words(test_set.location, data_root)
    test_set.location = Preprocessing.substitute_full_words(test_set.location, data_root)
    
    if "truth" in test_set:
        test_set.truth = Preprocessing.substitute_partial_words(test_set.truth, data_root)
        test_set.truth = Preprocessing.substitute_delete_words(test_set.truth, data_root)
        test_set.truth = Preprocessing.substitute_full_words(test_set.truth, data_root)

In [43]:
gov = Gov(data_root)
gov.load_data()
gov.build_indices()

old_names = list(gov.ids_by_name.keys())
new_names = Preprocessing.replace_characters_gov(pd.Series(old_names, dtype=str))
new_names = Preprocessing.substitute_partial_words(pd.Series(new_names), data_root)
new_names = Preprocessing.substitute_delete_words(pd.Series(new_names), data_root)
new_names = Preprocessing.substitute_full_words(pd.Series(new_names), data_root)

ids_by_pname = defaultdict(set)
for old_name, new_name in zip(old_names, new_names):
    ids_by_pname[new_name] |= gov.ids_by_name[old_name]
ids_by_pname.default_factory = None
gov.ids_by_name = ids_by_pname
    
pnames_by_id = defaultdict(set)
for k, v in ids_by_pname.items():
    for i in v:
        pnames_by_id[i] |= {k}
pnames_by_id.default_factory = None
gov.names_by_id = pnames_by_id

Test Set 1

In [None]:
for name, test_set in vl_test_sets:
    m = Matcher(gov)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches / test_set.location.nunique() * 100, 4)}%).")
    print()
    
    result_row.append(total_matches / test_set.location.nunique())

Test Set 2

In [45]:
for name, test_set in gov_test_sets:
    m = Matcher(gov)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches / test_set.location.nunique() * 100, 4)}%).")
    
    accuracy = get_accuracy(m.results, test_set)
    print("Accuracy (entries where all parts of truth are in possible matches):", round(accuracy, 4))
    print()
    
    result_row.append(total_matches / test_set.location.nunique())
    result_row.append(accuracy)

Running gov db test set with loc_count=1 and valid=1


Processing locations: 100%|██████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 29976.44it/s]

Total matches: 100 (100.0%).
Accuracy (entries where all parts of truth are in possible matches): 1.0






Running gov db test set with loc_count=1 and valid=0.7


Processing locations: 100%|████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 176.99it/s]


Total matches: 70 (70.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.69

Running gov db test set with loc_count=2 and valid=1


Processing locations: 100%|█████████████████████████████████████████████████████████████████████████████| 100/100 [00:07<00:00, 13.86it/s]


Total matches: 100 (100.0%).
Accuracy (entries where all parts of truth are in possible matches): 1.0

Running gov db test set with loc_count=2 and valid=0.7


Processing locations: 100%|█████████████████████████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 23.51it/s]


Total matches: 68 (68.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.67

Running gov db test set with loc_count=3 and valid=1


Processing locations: 100%|█████████████████████████████████████████████████████████████████████████████| 100/100 [00:11<00:00,  8.55it/s]


Total matches: 97 (97.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.97

Running gov db test set with loc_count=3 and valid=0.7


Processing locations: 100%|█████████████████████████████████████████████████████████████████████████████| 100/100 [00:08<00:00, 11.26it/s]

Total matches: 66 (66.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.66






Test Set 3

In [46]:
for name, test_set in syn_test_sets:
    m = Matcher(gov)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches /  test_set.location.nunique() * 100, 4)}%).")
    
    accuracy = get_accuracy(m.results, test_set)
    print("Accuracy (entries where all parts of truth are in possible matches):", round(accuracy, 4))
    print()
    
    result_row.append(total_matches / test_set.location.nunique())
    result_row.append(accuracy)

Running syn test set with default probabilities


Processing locations: 100%|█████████████████████████████████████████████████████████████████████████████| 100/100 [00:13<00:00,  7.36it/s]

Total matches: 99 (99.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.78






Test Set 4

In [None]:
for name, test_set in gtd_test_sets:
    m = Matcher(gov)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches /  test_set.location.nunique() * 100, 4)}%).")
    
    accuracy = get_accuracy(m.results, test_set)
    print("Accuracy (entries where all parts of truth are in possible matches):", round(accuracy, 4))
    print()
    
    result_row.append(total_matches / test_set.location.nunique())
    result_row.append(accuracy)

In [None]:
final_results.append(result_row)

## Auswertung

In [None]:
names = []
for name, test_set in vl_test_sets + gov_test_sets + syn_test_sets + gtd_test_sets:
    for metric in ['total matches', 'accuracy']:
        if name.startswith("vl") and metric == 'accuracy':
            continue
            
        names.append(name + ' ' + metric)
        
final_results = pd.DataFrame(final_results, columns=names)

In [None]:
final_results["test set"] = ["Baseline", "Preprocessing VL (corrections + characters)",  "Preprocessing VL + Gov (corrections + characters)",  "Preprocessing VL + Gov (corrections + characters + substitution)"]

In [None]:
final_results = final_results.set_index("test set")

In [None]:
final_results.to_csv("final_results.csv")