In [1]:
# only run if you are developping/debugging
# !pip install line_profiler
%load_ext line_profiler
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import pandas as pd
import random

from collections import Counter

from compgen2 import Gov, Matcher, GovTestData, Preprocessing
from compgen2.const import FILENAME_VL, FILENAME_GOV_TEST_SET
from compgen2.testdata import sample_test_set_from_gov, Synthetic, get_accuracy

random.seed(1337)

In [3]:
data_root= Path("../data")

In [4]:
# settings for matcher
matcher_params = {
    "use_difflib": True,  # False will use levenshtein
    "use_phonetic": True,
    "max_cost": 3,
    "search_kreis_first": True,
}

## Test sets
We use 4 test sets:
- different sample sets from the file "deutsche-verlustlisten-1wk"
- different sample sets from the gov database 
- different sample sets from a synthetic data set that tries to mimic the errors found in the original "verlustliste"
- manually collected correction suggestions from http://wiki-de.genealogy.net/Verlustlisten_Erster_Weltkrieg/Projekt/Ortsnamen

All test sets change when preprocessing is applied.

**Note**: We have a ground truth for all test setsbut the first one as the "verlustliste" is the actual problem we want to solve. So for the test set "verlustliste" we cannot calculate an accuracy score but we can compare who many items we were able to match. Assumption is that more matches are, in general, better.

## Test suite without Preprocessing -> Baseline

In [5]:
final_results = []

In [6]:
gov = Gov(data_root)
gov.load_data()
gov.build_indices()

In [7]:
# Test Set 1: VL
assert data_root.joinpath(FILENAME_VL).exists()
test_set_size = 100

vl = pd.read_parquet(data_root / FILENAME_VL)  # location column has the test data, truth is unknown

vl_test_sets = []
vl_test_sets.append(("vl test set with loc_count=1", vl.query("loc_parts_count == 1").sample(test_set_size)))
vl_test_sets.append(("vl test set with loc_count=2", vl.query("loc_parts_count == 2").sample(test_set_size)))
vl_test_sets.append(("vl test set with loc_count=3", vl.query("loc_parts_count == 3").sample(test_set_size)))
vl_test_sets.append(("vl test set containing '.'", vl[vl.location.str.contains(".", regex=False)].sample(test_set_size)))

# Test Set 2: Gov database
test_set_size = 100

gov_test_sets = []
gov_test_sets.append(("gov db test set with loc_count=1 and valid=1", sample_test_set_from_gov(gov, size=test_set_size, num_parts=1, valid=1)))
gov_test_sets.append(("gov db test set with loc_count=1 and valid=0.7", sample_test_set_from_gov(gov, size=test_set_size, num_parts=1, valid=0.7)))
gov_test_sets.append(("gov db test set with loc_count=2 and valid=1", sample_test_set_from_gov(gov, size=test_set_size, num_parts=2, valid=1)))
gov_test_sets.append(("gov db test set with loc_count=2 and valid=0.7", sample_test_set_from_gov(gov, size=test_set_size, num_parts=2, valid=0.7)))
gov_test_sets.append(("gov db test set with loc_count=3 and valid=1", sample_test_set_from_gov(gov, size=test_set_size, num_parts=3, valid=1)))
gov_test_sets.append(("gov db test set with loc_count=3 and valid=0.7", sample_test_set_from_gov(gov, size=test_set_size, num_parts=3, valid=0.7)))

In [8]:
# Test Set 3: Synthetic
test_set_size = 100

syn = Synthetic(gov)

syn_test_sets = []
syn_test_sets.append(("syn test set with loc_count=1 and distortion=0.", syn.create_synthetic_test_set(size=test_set_size, num_parts=1, distortion_factor=0.)))
syn_test_sets.append(("syn test set with loc_count=1 and distortion=1.", syn.create_synthetic_test_set(size=test_set_size, num_parts=1, distortion_factor=1.)))
syn_test_sets.append(("syn test set with loc_count=2 and distortion=0.", syn.create_synthetic_test_set(size=test_set_size, num_parts=2, distortion_factor=0.)))
syn_test_sets.append(("syn test set with loc_count=2 and distortion=1.", syn.create_synthetic_test_set(size=test_set_size, num_parts=2, distortion_factor=1.)))
syn_test_sets.append(("syn test set with loc_count=3 and distortion=0.", syn.create_synthetic_test_set(size=test_set_size, num_parts=3, distortion_factor=0.)))
syn_test_sets.append(("syn test set with loc_count=3 and distortion=1.", syn.create_synthetic_test_set(size=test_set_size, num_parts=3, distortion_factor=1.)))

In [9]:
# Test Set 4: GovTestData
assert data_root.joinpath(FILENAME_GOV_TEST_SET).exists()

gtd = GovTestData(gov)
gtd_test_sets = []
gtd_test_sets.append(("gov web test set", gtd.get_test_set()))

### Run the tests

In [10]:
result_row = []

Test Set 1

In [11]:
for name, test_set in vl_test_sets:
    m = Matcher(gov, **matcher_params)
    print("Rsnunning", name)
    m.get_match_for_locations(test_set.location)
    print(Counter(v["anchor_method"] for v in m.results.values()))
    
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches / test_set.location.nunique() * 100, 4)}%).")
    print()
    
    result_row.append(total_matches / test_set.location.nunique())

Rsnunning vl test set with loc_count=1


Processing locations: 100%|███████████████████████████████████████████████████████████| 100/100 [00:26<00:00,  3.81it/s]


Counter({'Phonetic': 38, 'gov complete': 28, 'ALL GOV | Cost 3': 19, 'ALL GOV | Cost 2': 8, 'No anchor at all': 5, 'ALL GOV | Cost 1': 2})
Total matches: 95 (95.0%).

Rsnunning vl test set with loc_count=2


Processing locations: 100%|██████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 201.56it/s]


Counter({'gov partial': 64, 'gov complete': 24, 'KREISORSTADT | Cost 2': 6, 'KREISORSTADT | Cost 3': 5, 'KREISORSTADT | Cost 1': 1})
Total matches: 86 (86.0%).

Rsnunning vl test set with loc_count=3


Processing locations: 100%|██████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 104.70it/s]


Counter({'gov partial': 73, 'gov complete': 21, 'KREISORSTADT | Cost 2': 3, 'KREISORSTADT | Cost 1': 2, 'KREISORSTADT | Cost 3': 1})
Total matches: 65 (65.0%).

Rsnunning vl test set containing '.'


Processing locations: 100%|███████████████████████████████████████████████████████████| 100/100 [00:08<00:00, 11.40it/s]

Counter({'gov partial': 62, 'KREISORSTADT | Cost 3': 13, 'KREISORSTADT | Cost 2': 9, 'ALL GOV | Cost 3': 5, 'Phonetic': 4, 'KREISORSTADT | Cost 1': 2, 'ALL GOV | Cost 2': 2, 'gov complete': 1, 'ALL GOV | Cost 1': 1, 'No anchor at all': 1})
Total matches: 95 (95.0%).






Test Set 2

for name, test_set in gov_test_sets:
    m = Matcher(gov, **matcher_params)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches / test_set.location.nunique() * 100, 4)}%).")
    
    accuracy = get_accuracy(m.results, test_set)
    print("Accuracy (entries where all parts of truth are in possible matches):", round(accuracy, 4))
    print()
    
    result_row.append(total_matches / test_set.location.nunique())
    result_row.append(accuracy)

Test Set 3

In [12]:
for name, test_set in syn_test_sets:
    m = Matcher(gov, **matcher_params)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    print(Counter(v["anchor_method"] for v in m.results.values()))
    
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches /  test_set.location.nunique() * 100, 4)}%).")
    
    accuracy = get_accuracy(m.results, test_set)
    print("Accuracy (entries where all parts of truth are in possible matches):", round(accuracy, 4))
    print()
    
    result_row.append(total_matches / test_set.location.nunique())
    result_row.append(accuracy)

Running syn test set with loc_count=1 and distortion=0.


Processing locations: 100%|████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 50021.51it/s]

Counter({'gov complete': 100})
Total matches: 100 (100.0%).
Accuracy (entries where all parts of truth are in possible matches): 1.0






Running syn test set with loc_count=1 and distortion=1.


Processing locations: 100%|███████████████████████████████████████████████████████████| 100/100 [00:02<00:00, 39.68it/s]


Counter({'gov complete': 75, 'Phonetic': 18, 'ALL GOV | Cost 1': 5, 'ALL GOV | Cost 2': 2})
Total matches: 100 (100.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.84

Running syn test set with loc_count=2 and distortion=0.


Processing locations: 100%|████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 33338.40it/s]

Counter({'gov complete': 98, 'gov partial': 1})
Total matches: 99 (100.0%).
Accuracy (entries where all parts of truth are in possible matches): 1.0






Running syn test set with loc_count=2 and distortion=1.


Processing locations: 100%|███████████████████████████████████████████████████████████| 100/100 [00:07<00:00, 13.55it/s]


Counter({'gov complete': 46, 'gov partial': 44, 'KREISORSTADT | Cost 1': 8, 'KREISORSTADT | Cost 3': 1, 'Phonetic': 1})
Total matches: 100 (100.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.92

Running syn test set with loc_count=3 and distortion=0.


Processing locations: 100%|████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 25000.32it/s]

Counter({'gov complete': 94, 'gov partial': 6})
Total matches: 100 (100.0%).
Accuracy (entries where all parts of truth are in possible matches): 1.0






Running syn test set with loc_count=3 and distortion=1.


Processing locations: 100%|███████████████████████████████████████████████████████████| 100/100 [00:18<00:00,  5.39it/s]

Counter({'gov partial': 69, 'gov complete': 27, 'KREISORSTADT | Cost 1': 2, 'KREISORSTADT | Cost 2': 2})
Total matches: 99 (99.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.86






Test Set 4

In [13]:
for name, test_set in gtd_test_sets:
    m = Matcher(gov, **matcher_params)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    print(Counter(v["anchor_method"] for v in m.results.values()))
    
    
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches /  test_set.location.nunique() * 100, 4)}%).")
    
    accuracy = get_accuracy(m.results, test_set)
    print("Accuracy (entries where all parts of truth are in possible matches):", round(accuracy, 4))
    print()
    
    result_row.append(total_matches / test_set.location.nunique())
    result_row.append(accuracy)

Running gov web test set


Processing locations: 100%|█████████████████████████████████████████████████████████| 5641/5641 [01:50<00:00, 51.03it/s]


Counter({'gov partial': 3546, 'gov complete': 602, 'Phonetic': 512, 'KREISORSTADT | Cost 2': 435, 'KREISORSTADT | Cost 3': 303, 'KREISORSTADT | Cost 1': 59, 'ALL GOV | Cost 1': 48, 'ALL GOV | Cost 2': 40, 'ALL GOV | Cost 3': 26, 'No anchor at all': 6})
Total matches: 5065 (90.8194%).
Accuracy (entries where all parts of truth are in possible matches): 0.6603



In [14]:
final_results.append(result_row)

## Test suite with VL + Gov Preprocessing - replace corrections and characters, no substitution

In [15]:
result_row = []

In [16]:
for _, test_set in vl_test_sets + syn_test_sets + gtd_test_sets:
    test_set.location = Preprocessing.replace_corrections_vl(test_set.location)
    test_set.location = Preprocessing.replace_characters_vl(test_set.location)
    
    if "truth" in test_set:
        test_set.truth = Preprocessing.replace_corrections_vl(test_set.truth)
        test_set.truth = Preprocessing.replace_characters_vl(test_set.truth)

In [17]:
from collections import defaultdict

In [18]:
old_names = list(gov.ids_by_name.keys())
new_names = Preprocessing.replace_characters_gov(pd.Series(old_names, dtype=str))

ids_by_pname = defaultdict(set)
for old_name, new_name in zip(old_names, new_names):
    ids_by_pname[new_name] |= gov.ids_by_name[old_name]
ids_by_pname.default_factory = None
gov.ids_by_name = ids_by_pname
    
pnames_by_id = defaultdict(set)
for k, v in ids_by_pname.items():
    for i in v:
        pnames_by_id[i] |= {k}
pnames_by_id.default_factory = None
gov.names_by_id = pnames_by_id

Test Set 1

In [19]:
for name, test_set in vl_test_sets:
    m = Matcher(gov, **matcher_params)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    print(Counter(v["anchor_method"] for v in m.results.values()))
    
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches / test_set.location.nunique() * 100, 4)}%).")
    print()
    
    result_row.append(total_matches / test_set.location.nunique())

Running vl test set with loc_count=1


Processing locations: 100%|███████████████████████████████████████████████████████████| 100/100 [00:14<00:00,  6.93it/s]


Counter({'gov complete': 43, 'Phonetic': 38, 'ALL GOV | Cost 3': 9, 'ALL GOV | Cost 2': 8, 'ALL GOV | Cost 1': 2})
Total matches: 100 (100.0%).

Running vl test set with loc_count=2


Processing locations: 100%|██████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 204.76it/s]


Counter({'gov partial': 63, 'gov complete': 25, 'KREISORSTADT | Cost 2': 6, 'KREISORSTADT | Cost 3': 5, 'KREISORSTADT | Cost 1': 1})
Total matches: 85 (85.0%).

Running vl test set with loc_count=3


Processing locations: 100%|██████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 113.77it/s]


Counter({'gov partial': 68, 'gov complete': 28, 'KREISORSTADT | Cost 2': 4})
Total matches: 64 (64.0%).

Running vl test set containing '.'


Processing locations: 100%|███████████████████████████████████████████████████████████| 100/100 [00:07<00:00, 12.64it/s]

Counter({'gov partial': 63, 'KREISORSTADT | Cost 3': 12, 'KREISORSTADT | Cost 2': 9, 'ALL GOV | Cost 3': 5, 'Phonetic': 4, 'KREISORSTADT | Cost 1': 2, 'gov complete': 2, 'ALL GOV | Cost 2': 2, 'ALL GOV | Cost 1': 1})
Total matches: 96 (96.0%).






Test Set 2

for name, test_set in gov_test_sets:
    m = Matcher(gov, **matcher_params)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches / test_set.location.nunique() * 100, 4)}%).")
    
    accuracy = get_accuracy(m.results, test_set)
    print("Accuracy (entries where all parts of truth are in possible matches):", round(accuracy, 4))
    print()
    
    result_row.append(total_matches / test_set.location.nunique())
    result_row.append(accuracy)

Test Set 3

In [20]:
for name, test_set in syn_test_sets:
    m = Matcher(gov, **matcher_params)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    print(Counter(v["anchor_method"] for v in m.results.values()))
    
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches /  test_set.location.nunique() * 100, 4)}%).")
    
    accuracy = get_accuracy(m.results, test_set)
    print("Accuracy (entries where all parts of truth are in possible matches):", round(accuracy, 4))
    print()
    
    result_row.append(total_matches / test_set.location.nunique())
    result_row.append(accuracy)

Running syn test set with loc_count=1 and distortion=0.


Processing locations: 100%|███████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 68.16it/s]


Counter({'gov complete': 99, 'ALL GOV | Cost 3': 1})
Total matches: 100 (100.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.99

Running syn test set with loc_count=1 and distortion=1.


Processing locations: 100%|███████████████████████████████████████████████████████████| 100/100 [00:03<00:00, 26.54it/s]


Counter({'gov complete': 74, 'Phonetic': 19, 'ALL GOV | Cost 1': 4, 'ALL GOV | Cost 2': 2, 'ALL GOV | Cost 3': 1})
Total matches: 100 (100.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.82

Running syn test set with loc_count=2 and distortion=0.


Processing locations: 100%|██████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 150.13it/s]


Counter({'gov complete': 90, 'gov partial': 9})
Total matches: 99 (100.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.91

Running syn test set with loc_count=2 and distortion=1.


Processing locations: 100%|███████████████████████████████████████████████████████████| 100/100 [00:06<00:00, 14.34it/s]


Counter({'gov complete': 45, 'gov partial': 41, 'KREISORSTADT | Cost 1': 9, 'KREISORSTADT | Cost 3': 4, 'Phonetic': 1})
Total matches: 99 (99.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.79

Running syn test set with loc_count=3 and distortion=0.


Processing locations: 100%|███████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 52.82it/s]


Counter({'gov complete': 81, 'gov partial': 19})
Total matches: 97 (97.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.84

Running syn test set with loc_count=3 and distortion=1.


Processing locations: 100%|███████████████████████████████████████████████████████████| 100/100 [00:17<00:00,  5.57it/s]

Counter({'gov partial': 71, 'gov complete': 25, 'KREISORSTADT | Cost 2': 3, 'KREISORSTADT | Cost 1': 1})
Total matches: 99 (99.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.71






Test Set 4

In [21]:
for name, test_set in gtd_test_sets:
    m = Matcher(gov, **matcher_params)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    print(Counter(v["anchor_method"] for v in m.results.values()))
    
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches /  test_set.location.nunique() * 100, 4)}%).")
    
    accuracy = get_accuracy(m.results, test_set)
    print("Accuracy (entries where all parts of truth are in possible matches):", round(accuracy, 4))
    print()
    
    result_row.append(total_matches / test_set.location.nunique())
    result_row.append(accuracy)

Running gov web test set


Processing locations: 100%|█████████████████████████████████████████████████████████| 5641/5641 [01:45<00:00, 53.31it/s]


Counter({'gov partial': 3545, 'gov complete': 609, 'Phonetic': 513, 'KREISORSTADT | Cost 2': 434, 'KREISORSTADT | Cost 3': 297, 'KREISORSTADT | Cost 1': 61, 'ALL GOV | Cost 1': 48, 'ALL GOV | Cost 2': 42, 'ALL GOV | Cost 3': 22, 'No anchor at all': 5})
Total matches: 5063 (90.7999%).
Accuracy (entries where all parts of truth are in possible matches): 0.6621



In [22]:
final_results.append(result_row)

## Test suite with VL + Gov Preprocessing - replace corrections and characters + substitution

In [23]:
result_row = []

In [24]:
for _, test_set in vl_test_sets + syn_test_sets + gtd_test_sets:
    test_set.location = Preprocessing.substitute_partial_words(test_set.location, data_root)
    test_set.location = Preprocessing.substitute_delete_words(test_set.location, data_root)
    test_set.location = Preprocessing.substitute_full_words(test_set.location, data_root)
    
    if "truth" in test_set:
        test_set.truth = Preprocessing.substitute_partial_words(test_set.truth, data_root)
        test_set.truth = Preprocessing.substitute_delete_words(test_set.truth, data_root)
        test_set.truth = Preprocessing.substitute_full_words(test_set.truth, data_root)

In [25]:
old_names = list(gov.ids_by_name.keys())
new_names = Preprocessing.substitute_partial_words(pd.Series(old_names), data_root)
new_names = Preprocessing.substitute_delete_words(pd.Series(new_names), data_root)
new_names = Preprocessing.substitute_full_words(pd.Series(new_names), data_root)

ids_by_pname = defaultdict(set)
for old_name, new_name in zip(old_names, new_names):
    ids_by_pname[new_name] |= gov.ids_by_name[old_name]
ids_by_pname.default_factory = None
gov.ids_by_name = ids_by_pname
    
pnames_by_id = defaultdict(set)
for k, v in ids_by_pname.items():
    for i in v:
        pnames_by_id[i] |= {k}
pnames_by_id.default_factory = None
gov.names_by_id = pnames_by_id

Test Set 1

In [26]:
for name, test_set in vl_test_sets:
    m = Matcher(gov, **matcher_params)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    print(Counter(v["anchor_method"] for v in m.results.values()))
    
    
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches / test_set.location.nunique() * 100, 4)}%).")
    print()
    
    result_row.append(total_matches / test_set.location.nunique())

Running vl test set with loc_count=1


Processing locations: 100%|███████████████████████████████████████████████████████████| 100/100 [00:13<00:00,  7.34it/s]


Counter({'gov complete': 45, 'Phonetic': 37, 'ALL GOV | Cost 3': 9, 'ALL GOV | Cost 2': 8, 'ALL GOV | Cost 1': 1})
Total matches: 100 (100.0%).

Running vl test set with loc_count=2


Processing locations: 100%|██████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 232.45it/s]


Counter({'gov complete': 49, 'gov partial': 47, 'KREISORSTADT | Cost 2': 4})
Total matches: 79 (79.0%).

Running vl test set with loc_count=3


Processing locations: 100%|███████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 95.98it/s]


Counter({'gov complete': 53, 'gov partial': 47})
Total matches: 58 (58.0%).

Running vl test set containing '.'


Processing locations: 100%|███████████████████████████████████████████████████████████| 100/100 [00:07<00:00, 13.24it/s]

Counter({'gov partial': 43, 'gov complete': 37, 'ALL GOV | Cost 3': 5, 'KREISORSTADT | Cost 2': 5, 'Phonetic': 3, 'ALL GOV | Cost 2': 3, 'KREISORSTADT | Cost 3': 2, 'KREISORSTADT | Cost 1': 2})
Total matches: 88 (88.0%).






In [27]:
vl_test_sets[0][1].location.head(10)

1009502                  stabencinek
690804                         moson
702232                     nagratten
282295                    gammellund
909701          rothenburg a. tauber
595044           lauban i. schlesien
1151499                       wiese 
37286            arendal i. norwegen
55709                     bargerfehn
392317     harthausen i. württemberg
Name: location, dtype: object

In [28]:
m.get_match_for_locations(["rott b. reisdingen"])

Processing locations: 100%|███████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.32s/it]


Test Set 2

for name, test_set in gov_test_sets:
    m = Matcher(gov, **matcher_params)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches / test_set.location.nunique() * 100, 4)}%).")
    
    accuracy = get_accuracy(m.results, test_set)
    print("Accuracy (entries where all parts of truth are in possible matches):", round(accuracy, 4))
    print()
    
    result_row.append(total_matches / test_set.location.nunique())
    result_row.append(accuracy)

Test Set 3

In [29]:
for name, test_set in syn_test_sets:
    m = Matcher(gov, **matcher_params)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    print(Counter(v["anchor_method"] for v in m.results.values()))
    
    
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches /  test_set.location.nunique() * 100, 4)}%).")
    
    accuracy = get_accuracy(m.results, test_set)
    print("Accuracy (entries where all parts of truth are in possible matches):", round(accuracy, 4))
    print()
    
    result_row.append(total_matches / test_set.location.nunique())
    result_row.append(accuracy)

Running syn test set with loc_count=1 and distortion=0.


Processing locations: 100%|███████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 69.44it/s]


Counter({'gov complete': 98, 'ALL GOV | Cost 3': 1, 'Phonetic': 1})
Total matches: 100 (100.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.98

Running syn test set with loc_count=1 and distortion=1.


Processing locations: 100%|███████████████████████████████████████████████████████████| 100/100 [00:03<00:00, 32.39it/s]


Counter({'gov complete': 75, 'Phonetic': 19, 'ALL GOV | Cost 1': 4, 'ALL GOV | Cost 2': 1, 'ALL GOV | Cost 3': 1})
Total matches: 100 (100.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.82

Running syn test set with loc_count=2 and distortion=0.


Processing locations: 100%|██████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 143.47it/s]


Counter({'gov complete': 89, 'gov partial': 10})
Total matches: 99 (100.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.9

Running syn test set with loc_count=2 and distortion=1.


Processing locations: 100%|███████████████████████████████████████████████████████████| 100/100 [00:07<00:00, 14.08it/s]


Counter({'gov complete': 45, 'gov partial': 42, 'KREISORSTADT | Cost 1': 9, 'KREISORSTADT | Cost 3': 4})
Total matches: 99 (99.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.76

Running syn test set with loc_count=3 and distortion=0.


Processing locations: 100%|███████████████████████████████████████████████████████████| 100/100 [00:02<00:00, 44.23it/s]


Counter({'gov complete': 79, 'gov partial': 21})
Total matches: 97 (97.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.82

Running syn test set with loc_count=3 and distortion=1.


Processing locations: 100%|███████████████████████████████████████████████████████████| 100/100 [00:18<00:00,  5.36it/s]

Counter({'gov partial': 72, 'gov complete': 25, 'KREISORSTADT | Cost 2': 2, 'KREISORSTADT | Cost 1': 1})
Total matches: 98 (98.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.71






Test Set 4

In [30]:
for name, test_set in gtd_test_sets:
    m = Matcher(gov, **matcher_params)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    print(Counter(v["anchor_method"] for v in m.results.values()))
    
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches /  test_set.location.nunique() * 100, 4)}%).")
    
    accuracy = get_accuracy(m.results, test_set)
    print("Accuracy (entries where all parts of truth are in possible matches):", round(accuracy, 4))
    print()
    
    result_row.append(total_matches / test_set.location.nunique())
    result_row.append(accuracy)

Running gov web test set


Processing locations: 100%|█████████████████████████████████████████████████████████| 5641/5641 [01:29<00:00, 63.14it/s]


Counter({'gov partial': 3960, 'gov complete': 700, 'Phonetic': 513, 'KREISORSTADT | Cost 1': 141, 'KREISORSTADT | Cost 2': 84, 'KREISORSTADT | Cost 3': 52, 'ALL GOV | Cost 1': 48, 'ALL GOV | Cost 2': 45, 'ALL GOV | Cost 3': 19, 'No anchor at all': 3})
Total matches: 4980 (89.4879%).
Accuracy (entries where all parts of truth are in possible matches): 0.7148



In [31]:
final_results.append(result_row)

## Auswertung

In [32]:
names = []
for name, test_set in vl_test_sets + syn_test_sets + gtd_test_sets:
    for metric in ['total matches', 'accuracy']:
        if name.startswith("vl") and metric == 'accuracy':
            continue
            
        names.append(name + ' ' + metric)
        
final_results = pd.DataFrame(final_results, columns=names)

In [33]:
final_results["test set"] = ["Baseline", "Preprocessing VL + Gov (corrections + characters)",  "Preprocessing VL + Gov (corrections + characters + substitution)"]

In [34]:
final_results = final_results.set_index("test set")

In [35]:
from datetime import datetime

In [36]:
final_results.to_csv(f"{datetime.now().strftime('%Y_%m_%d_%H_%M_%S')}_final_results.csv")