In [5]:
# only run if you are developping/debugging
# !pip install line_profiler
%load_ext line_profiler
%load_ext autoreload
%autoreload 2

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
from pathlib import Path

import pandas as pd
import random

from collections import Counter

from compgen2 import Gov, Matcher, GovTestData, Preprocessing
from compgen2.const import FILENAME_VL, FILENAME_GOV_TEST_SET
from compgen2.testdata import sample_test_set_from_gov, Synthetic, get_accuracy

random.seed(1337)

In [7]:
data_root= Path("../data")

In [24]:
# settings for matcher
matcher_params = {
    "use_difflib": True,
    "use_phonetic": True,
    "max_cost": 3,
    "search_kreis_first": True,
}

## Test sets
We use 4 test sets:
- different sample sets from the file "deutsche-verlustlisten-1wk"
- different sample sets from the gov database 
- different sample sets from a synthetic data set that tries to mimic the errors found in the original "verlustliste"
- manually collected correction suggestions from http://wiki-de.genealogy.net/Verlustlisten_Erster_Weltkrieg/Projekt/Ortsnamen

All test sets change when preprocessing is applied.

**Note**: We have a ground truth for all test setsbut the first one as the "verlustliste" is the actual problem we want to solve. So for the test set "verlustliste" we cannot calculate an accuracy score but we can compare who many items we were able to match. Assumption is that more matches are, in general, better.

## Test suite without Preprocessing -> Baseline

In [9]:
final_results = []

In [10]:
gov = Gov(data_root)
gov.load_data()
gov.build_indices()

In [11]:
# Test Set 1: VL
assert data_root.joinpath(FILENAME_VL).exists()
test_set_size = 100

vl = pd.read_parquet(data_root / FILENAME_VL)  # location column has the test data, truth is unknown

vl_test_sets = []
vl_test_sets.append(("vl test set with loc_count=1", vl.query("loc_parts_count == 1").sample(test_set_size)))
vl_test_sets.append(("vl test set with loc_count=2", vl.query("loc_parts_count == 2").sample(test_set_size)))
vl_test_sets.append(("vl test set with loc_count=3", vl.query("loc_parts_count == 3").sample(test_set_size)))
vl_test_sets.append(("vl test set containing '.'", vl[vl.location.str.contains(".", regex=False)].sample(test_set_size)))

# Test Set 2: Gov database
test_set_size = 100

gov_test_sets = []
gov_test_sets.append(("gov db test set with loc_count=1 and valid=1", sample_test_set_from_gov(gov, size=test_set_size, num_parts=1, valid=1)))
gov_test_sets.append(("gov db test set with loc_count=1 and valid=0.7", sample_test_set_from_gov(gov, size=test_set_size, num_parts=1, valid=0.7)))
gov_test_sets.append(("gov db test set with loc_count=2 and valid=1", sample_test_set_from_gov(gov, size=test_set_size, num_parts=2, valid=1)))
gov_test_sets.append(("gov db test set with loc_count=2 and valid=0.7", sample_test_set_from_gov(gov, size=test_set_size, num_parts=2, valid=0.7)))
gov_test_sets.append(("gov db test set with loc_count=3 and valid=1", sample_test_set_from_gov(gov, size=test_set_size, num_parts=3, valid=1)))
gov_test_sets.append(("gov db test set with loc_count=3 and valid=0.7", sample_test_set_from_gov(gov, size=test_set_size, num_parts=3, valid=0.7)))

In [12]:
# Test Set 3: Synthetic
test_set_size = 100

syn = Synthetic(gov)

syn_test_sets = []
syn_test_sets.append(("syn test set with loc_count=1 and distortion=0.", syn.create_synthetic_test_set(size=test_set_size, num_parts=1, distortion_factor=0.)))
syn_test_sets.append(("syn test set with loc_count=1 and distortion=1.", syn.create_synthetic_test_set(size=test_set_size, num_parts=1, distortion_factor=1.)))
syn_test_sets.append(("syn test set with loc_count=2 and distortion=0.", syn.create_synthetic_test_set(size=test_set_size, num_parts=2, distortion_factor=0.)))
syn_test_sets.append(("syn test set with loc_count=2 and distortion=1.", syn.create_synthetic_test_set(size=test_set_size, num_parts=2, distortion_factor=1.)))
syn_test_sets.append(("syn test set with loc_count=3 and distortion=0.", syn.create_synthetic_test_set(size=test_set_size, num_parts=3, distortion_factor=0.)))
syn_test_sets.append(("syn test set with loc_count=3 and distortion=1.", syn.create_synthetic_test_set(size=test_set_size, num_parts=3, distortion_factor=1.)))

In [13]:
# Test Set 4: GovTestData
assert data_root.joinpath(FILENAME_GOV_TEST_SET).exists()

gtd = GovTestData(gov)
gtd_test_sets = []
gtd_test_sets.append(("gov web test set", gtd.get_test_set()))

### Run the tests

In [15]:
result_row = []

Test Set 1

In [16]:
for name, test_set in vl_test_sets:
    m = Matcher(gov, **matcher_params)
    print("Rsnunning", name)
    m.get_match_for_locations(test_set.location)
    print(Counter(v["anchor_method"] for v in m.results.values()))
    
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches / test_set.location.nunique() * 100, 4)}%).")
    print()
    
    result_row.append(total_matches / test_set.location.nunique())

Running vl test set with loc_count=1


Processing locations: 100%|███████████████████████████████████████████████████████████| 100/100 [00:16<00:00,  6.10it/s]


Counter({'KREISORSTADT | Cost 3': 43, 'gov complete': 22, 'KREISORSTADT | Cost 2': 14, 'ALL GOV | Cost 3': 12, 'No anchor at all': 6, 'Phonetic': 2, 'KREISORSTADT | Cost 1': 1})
Total matches: 94 (94.0%).

Running vl test set with loc_count=2


Processing locations: 100%|██████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 208.31it/s]


Counter({'gov partial': 51, 'gov complete': 36, 'KREISORSTADT | Cost 2': 8, 'KREISORSTADT | Cost 3': 5})
Total matches: 80 (80.0%).

Running vl test set with loc_count=3


Processing locations: 100%|███████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 86.84it/s]


Counter({'gov partial': 77, 'gov complete': 20, 'KREISORSTADT | Cost 3': 1, 'KREISORSTADT | Cost 2': 1, 'KREISORSTADT | Cost 1': 1})
Total matches: 68 (68.0%).

Running vl test set containing '.'


Processing locations: 100%|███████████████████████████████████████████████████████████| 100/100 [00:05<00:00, 19.76it/s]

Counter({'gov partial': 61, 'KREISORSTADT | Cost 3': 23, 'KREISORSTADT | Cost 2': 9, 'Phonetic': 3, 'ALL GOV | Cost 3': 2, 'No anchor at all': 2})
Total matches: 96 (96.0%).






Test Set 2

for name, test_set in gov_test_sets:
    m = Matcher(gov, **matcher_params)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches / test_set.location.nunique() * 100, 4)}%).")
    
    accuracy = get_accuracy(m.results, test_set)
    print("Accuracy (entries where all parts of truth are in possible matches):", round(accuracy, 4))
    print()
    
    result_row.append(total_matches / test_set.location.nunique())
    result_row.append(accuracy)

Test Set 3

In [17]:
for name, test_set in syn_test_sets:
    m = Matcher(gov, **matcher_params)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    print(Counter(v["anchor_method"] for v in m.results.values()))
    
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches /  test_set.location.nunique() * 100, 4)}%).")
    
    accuracy = get_accuracy(m.results, test_set)
    print("Accuracy (entries where all parts of truth are in possible matches):", round(accuracy, 4))
    print()
    
    result_row.append(total_matches / test_set.location.nunique())
    result_row.append(accuracy)

Running syn test set with loc_count=1 and distortion=0.


Processing locations: 100%|███████████████████████████████████████████████████████| 100/100 [00:00<00:00, 107353.57it/s]

Counter({'gov complete': 100})
Total matches: 100 (100.0%).
Accuracy (entries where all parts of truth are in possible matches): 1.0






Running syn test set with loc_count=1 and distortion=1.


Processing locations: 100%|██████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 229.37it/s]


Counter({'gov complete': 67, 'KREISORSTADT | Cost 2': 14, 'KREISORSTADT | Cost 3': 13, 'KREISORSTADT | Cost 1': 3, 'Phonetic': 3})
Total matches: 100 (100.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.82

Running syn test set with loc_count=2 and distortion=0.


Processing locations: 100%|████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 33327.80it/s]

Counter({'gov complete': 99, 'gov partial': 1})
Total matches: 100 (100.0%).
Accuracy (entries where all parts of truth are in possible matches): 1.0






Running syn test set with loc_count=2 and distortion=1.


Processing locations: 100%|███████████████████████████████████████████████████████████| 100/100 [00:08<00:00, 11.32it/s]


Counter({'gov partial': 44, 'gov complete': 43, 'KREISORSTADT | Cost 2': 6, 'KREISORSTADT | Cost 1': 4, 'KREISORSTADT | Cost 3': 2, 'ALL GOV | Cost 3': 1})
Total matches: 99 (99.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.87

Running syn test set with loc_count=3 and distortion=0.


Processing locations: 100%|██████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 166.66it/s]


Counter({'gov complete': 90, 'gov partial': 10})
Total matches: 99 (99.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.98

Running syn test set with loc_count=3 and distortion=1.


Processing locations: 100%|███████████████████████████████████████████████████████████| 100/100 [00:12<00:00,  8.11it/s]

Counter({'gov partial': 57, 'gov complete': 41, 'KREISORSTADT | Cost 1': 1, 'KREISORSTADT | Cost 2': 1})
Total matches: 96 (96.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.89






Test Set 4

In [18]:
for name, test_set in gtd_test_sets:
    m = Matcher(gov, **matcher_params)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    print(Counter(v["anchor_method"] for v in m.results.values()))
    
    
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches /  test_set.location.nunique() * 100, 4)}%).")
    
    accuracy = get_accuracy(m.results, test_set)
    print("Accuracy (entries where all parts of truth are in possible matches):", round(accuracy, 4))
    print()
    
    result_row.append(total_matches / test_set.location.nunique())
    result_row.append(accuracy)

Running gov web test set


Processing locations: 100%|█████████████████████████████████████████████████████████| 5641/5641 [01:01<00:00, 92.13it/s]


Counter({'gov partial': 3546, 'KREISORSTADT | Cost 3': 841, 'gov complete': 602, 'KREISORSTADT | Cost 2': 521, 'KREISORSTADT | Cost 1': 28, 'Phonetic': 19, 'ALL GOV | Cost 3': 11, 'No anchor at all': 6, 'ALL GOV | Cost 2': 3})
Total matches: 5065 (90.8194%).
Accuracy (entries where all parts of truth are in possible matches): 0.6029



In [31]:
for name, test_set in gtd_test_sets:
    m = Matcher(gov, **matcher_params)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    print(Counter(v["anchor_method"] for v in m.results.values()))
    
    
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches /  test_set.location.nunique() * 100, 4)}%).")
    
    accuracy = get_accuracy(m.results, test_set)
    print("Accuracy (entries where all parts of truth are in possible matches):", round(accuracy, 4))
    print()
    
    result_row.append(total_matches / test_set.location.nunique())
    result_row.append(accuracy)

Running gov web test set


Processing locations: 100%|█████████████████████████████████████████████████████████| 5641/5641 [01:50<00:00, 51.11it/s]


Counter({'gov partial': 3546, 'gov complete': 602, 'Phonetic': 512, 'KREISORSTADT | Cost 2': 435, 'KREISORSTADT | Cost 3': 303, 'KREISORSTADT | Cost 1': 59, 'ALL GOV | Cost 1': 48, 'ALL GOV | Cost 2': 40, 'ALL GOV | Cost 3': 26, 'No anchor at all': 6})
Total matches: 5065 (90.8194%).
Accuracy (entries where all parts of truth are in possible matches): 0.6603



In [14]:
final_results.append(result_row)

## Test suite with VL + Gov Preprocessing - replace corrections and characters, no substitution

In [15]:
result_row = []

In [16]:
for _, test_set in vl_test_sets + syn_test_sets + gtd_test_sets:
    test_set.location = Preprocessing.replace_corrections_vl(test_set.location)
    test_set.location = Preprocessing.replace_characters_vl(test_set.location)
    
    if "truth" in test_set:
        test_set.truth = Preprocessing.replace_corrections_vl(test_set.truth)
        test_set.truth = Preprocessing.replace_characters_vl(test_set.truth)

In [17]:
from collections import defaultdict

In [18]:
old_names = list(gov.ids_by_name.keys())
new_names = Preprocessing.replace_characters_gov(pd.Series(old_names, dtype=str))

ids_by_pname = defaultdict(set)
for old_name, new_name in zip(old_names, new_names):
    ids_by_pname[new_name] |= gov.ids_by_name[old_name]
ids_by_pname.default_factory = None
gov.ids_by_name = ids_by_pname
    
pnames_by_id = defaultdict(set)
for k, v in ids_by_pname.items():
    for i in v:
        pnames_by_id[i] |= {k}
pnames_by_id.default_factory = None
gov.names_by_id = pnames_by_id

Test Set 1

In [19]:
for name, test_set in vl_test_sets:
    m = Matcher(gov, **matcher_params)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    print(Counter(v["anchor_method"] for v in m.results.values()))
    
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches / test_set.location.nunique() * 100, 4)}%).")
    print()
    
    result_row.append(total_matches / test_set.location.nunique())

Running vl test set with loc_count=1


Processing locations: 100%|███████████████████████████████████████████████████████████| 100/100 [00:09<00:00, 10.79it/s]


Counter({'KREISORSTADT | Cost 3': 52, 'gov complete': 27, 'ALL GOV | Cost 3': 7, 'KREISORSTADT | Cost 2': 7, 'Phonetic': 6, 'ALL GOV | Cost 2': 1})
Total matches: 100 (100.0%).

Running vl test set with loc_count=2


Processing locations: 100%|██████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 107.85it/s]


Counter({'gov partial': 54, 'gov complete': 29, 'KREISORSTADT | Cost 3': 9, 'KREISORSTADT | Cost 2': 7, 'KREISORSTADT | Cost 1': 1})
Total matches: 86 (86.0%).

Running vl test set with loc_count=3


Processing locations: 100%|███████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 99.21it/s]


Counter({'gov partial': 75, 'gov complete': 17, 'KREISORSTADT | Cost 3': 4, 'KREISORSTADT | Cost 2': 4})
Total matches: 74 (74.0%).

Running vl test set containing '.'


Processing locations: 100%|███████████████████████████████████████████████████████████| 100/100 [00:02<00:00, 37.14it/s]

Counter({'gov partial': 61, 'KREISORSTADT | Cost 3': 24, 'KREISORSTADT | Cost 2': 11, 'Phonetic': 3, 'ALL GOV | Cost 3': 1})
Total matches: 99 (99.0%).






Test Set 2

for name, test_set in gov_test_sets:
    m = Matcher(gov, **matcher_params)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches / test_set.location.nunique() * 100, 4)}%).")
    
    accuracy = get_accuracy(m.results, test_set)
    print("Accuracy (entries where all parts of truth are in possible matches):", round(accuracy, 4))
    print()
    
    result_row.append(total_matches / test_set.location.nunique())
    result_row.append(accuracy)

Test Set 3

In [20]:
for name, test_set in syn_test_sets:
    m = Matcher(gov, **matcher_params)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    print(Counter(v["anchor_method"] for v in m.results.values()))
    
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches /  test_set.location.nunique() * 100, 4)}%).")
    
    accuracy = get_accuracy(m.results, test_set)
    print("Accuracy (entries where all parts of truth are in possible matches):", round(accuracy, 4))
    print()
    
    result_row.append(total_matches / test_set.location.nunique())
    result_row.append(accuracy)

Running syn test set with loc_count=1 and distortion=0.


Processing locations: 100%|█████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 4758.14it/s]

Counter({'gov complete': 99, 'KREISORSTADT | Cost 3': 1})
Total matches: 100 (100.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.99






Running syn test set with loc_count=1 and distortion=1.


Processing locations: 100%|██████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 301.97it/s]


Counter({'gov complete': 70, 'KREISORSTADT | Cost 3': 12, 'KREISORSTADT | Cost 2': 10, 'Phonetic': 5, 'KREISORSTADT | Cost 1': 3})
Total matches: 100 (100.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.85

Running syn test set with loc_count=2 and distortion=0.


Processing locations: 100%|██████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 109.38it/s]


Counter({'gov complete': 89, 'gov partial': 11})
Total matches: 98 (98.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.89

Running syn test set with loc_count=2 and distortion=1.


Processing locations: 100%|███████████████████████████████████████████████████████████| 100/100 [00:06<00:00, 16.43it/s]


Counter({'gov complete': 52, 'gov partial': 38, 'KREISORSTADT | Cost 2': 6, 'KREISORSTADT | Cost 3': 3, 'KREISORSTADT | Cost 1': 1})
Total matches: 98 (98.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.82

Running syn test set with loc_count=3 and distortion=0.


Processing locations: 100%|███████████████████████████████████████████████████████████| 100/100 [00:02<00:00, 35.16it/s]


Counter({'gov complete': 83, 'gov partial': 17})
Total matches: 96 (96.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.85

Running syn test set with loc_count=3 and distortion=1.


Processing locations: 100%|███████████████████████████████████████████████████████████| 100/100 [00:16<00:00,  6.17it/s]

Counter({'gov partial': 56, 'gov complete': 40, 'KREISORSTADT | Cost 1': 2, 'KREISORSTADT | Cost 3': 1, 'KREISORSTADT | Cost 2': 1})
Total matches: 96 (96.0%).
Accuracy (entries where all parts of truth are in possible matches): 0.8






Test Set 4

In [21]:
for name, test_set in gtd_test_sets:
    m = Matcher(gov, **matcher_params)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    print(Counter(v["anchor_method"] for v in m.results.values()))
    
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches /  test_set.location.nunique() * 100, 4)}%).")
    
    accuracy = get_accuracy(m.results, test_set)
    print("Accuracy (entries where all parts of truth are in possible matches):", round(accuracy, 4))
    print()
    
    result_row.append(total_matches / test_set.location.nunique())
    result_row.append(accuracy)

Running gov web test set


Processing locations: 100%|█████████████████████████████████████████████████████████| 5641/5641 [00:57<00:00, 98.22it/s]


Counter({'gov partial': 3545, 'KREISORSTADT | Cost 3': 834, 'gov complete': 609, 'KREISORSTADT | Cost 2': 522, 'KREISORSTADT | Cost 1': 29, 'Phonetic': 18, 'ALL GOV | Cost 3': 10, 'No anchor at all': 5, 'ALL GOV | Cost 2': 4})
Total matches: 5063 (90.7999%).
Accuracy (entries where all parts of truth are in possible matches): 0.6049



In [22]:
final_results.append(result_row)

## Test suite with VL + Gov Preprocessing - replace corrections and characters + substitution

In [23]:
result_row = []

In [24]:
for _, test_set in vl_test_sets + syn_test_sets + gtd_test_sets:
    test_set.location = Preprocessing.substitute_partial_words(test_set.location, data_root)
    test_set.location = Preprocessing.substitute_delete_words(test_set.location, data_root)
    test_set.location = Preprocessing.substitute_full_words(test_set.location, data_root)
    
    if "truth" in test_set:
        test_set.truth = Preprocessing.substitute_partial_words(test_set.truth, data_root)
        test_set.truth = Preprocessing.substitute_delete_words(test_set.truth, data_root)
        test_set.truth = Preprocessing.substitute_full_words(test_set.truth, data_root)

In [25]:
old_names = list(gov.ids_by_name.keys())
new_names = Preprocessing.substitute_partial_words(pd.Series(old_names), data_root)
new_names = Preprocessing.substitute_delete_words(pd.Series(new_names), data_root)
new_names = Preprocessing.substitute_full_words(pd.Series(new_names), data_root)

ids_by_pname = defaultdict(set)
for old_name, new_name in zip(old_names, new_names):
    ids_by_pname[new_name] |= gov.ids_by_name[old_name]
ids_by_pname.default_factory = None
gov.ids_by_name = ids_by_pname
    
pnames_by_id = defaultdict(set)
for k, v in ids_by_pname.items():
    for i in v:
        pnames_by_id[i] |= {k}
pnames_by_id.default_factory = None
gov.names_by_id = pnames_by_id

Test Set 1

In [26]:
for name, test_set in vl_test_sets:
    m = Matcher(gov, **matcher_params)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    print(Counter(v["anchor_method"] for v in m.results.values()))
    
    
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches / test_set.location.nunique() * 100, 4)}%).")
    print()
    
    result_row.append(total_matches / test_set.location.nunique())

Running vl test set with loc_count=1


Processing locations:   8%|████▉                                                        | 8/100 [00:01<00:15,  5.90it/s]


KeyError: 'st. egidien'

In [33]:
vl_test_sets[0][1].location.head(10)

684840              mölln waren
653560     markranstädt leipzig
783104               oberstelen
307579                gohlefanz
342455         groß bodensleben
884460             reuden calau
38743                  arnsdorf
161495             colmar i. m.
911417       rott b. reisdingen
1118203         warendorf i. w.
Name: location, dtype: object

In [37]:
m.get_match_for_locations(["rott b. reisdingen"])

Processing locations: 100%|███████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.24s/it]


Test Set 2

for name, test_set in gov_test_sets:
    m = Matcher(gov, **matcher_params)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches / test_set.location.nunique() * 100, 4)}%).")
    
    accuracy = get_accuracy(m.results, test_set)
    print("Accuracy (entries where all parts of truth are in possible matches):", round(accuracy, 4))
    print()
    
    result_row.append(total_matches / test_set.location.nunique())
    result_row.append(accuracy)

Test Set 3

In [None]:
for name, test_set in syn_test_sets:
    m = Matcher(gov, **matcher_params)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    print(Counter(v["anchor_method"] for v in m.results.values()))
    
    
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches /  test_set.location.nunique() * 100, 4)}%).")
    
    accuracy = get_accuracy(m.results, test_set)
    print("Accuracy (entries where all parts of truth are in possible matches):", round(accuracy, 4))
    print()
    
    result_row.append(total_matches / test_set.location.nunique())
    result_row.append(accuracy)

Test Set 4

In [None]:
for name, test_set in gtd_test_sets:
    m = Matcher(gov, **matcher_params)
    print("Running", name)
    m.get_match_for_locations(test_set.location)
    print(Counter(v["anchor_method"] for v in m.results.values()))
    
    total_matches = len([match for match in m.results.values() if match.get("possible_matches")])
    print(f"Total matches: {total_matches} ({round(total_matches /  test_set.location.nunique() * 100, 4)}%).")
    
    accuracy = get_accuracy(m.results, test_set)
    print("Accuracy (entries where all parts of truth are in possible matches):", round(accuracy, 4))
    print()
    
    result_row.append(total_matches / test_set.location.nunique())
    result_row.append(accuracy)

In [None]:
final_results.append(result_row)

## Auswertung

In [None]:
names = []
for name, test_set in vl_test_sets + syn_test_sets + gtd_test_sets:
    for metric in ['total matches', 'accuracy']:
        if name.startswith("vl") and metric == 'accuracy':
            continue
            
        names.append(name + ' ' + metric)
        
final_results = pd.DataFrame(final_results, columns=names)

In [None]:
final_results["test set"] = ["Baseline", "Preprocessing VL + Gov (corrections + characters)",  "Preprocessing VL + Gov (corrections + characters + substitution)"]

In [None]:
final_results = final_results.set_index("test set")

In [None]:
from datetime import datetime

In [None]:
final_results.to_csv(f"{datetime.now().strftime('%Y_%m_%d_%H_%M_%S')}_final_results.csv")