In [None]:
import pickle
import pandas as pd
import concurrent.futures
import pathlib

## Uniqueify by taking max both ways

### Experiment

#### Load and see how many candidates there are

In [None]:
matches = pd.read_csv("m-TM.csv", delimiter="|")

In [None]:
len(matches)

In [None]:
# for computing upper bound on number of extracted matches
g1 = matches.groupby(by=["a_FT", "a_Kipnr", "a_Løbenr"], as_index=False)
g2 = matches.groupby(by=["b_FT", "b_Kipnr", "b_Løbenr"])
l1=list(g1.size().groupby(level=0).size().iteritems())
l2=list(g2.size().groupby(level=0).size().iteritems())
print(l1)
print(l2)

In [None]:
g1 = matches.groupby(by=["a_FT", "a_Kipnr", "a_Løbenr"])
idx = g1["p"].transform(max) == matches["p"]
matches = matches[idx]
print(len(matches))

In [None]:
g2 = matches.groupby(by=["b_FT", "b_Kipnr", "b_Løbenr"])
idx = g2["p"].transform(max) == matches["p"]
matches = matches[idx]
print(len(matches))

In [None]:
len(matches)

In [None]:
for (a_year, a_count), (b_year, b_count) in zip(l1, l2):
    print(a_year, b_year, min(a_count, b_count))
print(sum(min(a, b) for ((_, a), (_, b)) in zip(l1,l2)))

In [None]:
# so how many left?
g1 = matches.groupby(by=["a_FT", "a_Kipnr", "a_Løbenr"], as_index=False)
g2 = matches.groupby(by=["b_FT", "b_Kipnr", "b_Løbenr"])
l1=list(g1.size().groupby(level=0).size().iteritems())
l2=list(g2.size().groupby(level=0).size().iteritems())

In [None]:
for (a_year, a_count), (b_year, b_count) in zip(l1, l2):
    print(a_year, b_year, min(a_count, b_count))
print(sum(min(a, b) for ((_, a), (_, b)) in zip(l1,l2)))

#### There are probably still duplicates
Obvious if `len(matches)` above is greater than the sum in the cell directly above.

In [None]:
indexed = matches.set_index(["a_FT", "a_Kipnr", "a_Løbenr"])

In [None]:
d = indexed.index.get_duplicates()

In [None]:
indexed.loc[d]

In [None]:
len(matches)

In [None]:
matches.drop_duplicates(subset=["a_FT", "a_Løbenr", "a_Kipnr"], inplace=True)
matches.drop_duplicates(subset=["b_FT", "b_Løbenr", "b_Kipnr"], inplace=True)

In [None]:
len(matches)

In [None]:
matches

### Apply

In [None]:
def uniqueify_max(matches):
    # take max-probability b for each a
    g = matches.groupby(by=["a_FT", "a_Kipnr", "a_Løbenr"])
    idx = g["p"].transform(max) == matches["p"]
    matches = matches[idx]
    del(g)
    del(idx)
    # and then max-probability a for each b
    g = matches.groupby(by=["b_FT", "b_Kipnr", "b_Løbenr"])
    idx = g["p"].transform(max) == matches["p"]
    matches = matches[idx]
    del(g)
    del(idx)
    # drop any duplicate a or b
    matches.drop_duplicates(subset=["a_FT", "a_Løbenr", "a_Kipnr"], inplace=True)
    matches.drop_duplicates(subset=["b_FT", "b_Løbenr", "b_Kipnr"], inplace=True)
    return matches

In [None]:
def process(fn):
    matches = pd.read_csv(str(fn), delimiter="|")
    matches = matches[matches.p > 0.9]
    matches = uniqueify_max(matches)
    return matches

In [None]:
with concurrent.futures.ProcessPoolExecutor(max_workers=10) as tpe:
    res = tpe.map(process, pathlib.Path("nn-match").glob("*.csv"))

In [None]:
res = list(res)

In [None]:
len(res)

In [None]:
matches = pd.concat(res)

In [None]:
del(res)

In [None]:
s = 0
for fn in pathlib.Path("nn-match").glob("*.csv"):
    with fn.open("r") as fd:
        s += sum(1 for _ in fd) - 1

In [None]:
s

In [None]:
len(matches)

In [None]:
g = matches.groupby("a_FT")

In [None]:
s = 0
for a_year, size in g.size().iteritems():
    print(a_year, size)
    s += size
s

In [None]:
df = pd.read_pickle("../scripts/dataframe.pickled")

In [None]:
g2 = df.groupby("FT")

In [None]:
l = list(g2.size().iteritems())

In [None]:
import math

In [None]:
for (a_year, a_size), (b_year, b_size), (our_year, our_size) in zip(l, l[1:], g.size().iteritems()):
    ul = min(a_size, b_size)
    print(a_year, "&", b_year, "&", our_size, "&", ul, "&", round(our_size/ul*100,2))

In [None]:
hmm = matches.p.describe()

In [None]:
print(hmm.to_latex())

In [None]:
matches.p.plot(kind="hist")

In [None]:
matches.p.hist(bins=100)
#plt.show()

In [None]:
plt.savefig("test.pdf")

In [None]:
plt.style.use("ggplot")

In [None]:
import matplotlib.pyplot as plt

## There are probably still duplicates
Obvious if `len(matches)` above is greater than the sum in the cell directly above.

In [None]:
indexed = matches.set_index(["a_FT", "a_Kipnr", "a_Løbenr"])

In [None]:
d = indexed.index.get_duplicates()

In [None]:
indexed.loc[d]

In [None]:
len(matches)

In [None]:
matches.drop_duplicates(subset=["a_FT", "a_Løbenr", "a_Kipnr"], inplace=True)
matches.drop_duplicates(subset=["b_FT", "b_Løbenr", "b_Kipnr"], inplace=True)

In [None]:
len(matches)

In [None]:
matches

# Now only sensible matches left
So let's look at these

## Inspect some matches

In [None]:
df = pd.read_pickle("../scripts/indexed.pickled")

In [None]:
lookup = {}
for t in df.itertuples():
    lookup[t.Index] = t

In [None]:
del(df)

In [None]:
header = "_|Amt|Herred|Sogn|Navn|Køn|Fødested|Fødeår|Civilstand|Position|Erhverv|Fornavn|Kipnr|Løbenr|Group".split("|")

In [None]:
matches.sort_values(by="p", inplace=True, ascending=True)

In [None]:
ss = matches[abs(matches.p - 0.935037) < 0.0001]

In [None]:
import textwrap

In [None]:
sh = textwrap.shorten

In [None]:
latex = False

In [None]:
count = 0
for _, a_FT, a_Kip, a_Løb, b_FT, b_Kip, b_Løb, prob in matches.itertuples():
    count += 1
    a_key = (a_FT, a_Kip, a_Løb)
    b_key = (b_FT, b_Kip, b_Løb)
    a = lookup[a_key]
    b = lookup[b_key]
    if latex:
        print("\\multicolumn{3}{l}{" + "Match from " + str(a_FT) + " to " + str(b_FT) + "}\\\\")
        print("\\multicolumn{3}{l}{" + "Assigned probability: " + str(prob) + "}\\\\")
    else:
        print("Match from", a_FT, "to", b_FT)
        print("Assigned probability:", prob)
    for i in range(1, 11):
        if latex:
            s = "{}&{}&{}\\\\".format(header[i], sh(str(a[i]), 40), sh(str(b[i]), 40))
        else:
            s = "{:10s} {:30s} {:30s}".format(header[i], str(a[i]), str(b[i]))
        if a[i] != b[i]:
            s = '\033[95m' + s + "\033[0m"
        print(s)
    if latex:
        print("\midrule")
    else:
        print()
    if count == 500:
        break

# How good is this?

In [None]:
import utils

In [None]:
gold = pd.read_csv(utils.datadir / "links" / "matches.csv", delimiter="|",
                   dtype={"a_FT":int,"b_FT":int,"a_Løbenr":int,"b_Løbenr":int},
                   comment="#")

In [None]:
known = {t[1:4]: t[4:7] for t in gold.itertuples()}

In [None]:
known_rev = {}
for k, v in known.items():
    known_rev[v] = k

In [None]:
len(known_rev)

In [None]:
both_sides = set(v for v in known.values())

In [None]:
both_sides.update(known)

In [None]:
ss = matches.set_index(["a_FT", "a_Kipnr", "a_Løbenr"])

In [None]:
ss = ss[ss.index.isin(both_sides)]

In [None]:
len(ss)

In [None]:
d = {}
for t in ss.itertuples():
    d[t.Index] = t[1:4]

In [None]:
del(ss)

In [None]:
len(d)

In [None]:
rev = {}
for k, v in d.items():
    rev[v] = k

In [None]:
len(d)

In [None]:
d = {k: v for (k, v) in d.items() if k in known or v in known_rev}

In [None]:
recovered = 0
missed = 0
unnecessary = 0
hmm = 0
for a, b in d.items():
    if a in known:
        should = known[a]
        if b == should:
            recovered += 1
        else:
            missed += 1
    elif b in known_rev:
        hmm += 1
    else: # found a->b, but b was matched with other
        unnecessary += 1

In [None]:
len(known)

In [None]:
recovered / len(known)

In [None]:
recovered, missed, unnecessary, hmm

In [None]:
no

In [None]:
yes

In [None]:
yes / len(known)

In [None]:
yes / (yes + no)