Let's try the recordlinkage package for python!

In [None]:
import pandas as pd
import recordlinkage as rl
import csv
import utils

First read in some appropriate small subset

# Manage training data

In [None]:
df_train = pd.read_pickle("tmp.pickled")

In [None]:
relevant = set((1845, 1850, 1860, 1880, 1885))

def buildLinksDict(fd):
    reader = csv.reader(fd, delimiter="|")
    next(reader) # skip header
    links = {year: {} for year in relevant} # year -> (kip,løb) -> linkID
    for row in reader:
        year = utils.extractYear(row[1])
        if year is None:
            continue
        if year not in relevant:
            continue
        kipløb = (row[2], int(row[3]))
        links[year][kipløb] = row[0]
    return links

with (utils.datadir / "links" / "matches.csv").open("r") as fd:
    dm = buildLinksDict(fd)
with (utils.datadir / "links" / "notmatches.csv").open("r") as fd:
    dn = buildLinksDict(fd)

In [None]:
df_train["LinkID"] = ""

This way is stupid. Indexing by the three things is way cooler. But I don't know how to get "back out" again.

In [None]:
indices = []
linkIDs = []
for t in df_train.itertuples():
    s = dm[t.FT]
    key = (t.Kipnr, t.Løbenr)
    if key in s:
        indices.append(t.Index)
        linkIDs.append(s[key])
df_train.loc[indices, "LinkID"] = linkIDs

Continue

In [None]:
df_train = df_train[df_train.LinkID != ""]

In [None]:
groups = df_train.groupby("LinkID")

In [None]:
l = []
for group, data in groups:
    l.append(data.index)

In [None]:
mindex_train = pd.MultiIndex.from_tuples(l, names=["id1", "id2"])

In [None]:
indexer = rl.SortedNeighbourhoodIndex(on="Fødeår", window=5)

In [None]:
by_year = df_train.groupby("FT")

In [None]:
for year, data in by_year:
    print(year, len(data))

In [None]:
dfA = by_year.get_group(1845)

In [None]:
dfB = by_year.get_group(1850)

In [None]:
pairs = indexer.index(dfA, dfB)

In [None]:
len(pairs)

In [None]:
comparer = rl.Compare()
comparer.string("Navn", "Navn")
comparer.string("Fødested", "Fødested")
comparer.string("Erhverv", "Erhverv")
comparer.string("Position", "Position")
comparer.string("Civilstand", "Civilstand")
comparer.numeric("Fødeår", "Fødeår", scale=1)

In [None]:
features_train = comparer.compute(pairs, dfA, dfB)

# Train classifier

In [None]:
matches = features[features.sum(axis=1) > 4.5]

In [None]:
for t in matches.itertuples():
    a, b = t[0]
    print(dfA.loc[a])
    print(dfB.loc[b])
    #print(t[0][0])
    #print(t[1])

In [None]:
lr = rl.LogisticRegressionClassifier()

In [None]:
svm = rl.SVMClassifier()

In [None]:
nb = rl.NaiveBayesClassifier()

In [None]:
svm.learn(features, (features.index & mi))

In [None]:
nb.learn(features_train, (features_train.index & mindex_train))

In [None]:
len(lr.predict(features_train))

In [None]:
len(svm.predict(features_train))

In [None]:
len(mi)

In [None]:
len(nb.predict(features_train))

In [None]:
nb.prob(features_train)

In [None]:
df_train.loc[[1163538, 1953896]]

# Try the classifier!

In [None]:
df = pd.read_pickle("tmp.pickled")

In [None]:
df = df[df.Navn.str.startswith("Da")]

In [None]:
df = df[df.Køn==False]

In [None]:
by_year = df.groupby("FT")

In [None]:
for year, data in by_year:
    print(year, len(data))

In [None]:
dfA = by_year.get_group(1845)

In [None]:
dfB = by_year.get_group(1850)

Do some indexing!

In [None]:
indexer = rl.FullIndex()

In [None]:
indexer = rl.SortedNeighbourhoodIndex(on="Fødeår", window=3)

In [None]:
pairs = indexer.index(dfA, dfB)

In [None]:
len(pairs)

In [None]:
comparer = rl.Compare()
comparer.string("Navn", "Navn", method="jarowinkler")
comparer.string("Fødested", "Fødested")
comparer.string("Erhverv", "Erhverv")
comparer.string("Position", "Position")
comparer.string("Civilstand", "Civilstand")
comparer.numeric("Fødeår", "Fødeår", scale=1)

In [None]:
features = comparer.compute(pairs, dfA, dfB)

In [None]:
lr_res = lr.predict(features)

In [None]:
svm_res = svm.predict(features)

In [None]:
svm_res[0]

In [None]:
import pickle

In [None]:
with open("test.pickle", "wb") as fd:
    pickle.dump(svm, fd)

In [None]:
len(svm_res)

In [None]:
for pair in svm_res:
    print(df.loc[list(pair)])
    print()

In [None]:
df.loc[[439989, 1803687]]

# Normalization stuff

In [None]:
df["Fornavn"] = df.Navn.apply(lambda s: s.split()[0])

In [None]:
import recordlinkage.standardise as rlstd

In [None]:
subset = df[df.Navn.str.match(r"^(kar|car)", case=False)]

In [None]:
subset["Fornavn"] = subset.Navn.str.translate(utils.trans).apply(lambda s: s.split()[0])

In [None]:
subset["soundex"] = rlstd.phonetic(subset.Fornavn, method="soundex")

In [None]:
subset["nysiis"] = rlstd.phonetic(subset.Fornavn, method="nysiis")

In [None]:
subset["metaphone"] = rlstd.phonetic(subset.Fornavn, method="metaphone")

In [None]:
n = set()
for s in subset.itertuples():
    n.add((s.Fornavn, s.soundex, s.nysiis, s.metaphone))

In [None]:
subset.loc[:, ("Fornavn", "soundex")]

In [None]:
subset["nysiis"] = rlstd.phonetic(subset.Fornavn, method="nysiis")

In [None]:
subset["gb"] = subset.Navn.apply()

# Hmm

In [None]:
df2 = df.set_index(["FT", "Kipnr", "Løbenr"])

In [None]:
df2["Include"] = False

In [None]:
for year, data in dm.items():
    for kip, løb in data:
        df2.loc[(year, kip, løb), "Include"] = True
for year, data in dn.items():
    for kip, løb in data:
        df2.loc[(year, kip, løb), "Include"] = True