In [None]:
import pandas as pd
import tensorflow as tf
import multiprocessing
import editdistance

In [None]:
navn = tf.feature_column.numeric_column("r_name")
fødested = tf.feature_column.numeric_column("r_birthp")
civilstand = tf.feature_column.numeric_column("r_civil")
position = tf.feature_column.numeric_column("r_pos")
erhverv = tf.feature_column.numeric_column("r_job")
køn = tf.feature_column.numeric_column("m_gender")
fødeår = tf.feature_column.numeric_column("d_birthy")

In [None]:
model2 = tf.estimator.LinearClassifier(feature_columns=[
    navn, fødested, civilstand, position, erhverv, køn, fødeår
], model_dir="model")

In [None]:
df = pd.read_pickle("m-TT.pickled")

In [None]:
len(df)

In [None]:
import recordlinkage

In [None]:
dfA = df[df.FT==1845]

In [None]:
len(dfA)

In [None]:
dfB = df[df.FT==1850]

In [None]:
len(dfB)

In [None]:
len(dfA) * len(dfB)

In [None]:
indexer = recordlinkage.indexing.SortedNeighbourhoodIndex(on="Fødeår", window=5)

In [None]:
indexer = recordlinkage.indexing.FullIndex()

In [None]:
index = indexer.index(dfA, dfB)

In [None]:
len(index)

In [None]:
def ss(a: str, b: str):
    """Similarity score for two strings a and b"""
    if pd.isnull(a) or pd.isnull(b):
        return 0
    if a==b:
        return 1.0
    return editdistance.eval(a, b) / max(len(a), len(b))

In [None]:
def score(a, b):
    return (ss(a.Navn, b.Navn),
            ss(a.Fødested, b.Fødested),
            ss(a.Civilstand, b.Civilstand),
            ss(a.Position, b.Position),
            ss(a.Erhverv, b.Erhverv),
            1.0 if a.Køn == b.Køn else 0.0,
            abs(a.Fødeår - b.Fødeår))

In [None]:
score_columns="r_name r_birthp r_civil r_pos r_job m_gender d_birthy".split(" ")

In [None]:
def generate_scores(d):
    lots = []
    for (a, b) in d:
        a = dfA.loc[a]
        b = dfB.loc[b]
        lots.append((a.FT, a.Kipnr, a.Løbenr, b.FT, b.Kipnr, b.Løbenr) + score(a,b))
    return lots

In [None]:
import utils

In [None]:
with multiprocessing.Pool() as p:
    res = p.map(generate_scores, utils.chunks(index, 1000))

In [None]:
import itertools

In [None]:
df_eval = pd.DataFrame(list(itertools.chain(*res)), columns=["a_FT", "a_Kipnr", "a_Løbenr", "b_FT", "b_Kipnr", "b_Løbenr"] + score_columns)

In [None]:
input_fn = tf.estimator.inputs.pandas_input_fn(x=df_eval, shuffle=False)

In [None]:
prediction = list(model2.predict(input_fn))

In [None]:
df.set_index(["FT", "Kipnr", "Løbenr"], inplace=True)

In [None]:
df.sort_index(inplace=True)

In [None]:
len(prediction)

In [None]:
header = "Amt|Herred|Sogn|Navn|Køn|Fødested|Fødeår|Civilstand|Position|Erhverv|Kipnr|Løbenr|Group".split("|")

In [None]:
for i, d in enumerate(prediction):
    if d["probabilities"][0] < 0.135:
        print(d["probabilities"])
        print(d["classes"])
        r = df_eval.iloc[i]
        s = df.loc[[(r.a_FT,r.a_Kipnr,r.a_Løbenr),
                     (r.b_FT,r.b_Kipnr,r.b_Løbenr)]]
        a = s.iloc[0]
        b = s.iloc[1]
        for i in range(11):
            print("{:10s} {:40s} {:40s}".format(header[i], str(a[i]), str(b[i])))
        print()

In [None]:
d

In [None]:
model2.get_variable_names()

In [None]:
for var in model2.get_variable_names():
    print(var)
    print(model2.get_variable_value(var))