In [None]:
import random
import tensorflow as tf
import numpy as np
import re
import pandas as pd
import utils
import editdistance
import csv
import multiprocessing

# Define scoring

In [None]:
def ss(a: str, b: str):
    """Similarity score for two strings a and b"""
    if pd.isnull(a) or pd.isnull(b):
        return 0
    if a==b:
        return 1.0
    return editdistance.eval(a, b) / max(len(a), len(b))

Names which will be used in training dataframe:

In [None]:
score_columns="r_name r_birthp r_civil r_pos r_job m_gender d_birthy".split(" ")

In [None]:
def score(a, b):
    return (ss(a.Navn, b.Navn),
            ss(a.Fødested, b.Fødested),
            ss(a.Civilstand, b.Civilstand),
            ss(a.Position, b.Position),
            ss(a.Erhverv, b.Erhverv),
            1.0 if a.Køn == b.Køn else 0.0,
            abs(a.Fødeår - b.Fødeår))

# Get some training data

In [None]:
matches = pd.read_csv(str(utils.datadir / "links" / "matches.csv"), sep="|")

In [None]:
matches

In [None]:
df = pd.read_pickle("dataframe.pickled")

In [None]:
df.dropna(subset=("Fødeår",), inplace=True)

In [None]:
df.set_index(["FT", "Kipnr", "Løbenr"], inplace=True)

In [None]:
df.sort_index(inplace=True)

Getting raw data for all matches:

In [None]:
def to_scored_frame(d):
    lots = []
    for t in d.itertuples():
        try:
            # TODO: optimize access
            s = df.loc[[t[1:4], t[4:]]]
            a = s.iloc[0]
            b = s.iloc[1]
        except KeyError:
            continue
        lots.append(score(a,b) + (True,))
    return pd.DataFrame(lots, columns=score_columns + ["label"])

In [None]:
positive = utils.parallelize(matches[:10000], to_scored_frame)

Run through some arbitrary non-matches

In [None]:
import random
import itertools

In [None]:
years = [1845, 1850, 1860, 1880, 1885]

In [None]:
ssize = len(positive)
def generate_nonmatch_scores(_):
    lots = []
    s = df.loc[random.choice(years)].sample(ssize)
    for i in range(ssize-1):
        a = s.iloc[i]
        b = s.iloc[i+1]
        lots.append(score(a,b) + (False,))
    return lots

In [None]:
with multiprocessing.Pool() as p:
    res = p.map(generate_nonmatch_scores, [None for i in range(multiprocessing.cpu_count())])

In [None]:
negative = pd.DataFrame(list(itertools.chain(*res)), columns=score_columns + ["label"])

In [None]:
del(res)

In [None]:
df_train = pd.concat([positive, negative])

# Build feature thing

In [None]:
navn = tf.feature_column.numeric_column("r_name")
fødested = tf.feature_column.numeric_column("r_birthp")
civilstand = tf.feature_column.numeric_column("r_civil")
position = tf.feature_column.numeric_column("r_pos")
erhverv = tf.feature_column.numeric_column("r_job")
køn = tf.feature_column.numeric_column("m_gender")
fødeår = tf.feature_column.numeric_column("d_birthy")

# Build model

In [None]:
model = tf.estimator.LinearClassifier(feature_columns=[
    navn, fødested, civilstand, position, erhverv, køn, fødeår
], model_dir="model")

In [None]:
input_fn_train = tf.estimator.inputs.pandas_input_fn(
    x=df_train,
    y=df_train.label,
    shuffle=True)

In [None]:
model.train(input_fn=input_fn_train, steps=100)

In [None]:
model.evaluate(input_fn=input_fn_train)

# Or other model

In [None]:
model2 = tf.estimator.DNNClassifier(feature_columns=[
    navn, fødested, civilstand, position, erhverv, køn, fødeår
], model_dir="model2", hidden_units=[7,5,3])

In [None]:
model2.train(input_fn_train)

In [None]:
model2.evaluate(input_fn_train)