In [None]:
import random
import tensorflow as tf
import numpy as np
import re
import pandas as pd
import utils
import editdistance

# Define scoring

In [None]:
def ss(a, b):
    if a==b:
        return 1.0
    return editdistance.eval(a, b) / ((len(a) + len(b))/2)

In [None]:
score_columns="r_name r_birthp r_pos r_job c_civil m_gender d_birthy".split(" ")

In [None]:
def score(a, b):
    return (ss(a.Navn, b.Navn),
            ss(a.Fødested, b.Fødested),
            ss(a.Position, b.Position),
            ss(a.Erhverv, b.Erhverv),
            a.Civilstand.lower() + "-" + b.Civilstand.lower(),
            1.0 if a.Køn == b.Køn else 0.0,
            abs(a.Fødeår - b.Fødeår))

# Get some training data

In [None]:
matches = pd.read_csv(str(utils.datadir / "links" / "matches.csv"),
                     sep="|")

In [None]:
df = pd.read_pickle("dataframe.pickled")

In [None]:
df.set_index(["FT", "Kipnr", "Løbenr"], inplace=True)

In [None]:
df.sort_index(inplace=True)

In [None]:
lots = []

Getting raw data for all matches:

In [None]:
for t in matches.itertuples():
    try:
        a = df.loc[(t.a_FT, t.a_Kipnr, t.a_Løbenr)].iloc[0]
        b = df.loc[(t.b_FT, t.b_Kipnr, t.b_Løbenr)].iloc[0]
    except KeyError:
        print("Not found :(")
        continue
    lots.append(score(a,b) + (True,))

Run through some arbitrary non-matches

In [None]:
ssize = len(matches)
s = df.loc[1845].sample(ssize)
for i in range(ssize-1):
    a = s.iloc[i]
    b = s.iloc[i+1]
    lots.append(score(a,b) + (False,))

In [None]:
df_train = pd.DataFrame(lots, columns=score_columns + ["label"])

# Build feature thing

In [None]:
navn = tf.feature_column.numeric_column("r_name")
fødested = tf.feature_column.numeric_column("r_birthp")
civilstand = tf.feature_column.categorical_column_with_hash_bucket("c_civil", 100)
position = tf.feature_column.numeric_column("r_pos")
erhverv = tf.feature_column.numeric_column("r_job")
køn = tf.feature_column.numeric_column("m_gender")
fødeår = tf.feature_column.numeric_column("d_birthy")

In [None]:
model = tf.estimator.LinearClassifier(feature_columns=[
    navn, fødested, civilstand, position, erhverv, køn, fødeår
], model_dir="model",)

In [None]:
input_fn = tf.estimator.inputs.pandas_input_fn(
    x=df_train,
    y=df_train.label,
    shuffle=True)

In [None]:
df_train

In [None]:
model.train(input_fn=input_fn)

In [None]:
results = model.evaluate(input_fn=input_fn, steps=None)

In [None]:
results

In [None]:
ex = next(model.predict(input_fn=input_fn))

In [None]:
input_fn = tf.estimator.inputs.pandas_input_fn(
    x=df_train,
    y=df_train.label,
    shuffle=False)

In [None]:
for i, thing in enumerate(model.predict(input_fn)):
    row = df_train.loc[i]
    print(i, int(thing["classes"][0]) == row.label)
    #print(i, thing["probabilities"], row.label)