In [None]:
import pathlib
import re
import collections
import functools
import difflib
import operator
import json
import requests
import time
import editdistance
import random

datadir = pathlib.Path("/home/david/pro/scc/data")

%run utils.py
import similarity

In [None]:
relevant = tuple(year for year in (1845, 1850, 1860, 1880, 1885))

In [None]:
with (datadir / ".." / "links" / "linkede_personer.csv").open("r", encoding="latin1") as fd:
    print(next(fd))
    links = {year: {} for year in relevant} # year -> (kip,løb) -> linkID
    for line in fd:
        split = [part.lower() for part in line.strip().split("|")]
        for year in relevant:
            if str(year) in str(split[1]):
                kipløb = tuple(split[2:4])
                links[year][kipløb] = split[0]
                break

In [None]:
for year, matched in links.items():
    print(year, len(matched))

In [None]:
# Import tensorflow
import tensorflow as tf
import numpy as np

def get_score(a_row, b_row):
    name_scores = (similarity.string_similarity_linear(a_row[i], b_row[i])
                   for i in (3,4,5,6)) # first, middle, last, initials
    geo_scores = (similarity.string_similarity_linear(a_row[8], b_row[8]),
                   similarity.geo_similarity(a_row[8], b_row[8]))
    position_scores = (similarity.string_similarity_linear(a_row[i], b_row[i])
                   for i in (11, 12)) # position, erhverv
    age_score = similarity.year_similarity(int(a_row[9]), int(b_row[9]))
    return np.array((*name_scores, *geo_scores, *position_scores, age_score))

In [None]:
linked_data = {} # linkID -> [(year, row)]
all_data = {}
for fn in sorted(datadir.glob("lc_*.csv")):
    year = re.search(r"\d{4}", fn.name).group(0)
    year_num = int(year)
    print(year_num)
    all_data[year_num] = []
    with fn.open("r", encoding="UTF-8") as fd:
        print(next(fd))
        for line in fd:
            row = line.strip().split("|")
            if row[6] == "ah":
                all_data[year_num].append(row)
            kip, løb = row[-2:]
            løb = løb.split(",")[0]
            pair = (kip,løb)
            if pair in links[year_num]:
                linkID = links[year_num][pair]
                linked_data.setdefault(linkID, []).append((year_num, row))

In [None]:
useless = [key for key,val in linked_data.items() if len(val) == 1]
for key in useless:
    del(linked_data[key])

In [None]:
linked_data["353726"]

In [None]:
input_size = 9
output_size = 2

In [None]:
# input
x = tf.placeholder(tf.float32, [None, input_size], name="x-in-data")

W = tf.Variable(tf.zeros([input_size, output_size]))
b = tf.Variable(tf.zeros([output_size]))

# output layer
y = tf.nn.softmax(tf.matmul(x, W) + b)

# now declare the output data placeholder
y_ = tf.placeholder(tf.float32, [None, output_size], name="y-out-data")

In [None]:
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
s = tf.InteractiveSession()
tf.global_variables_initializer().run()

In [None]:
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [None]:
data = []
labels = []

for linkID in random.sample(list(linked_data), 1000):
    rows = linked_data[linkID]
    for (a_year, a_row), (b_year, b_row) in zip(rows[:-1], rows[1:]):
        data.append(get_score(a_row, b_row))
        labels.append((1, 0))

for year, rows in all_data.items():
    for i in range(200):
        a_row, b_row = random.sample(rows, 2)
        data.append(get_score(a_row, b_row))
        labels.append((0, 1))

x_train = np.vstack(data)

y_train = np.vstack(labels)

s.run(train_step, feed_dict={x: x_train,
                             y_: y_train})

In [None]:
s.run(accuracy, feed_dict={x: x_train,
                             y_: y_train})

In [None]:
collections.Counter(labels)

In [None]:
s.run(W)

In [None]:
s.run(y, feed_dict={x: data[:10]})

In [None]:
s.run(y, feed_dict={x: data[-10:]})

In [None]:
y_train