In [None]:
import pathlib
import re
import collections
import difflib
import csv
import pandas
import utils

# Prep for pandas

Headers seem to be one of:

    'Amt|Herred|Sogn|aarfra|navn|køn|Fødested|Fødeaar|Civilstand|Position|Erhverv|kipnr|løbenr',
    'Amt|Herred|Sogn|aarfra|navn|køn|Fødested|Fødeaar|Civilstand|Stilling_i_husstanden|Erhverv|kipnr|løbenr',
    'Amt|Herred|Sogn|navn|køn|Fødested|Fødeaar|Civilstand|Position|Erhverv|husstnr|kipnr|løbenr',
    'Amt|Herred|Sogn|navn|køn|Fødested|Fødeaar|Civilstand|Position|Erhverv|kipnr|løbenr',
    'Amt|Herred|Sogn|navn|køn|Fødested|Fødeaar|Civilstand|stilling_i_husstanden|Erhverv|husstnr|kipnr|løbenr'

Read all the files, normalize columns and dump into complete.csv with years marked (for pandas).
**DONE**

In [None]:
with (utils.datadir / "clean" / "complete.csv").open("w", encoding="utf-8") as fout, \
     (utils.datadir / "clean" / "problematic.csv").open("w", encoding="utf-8") as pout:
    writer = csv.writer(fout, delimiter="|", lineterminator="\n")
    bad = csv.writer(pout, delimiter="|", lineterminator="\n")
    header = "FT|Amt|Herred|Sogn|Navn|Køn|Fødested|Fødeår|Civilstand|Position|Erhverv|Kipnr|Løbenr".split("|")
    writer.writerow(header)
    bad.writerow(header)
    for fn in sorted((utils.datadir / "raw").glob("UTF8_*.csv")):
        seen = {}
        dupes = 0
        year = utils.extractYear(str(fn))
        print(fn, year)
        with fn.open("r", encoding="utf-8") as fd:
            for line in fd:
                line = line.strip()
                split = line.split("|")
                
                # detect new header inside file
                if split[0] == "Amt" or split[-1] == "løbenr":
                    #print(line)
                    prevHeader = line
                    continue
                
                # delete any useless columns
                if prevHeader ==   'Amt|Herred|Sogn|aarfra|navn|køn|Fødested|Fødeaar|Civilstand|Position|Erhverv|kipnr|løbenr':
                    del(split[3])
                elif prevHeader == 'Amt|Herred|Sogn|aarfra|navn|køn|Fødested|Fødeaar|Civilstand|Stilling_i_husstanden|Erhverv|kipnr|løbenr':
                    del(split[3])
                    # stilling ~ position
                elif prevHeader == 'Amt|Herred|Sogn|navn|køn|Fødested|Fødeaar|Civilstand|Position|Erhverv|husstnr|kipnr|løbenr':
                    del(split[10]) # husstnr only appears seldomly, cannot be that useful
                elif prevHeader == 'Amt|Herred|Sogn|navn|køn|Fødested|Fødeaar|Civilstand|Position|Erhverv|kipnr|løbenr':
                    ... # ideal
                elif prevHeader == 'Amt|Herred|Sogn|navn|køn|Fødested|Fødeaar|Civilstand|stilling_i_husstanden|Erhverv|husstnr|kipnr|løbenr':
                    del(split[10]) # husstnr only appears seldomly, cannot be that useful
                    # stilling ~ position
                else:
                    print("Unknown header!")
                    print(prevHeader)
                    break
                split.insert(0, year)
                
                key = tuple(split[-2:]) # (løb, kip)
                if key in seen:
                    if seen[key] != split:
                        bad.writerow(seen[key])
                        bad.writerow(split)
                    dupes += 1
                    continue
                seen[key] = split
                writer.writerow(split)
        print(dupes, "duplicate lines")
        del(seen)

# Initial
## Read stuff

In [None]:
df = pandas.read_csv(str(utils.datadir / "clean"/ "complete.csv"),
                     delimiter="|",
                     low_memory=False,
                     converters={
                         "FT": int,
                         "Navn": str,
                         "Fødeår": str # some are "", will be removed later
                     })

## Discard bad rows

### Fødeår
Empty fødeår is not useful...

In [None]:
print("Before dropping Fødeår==\"\":", len(df))
df.drop(df[df.Fødeår==""].index, inplace=True)
print("After:", len(df))
df.Fødeår = pandas.to_numeric(df.Fødeår)

People weren't that old back then, so discard anyone who seems to be >100.  Or negative numbers obviously.

In [None]:
ages = df.FT - df.Fødeår

In [None]:
print("Before dropping age > 100:", len(df))
df.drop(df[ages>100].index, inplace=True)
print("After:", len(df))
print("Before dropping age < 0:", len(df))
df.drop(df[ages<0].index, inplace=True)
print("After:", len(df))

In [None]:
del(ages)

### Navn

In [None]:
print("Before dropping empty names with no letters:", len(df))
df.drop(df[df.Navn.str.match(r"^[^a-zæøå]*$", case=False)].index, inplace=True)
print("After:", len(df))

Get rid of children without names.

In [None]:
def isProbablyChild(s):
    s = s.lower()
    return "barn" in s and ("navn" in s or
                           "døbt" in s or
                           "dreng" in s or
                           "pige" in s or
                           "nyfødt" in s)

In [None]:
maybe_children = df[df.Navn.map(isProbablyChild).astype(bool, copy=False)]

**TODO**: extract names where possible; some are like `Dorthea Kirstine Hansen (Udøbt Pigebarn)` or `1 udøbt drengebarn [Iflg.KB.28/1-1845: Carl Christian Sørensen]` where there is actually a name even though they claim not to have one.

In [None]:
maybe_children.Navn.value_counts()

In [None]:
print("Before dropping unchristened children:", len(df))
df.drop(maybe_children.index, inplace=True)
print("After:", len(df))
del(maybe_children)

Look at remaining rows containing "barn":

In [None]:
maybe_children = df[df.Navn.str.contains("barn", case=False)]

In [None]:
maybe_children.Navn.value_counts()

## Køn
Look for "K" or "M" primarily.  Anything not seemingly gender related will be discarded for now...

In [None]:
def guessGender(s):
    s = s.lower()
    if "k" in s or "f" in s:
        return "K"
    if "m" in s:
        return "M"
    return "?"

In [None]:
df.Køn = df.Køn.astype(str).apply(guessGender)

In [None]:
df.Køn.value_counts()

In [None]:
print("Before dropping rows lacking gender:", len(df))
df.drop(df[df.Køn=="?"].index, inplace=True)
print("After:", len(df))

We can just replace the field with Boolean values now.

In [None]:
Male, Female = False, True

In [None]:
df.Køn = df.Køn.apply(lambda s: Male if s=="M" else Female)

# Check løbenr

There is one guy with ",50000", lets remove him.  Løbenr seems to be "something,subnumber" and sometimes only the first something. But with only subnumber, what can be done?

In [None]:
print("Before dropping invalid løbenr:", len(df))
df.drop(df[df.Løbenr.str.startswith(",")].index, inplace=True)
print("After:", len(df))

def toNumbers(s):
    return tuple(int(part) for part in s.split(","))

for year, group in df.groupby("FT"):
    print(year, len(group))
    continue
    prev = None
    for row in group.itertuples():
        curr = (row.Kipnr, toNumbers(row.Løbenr))
        if prev is not None:
            if curr[0] == prev[0] and curr[1] < prev[1]:
                print("Uh oh...")
                print(prev)
                print(row)
                break
        prev = curr

# Drop dårlige fødesteder

In [None]:
df.drop(df[df.Fødested.apply(lambda x: isinstance(x, float))].index, inplace=True)

# Checkpoint!

In [None]:
pandas.to_pickle(df, "tmp.pickled")

In [None]:
df = pandas.read_pickle("tmp.pickled")

In [None]:
g = df[df.Køn].Navn.value_counts()

In [None]:
d = g.to_dict()

In [None]:
names = set()
for t in df.itertuples():
    names.add(t.Navn.split()[0])

In [None]:
names

**Anything below is old**

# Uhhh, lowercase for realsies

In [None]:
with AllEntries() as dataSet:
    for fn, year, entries in dataSet.getEntries():
        print(year)
        lines = [commonHeader + "\n"]
        for entry in entries:
            lines.append(entry.toRow().lower() + "\n")
        with fn.open("w", encoding="UTF-8") as fd:
            fd.writelines(lines)

In [None]:
lines = []
with (datadir / "gps-coords.csv").open("r", encoding="utf-8") as fd:
    header = next(fd)
    for line in fd:
        split = line.split("|")
        split[0] = split[0].lower().strip()
        lines.append("|".join(split))
lines.sort()
lines.insert(0, header)
with (datadir / "gps-coords.csv").open("w", encoding="utf-8") as fd:
    fd.writelines(lines)

# Fix the birthplace field

## Part one: replace "i sognet"

In [None]:
with AllEntries() as dataSet:
    for fn, year, entries in dataSet.getEntries():
        print(year)
        lines = [commonHeader + "\n"]
        for entry in entries:
            # prøv at fikse fødesteder
            if "her i sognet" in entry.fødested or \
               "heri sognet" in entry.fødested or \
                entry.fødested.replace(".", "") in ("i sognet", "her i s"):        
                # det kan være, der står noget om placering i parentes; brug det
                match1 = re.search(r"(her ?)?i sognet (\{|\[|\()(?P<place>.*?)(\]|\)|\})", entry.fødested)
                # det kan også være, der står noget efter et komma
                match2 = re.search(r"(her ?)? i sognet, (<P<place>.*)", entry.fødested)
                if match1:
                    new = match1.group("place")
                elif match2:
                    new = match2.group("place")
                else:
                    new = entry.sogn
                entry = entry._replace(fødested=new)
            lines.append(entry.toRow() + "\n")
        with fn.open("w", encoding="UTF-8") as fd:
            fd.writelines(lines)

In [None]:
re.search(r"(her ?)?i sognet (\[|\()(?P<place>.*?)(\]|\))", "her i sognet [tommerup)").group("place")

In [None]:
# for 1845 som eksempel
sorted((a,b) for (a,b) in changes if a not in ("her i sognet", "heri sognet", "i sognet"))

## Part two: more of the same

Let's see what's missing; the GPS lookup efforts showed a lot of crud.

**TODO**

Change all the simple "dito", "ditto", "do", "do." etc. to the previous birthplace...

In [None]:
with AllEntries() as dataSet:
    for fn, year, entries in dataSet.getEntries():
        print(year)
        lines = [commonHeader + "\n"]
        prev = None
        for entry in entries:
            place = entry.fødested.replace(".", "")
            if place in ("do", "dito", "ditto"):
                entry = entry._replace(fødested=prev)
            prev = entry.fødested
            lines.append(entry.toRow() + "\n")
        with fn.open("w", encoding="UTF-8") as fd:
            fd.writelines(lines)

In [None]:
_remove_first = str.maketrans({
    c: "" for c in ",.-:?/!"
})
_remove_last = str.maketrans({
    c: "" for c in "[]{}()"
})

In [None]:
def extractPlace(name):
    stripped = name.translate(_remove_first)
    match = re.match(r"di?t?t?o(\(|\[)(.*)(\)|\])", stripped.replace(" ", ""))
    if match:
        return match.group(2).translate(_remove_last).strip()
    else:
        return stripped.translate(_remove_last).strip()

In [None]:
places = collections.Counter()
with AllEntries() as dataSet:
    for fn, year, entries in dataSet.getEntries():
        print(year)
        tmp = []
        for entry in entries:
            tmp.append(extractPlace(entry.fødested))
        places.update(tmp)
        del(tmp)

In [None]:
places.most_common()

In [None]:
differ = difflib.SequenceMatcher()

In [None]:
differ.set_seq1("københavn")

In [None]:
extractPlace("do. [rønne]")

In [None]:
extractPlace("dito [kjøbenhavn]")

In [None]:
extractPlace("kjøbenha[vn] !!")

In [None]:
copenhagen_names = set()
for place, freq in places.most_common():
    differ.set_seq2(place)
    if differ.ratio() > 0.8:
        print(place, freq)
        copenhagen_names.add(place)

In [None]:
len(places)

In [None]:
with AllEntries() as dataSet:
    for fn, year, entries in dataSet.getEntries():
        print(year)
        lines = [commonHeader + "\n"]
        prev = None
        for entry in entries:
            if extractPlace(entry.fødested) in copenhagen_names:
                entry = entry._replace(fødested="københavn")
            lines.append(entry.toRow() + "\n")
        with fn.open("w", encoding="UTF-8") as fd:
            fd.writelines(lines)

# Hmm

In [None]:
import sklearn.cluster

In [None]:
sklearn.cluster.AgglomerativeClustering()

In [None]:
places = set(s.split(" ")[0] for s in df[df)

In [None]:
len(names)

In [None]:
names