In [13]:
import numpy as np

Some first names and surnames are given, based on the English (lowercase) alphabet.

In [14]:
fnames = "david", "anton", "fred", "jim", "barry"
snames = "barber", "ilsung", "fox", "chain", "fitzwilliam", "quinceadams", "grafvonunterhosen"

In [15]:
alphabet = "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"

In [16]:
sequence = tuple((open("sequence.txt", "r")).read())
T = len(sequence)

In [17]:
tp = ""
for char in sequence:
    tp += "{},".format(alphabet.index(char) + 1)

In [18]:
states_to_char = {}
states_to_char["r-0"] = " "
states_to_char["r-1"] = " "
states_to_char["r-2"] = " "
states = ["r-0"]
for i, name in enumerate(fnames):
    for j, char in enumerate(name):
        state = "f-{}-{}".format(i, j)
        states_to_char[state] = char
        states.append(state)
states += ["r-1"]
for i, name in enumerate(snames):
    for j, char in enumerate(name):
        state = "s-{}-{}".format(i, j)
        states_to_char[state] = char
        states.append(state)
states += ["r-2"]

The transitions operate as follows:
* A state of either random character generator switches to another state of the same random character generator with probability $0.8 \times \frac{1}{26}$.
* A state of either random character generator switches to the state of the first character of a name with probability $0.2 \times \frac{1}{|\mathcal{N}|}$, where $\mathcal{N}$ is the set of names (first names for the first random generator and surname for the second generator)
* A state of a character of a name switches to the state of the next character for that name with probability $1.0$, unless it's the last character in which case it would switch to the state of a character of the following random generator with probability $\frac{1}{26}$

In [19]:
A = {}
for a in states:
    A[a] = {}
    for b in states:
        if a == "r-0":
            if b == "r-0":
                A[a][b] = 0.8
            elif b.startswith("f-") and b.endswith("-0"):
                A[a][b] = 0.2 / len(fnames)
            else:
                A[a][b] = 0.0
        elif a == "r-1":
            if b == "r-1":
                A[a][b] = 0.8
            elif b.startswith("s-") and b.endswith("-0"):
                A[a][b] = 0.2 / len(snames)
            else:
                A[a][b] = 0.0
        elif a == "r-2":
            if b == "r-0":
                A[a][b] = 1.0
            else:
                A[a][b] = 0.0
        elif a.startswith("f-"):
            name_i, char_i = int((a.split("-"))[1]), int((a.split("-"))[2])
            next_name_state = "f-{}-{}".format(str(name_i), str(char_i + 1))
            if next_name_state in states:
                A[a][b] = 1 if b == next_name_state else 0
            else:
                A[a][b] = 1 if b.startswith("r-1") else 0
        elif a.startswith("s-"):
            name_i, char_i = int((a.split("-"))[1]), int((a.split("-"))[2])
            next_name_state = "s-{}-{}".format(str(name_i), str(char_i + 1))
            if next_name_state in states:
                A[a][b] = 1 if b == next_name_state else 0
            else:
                A[a][b] = 1 if b.startswith("r-2") else 0

In [20]:
B = {}
for state in states:
    B[state] = {}
    for char in alphabet:
        if state.startswith("r-"):
            B[state][char] = 1 / 26
        else:
            B[state][char] = (char == states_to_char[state]) * 0.3 + (char != states_to_char[state]) * 0.7 / 25

In [21]:
I = {}
for state in states:
    I[state] = 1 if state == "r-0" else 0

In [22]:
from decimal import Decimal

def viterbi():
    t1, t2 = {}, {}
    for i, state in enumerate(states):
        t1[state], t2[state] = [], []
        t1[state] += [Decimal(I[state] * B[state][sequence[0]])]
        t2[state] += [Decimal(0)]
    for i_, o in enumerate(sequence[1:]):
        i = i_ + 1
        for j, state in enumerate(states):
            maxk, argmaxk = 0, 0
            for k, s in enumerate(states):
                x = t1[s][i - 1] * Decimal(A[s][state]) * Decimal(B[state][sequence[i]])
                if x > maxk:
                    maxk = x
                    argmaxk = s
            t1[state] += [maxk]
            t2[state] += [argmaxk]
    z_max, z_argmax = 0, 0
    for i, state in enumerate(states):
        if t1[state][T - 1] > z_max:
            z_max = t1[state][T - 1]
            z_argmax = state
    z = [z_argmax]
    x = [states_to_char[z_argmax]]
    for t in range(T - 1, 0, -1):
        v = t2[z[0]][t]
        z.insert(0, v)
        x.insert(0, states_to_char[v])
    return x

v = viterbi()

In [23]:
''.join(v)

'       david        quinceadams    david     quinceadams      david          fitzwilliam            '

In [35]:
clean_sequence = ''.join(v)

In [36]:
elements = clean_sequence.split(" ")
names = []
for element in elements:
    if element != "":
        names.append(element)

In [37]:
for name in fnames + snames:
    n = clean_sequence.count(name)
    print("{}: {}".format(name, n))

david: 66
anton: 69
fred: 67
jim: 92
barry: 70
barber: 39
ilsung: 28
fox: 39
chain: 57
fitzwilliam: 73
quinceadams: 49
grafvonunterhosen: 78


In [38]:
occ = {}
for i, name in enumerate(names):
    if i % 2 == 1:
        first_name = names[i - 1]
        surname = names[i]
        if first_name not in occ:
            occ[first_name] = {}
        if surname not in occ[first_name]:
            occ[first_name][surname] = 0
        occ[first_name][surname] += 1
occ
    

{'david': {'quinceadams': 9,
  'fitzwilliam': 16,
  'chain': 11,
  'ilsung': 4,
  'grafvonunterhosen': 10,
  'fox': 8,
  'barber': 7},
 'jim': {'fitzwilliam': 20,
  'ilsung': 9,
  'grafvonunterhosen': 19,
  'fox': 8,
  'barber': 8,
  'chain': 16,
  'quinceadams': 12},
 'barry': {'quinceadams': 9,
  'fitzwilliam': 8,
  'grafvonunterhosen': 19,
  'chain': 10,
  'ilsung': 5,
  'fox': 10,
  'barber': 9},
 'anton': {'barber': 9,
  'ilsung': 5,
  'grafvonunterhosen': 11,
  'quinceadams': 11,
  'fox': 4,
  'fitzwilliam': 19,
  'chain': 10},
 'fred': {'grafvonunterhosen': 19,
  'chain': 10,
  'barber': 6,
  'quinceadams': 8,
  'fitzwilliam': 10,
  'fox': 9,
  'ilsung': 5}}