In [1]:
import numpy as np

Some first names and surnames are given, based on the English (lowercase) alphabet.

In [2]:
fnames = "david", "anton", "fred", "jim", "barry"
snames = "barber", "ilsung", "fox", "chain", "fitzwilliam", "quinceadams", "grafvonunterhosen"

In [3]:
alphabet = "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"

From a high-level perspective, there exist four possible states for the machine generating the string: one which generates random characters before a first name, one that generates a first name, one which generates random characters before a surname and one that generates a last name. However, modelling this is not practical since each state can generate a string of arbitrary length. Another approach is to consider the atomic states that the four above mentioned states have. In other words, the two random character generators have 26 states each (one for each character) and the names have a state for each character in each name.

In [4]:
states = {}
for i, letter in enumerate(alphabet):
    states["r0-{}".format(i)] = letter
for i, name in enumerate(fnames):
    for j, letter in enumerate(name):
        states["f-{}-{}".format(i, j)] = letter
for i, letter in enumerate(alphabet):
    states["r1-{}".format(i)] = letter
for i, name in enumerate(snames):
    for j, letter in enumerate(name):
        states["s-{}-{}".format(i, j)] = letter

The transitions operate as follows:
* A state of either random character generator switches to another state of the same random character generator with probability $0.8 \times \frac{1}{26}$.
* A state of either random character generator switches to the state of the first character of a name with probability $0.2 \times \frac{1}{|\mathcal{N}|}$, where $\mathcal{N}$ is the set of names (first names for the first random generator and surname for the second generator)
* A state of a character of a name switches to the state of the next character for that name with probability $1.0$, unless it's the last character in which case it would switch to the state of a character of the following random generator with probability $\frac{1}{26}$

In [8]:
transition = {}
for a in states:
    transition[a] = {}
    for b in states:
        if a.startswith("r0"):
            if b.startswith("r0"):
                transition[a][b] = 0.8 * 1 / 26
            elif b.startswith("f") and b.endswith("-0"):
                transition[a][b] = 0.2 * 1 / len(fnames)
            else:
                transition[a][b] = 0
        elif a.startswith("r1"):
            if b.startswith("r1"):
                transition[a][b] = 0.8 * 1 / 26
            elif b.startswith("s") and b.endswith("-0"):
                transition[a][b] = 0.2 * 1 / len(snames)
            else:
                transition[a][b] = 0
        elif a.startswith("f"):
            name_i, char_i = int((a.split("-"))[1]), int((a.split("-"))[2])
            next_name_state = "f-{}-{}".format(str(name_i), str(char_i + 1))
            if next_name_state in states:
                transition[a][b] = 1 if b == next_name_state else 0
            else:
                transition[a][b] = 1 / 26 if b.startswith("r1") else 0
        elif a.startswith("s"):
            name_i, char_i = int((a.split("-"))[1]), int((a.split("-"))[2])
            next_name_state = "s-{}-{}".format(str(name_i), str(char_i + 1))
            if next_name_state in states:
                transition[a][b] = 1 if b == next_name_state else 0
            else:
                transition[a][b] = 1 / 26 if b.startswith("r0") else 0

In [30]:
emission = {}
for state, state_char in states.items():
    emission[state] = {}
    for char in alphabet:
        emission[state][char] = (char == state_char) * 0.3 + (char != state_char) * 0.7 / 25

In [122]:
# from decimal import Decimal

# y = tuple((open("sequence.txt", "r")).read())
# T = len(y)
# s = ["r0-0"] * T

# x = {}
# for state in states:
#     x[state] = [Decimal(1 / 26)] if state.startswith("r0") else [Decimal(0)]
    
# for t in range(1, T):
#     if t % 100 == 0:
#         print(t)
#     for b in states:
#         v = 0
#         for a in states:
#             v += x[a][t - 1] * Decimal(transition[a][b]) * Decimal(emission[b][y[t]])
#         x[b] += [v]

In [123]:
from decimal import Decimal

y = tuple((open("sequence.txt", "r")).read())
T = len(y)

x = {}
for state in states:
    x[state] = [Decimal(1 / 26)] if state.startswith("r0") else [Decimal(0)]
    
for t in range(1, T):
    for k in states:
        

KeyboardInterrupt: 

In [110]:
def argmax(x, t, s):
    r = []
    for state, values in x.items():
        r.append((values[t], state))
    r.sort(key = lambda t: t[0], reverse=True)
    return r[s]
    

a, b = argmax(x, T - 1, 0)
z = [b]

for t in range(T - 1, 0, -1):
    c = 0
    a, b = argmax(x, t, c)
    while transition[b][z[0]] == 0:
        c += 1
        a, b = argmax(x, t, c)
    z.insert(0, b)
    
clean_sequence = []
    
for s in z:
    a, b = 0, 0
    for char, prob in (emission[s]).items():
        if prob > a:
            a = prob
            b = char
    #print(b, end="")
    clean_sequence += [b]

clean_sequence = ''.join(clean_sequence)

In [121]:
import pickle
pickle.dump(clean_sequence, open("clean_sequence.cp", "wb"))

In [115]:
i = 0
f = True
while i < T:
    names = fnames if f else snames
    for name in names:
        if (clean_sequence[i:]).startswith(name):
            print(name)
            i = i + len(name)
            f = not f
            break
    else:
        i += 1

david
quinceadams
david
quinceadams
david
fitzwilliam
fred
fitzwilliam
barry
quinceadams
jim
fitzwilliam
barry
fitzwilliam
jim
fox
anton
barber
david
chain
david
fitzwilliam
david
quinceadams
fred
ilsung
anton
grafvonunterhosen
david
ilsung
barry
grafvonunterhosen
barry
fitzwilliam
david
chain
anton
quinceadams
barry
fitzwilliam
fred
grafvonunterhosen
fred
chain
fred
grafvonunterhosen
fred
grafvonunterhosen
anton
quinceadams
david
barber
anton
chain
barry
grafvonunterhosen
jim
fitzwilliam
anton
fitzwilliam
anton
ilsung
barry
grafvonunterhosen
david
grafvonunterhosen
fred
grafvonunterhosen
david
barber
anton
barber
jim
fox
jim
ilsung
jim
barber
david
grafvonunterhosen
fred
quinceadams
fred
quinceadams
jim
grafvonunterhosen
jim
fitzwilliam
fred
barber
david
chain
jim
grafvonunterhosen
david
barber
fred
grafvonunterhosen
fred
ilsung
anton
fitzwilliam
david
quinceadams
david
fitzwilliam
fred
chain
jim
fox
anton
fitzwilliam
anton
fitzwilliam
fred
barber
barry
fox
barry
grafvonunterhosen
fre

In [20]:
# sequence = tuple((open("sequence.txt", "r")).read())[0:100]
# T = len(sequence)

# def mlog(n):
#     return np.log(0.0000001 + n)

# def viterbi():
#     t1, t2 = {}, {}
#     for i, state in enumerate(states):
#         t1[state], t2[state] = [], []
#         t1[state] += [initial[state] * emission[state][sequence[0]]]
#         t2[state] += [0]
#     for i_, o in enumerate(sequence[1:]):
#         i = i_ + 1
#         for j, state in enumerate(states):
#             maxk, argmaxk = 0, 0
#             for k, s in enumerate(states):
#                 x = t1[s][i - 1] * transition[s][state] * emission[state][sequence[i]]
#                 if x > maxk:
#                     maxk = x
#                     argmaxk = s
#             t1[state] += [maxk]
#             t2[state] += [argmaxk]
#     z_max, z_argmax = 0, 0
#     for i, state in enumerate(states):
#         if t1[state][T - 1] > z_max:
#             z_max = t1[state][T - 1]
#             z_argmax = state
#     z = [z_argmax]
#     x = [states[z_argmax]]
#     for t in range(T - 1, 0, -1):
#         v = t2[z[0]][t]
#         z.insert(0, v)
#         x.insert(0, states[v])
#     return x

# v = viterbi()