In [1]:
import numpy as np

Some first names and surnames are given, based on the English (lowercase) alphabet.

In [2]:
fnames = np.array(["david", "anton", "fred", "jim", "barry"])
snames = np.array(["barber", "ilsung", "fox", "chain", "fitzwilliam", "quinceadams", "grafvonunterhosen"])

In [102]:
sequence = np.array(list((open("sequence.txt", "r")).read())[0:100])
T = np.size(sequence)

In [103]:
alphabet = np.array(["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"])
N = np.size(alphabet)

From a high-level perspective, there exist four possible states for the machine generating the string: one which generates random characters before a first name, one that generates a first name, one which generates random characters before a surname and one that generates a last name. However, modelling this is not practical since each state can generate a string of arbitrary length. Another approach is to consider the atomic states that the four above mentioned states have. In other words, the two random character generators have 26 states each (one for each character) and the names have a state for each character in each name.

In [104]:
states, state_symbols = [], []
for i, char in enumerate(alphabet):
    states.append("r-0-{}".format(i))
    state_symbols.append(char)
for i, name in enumerate(fnames):
    for j, char in enumerate(name):
        states.append("f-{}-{}".format(i, j))
        state_symbols.append(char)    
for i, char in enumerate(alphabet):
    states.append("r-1-{}".format(i))
    state_symbols.append(char)
for i, name in enumerate(snames):
    for j, char in enumerate(name):
        states.append("s-{}-{}".format(i, j))
        state_symbols.append(char)
states = np.array(states)
state_symbols = np.array(state_symbols)

K = np.size(states)

The transitions operate as follows:
* A state of either random character generator switches to another state of the same random character generator with probability $0.8 \times \frac{1}{26}$.
* A state of either random character generator switches to the state of the first character of a name with probability $0.2 \times \frac{1}{|\mathcal{N}|}$, where $\mathcal{N}$ is the set of names (first names for the first random generator and surname for the second generator)
* A state of a character of a name switches to the state of the next character for that name with probability $1.0$, unless it's the last character in which case it would switch to the state of a character of the following random generator with probability $\frac{1}{26}$

In [111]:
A = np.zeros((K, K))

for i, a in enumerate(states):
    for j, b in enumerate(states):
        if a.startswith("r-0"):
            if b.startswith("r-0"):
                A[i][j] = 0.8 / 26
            elif b.startswith("f") and b.endswith("-0"):
                A[i][j] = 0.2 / len(fnames)
        elif a.startswith("r-1"):
            if b.startswith("r-1"):
                A[i][j] = 0.8 / 26
            elif b.startswith("s") and b.endswith("-0"):
                A[i][j] = 0.2 / len(snames)
        elif a.startswith("f"):
            v = a.split("-")
            name_i, char_i = int(v[1]), int(v[2])
            next_name_state = "f-{}-{}".format(name_i, char_i + 1)
            if next_name_state in states:
                A[i][j] = 1.0 if b == next_name_state else 0
            elif b.startswith("r-1"):
                A[i][j] = 1/ 26
        elif a.startswith("s"):
            v = a.split("-")
            name_i, char_i = int(v[1]), int(v[2])
            next_name_state = "s-{}-{}".format(name_i, char_i + 1)
            if next_name_state in states:
                A[i][j] = 1.0 if b == next_name_state else 0
            elif b.startswith("r-0"):
                A[i][j] = 1/ 26

In [112]:
B = np.zeros((K, N))

for i, state in enumerate(states):
    symbol = state_symbols[i]
    for j, char in enumerate(alphabet):
        B[i][j] = (symbol == char) * 0.3 + (symbol != char) * 0.7 / 25

B

array([[0.3  , 0.028, 0.028, ..., 0.028, 0.028, 0.028],
       [0.028, 0.3  , 0.028, ..., 0.028, 0.028, 0.028],
       [0.028, 0.028, 0.3  , ..., 0.028, 0.028, 0.028],
       ...,
       [0.028, 0.028, 0.028, ..., 0.028, 0.028, 0.028],
       [0.028, 0.028, 0.028, ..., 0.028, 0.028, 0.028],
       [0.028, 0.028, 0.028, ..., 0.028, 0.028, 0.028]])

In [99]:
initial = np.zeros((K))
for i, state in enumerate(states):
    initial[i] = 1 / 26 if state.startswith("r-0") else 0

In [121]:
y = np.array([list(alphabet).index(char) for char in list(sequence)])

def viterbi():
    t1 = np.zeros((K, T))
    t2 = np.zeros((K, T))
    for i, state in enumerate(states):
        t1[i, 0] = initial[i] * B[i][y[0]]
        t2[i, 0] = 0
    for i_, o in enumerate(y[1:]):
        i = i_ + 1
        for j, state in enumerate(states):
            max_k, argmax_k = 0, 0
            for k, state in enumerate(states):
                x = t1[k][i - 1] * A[k][j] * B[j][y[i]]
                if x > max_k:
                    max_k = x
                    argmax_k = k
            t1[j][i] = max_k
            t2[j][i] = argmax_k    
    z, x = np.zeros((T), dtype="int"), np.zeros((T), dtype="<U6")
    z[T - 1] = np.argmax(t1[:,T-1])
    x[T - 1] = state_symbols[z[T-1]]
    for t in range(T - 1, 0, -1):
        z[t-1] = t2[z[t], t]
        x[t-1] = state_symbols[z[t-1]]
    return x

x = viterbi()

In [122]:
x = ''.join(list(x))
x

'kjimegrafvonunterhosenjjimgfitzwilliamidavidgquinceadamsxantondquinceadamshdavidzwgrafvonunterhosenw'