In [1]:
import numpy as np

In [2]:
fnames = "david", "anton", "fred", "jim", "barry"
snames = "barber", "ilsung", "fox", "chain", "fitzwilliam", "quinceadams", "grafvonunterhosen"
alphabet = "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"
N = len(alphabet)

## States
The states in this scenario are the states generating each character:
* $R_1$ is the state generating random characters before writing a first name
* $F_{1:X}$ are the states of the characters of first names being written, with $X$ being the length of the longest first name
* $R_2$ is the state generating random characters before writing a surname
* $S_{1:Y}$ are the states of the characters of surnames being written, with $Y$ being the length of the longest surname

In [9]:
max_length = lambda names: max([len(name) for name in names])
states = ["r1"] + ["f{}".format(i) for i in range(1, max_length(fnames) + 1)] + ["r2"] + ["s{}".format(i) for i in range(0, max_length(snames) + 1)]
K = len(states)
# for state in states:
#     print("\"{}\",".format(state), end=" ")

## Transition Matrix
The transition matrix contains information about probabilities of switching from a state to the other. In particular, the entry $(i, j)$ contains the probability of switching from the $i$-th state to the $j$-th state. In this context:
* $R_1 : R_1 = 0.8$ 
* $R_1 : F_1 = 0.2$
* $F_i : F_{i+1} = f(i)$
* $F_i : R_2 = 1 - f(i)$
* $R_2 : R_2 = 0.8$ 
* $R_2 : S_1 = 0.2$
* $S_i : S_{i+1} = s(i)$
* $S_i : S_1 = 1 - s(i)$

where $f(i)$ (similarly to $s(i)$) is the probability of a first name which is at least $i$ characters long to have at least one extra character.

In [4]:
def p(name_type, i):
    """ Returns the probability that a name of the given type containing at
    least i characters also contains at least one more character. 
    
    :param name_type: "f" for first names and "s" for surnames
    :param i: the minimum number of characters the name must have
    :return: the probability """
    names = fnames if name_type == "f" else snames
    c, t = 0, 0
    for name in names:
        if len(name) >= i:
            t += 1
            if len(name) >= i + 1:
                c += 1
    return c / t

transition = {}
for state1 in states:
    transition[state1] = {}
    for state2 in states:
        if state1 == "r1":
            if state2 == "r1":
                transition[state1][state2] = 0.8
            elif state2 == "f1":
                transition[state1][state2] = 0.2
            else:
                transition[state1][state2] = 0.0
        elif state1 == "r2":
            if state2 == "r2":
                transition[state1][state2] = 0.8
            elif state2 == "s1":
                transition[state1][state2] = 0.2
            else:
                transition[state1][state2] = 0.0
        elif state1.startswith("f"):
            i = int(state1[1])
            if state2.startswith("f"):
                j = int(state2[1])
                if j == i + 1:
                    transition[state1][state2] = p("f", i)
                else:
                    transition[state1][state2] = 0.0
            elif state2 == "r2":
                transition[state1][state2] = 1 - p("f", i)
            else:
                transition[state1][state2] = 0.0
        elif state1.startswith("s"):
            i = int(state1[1])
            if state2.startswith("s"):
                j = int(state2[1])
                if j == i + 1:
                    transition[state1][state2] = p("s", i)
                else:
                    transition[state1][state2] = 0.0
            elif state2 == "r1":
                transition[state1][state2] = 1 - p("s", i)
            else:
                transition[state1][state2] = 0.0

## Emission Matrix
The emission matrix represents the probability of a given state to generate a certain value.

In [34]:
def q(name_type, i, ch):
    """ Returns the probability that names of the given type have c has i-th character. """
    names = fnames if name_type == "f" else snames
    c, t = 0, 0
    for name in names:
        if len(name) > i:
            t += 1
            if name[i] == ch:
                c += 1
    return c / t
            

emission = {}
for state in states:
    emission[state] = {}
    for letter in alphabet:
        if state.startswith("r"):
            emission[state][letter] = 1 / 26
        elif state.startswith("f"):
            i = int(state[1])
            x = q("f", i - 1, letter)
            emission[state][letter] = x
        elif state.startswith("s"):
            i = int(state[1])
            x = q("s", i - 1, letter)
            emission[state][letter] = x

emission1 = emission

In [35]:
emission2 = {}
for a in alphabet:
    emission2[a] = {}
    for b in alphabet:
        emission2[a][b] = 0.3 * (a == b) + 0.7 * (a != b) / 25

{'a': {'a': 0.3,
  'b': 0.027999999999999997,
  'c': 0.027999999999999997,
  'd': 0.027999999999999997,
  'e': 0.027999999999999997,
  'f': 0.027999999999999997,
  'g': 0.027999999999999997,
  'h': 0.027999999999999997,
  'i': 0.027999999999999997,
  'j': 0.027999999999999997,
  'k': 0.027999999999999997,
  'l': 0.027999999999999997,
  'm': 0.027999999999999997,
  'n': 0.027999999999999997,
  'o': 0.027999999999999997,
  'p': 0.027999999999999997,
  'q': 0.027999999999999997,
  'r': 0.027999999999999997,
  's': 0.027999999999999997,
  't': 0.027999999999999997,
  'u': 0.027999999999999997,
  'v': 0.027999999999999997,
  'w': 0.027999999999999997,
  'x': 0.027999999999999997,
  'y': 0.027999999999999997,
  'z': 0.027999999999999997},
 'b': {'a': 0.027999999999999997,
  'b': 0.3,
  'c': 0.027999999999999997,
  'd': 0.027999999999999997,
  'e': 0.027999999999999997,
  'f': 0.027999999999999997,
  'g': 0.027999999999999997,
  'h': 0.027999999999999997,
  'i': 0.027999999999999997,
  'j': 0