In [2]:
import numpy as np
import pandas as pd

#### Read in files

In [11]:
lines = []
with open('lm.txt') as f:
    lines = f.readlines()

count = 0
for line in lines:
    count += 1
#     print(f'line {count}: {line}')  

In [16]:
# lines

In [17]:
lines_tidy = [x.replace("\t", " ") for x in lines]
lines_final = [x.replace("\n", " ") for x in lines_tidy]

In [21]:
lines_final[:2]

['<start> <start> 2.1782695826435478e-05 ', '<start> T 0.20767622200923586 ']

In [31]:
new_list = []
for each in lines_final:
    new_list.append(each.split(" "))

In [33]:
new_list[:2]

[['<start>', '<start>', '2.1782695826435478e-05', ''],
 ['<start>', 'T', '0.20767622200923586', '']]

In [42]:
df_tran_pred = pd.DataFrame(new_list, columns = ['char_1', 'char_2', 'prob', 'n'])  

In [43]:
df_tran = df_tran_pred.iloc[:, :-1]

In [44]:
df_tran.head()

Unnamed: 0,char_1,char_2,prob
0,<start>,<start>,2.178269582643548e-05
1,<start>,T,0.2076762220092358
2,<start>,h,0.0030277947198745
3,<start>,e,0.0008277424414045
4,<start>,<s>,2.178269582643548e-05


#### Get transition matrix

In [51]:
all_chars = set(df_tran['char_2'].unique()).union(set(df_tran['char_1'].unique()))

In [52]:
all_chars

{'!',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<eos>',
 '<s>',
 '<start>',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 ']',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '{',
 '}'}

In [56]:
char_trans_array = np.zeros((len(all_chars), len(all_chars)))
for i, row_char in enumerate(all_chars):
    for j, column_char in enumerate(all_chars):
        char_trans_array[i, j] = df_tran[
            (df_tran['char_1'] == row_char) &\
            (df_tran['char_2'] == column_char)
        ]['prob']  

In [61]:
df_char_trans = pd.DataFrame(
    char_trans_array,
    index=all_chars,
    columns=all_chars
)

In [57]:
char_trans_array.shape

(86, 86)

In [70]:
df_char_trans[df_char_trans.index=='t']['%'].values

array([2.95666417e-06])

In [71]:
df_char_trans[df_char_trans.index=='%']['t'].values

array([0.00485437])

In [68]:
df_tran[(df_tran['char_1'] == 't') & (df_tran['char_2'] == '%')]

Unnamed: 0,char_1,char_2,prob
766,t,%,2.956664173213214e-06


In [69]:
df_tran[(df_tran['char_1'] == '%') & (df_tran['char_2'] == 't')]

Unnamed: 0,char_1,char_2,prob
6716,%,t,0.0048543689320388


In [73]:
char_trans_array

array([[1.02040816e-02, 1.22448980e-01, 1.02040816e-02, ...,
        1.02040816e-02, 1.02040816e-02, 1.02040816e-02],
       [2.95666417e-06, 1.75478019e-02, 2.95666417e-06, ...,
        2.95666417e-06, 2.95666417e-06, 2.95666417e-06],
       [3.95100751e-04, 3.95100751e-04, 3.95100751e-04, ...,
        3.95100751e-04, 3.95100751e-04, 3.95100751e-04],
       ...,
       [4.84966052e-04, 9.69932105e-04, 4.84966052e-04, ...,
        4.84966052e-04, 4.84966052e-04, 4.84966052e-04],
       [1.04166667e-03, 2.18750000e-02, 1.04166667e-03, ...,
        1.04166667e-03, 2.81250000e-02, 2.70833333e-02],
       [5.56173526e-04, 1.16796440e-02, 5.56173526e-04, ...,
        5.56173526e-04, 1.11234705e-02, 1.11234705e-02]])

#### Get observation characters

In [253]:
sentences = []
with open('15pctmasked.txt') as f:
    sentences = f.readlines()

sentences = [s.replace('\n', '').split() for s in sentences]
    

In [255]:
unique_observation_words_list = list(set(df_char_trans.index.unique()))

#### Implement Viterbi

In [308]:
def Viterbi(
    characters, # sentence
    unique_observation_words_list,
    df_char_trans
):
    state = []
    output_characters = []
    T = unique_observation_words_list
    for key, char in enumerate(characters):
        #initialise list of probability column for a given observation
        p = [] 
        if char == '<mask>': # only compute when run into a mask token
            prev_char = characters[key-1]
            if prev_char == '<mask>':
                prev_char = state[-1]
            else:
                prev_char = characters[key-1]
            for character in T:
                transition_p = df_char_trans.loc[prev_char, character]
                state_probability = 1 * transition_p    
                p.append(state_probability)

            pmax = max(p)
            # getting state for which probability is maximum
            state_max = T[p.index(pmax)] 
            state.append(state_max)
            output_characters.append(state_max)
        else:
            output_characters.append(char)
    return characters, output_characters

## Check against a known sentence prediction from discussion board

In [309]:
pred_first_sentence = Viterbi(
    sentences[0],
    unique_observation_words_list,
    df_char_trans
)

In [311]:
correct_first_sentence = "<start> I <s> p e <s> m a n t a t i o n <s> o f <s> G e o r g i a ' ' <s> a u r o m o b i l e <s> t i t l e <s> l a w <s> w a s <s> a l <s> , <s> h e c o m m e n d e d <s> b e <s> t h e <s> o u t g o i n g <s> j u r y <s> . <eos>".split(' ')
correct_first_sentence = ''.join(correct_first_sentence).replace('<s>', ' ')
print(f"Correct first sentence is:\n{correct_first_sentence}")
print(f"My prediction for the first sentence is:\n{''.join(pred_first_sentence[1]).replace('<s>', ' ')}")

Correct first sentence is:
<start>I pe mantation of Georgia'' auromobile title law was al , hecommended be the outgoing jury .<eos>
My prediction for the first sentence is:
<start>I pe mentation of Georgia'  auromobile title law was al t tecommended be the outgoing jury .<eos>


In [312]:
pred_sentence_list = []
for i in range(len(sentences)):
    pred_sentence_list.append(Viterbi(
    sentences[i],
    unique_observation_words_list,
    df_char_trans
))

In [317]:
final_pred_list = [' '.join(s[1]) for s in pred_sentence_list]

In [318]:
final_pred_list[:2]

["<start> I <s> p e <s> m e n t a t i o n <s> o f <s> G e o r g i a ' <s> <s> a u r o m o b i l e <s> t i t l e <s> l a w <s> w a s <s> a l <s> t <s> t e c o m m e n d e d <s> b e <s> t h e <s> o u t g o i n g <s> j u r y <s> . <eos>",
 '<start> T h e <s> g r e n <s> t j u r e <s> t h o n <s> a <s> s w i p e <s> a t <s> t h e <s> S t a t e <s> W e l f a r e <s> D e p a r t m e n t h s <s> h a n d <s> i n g <s> o f <s> f e <s> e r a l <s> f u r d s <s> g r a n t e d <s> f <s> r <s> c h i n d <s> w e l <s> a r e <s> t h e v i c e s <s> i n <s> t o n t h r <s> h o m e s <s> t <eos>']

In [322]:
with open('output.txt', 'wt') as f:
    for s in final_pred_list:
        f.write(s + '\n')