In [1]:
# read in data

readin_cn = []
readin_en = []
readin_align = []

# read in chinese sentences
with open('data/chinese', encoding='utf-8') as f:
    readin_cn=f.readlines()

# read in english sentences 
with open('data/english', encoding='utf-8') as f:
    readin_en=f.readlines()

# read in word-to-word alignment info for each sentences

with open('data/alignment', encoding='utf-8') as f:
    readin_align = f.readlines()

In [2]:
# remove '\n'
readin_cn_raw = [x.strip('\n') for x in readin_cn]
readin_en_raw = [x.strip('\n') for x in readin_en]
readin_align_raw = [x.strip('\n') for x in readin_align]
len(readin_en_raw),len(readin_cn_raw),len(readin_align_raw)

(40715, 40715, 40715)

In [3]:
# split out training and test set
# the readin_xx is for training and test_xx is for test

readin_cn = readin_cn_raw[:1000]
readin_en = readin_en_raw[:1000]
readin_align = readin_align_raw[:1000]

test_cn = readin_cn_raw[1000:1100]
test_en = readin_en_raw[1000:1100]
test_align = readin_align_raw[1000:1100]
len(test_en),len(test_cn),len(test_align)

(100, 100, 100)

In [4]:
len(readin_cn),len(readin_en),len(readin_align)

(1000, 1000, 1000)

Let's explore the data

In [5]:
readin_cn[-1]

'科学家 们 的 新 发现 --- 羊羊 们 可以 清理 爆炸物 的 残渣'

In [6]:
readin_en[-1]

'Sheep Help Scientists Clean Up Explosives Residue'

In [7]:
readin_align[-1]

'1:3/1 2:3/0 7:1/1 8:1/0 10:4/1 10:5/1 11:6/1 12:6/0 13:7/1'

In [8]:
# test the alignment info
t_align = readin_align[-1].split(' ')
t_cn = readin_cn[-1].split(' ')
t_en = readin_en[-1].split(' ')

# remove the '/0' or '/1'
t_align = [x[:-2] for x in t_align]

# match up according to alignment info(e.g. "1:2" means that the first Chinese word is aligned to the second English word )
for al in t_align:
    p = al.split(':')
    print(t_cn[int(p[0])-1]+"-->"+t_en[int(p[1])-1])

科学家-->Scientists
们-->Scientists
羊羊-->Sheep
们-->Sheep
清理-->Clean
清理-->Up
爆炸物-->Explosives
的-->Explosives
残渣-->Residue


In [9]:
# we notice that some chinese word can be translated to two or more english words
# also some chinese words do not have corresponding words

def matchWords(ch, en, align):
    """Given the chinese and english sentences, make a word-to-word alignment with provided alignment info
    :param ch: chinese sentence in a string(words splited by space)
    :param en: English sentence
    :param align: alignment info(e.g. '1:4/1 2:4/0 3:1/1 4:2/1 5:2/0 6:5/1')
    :return tuple(chinese words),tuple(english words(aligned to chinese words))
    """
    ch_list = ch.split(' ')
    en_list = en.split(' ')
    align_list = align.split(' ')
    
    # a list for aligned english words, should have same length as chinese word list
    en_align = ['None'] * len(ch_list)
    
    for pair in align_list:
        # remove '/', split by ":"
        p = pair[:-2].split(':')
        # check if one chinese word can be translated to two or more english words
        if 0<int(p[0])<=len(en_align) and 0<int(p[1])<=len(en_list):
            if en_align[int(p[0])-1] != 'None':
                en_align[int(p[0])-1] = en_align[int(p[0])-1]+' '+en_list[int(p[1])-1]
            else:
                en_align[int(p[0])-1] = en_list[int(p[1])-1]
    
    return tuple(ch_list), tuple(en_align)
    
# function that check the data   
def checker(ch, en, align):
    """check if the alignment data is OK(can not index out of range )
    :param ch: chinese sentence in a string(words splited by space)
    :param en: English sentence
    :param align: alignment info(e.g. '1:4/1 2:4/0 3:1/1 4:2/1 5:2/0 6:5/1')
    :return boolean, if this parallel text is ok
    """
    if align == '' or en == '' or ch == '':
        return False
    ch_list = ch.split(' ')
    en_list = en.split(' ')
    align_list = align.split(' ')
    
    ch_max = 0
    en_max = 0
    for pair in align_list:
        # remove '/', split by ":"
        p = pair[:-2].split(':')
        if int(p[0]) > ch_max:
            ch_max = int(p[0])
        if int(p[1]) > en_max:
            en_max = int(p[1])
    
    # print(str(ch_max)+' '+str(en_max))
    if ch_max <= len(ch_list) and en_max <= len(en_list):
        return True
    else:
        return False

In [10]:
# test alignment again
t_align = readin_align[-1]
t_cn = readin_cn[-1]
t_en = readin_en[-1]
c1, e1 = matchWords(t_cn, t_en, t_align)
for c, e in zip(c1,e1):
    print(c+'->'+e)
print()
print('original data')
print(t_cn,t_en)

科学家->Scientists
们->Scientists
的->None
新->None
发现->None
---->None
羊羊->Sheep
们->Sheep
可以->None
清理->Clean Up
爆炸物->Explosives
的->Explosives
残渣->Residue

original data
科学家 们 的 新 发现 --- 羊羊 们 可以 清理 爆炸物 的 残渣 Sheep Help Scientists Clean Up Explosives Residue


In [11]:
checker(t_cn, t_en, t_align)

True

In [12]:
# now we are good to go, turn the read in data into dataset(list) that contain words in tuples, 
# each tuple represent one sentence

# training set

DataSet_ch = []
DataSet_en = []

for ch, en, al in zip(readin_cn, readin_en, readin_align):
    if checker(ch, en, al):
        words_ch, aligned_en_words = matchWords(ch, en, al)
        DataSet_ch.append(words_ch)
        DataSet_en.append(aligned_en_words)

In [13]:
# test set
TestSet_ch = []
TestSet_en = []

for ch, en, al in zip(test_cn, test_en, test_align):
    if checker(ch, en, al):
        words_ch, aligned_en_words = matchWords(ch, en, al)
        TestSet_ch.append(words_ch)
        TestSet_en.append(aligned_en_words)

In [14]:
# how many parallel sentences after clean up the data
len(DataSet_ch),len(DataSet_en)

(999, 999)

In [15]:
len(TestSet_ch),len(TestSet_en)

(99, 99)

In [16]:
# validate that in every sentence, each chinese word mapping to one english word 
# (use None for some chinese words do not have corresponding words)
import random
rand_p = random.randint(0,len(DataSet_ch)-1)
print('random sentence index='+str(rand_p))
len(DataSet_ch[rand_p]) == len(DataSet_en[rand_p])

random sentence index=404


True

In [17]:
# sample the dataset and check
for i in range(4):
    print(DataSet_ch[i])
    print(DataSet_en[i])

('教区', '财务', '委员会', '认为', ',', '这项', '协议', '可能', '使得', '波士顿', '天主教会', '破产', ',', '因为', '声称', '受到', '性', '侵害', '的', '案子', '越来越', '多', ',', '而且', '「', '在', '促成', '协议', '的', '程序', '开始', '后', '情况', '已经', '有', '很', '大', '变化', '」', '。')
('archdiocese', 'finance', 'council', 'thinks', 'that', 'this', 'agreement', 'may', 'force', 'boston', 'catholic archdiocese', 'bankruptcy', 'None', 'as', 'alleged', 'None', 'sexual', 'abuse', 'of', 'cases', 'keep surfacing', 'keep surfacing', ',', 'and', '"', 'since', 'leading', 'agreement', 'None', 'process', 'started', 'since', 'situation', 'has', 'None', 'dramatically', 'dramatically', 'changed', '"', '.')
('六十六', '岁', '的', '吉欧根', '因为', '在', '一九九一年', '性', '侵害', '一', '名', '十', '岁', '男童', ',', '最近', '被', '判处', '十', '年', '徒刑', '。')
('66-year-old', '66-year-old', 'None', 'geoghan', 'for', 'in', '1991', 'sexual', 'molestation', 'a', 'None', '10-year-old', '10-year-old', 'boy', 'None', 'recently', 'was', 'sentenced', '10', 'years', 'imprisonment', '.')
('已', '被

In [18]:
# build up the vocabulary
DataSet_vocab = []
for sentence in DataSet_ch:
    for word in sentence:
        DataSet_vocab.append(word)
DataSet_vocab_frozen = frozenset(DataSet_vocab)
DataSet_vocab_frozen

frozenset({'城',
           '云路',
           '显现',
           '或',
           '设计师',
           '尖峰',
           '腾格里',
           '子女',
           '有人',
           '十分',
           '祈祷',
           '锦标赛',
           '十',
           '纪录',
           '名誉',
           '任天堂',
           '适龄前',
           '占',
           '攀登',
           '乡镇',
           '当局',
           '外交',
           '大将',
           '五十',
           '辩论',
           '一度',
           '55',
           '特区',
           '上帝',
           '抵达',
           '二000年',
           '售价',
           '维持',
           '攻占',
           '机密',
           '智库',
           '利润',
           '东',
           '职务',
           '差',
           '立方米',
           '淘汰',
           '本周',
           '伊丽莎白',
           '回顾',
           '队长',
           '维芳迪',
           '激进分子',
           '店',
           '摆脱',
           '循',
           '洛美',
           '百分之八',
           '延长',
           '各方',
           '情形',
           '随着',
           '力克',
      

In [19]:
len(DataSet_vocab),len(DataSet_vocab_frozen)

(25206, 5259)

# BOW

In [20]:
# create two list for all chinese words and enligh words (one to one map)

words_ch = []
words_en = []

for s_ch, s_en in zip(DataSet_ch, DataSet_en):
    for w_ch, w_en in zip(s_ch, s_en):
        words_ch.append(w_ch)
        words_en.append(w_en)
        
for i in range(10):
    print(words_ch[i]+'->'+words_en[i])

教区->archdiocese
财务->finance
委员会->council
认为->thinks
,->that
这项->this
协议->agreement
可能->may
使得->force
波士顿->boston


In [21]:
from collections import Counter, defaultdict, namedtuple, OrderedDict
# build a dict of chinese-english
def ch_en_dict(ch, en):
    d = defaultdict(lambda: defaultdict(int))
    for c, e in zip(ch, en):
        d[c][e] += 1
    return d

word_counts = ch_en_dict(words_ch, words_en)
word_counts

defaultdict(<function __main__.ch_en_dict.<locals>.<lambda>()>,
            {'教区': defaultdict(int, {'archdiocese': 7}),
             '财务': defaultdict(int, {'finance': 2, 'financial': 2}),
             '委员会': defaultdict(int,
                         {'council': 2,
                          'committee': 1,
                          'committees': 1,
                          'None': 1,
                          'commission': 6}),
             '认为': defaultdict(int,
                         {'thinks': 3,
                          'think': 5,
                          'opinion': 1,
                          'considers': 1,
                          'thought': 3,
                          'agreed': 1,
                          'held': 1,
                          'holds': 1,
                          'found': 1,
                          'was believed that': 1,
                          'it may be': 1,
                          'gain a sense of': 1}),
             ',': defaultdict(int,
  

In [22]:
# now we pick up most most frequent english words for each chinese word
mfc_table = dict((ch_word, max(en_word.keys(), key=lambda key: en_word[key])) for ch_word, en_word in word_counts.items())
print(type(mfc_table))
mfc_table

<class 'dict'>


{'教区': 'archdiocese',
 '财务': 'finance',
 '委员会': 'commission',
 '认为': 'think',
 ',': 'None',
 '这项': 'the',
 '协议': 'agreement',
 '可能': 'may',
 '使得': 'None',
 '波士顿': 'boston',
 '天主教会': 'catholic archdiocese',
 '破产': 'bankruptcy',
 '因为': 'because',
 '声称': 'alleged',
 '受到': 'None',
 '性': 'sexual',
 '侵害': 'abuse',
 '的': 'None',
 '案子': 'cases',
 '越来越': 'keep surfacing',
 '多': 'over',
 '而且': 'and',
 '「': '"',
 '在': 'in',
 '促成': 'leading',
 '程序': 'process',
 '开始': 'started',
 '后': 'after',
 '情况': 'situation',
 '已经': 'has',
 '有': 'None',
 '很': 'None',
 '大': 'None',
 '变化': 'changes',
 '」': '"',
 '。': '.',
 '六十六': '66-year-old',
 '岁': 'None',
 '吉欧根': 'geoghan',
 '一九九一年': '1991',
 '一': 'a',
 '名': 'None',
 '十': '10',
 '男童': 'boy',
 '最近': 'recently',
 '被': 'None',
 '判处': 'sentenced',
 '年': 'years',
 '徒刑': 'imprisonment',
 '已': 'has',
 '削去': 'defrocked',
 '教士': 'priest',
 '职务': 'None',
 '他': 'he',
 '还': 'also',
 '面临': 'faces',
 '其他': 'other',
 '刑事': 'criminal',
 '审判': 'trials',
 '和': 'and',
 '数十': 'sc

In [23]:
FakeState = namedtuple('FakeState', 'name')

class MFCTagger:
    missing = FakeState(name = '<MISSING>')
    
    def __init__(self, table):
        self.table = defaultdict(lambda: MFCTagger.missing)
        self.table.update({word: FakeState(name=tag) for word, tag in table.items()})
        
    def viterbi(self, seq):
        """This method simplifies predictions by matching the Pomegranate viterbi() interface"""
        return 0., list(enumerate(["<start>"] + [self.table[w] for w in seq] + ["<end>"]))

In [24]:
mfc_model = MFCTagger(mfc_table)

In [25]:
def replace_unknown(sequence):
    return [w if w in DataSet_vocab_frozen else 'nan' for w in sequence]
def simplify_decoding(X, model):    
    _, state_path = model.viterbi(replace_unknown(X))
    return [state[1].name for state in state_path[1:-1]]

In [26]:
for sentence,es in zip(TestSet_ch,TestSet_en):
    #print("Sentence Key: {}\n".format(sentence))
    print("Sentence: {}\n".format(sentence))
    print("Predicted labels:\n-----------------")
    print(simplify_decoding(sentence, mfc_model))
    print()
    print("Actual labels:\n--------------")
    print(es)
    print("\n")

Sentence: ('目前', '，', '科学家', '们', '正在', '用', '三硝基', '甲苯', '（', '黄色炸药', '）', '饲养', '一', '群', '羊', '。')

Predicted labels:
-----------------
['at present', ',', 'scientists', 'None', 'is', 'learn', '<MISSING>', '<MISSING>', '(', '<MISSING>', ')', '<MISSING>', 'a', 'crowd', '<MISSING>', '.']

Actual labels:
--------------
('None', 'None', 'scientists', 'scientists', 'is feeding to', 'is feeding to', 'TNT', 'TNT', 'None', 'TNT', 'None', 'is feeding to', 'a flock of', 'a flock of', 'sheep', '.')


Sentence: ('美国', '俄勒冈', '大学', '的', '兽医学', '教授', '莫里', '克雷格', '发现', '，', '反刍', '的', '食物', '的', '哺乳类', '动物', '能够', '有效', '的', '清理', '爆炸物', '污染', '的', '土壤', '。')

Predicted labels:
-----------------
['us', '<MISSING>', 'university', 'None', '<MISSING>', 'professor', '<MISSING>', '<MISSING>', 'found', ',', '<MISSING>', 'None', 'foods', 'None', '<MISSING>', 'None', 'None', 'effective', 'None', 'Clean Up', 'explosives', 'polluted', 'None', '<MISSING>', '.']

Actual labels:
--------------
('None', 'Orego

In [28]:
def accuracy(X, Y, model):
    correct = total_predictions = 0
    # The model.viterbi call in simplify_decoding will return None if the HMM
    # raises an error (for example, if a test sentence contains a word that
    # is out of vocabulary for the training set). Any exception counts the
    # full sentence as an error (which makes this a conservative estimate).
    for ch_sentence, en_sentence in zip(X, Y):
        try:
            most_likely_tags = simplify_decoding(ch_sentence, model)
            for p, t in zip(most_likely_tags, en_sentence):
                if(p.find(t)!=-1):
                    correct += 1
        except:
            pass
        total_predictions += len(ch_sentence)    
    return correct / total_predictions

In [29]:
mfc_training_acc = accuracy(DataSet_ch, DataSet_en, mfc_model)
print("training accuracy mfc_model: {:.2f}%".format(100 * mfc_training_acc))

training accuracy mfc_model: 68.37%


In [30]:
mfc_testing_acc = accuracy(TestSet_ch, TestSet_en, mfc_model)
print("testing accuracy mfc_model: {:.2f}%".format(100 * mfc_testing_acc))

testing accuracy mfc_model: 22.48%


# HMM

In [31]:
from collections import Counter, defaultdict, namedtuple, OrderedDict
from pomegranate import State, HiddenMarkovModel, DiscreteDistribution

# creat Hidden Markov Model for tanslator
hmm_model = HiddenMarkovModel(name="base-hmm-translator")

In [32]:
# create two list for all chinese words and enligh words (one to one map)

words_ch = []
words_en = []

for s_ch, s_en in zip(DataSet_ch, DataSet_en):
    for w_ch, w_en in zip(s_ch, s_en):
        words_ch.append(w_ch)
        words_en.append(w_en)
        
for i in range(10):
    print(words_ch[i]+'->'+words_en[i])

教区->archdiocese
财务->finance
委员会->council
认为->thinks
,->that
这项->this
协议->agreement
可能->may
使得->force
波士顿->boston


In [33]:
len(words_ch),len(words_en)

(25206, 25206)

In [34]:
# First, let's see how many English words we have in our corpus
def unigram_counts(sequences):
    return Counter(sequences)

en_unigram = unigram_counts(words_en)
en_unigram

Counter({'archdiocese': 7,
         'finance': 2,
         'council': 3,
         'thinks': 3,
         'that': 158,
         'this': 86,
         'agreement': 14,
         'may': 42,
         'force': 2,
         'boston': 6,
         'catholic archdiocese': 1,
         'bankruptcy': 3,
         'None': 4259,
         'as': 28,
         'alleged': 5,
         'sexual': 3,
         'abuse': 3,
         'of': 129,
         'cases': 5,
         'keep surfacing': 2,
         ',': 668,
         'and': 365,
         '"': 310,
         'since': 40,
         'leading': 2,
         'process': 7,
         'started': 7,
         'situation': 13,
         'has': 36,
         'dramatically': 6,
         'changed': 1,
         '.': 1009,
         '66-year-old': 2,
         'geoghan': 3,
         'for': 36,
         'in': 270,
         '1991': 1,
         'molestation': 1,
         'a': 117,
         '10-year-old': 2,
         'boy': 2,
         'recently': 14,
         'was': 24,
         'sentence

In [35]:
# see which english word followed by other english words
def bigram_counts(sequences):
    return Counter(sequences)
# !!
words_en_bi = [(words_en[i], words_en[i+1]) for i in range(0,len(words_en)-2,1)]
en_bigram = bigram_counts(words_en_bi)
en_bigram

Counter({('archdiocese', 'finance'): 2,
         ('finance', 'council'): 2,
         ('council', 'thinks'): 1,
         ('thinks', 'that'): 3,
         ('that', 'this'): 2,
         ('this', 'agreement'): 1,
         ('agreement', 'may'): 1,
         ('may', 'force'): 1,
         ('force', 'boston'): 1,
         ('boston', 'catholic archdiocese'): 1,
         ('catholic archdiocese', 'bankruptcy'): 1,
         ('bankruptcy', 'None'): 2,
         ('None', 'as'): 3,
         ('as', 'alleged'): 1,
         ('alleged', 'None'): 3,
         ('None', 'sexual'): 1,
         ('sexual', 'abuse'): 2,
         ('abuse', 'of'): 1,
         ('of', 'cases'): 1,
         ('cases', 'keep surfacing'): 1,
         ('keep surfacing', 'keep surfacing'): 1,
         ('keep surfacing', ','): 1,
         (',', 'and'): 6,
         ('and', '"'): 1,
         ('"', 'since'): 1,
         ('since', 'leading'): 1,
         ('leading', 'agreement'): 1,
         ('agreement', 'None'): 2,
         ('None', 'process'):

In [36]:
# which english words we use in the beginnig of the sentence?
def starting_counts(sequences):
    return Counter(sequences)

start_en_words = [i[0] for i in DataSet_en]
en_words_starts = starting_counts(start_en_words)
en_words_starts

Counter({'archdiocese': 2,
         '66-year-old': 1,
         'None': 48,
         'us': 22,
         '(': 50,
         'ministry of national defense': 2,
         'two': 5,
         'united states': 1,
         'apart from': 2,
         'india': 3,
         'last time': 1,
         'since': 7,
         'after': 8,
         'diplomat': 1,
         'south african': 1,
         'shuttleworth': 3,
         'he': 41,
         'with': 1,
         'spacecraft': 1,
         'this year': 4,
         'human history': 1,
         'russian': 1,
         'castro': 1,
         'white house': 1,
         'state department': 1,
         'her': 1,
         'carter': 1,
         'powell': 1,
         'our': 1,
         'napster': 3,
         'hilbers': 1,
         'california-based': 1,
         'it': 3,
         'once': 1,
         'oecd': 1,
         'economic': 1,
         'however': 23,
         'european': 3,
         'eu': 4,
         'belgian': 1,
         'mid-term': 1,
         'washington': 

In [37]:
# which english word we end with?

def ending_counts(sequences):    
    return Counter(sequences)

end_en_words = [i[len(i)-1] for i in DataSet_en]
en_words_ends = ending_counts(end_en_words)
en_words_ends

Counter({'.': 790,
         'exercise': 1,
         '"': 71,
         'earth': 1,
         'None': 18,
         'stance': 1,
         'resigns': 1,
         'economy': 1,
         'slow down': 1,
         'defend': 1,
         'loss': 1,
         'embargo': 1,
         'rose': 1,
         'retire': 1,
         'summit': 1,
         'missile': 1,
         'bankrupt': 1,
         'speech': 1,
         'protests': 1,
         'police force': 1,
         'loss lives': 1,
         'apology': 1,
         'future hangs in the air': 1,
         'progress': 1,
         'actions': 2,
         'superhero': 1,
         'golden jubilee': 1,
         'fined': 1,
         'slow': 1,
         'key': 1,
         'claims crown': 1,
         'weapons': 1,
         'situation': 1,
         'show': 1,
         'benefit enormously': 1,
         'obstacles': 1,
         'talks': 1,
         'hotels': 1,
         'arrested': 1,
         'shaking': 1,
         'towns': 1,
         'dead': 2,
         'escalate

In [38]:
# count chinese words that associated to each english word, and we can get emission probabilities 
# (given english words, the probability of observing certain chinese words)
def pair_counts(en_w, ch_w):
    d = defaultdict(lambda: defaultdict(int))
    for en, ch in zip(en_w, ch_w):
        d[en][ch] += 1
    return d

en_ch_count = pair_counts(words_en, words_ch)
en_ch_count

defaultdict(<function __main__.pair_counts.<locals>.<lambda>()>,
            {'archdiocese': defaultdict(int, {'教区': 7}),
             'finance': defaultdict(int, {'财务': 2}),
             'council': defaultdict(int, {'委员会': 2, '理事会': 1}),
             'thinks': defaultdict(int, {'认为': 3}),
             'that': defaultdict(int,
                         {',': 153, ':': 1, '这': 2, '款': 1, '，': 1}),
             'this': defaultdict(int,
                         {'这项': 7,
                          '这': 25,
                          '这次': 6,
                          '这个': 14,
                          '此': 17,
                          '这种': 7,
                          '这部': 1,
                          '这家': 3,
                          '今天': 2,
                          '本': 1,
                          '此次': 1,
                          '它': 2}),
             'agreement': defaultdict(int, {'协议': 13, '协定': 1}),
             'may': defaultdict(int,
                         {'可能': 14,
    

In [39]:
# yielding the distribution of chinese words (per english word), calculate emission probability
  
to_pass_states = []
for en, ch in en_ch_count.items():
    total = float(sum(ch.values()))
    distribution = {word: count/total for word, count in ch.items()}
    en_emissions = DiscreteDistribution(distribution)
    en_state = State(en_emissions, name=en)
    to_pass_states.append(en_state)

In [40]:
distribution

{'残渣': 1.0}

In [41]:
to_pass_states

[{
     "class" : "State",
     "distribution" : {
         "class" : "Distribution",
         "dtype" : "str",
         "name" : "DiscreteDistribution",
         "parameters" : [
             {
                 "\u6559\u533a" : 1.0
             }
         ],
         "frozen" : false
     },
     "name" : "archdiocese",
     "weight" : 1.0
 }, {
     "class" : "State",
     "distribution" : {
         "class" : "Distribution",
         "dtype" : "str",
         "name" : "DiscreteDistribution",
         "parameters" : [
             {
                 "\u8d22\u52a1" : 1.0
             }
         ],
         "frozen" : false
     },
     "name" : "finance",
     "weight" : 1.0
 }, {
     "class" : "State",
     "distribution" : {
         "class" : "Distribution",
         "dtype" : "str",
         "name" : "DiscreteDistribution",
         "parameters" : [
             {
                 "\u59d4\u5458\u4f1a" : 0.6666666666666666,
                 "\u7406\u4e8b\u4f1a" : 0.333333333333333

In [42]:
# validation-----we should have state for each english word, the number should be same
len(to_pass_states),len(en_unigram)

(5677, 5677)

In [43]:
type(to_pass_states[0])

pomegranate.base.State

In [44]:
# add states to our model
hmm_model.add_states(to_pass_states)

In [45]:
# add start probability
start_prob={}

# for tag in tags:
#     start_prob[tag] = starting_tag_count[tag] / tags_count[tag]

# for tag_state in to_pass_states :
#     hmm_model.add_transition(hmm_model.start, tag_state, start_prob[tag_state.name])  
    

for en in words_en:
    start_prob[en] = en_words_starts[en]/ sum(en_words_starts.values()) 

#for en in words_en:
#    start_prob[en] = en_words_starts[en]/ en_unigram[en]

for en_state in to_pass_states:
    hmm_model.add_transition(hmm_model.start, en_state, start_prob[en_state.name])

In [44]:
# !!! try not to add end probability !!!!
# add end probability

end_prob={}

# for tag in tags:
#     end_prob[tag] = ending_tag_count[tag]/tags_count[tag]
    
# for tag_state in to_pass_states :
#     hmm_model.add_transition(tag_state, hmm_model.end, end_prob[tag_state.name])
    
for en in words_en:
     end_prob[en] = en_words_ends[en]/ sum(en_words_ends.values()) 

#for en in words_en:
#    end_prob[en] = en_words_ends[en]/ en_unigram[en]

for en_state in to_pass_states:
    hmm_model.add_transition(en_state, hmm_model.end, end_prob[en_state.name])

In [46]:
# add transition probability
transition_prob_pair={}

# for key in tag_bigrams.keys():
#     transition_prob_pair[key] = tag_bigrams.get(key)/tags_count[key[0]]
    
# for tag_state in to_pass_states:
#     for next_tag_state in to_pass_states:
#         hmm_model.add_transition(tag_state, next_tag_state, transition_prob_pair[(tag_state.name, next_tag_state.name)])


for en in en_bigram.keys():
    transition_prob_pair[en] = en_bigram.get(en)/ en_unigram[en[0]]

for en_state in to_pass_states:
    for next_en_state in to_pass_states:
        if (en_state.name, next_en_state.name) in transition_prob_pair.keys():
            hmm_model.add_transition(en_state, next_en_state, transition_prob_pair[(en_state.name, next_en_state.name)])
#         else:
#             #!! if path no exits(there is no one english word to another english word due the the limited data size)
#             hmm_model.add_transition(en_state, next_en_state, 0)

In [47]:
transition_prob_pair

{('archdiocese', 'finance'): 0.2857142857142857,
 ('finance', 'council'): 1.0,
 ('council', 'thinks'): 0.3333333333333333,
 ('thinks', 'that'): 1.0,
 ('that', 'this'): 0.012658227848101266,
 ('this', 'agreement'): 0.011627906976744186,
 ('agreement', 'may'): 0.07142857142857142,
 ('may', 'force'): 0.023809523809523808,
 ('force', 'boston'): 0.5,
 ('boston', 'catholic archdiocese'): 0.16666666666666666,
 ('catholic archdiocese', 'bankruptcy'): 1.0,
 ('bankruptcy', 'None'): 0.6666666666666666,
 ('None', 'as'): 0.000704390702042733,
 ('as', 'alleged'): 0.03571428571428571,
 ('alleged', 'None'): 0.6,
 ('None', 'sexual'): 0.00023479690068091102,
 ('sexual', 'abuse'): 0.6666666666666666,
 ('abuse', 'of'): 0.3333333333333333,
 ('of', 'cases'): 0.007751937984496124,
 ('cases', 'keep surfacing'): 0.2,
 ('keep surfacing', 'keep surfacing'): 0.5,
 ('keep surfacing', ','): 0.5,
 (',', 'and'): 0.008982035928143712,
 ('and', '"'): 0.0027397260273972603,
 ('"', 'since'): 0.0032258064516129032,
 ('sin

In [48]:
hmm_model.bake()

In [49]:
# sample a state-path(observation)
hmm_model.sample(length= 6)

array(['这', '一', '了', '暴雨', '原因', '人民'], dtype='<U2')

In [50]:
def replace_unknown(sequence):
    return [w for w in sequence if w in DataSet_vocab_frozen]

# re-define the decoding--- state path might no exist!
def simplify_decoding(X, model):    
    _, state_path = model.viterbi(replace_unknown(X))
    if state_path is not None:
        return [state[1].name for state in state_path[1:]]

see if this model works!

In [51]:
hmm_model.node_count()

5678

In [52]:
# test the model
a = ['则', '警告', '：', '巴勒斯坦', '全国', '安全', '。']
hmm_model.log_probability(a)

-26.84114220862837

In [53]:
print(", ".join(state.name for i, state in hmm_model.viterbi(['则', '警告', '：', '巴勒斯坦', '全国', '安全', '。'])[1]))

base-hmm-translator-start, on the other hand, warned, :, palestinian, national, security, .


In [54]:
simplify_decoding(a, hmm_model)

['on the other hand',
 'warned',
 ':',
 'palestinian',
 'national',
 'security',
 '.']

In [55]:
# a function that validate that there is a state path for input sentence
def testpath(X,model):
    _, state_path = model.viterbi(replace_unknown(X))
    if state_path != None:
        return True
    else:
        return False

# a more complex decoding, try to translate as much as possible words together
# there might be some case that we can not find a path for whole sentence, 
# but we can find a path for a part of sentence.
def decoding(X,model):
    X = replace_unknown(X)
    results = []
    start = 0
    end = 1
    tmp = []
    while len(results)!= len(X):
        while testpath(X[start:end],model) and end < len(X):
            if end <= len(X):
                end += 1
        # case 1: even can not translate a single word
        if end == start+1:
            results.append('None')
            start = end
            end += 1
        # case 2: can translate whole sentence
        elif start == 0 and end == len(X):
            results = simplify_decoding(X, model)
            end += 1
        # case 3: can translate part of sentence
        else:
            tmp = simplify_decoding(X[start:end-1], model)
            start = end
            end += 1
            results += tmp
    return results
            

In [56]:
# test the new decoding function
test = ['鼓励', '孩子', '发挥', '想象力', '，', '每天', '和', '孩子', '一', '起', '阅读', '。']
print(testpath(test,hmm_model))
decoding(test,hmm_model)

True


['To encourage',
 "your youngster 's",
 'None',
 'imagination',
 ',',
 'every day',
 'to',
 'him',
 'None',
 'to',
 'read',
 '.']

In [57]:
for sentence, result in zip(TestSet_ch, TestSet_en):
    print("Sentence: {}\n".format(sentence))
    print("Predicted labels:\n-----------------")
#     print(simplify_decoding(sentence, hmm_model))
#     print()
#     print("decoding:")
    print(decoding(sentence, hmm_model))
    print()
    print("Actual labels:\n--------------")
    print(result)
    print("\n")

Sentence: ('目前', '，', '科学家', '们', '正在', '用', '三硝基', '甲苯', '（', '黄色炸药', '）', '饲养', '一', '群', '羊', '。')

Predicted labels:
-----------------
['None', '.', 'Scientists', 'Scientists', 'None', 'None', '(', 'a', 'crowd', 'None', 'None']

Actual labels:
--------------
('None', 'None', 'scientists', 'scientists', 'is feeding to', 'is feeding to', 'TNT', 'TNT', 'None', 'TNT', 'None', 'is feeding to', 'a flock of', 'a flock of', 'sheep', '.')


Sentence: ('美国', '俄勒冈', '大学', '的', '兽医学', '教授', '莫里', '克雷格', '发现', '，', '反刍', '的', '食物', '的', '哺乳类', '动物', '能够', '有效', '的', '清理', '爆炸物', '污染', '的', '土壤', '。')

Predicted labels:
-----------------
['us', 'None', 'None', ',', 'None', 'None', 'None', 'None', 'effective', 'None', 'Clean Up', 'Explosives', 'None', 'None', 'None', 'None', 'None', 'None']

Actual labels:
--------------
('None', 'Oregon', 'State University', 'None', 'veterinary', 'scientist', 'Morrie', 'Craig', 'has found that', ',', 'cud-chewing', 'cud-chewing', 'cud-chewing', 'cud-chewing', 'm

['we', 'None', 'None', 'set', 'None', 'None', 'us', 'navy', 'in', 'nbc', 'None', 'None', 'the', 'None', 'None', 'vice', 'governor', 'None', 'said', 'None', 'None', 'None']

Actual labels:
--------------
('"', 'Such events are history in the making', 'Such events are history in the making', 'Such events are history in the making', 'Such events are history in the making', 'Such events are history in the making', '"', ',', 'for', 'the U.S.', 'Navy', 'built', 'the X-47B program', 'None', 'Northrop', 'Grumman', 'Grumman', 'Aerospace', 'Systems', 'None', 'None', 'None', 'None', 'None', 'vice', 'president', 'and', 'program', 'manager', 'Janis', 'Pamiljans', 'said', '.')


Sentence: ('“', '我们', '已经', '超越', '了', '想', '在', '飞行器', '上', '所', '达到', '的', '目标', '！', '”')

Predicted labels:
-----------------
['we', 'already', 'None', 'None', 'over', 'previous', 'None', 'reached', 'None', 'goal']

Actual labels:
--------------
('"', 'We', 'surpassed', 'surpassed', 'surpassed', 'wanted to', 'with', 'the

['in addition', 'None', 'this', 'is', 'None', 'Some', 'None', 'first', 'None', 'flight', 'None', 'it', 'may', 'be', 'None', ',', 'yet', 'still', 'since', 'the', 'None', 'project', 'None', 'started', 'None', 'None', 'None', 'None', 'None']

Actual labels:
--------------
('Engdahl', 'None', 'said', '"', 'None', 'None', 'Navy', 'unmanned', 'unmanned', 'X-plane', 'a X-plane', 'of', 'First', 'First', 'flight', 'None', 'that', "'s", "'s", 'a huge deal', 'a huge deal', ',', 'but', 'None', 'just', "'s", 'this', 'this', 'program', 'to', 'one of the many firsts', 'just', '"', '.')


Sentence: ('我们', '将', '见证', '更', '多', '的', '新', '面孔', '现身', '全明星', '舞台', '，', '尤其', '考虑', '到', '甜瓜', '马上', '即将', '东', '渡', '而', '姚', '明', '又', '再次', '报销', ',', '西部', '必', '将', '天下大乱', '。')

Predicted labels:
-----------------
['we', 'None', 'more', 'more', 'None', 'new', ',', 'especially', 'at', 'will', 'however', 'None', 'further', 'None', 'western', 'None', 'None', 'None', 'None', 'None']

Actual labels:
----------

['in', 'in', 'None', 'so far', 'None', 'more', 'more', 'None', 'None', 'projects', '.', 'None', 'all', 'in', 'a number of', 'None', 'seven', 'None', 'None', 'None', 'None']

Actual labels:
--------------
('in', 'Utah', 'Utah', 'in', 'than', 'ever before', 'shouldering', "'s shouldering", 'more', 'more', 'of', 'scoring', 'load', ', and', 'per game', 'per', 'getting', "'s getting", 'line', 'the line', 'more than', 'more than', 'seven', 'times', '.')


Sentence: ('身体', '和', '技术', '都', '能', '摇摆', '到', '分位', '，', '防守', '上', '的', '出众', '表现', '使', '其', '在', '这', '张', '名单', '中', '干掉', '纳什', '。')

Predicted labels:
-----------------
['None', 'None', 'technology', 'None', 'None', 'None', 'None', 'previous', 'None', 'performance', 'None', 'them', 'in', 'None', 'zhang', 'in', 'None', 'None']

Actual labels:
--------------
('the size', 'and', 'flexibility', 'None', 'to', 'shift', 'to', 'shooting guard', ', and', 'defensively', 'None', 'None', 'enough', 'None', 'to', 'None', 'on', 'my', 'my', 'ballo

['None', 'as', 'now', 'None', 'still', 'None', 'some', 'however', 'None', 'preparation', 'None', 'participants', 'None', 'None', 'while', 'his', 'None', 'performance', 'None', 'it', 'None', 'can', 'None', 'None', 'experiment with the variables', 'experiment with the variables', 'None', 'None']

Actual labels:
--------------
('None', 'as', 'a rookie', 'make it', 'None', 'None', 'None', 'tough', 'None', 'tough', ',', 'but', "Griffin's", 'already', 'None', 'None', 'for', 'dunk', 'contest', 'None', ',', 'and', "he's", 'None', 'None', 'None', 'most folks', 'realize', 'None', 'None', 'None', 'None', 'None', 'None', 'None', '.')


Sentence: ('即使', '没有', '开发', '出', '什么', '低位', '和', '左手', '技术', '，', '他', '依然', '能', '取得', '52%', '的', '命中率', '，', '平均', '一', '场', '比赛', '8', '次', '走', '上', '罚球', '线', '的', '同时', '他', '也', '成为', '了', '联盟', '最', '顶尖', '的', '篮板', '手', '之一', '（', '无论', '是', '进攻', '还', '是', '防守', '篮板', '）', '。')

Predicted labels:
-----------------
['even', 'None', 'development', 'None',

['with', 'a', 'correspondingly', ',', 'brazil', 'None', 'in', 'None', 'take', 'None', 'more', 'more', 'None', 'responsibility', 'None', 'while', 'None', 'None', 'aimed', 'None', 'None', 'None', 'None']

Actual labels:
--------------
('as', 'injuries', 'have', 'have', 'decimated', 'Denver', 'None', 'front', 'line', ',', 'None', 'None', 'None', 'None', 'both ends', 'ends', 'carry', 'None', 'a huge', 'a huge', 'huge', 'burden', ',', 'and', "'s 's 's", "'s 's 's", 'in', '63', 'percent', 'percent', 'field-goal percentage', 'leading', 'leading', 'the league', '.')


Sentence: ('哦', '对了', '，', '他', '在', '这', '份', '名单', '中', '可能', '以', '中锋', '身份', '出现', '哦', '。')

Predicted labels:
-----------------
['None', 'he', 'during', 'this', 'None', 'during', 'can', 'None', 'posted', 'None', 'None']

Actual labels:
--------------
('None', 'A small plus', ':', 'He', 'None', 'the', 'the', 'ballot', 'None', 'actually', 'as', 'a center', 'center', "'s listed", 'None', '.')


Sentence: ('扎克', '兰道夫', '(', 'PE

In [58]:
def accuracy(X, Y, model):
    correct = total_predictions = 0
    # The model.viterbi call in simplify_decoding will return None if the HMM
    # raises an error (for example, if a test sentence contains a word that
    # is out of vocabulary for the training set). Any exception counts the
    # full sentence as an error (which makes this a conservative estimate).
    for ch_sentence, en_sentence in zip(X, Y):
        try:
            most_likely_tags = simplify_decoding(ch_sentence, model)
            for word in en_sentence:
                if word in most_likely_tags:
                    correct += 1
        except:
            pass
        total_predictions += len(ch_sentence)    
    return correct / total_predictions

In [59]:
hmm_training_acc = accuracy(DataSet_ch, DataSet_en, hmm_model)
print("training accuracy hmm_model: {:.2f}%".format(100 * hmm_training_acc))

training accuracy hmm_model: 98.42%


In [60]:
hmm_testing_acc = accuracy(TestSet_ch, TestSet_en, hmm_model)
print("testing accuracy mfc_model: {:.2f}%".format(100 * hmm_testing_acc))

testing accuracy mfc_model: 2.73%


HMM model goes pretty well at what it has learned(training data). 
However, the testing accuray is very low, note that we only have 1000 training data, as the training data become bigger, we can have better result.