In [1]:
import sys
import os

parent_directory = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(parent_directory)

In [2]:
import LM.lm as lm
import TM.ibm1 as tm
import TM.phrase_based as pbtm
import Decoder.ibm1 as decoder
import Decoder.phrase_based as pb_decoder
import utils as utils

In [5]:
direction = 'forward'
f_corpus, e_corpus = utils.load_data('../data/small.json', direction, 20)
print(f_corpus)
print(e_corpus)

['the date is not a fetish', 'the results speak for themselves', 'it should mean that everyone is given an equal opportunity', 'europe has done a very great deal of work in this field which is to be welcomed', 'we simply have to build it stage by stage', 'it is an issue of proportionality', 'we are talking about rights plurality and freedom', 'that attempt did not succeed but it does represent a precedent for that kind of gathering', 'this green paper is important seeing as it concerns a matter that needs to be dealt with', 'mr audy asked the following question are your samples of sufficient size', 'it is up to you to consider and choose the outcome you want to see', 'what can we expect from china', 'we must pay by giving financial support to developing countries', 'i will however refer your request to the conference of presidents which will meet later this afternoon', 'it says that the act also complies with international standards and pursues a legitimate goal', 'dates of forthcoming

# IBM 1 Translation Model

In [6]:
ibm1 = tm.IBMModel1(f_corpus, e_corpus)
ibm1.preprocess()
ibm1.train(100000000, 0.0000001)
ibm1.align()

print('ibm1.translation_table: ', ibm1.translation_table)
print('ibm1.translation_tuple: ', ibm1.translation_tuple)
print('ibm1.f_e_alignment: ', ibm1.f_e_alignment)
print('ibm1.e_f_alignment: ', ibm1.e_f_alignment)

# for f_e_pair, prob in sorted(ibm1.translation_tuple.items()):
#     print(f_e_pair, prob)

# for i in range(len(f_corpus)):
#     print(f_corpus[i])
#     print(e_corpus[i])
#     print('e_f_alignment', ibm1.e_f_alignment[i])
#     print('f_e_alignment', ibm1.f_e_alignment[i])

ibm1.translation_table:  defaultdict(<function IBMModel1.train.<locals>.<lambda> at 0x000001AD48D96320>, {'the': defaultdict(<function IBMModel1.train.<locals>.<lambda>.<locals>.<lambda> at 0x000001AD48D964D0>, {'date': 5e-324, 'be': 5e-324, 'not': 5e-324, 'fetish': 5e-324, 'result': 5e-324, 'speak': 5e-324, 'for': 3.5783705174576454e-275, 'mselves': 5e-324, 'mr': 5e-324, 'audy': 5e-324, 'ask': 5e-324, 'follow': 5e-324, 'question': 5e-324, 'you': 1.0, 'sample': 5e-324, 'sufficient': 5e-324, 'size': 5e-324, 'it': 0.0, 'up': 5e-324, 'to': 0.0, 'consider': 5e-324, 'and': 0.0, 'choose': 5e-324, 'outcome': 5e-324, 'want': 5e-324, 'see': 0.0, 'i': 5e-324, 'will': 0.0, 'however': 5e-324, 'refer': 5e-324, 'request': 5e-324, 'conference': 5e-324, 'president': 5e-324, 'which': 0.0, 'meet': 5e-324, 'later': 5e-324, 'this': 0.0, 'afternoon': 5e-324, 'say': 5e-324, 'that': 0.0, 'act': 5e-324, 'also': 5e-324, 'comply': 5e-324, 'with': 0.0, 'international': 5e-324, 'standard': 5e-324, 'pursue': 5e-32

## The Language Models

In [7]:
unigram_lm = lm.Unigram(e_corpus)
unigram_lm.preprocess()
unigram_lm.train()

In [8]:
bigram_lm = lm.Bigram(e_corpus)
bigram_lm.preprocess()
bigram_lm.train()

In [9]:
trigram_lm = lm.Trigram(e_corpus)
trigram_lm.preprocess()
trigram_lm.train()

## The Decoder (Translation)

In [11]:
translator = decoder.Decoder(ibm1)
translator.translate("the date is not a fetish")

'you date be not be date'

### With Unigram

In [13]:
translator = decoder.Decoder(ibm1, unigram_lm)
translator.translate("the date is not a fetish")

'you date be not be date'

### With Bigram

In [14]:
translator = decoder.Decoder(ibm1, bigram_lm)
translator.translate("the date is not a fetish")

'you fetish be not with fetish'

### With Trigram

In [15]:
translator = decoder.Decoder(ibm1, trigram_lm)
translator.translate("the date is not a fetish")

'you date be not be date'

# Phrase-Based Model

In [16]:
# Extract the phrase pairs from the corpus
pbtm = pbtm.PhraseBasedModel(f_corpus, e_corpus, ibm1.f_e_alignment)
pbtm.extrat_phrase_pairs()
print('pbmt.phrase_table: ', pbtm.phrase_table)

pbmt.phrase_table:  [((3, 5), (2, 3), 'not a', 'not'), ((2, 4), (1, 3), 'is not', 'be not'), ((0, 6), (0, 4), 'the date is not a fetish', 'date be not fetish'), ((3, 6), (2, 3), 'not a fetish', 'not'), ((2, 5), (1, 3), 'is not a', 'be not'), ((2, 3), (1, 2), 'is', 'be'), ((0, 4), (0, 4), 'the date is not', 'date be not fetish'), ((0, 5), (0, 4), 'the date is not a', 'date be not fetish'), ((1, 4), (0, 4), 'date is not', 'date be not fetish'), ((2, 6), (1, 3), 'is not a fetish', 'be not'), ((1, 6), (0, 4), 'date is not a fetish', 'date be not fetish'), ((1, 5), (0, 4), 'date is not a', 'date be not fetish'), ((3, 4), (2, 3), 'not', 'not'), ((0, 4), (0, 4), 'the results speak for', 'result speak for mselves'), ((3, 5), (2, 3), 'for themselves', 'for'), ((1, 4), (0, 4), 'results speak for', 'result speak for mselves'), ((2, 4), (2, 3), 'speak for', 'for'), ((1, 5), (0, 4), 'results speak for themselves', 'result speak for mselves'), ((3, 4), (2, 3), 'for', 'for'), ((0, 5), (0, 4), 'the re

In [17]:
pbtm.score_phrase_pairs()
print('pbmt.score_to_f_e_phrase', sorted(pbtm.score_to_f_e_phrase)[::-1])

pbmt.score_to_f_e_phrase [(1.0, ('we must', 'we must')), (1.0, ('this green paper is important seeing as it concerns a matter that needs to be dealt with', 'this green paper be important see as it concern matter that need to be deal with')), (1.0, ('this green paper is important seeing as it concerns a matter that needs to', 'this green paper be important see as it concern matter that need to be')), (1.0, ('needs', 'need')), (1.0, ('must', 'must')), (1.0, ('it is up to you to consider and choose the outcome you want to see', 'it be up to you to consider and choose outcome you want to see')), (1.0, ('is up to you to consider and choose the outcome you want to see', 'be up to you to consider and choose outcome you want to see')), (1.0, ('is to', 'be to be')), (1.0, ('important for', 'important for')), (1.0, ('green paper is important seeing as it concerns a matter that needs to be dealt with', 'green paper be important see as it concern matter that need to be deal with')), (1.0, ('green 

In [19]:
translator = pb_decoder.Decoder(pbtm, unigram_lm)
translator.translate("the date is not a fetish")

'date be not fetish'