In [17]:
import sys
import os

# Add the Integration directory to the Python system path, enabling the import of modules or packages located in that directory.
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))

In [2]:
import LM.lm as lm
import TM.ibm1 as ibm1
import TM.phrase_based as phrase_based
import Decoder.ibm1 as decoder
import Decoder.phrase_based as pb_decoder
import utils as utils

In [3]:
direction = 'forward'
f_corpus, e_corpus = utils.load_data('../data/data.json', direction)
print(f_corpus)
print(e_corpus)

['the girl is in france', 'paris is a city in france', 'the girl is beautiful', 'paris is a beautiful city', 'the girl is in church', 'a church is in paris', 'the church is beautiful', 'france has a beautiful church']
['girl be in france', 'paris be city in france', 'girl be beautiful', 'paris be beautiful city', 'girl be in church', 'church be in paris', 'church be beautiful', 'france have beautiful church']


# IBM 1 Translation Model

In [4]:
tm = ibm1.IBMModel1(f_corpus, e_corpus)
tm.preprocess()
tm.train(100000000, 0.0000001)
tm.align()

print('tm.translation_table: ', tm.translation_table)
print('tm.translation_tuple: ', tm.translation_tuple)
print('tm.f_e_alignment: ', tm.f_e_alignment)
print('tm.e_f_alignment: ', tm.e_f_alignment)

# for f_e_pair, prob in sorted(tm.translation_tuple.items()):
#     print(f_e_pair, prob)

# for i in range(len(f_corpus)):
#     print(f_corpus[i])
#     print(e_corpus[i])
#     print('e_f_alignment', tm.e_f_alignment[i])
#     print('f_e_alignment', tm.f_e_alignment[i])

tm.translation_table:  defaultdict(<function IBMModel1.train.<locals>.<lambda> at 0x00000267C1CF3E20>, {'the': defaultdict(<function IBMModel1.train.<locals>.<lambda>.<locals>.<lambda> at 0x00000267E1FA2290>, {'girl': 0.48389928480146044, 'be': 0.38397971962691213, 'in': 0.01893506411683187, 'france': 6.273033751106227e-60, 'beautiful': 0.05659296572741154, 'church': 0.05659296572738387, 'city': 0.0, 'have': 0.0, 'paris': 0.0}), 'girl': defaultdict(<function IBMModel1.train.<locals>.<lambda>.<locals>.<lambda> at 0x00000267E1FA2170>, {'girl': 0.9623433360159952, 'be': 1.2401777814299425e-24, 'in': 0.03765666398400472, 'france': 1.247534852104058e-59, 'beautiful': 4.215313480557721e-73, 'church': 3.9738375795389604e-73, 'city': 0.0, 'have': 0.0, 'paris': 0.0}), 'is': defaultdict(<function IBMModel1.train.<locals>.<lambda>.<locals>.<lambda> at 0x00000267E1FA2320>, {'girl': 1.2736734644110412e-116, 'be': 0.9999993465829052, 'in': 6.534170946600426e-07, 'france': 2.2439676909151936e-76, 'pa

## The Language Models

In [5]:
unigram_lm = lm.Unigram(e_corpus)
unigram_lm.preprocess()
unigram_lm.train()

In [6]:
bigram_lm = lm.Bigram(e_corpus)
bigram_lm.preprocess()
bigram_lm.train()

In [7]:
trigram_lm = lm.Trigram(e_corpus)
trigram_lm.preprocess()
trigram_lm.train()

## The Decoder (Translation)

In [8]:
translator = decoder.Decoder(tm)
translator.translate("the date is not a fetish")

'girl date be not paris fetish'

### With Unigram

In [9]:
translator = decoder.Decoder(tm, unigram_lm)
translator.translate("the date is not a fetish")

'be date be not paris fetish'

### With Bigram

In [10]:
translator = decoder.Decoder(tm, bigram_lm)
translator.translate("the date is not a fetish")

'girl date be not paris fetish'

### With Trigram

In [11]:
translator = decoder.Decoder(tm, trigram_lm)
translator.translate("the date is not a fetish")

'girl date be not paris fetish'

# Phrase-Based Model

In [12]:
# Extract the phrase pairs from the corpus
pbtm = phrase_based.PhraseBasedModel(f_corpus, e_corpus, tm.f_e_alignment)
pbtm.extrat_phrase_pairs()
print('pbmt.phrase_table: ', pbtm.phrase_table)

pbmt.phrase_table:  [((1, 3), (0, 2), 'girl is', 'girl be'), ((0, 2), (0, 1), 'the girl', 'girl'), ((1, 2), (0, 1), 'girl', 'girl'), ((2, 5), (1, 4), 'is in france', 'be in france'), ((0, 3), (0, 2), 'the girl is', 'girl be'), ((0, 4), (0, 3), 'the girl is in', 'girl be in'), ((3, 5), (2, 4), 'in france', 'in france'), ((3, 4), (2, 3), 'in', 'in'), ((1, 4), (0, 3), 'girl is in', 'girl be in'), ((2, 4), (1, 3), 'is in', 'be in'), ((0, 5), (0, 4), 'the girl is in france', 'girl be in france'), ((1, 5), (0, 4), 'girl is in france', 'girl be in france'), ((2, 3), (1, 2), 'is', 'be'), ((4, 5), (3, 4), 'france', 'france'), ((1, 3), (1, 2), 'is a', 'be'), ((0, 3), (0, 2), 'paris is a', 'paris be'), ((0, 2), (0, 2), 'paris is', 'paris be'), ((0, 5), (0, 4), 'paris is a city in', 'paris be city in'), ((0, 1), (0, 1), 'paris', 'paris'), ((2, 4), (2, 3), 'a city', 'city'), ((1, 2), (1, 2), 'is', 'be'), ((1, 5), (1, 4), 'is a city in', 'be city in'), ((0, 4), (0, 3), 'paris is a city', 'paris be c

In [13]:
pbtm.score_phrase_pairs()
print('pbmt.score_to_f_e_phrase', sorted(pbtm.score_to_f_e_phrase)[::-1])

pbmt.score_to_f_e_phrase [(1.0, ('paris is a city in france', 'paris be city in france')), (1.0, ('paris is a city in', 'paris be city in')), (1.0, ('paris is a city', 'paris be city')), (1.0, ('paris is a beautiful city', 'paris be beautiful city')), (1.0, ('paris is a beautiful', 'paris be beautiful')), (1.0, ('paris', 'paris')), (1.0, ('is in paris', 'be in paris')), (1.0, ('is in france', 'be in france')), (1.0, ('is in church', 'be in church')), (1.0, ('is in', 'be in')), (1.0, ('is a city in france', 'be city in france')), (1.0, ('is a city in', 'be city in')), (1.0, ('is a city', 'be city')), (1.0, ('is a beautiful city', 'be beautiful city')), (1.0, ('in paris', 'in paris')), (1.0, ('in france', 'in france')), (1.0, ('in church', 'in church')), (1.0, ('in', 'in')), (1.0, ('has a beautiful church', 'have beautiful church')), (1.0, ('has a beautiful', 'have beautiful')), (1.0, ('france has a beautiful church', 'france have beautiful church')), (1.0, ('france has a beautiful', 'fr

In [14]:
translator = pb_decoder.Decoder(pbtm, unigram_lm)
translator.translate("the date is not a fetish")

''