In [1]:
import sys
import os

parent_directory = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(parent_directory)

In [2]:
import LM.lm as lm
import TM.ibm1 as tm
import Decoder.ibm1 as decoder
import utils as utils

In [3]:
direction = 'forward'
f_corpus, e_corpus = utils.load_data('../data/data.json', direction)
print(f_corpus)
print(e_corpus)

['the girl is in france', 'paris is a city in france', 'the girl is beautiful', 'paris is a beautiful city', 'the girl is in church', 'a church is in paris', 'the church is beautiful', 'france has a beautiful church']
['girl be in france', 'paris be city in france', 'girl be beautiful', 'paris be beautiful city', 'girl be in church', 'church be in paris', 'church be beautiful', 'france have beautiful church']


# IBM 1 Translation Model

In [4]:
ibm1 = tm.IBMModel1(f_corpus, e_corpus)
ibm1.preprocess()
ibm1.train(100000000, 0.0000001)
print('ibm1.translation_table: ', ibm1.translation_table)
print('ibm1.translation_tuple: ', ibm1.translation_tuple)

# for f_e_pair, prob in sorted(ibm1.translation_tuple.items()):
#     print(f_e_pair, prob)

ibm1.translation_table:  defaultdict(<function IBMModel1.train.<locals>.<lambda> at 0x000002A11A2337F0>, {'the': defaultdict(<function IBMModel1.train.<locals>.<lambda>.<locals>.<lambda> at 0x000002A13A481990>, {'girl': 0.48389928480146044, 'be': 0.38397971962691213, 'in': 0.01893506411683187, 'france': 6.273033751106227e-60, 'beautiful': 0.05659296572741154, 'church': 0.05659296572738387, 'have': 0.0, 'city': 0.0, 'paris': 0.0}), 'girl': defaultdict(<function IBMModel1.train.<locals>.<lambda>.<locals>.<lambda> at 0x000002A13A481A20>, {'girl': 0.9623433360159952, 'be': 1.2401777814299425e-24, 'in': 0.03765666398400472, 'france': 1.247534852104058e-59, 'beautiful': 4.215313480557721e-73, 'church': 3.9738375795389604e-73, 'have': 0.0, 'city': 0.0, 'paris': 0.0}), 'is': defaultdict(<function IBMModel1.train.<locals>.<lambda>.<locals>.<lambda> at 0x000002A13A481AB0>, {'girl': 1.2736734644110412e-116, 'be': 0.9999993465829052, 'in': 6.534170946600426e-07, 'france': 2.2439676909151936e-76, '

## The Language Models

In [5]:
unigram_lm = lm.Unigram(e_corpus)
unigram_lm.preprocess()
unigram_lm.train()

In [6]:
bigram_lm = lm.Bigram(e_corpus)
bigram_lm.preprocess()
bigram_lm.train()

In [7]:
trigram_lm = lm.Trigram(e_corpus)
trigram_lm.preprocess()
trigram_lm.train()

## The Decoder (Translation)

In [8]:
translator = decoder.Decoder(ibm1)
translator.translate("the girl is in france")

'girl girl be in france'

### With Unigram

In [10]:
translator = decoder.Decoder(ibm1, unigram_lm)
translator.translate("the girl is in france")

'be girl be in france'

### With Bigram

In [11]:
translator = decoder.Decoder(ibm1, bigram_lm)
translator.translate("the girl is in france")

'girl girl be in france'

### With Trigram

In [12]:
translator = decoder.Decoder(ibm1, trigram_lm)
translator.translate("the girl is in france")

'girl girl be in france'