# Testing the Translation Model classes

In [1]:
import tm as tm
import utils as utils
import nltk

## Preparing the Data

### Load the Training Data

In [2]:
f_corpus = utils.load_data("data/small/aslg.en")
e_corpus = utils.load_data("data/small/aslg.processed.gloss.asl")
print('f_corpus',f_corpus)
print('e_corpus',e_corpus)

f_corpus ['the date is not a fetish .', 'the results speak for themselves .', 'it should mean that everyone is given an equal opportunity .', 'europe has done a very great deal of work in this field , which is to be welcomed .', 'we simply have to build it stage by stage .', 'it is an issue of proportionality .', 'we are talking about rights , plurality and freedom .', 'that attempt did not succeed , but it does represent a precedent for that kind of gathering .', 'this green paper is important , seeing as it concerns a matter that needs to be dealt with .', 'mr audy asked the following question are your samples of sufficient size ?', 'it is up to you to consider and choose the outcome you want to see .', 'what can we expect from china ?']
e_corpus ['date be not fetish .', 'result speak for mselves .', 'it should mean that everyone be give equal opportunity .', 'europe have do very great deal work in this field , which be to be welcome .', 'we simply have to build it stage by stage .',

### Preprocessing the Data

In [5]:
_, f_corpus = utils.preprocess(f_corpus)
print('f_corpus',f_corpus)
_, e_corpus = utils.preprocess(e_corpus)
print('e_corpus',e_corpus)

f_corpus [['<s>', 'the', 'date', 'is', 'not', 'a', 'fetish', '.', '</s>'], ['<s>', 'the', 'results', 'speak', 'for', 'themselves', '.', '</s>'], ['<s>', 'it', 'should', 'mean', 'that', 'everyone', 'is', 'given', 'an', 'equal', 'opportunity', '.', '</s>'], ['<s>', 'europe', 'has', 'done', 'a', 'very', 'great', 'deal', 'of', 'work', 'in', 'this', 'field', ',', 'which', 'is', 'to', 'be', 'welcomed', '.', '</s>'], ['<s>', 'we', 'simply', 'have', 'to', 'build', 'it', 'stage', 'by', 'stage', '.', '</s>'], ['<s>', 'it', 'is', 'an', 'issue', 'of', 'proportionality', '.', '</s>'], ['<s>', 'we', 'are', 'talking', 'about', 'rights', ',', 'plurality', 'and', 'freedom', '.', '</s>'], ['<s>', 'that', 'attempt', 'did', 'not', 'succeed', ',', 'but', 'it', 'does', 'represent', 'a', 'precedent', 'for', 'that', 'kind', 'of', 'gathering', '.', '</s>'], ['<s>', 'this', 'green', 'paper', 'is', 'important', ',', 'seeing', 'as', 'it', 'concerns', 'a', 'matter', 'that', 'needs', 'to', 'be', 'dealt', 'with', '.

## The Translation Models

### IBM Model1

In [6]:
ibm1 = tm.IBMModel1(f_corpus, e_corpus, num_iters=5)
print('ibm1.translation_table',ibm1.translation_table)

ibm1.translation_table defaultdict(<function IBMModel1.__init__.<locals>.<lambda> at 0x000001B640B4AC10>, {'<s>': defaultdict(<function IBMModel1.__init__.<locals>.<lambda>.<locals>.<lambda> at 0x000001B640B4AD30>, {'<s>': 0.2558245766178133, 'date': 0.002566566874374593, 'be': 0.13146569504072217, 'not': 0.008719061483301711, 'fetish': 0.002566566874374593, '.': 0.2094591303781147, '</s>': 0.2558245766178133, 'result': 0.0010901084524122158, 'speak': 0.0010901084524122158, 'for': 0.005269638782883964, 'mselves': 0.0010901084524122158, 'it': 0.040068295458899256, 'should': 0.0005764997439362683, 'mean': 0.0005764997439362683, 'that': 0.007642033469329306, 'everyone': 0.0005764997439362683, 'give': 0.0005764997439362683, 'equal': 0.0005764997439362683, 'opportunity': 0.0005764997439362683, 'europe': 0.00027081568535866835, 'have': 0.003849323551886079, 'do': 0.003174565280712972, 'very': 0.00027081568535866835, 'great': 0.00027081568535866835, 'deal': 0.0018899401145721832, 'work': 0.00

#### Test the Model

In [13]:
f_sent = "this is yousef ."
f_sent_tokens = nltk.word_tokenize(f_sent.lower())

e_sent_tokens = []
for f_sent_token in f_sent_tokens:
    translation_probs = ibm1.translation_table[f_sent_token]
    if translation_probs:
        best_e_sent_token = max(translation_probs, key=translation_probs.get)
        e_sent_tokens.append(best_e_sent_token)
    else:
        e_sent_tokens.append(f_sent_token)

print('e_sent_tokens before removing the None tokens', e_sent_tokens)
e_sent_tokens = [token for token in e_sent_tokens if token is not None]
print('e_sent_tokens after removing the None tokens', e_sent_tokens)
e_sent_sent = ' '.join(e_sent_tokens)
print(e_sent_sent)

e_sent_tokens before removing the None tokens ['deal', 'be', 'yousef', '.']
e_sent_tokens after removing the None tokens ['deal', 'be', 'yousef', '.']
deal be yousef .


## Saving the Models

In [None]:
# save the language models
utils.save_model(ibm1, "models/ibm1_model.pkl")