# Testing the LM classes

### TODO
- Handle the unseen and unkown problems.
- Vocabs can be derived from the training data with cutoff (unk_cutoff).

In [9]:
import utils as utils
import lm as lm

## Preparing the Data

### Load the Training Data

In [10]:
e_corpus = utils.load_data("data/small/aslg.processed.gloss.asl")
print('e_corpus',e_corpus)

e_corpus ['date be not fetish .', 'result speak for mselves .', 'it should mean that everyone be give equal opportunity .', 'europe have do very great deal work in this field , which be to be welcome .', 'we simply have to build it stage by stage .', 'it be issue proportionality .', 'we be talk about rights , plurality and freedom .', 'that attempt do not succeed , but it do represent precedent for that kind garing .', 'this green paper be important , see as it concern matter that need to be deal with .', 'mr audy ask follow question be you sample sufficient size ?', 'it be up to you to consider and choose outcome you want to see .', 'what can we expect from china ?']


### Preprocessing the Data

In [11]:
e_vocabs, e_corpus = utils.preprocess(e_corpus)
print('vocabs',e_vocabs)
print('e_corpus',e_corpus)

vocabs ['expect', 'with', 'opportunity', 'work', 'you', 'very', 'but', 'outcome', 'mean', 'about', 'garing', 'stage', 'everyone', 'kind', 'and', 'freedom', 'it', 'sufficient', '.', 'by', 'concern', 'paper', 'from', '?', 'follow', '<s>', 'audy', 'great', 'size', 'simply', 'important', 'green', '</s>', 'ask', 'represent', 'result', 'not', 'we', 'this', 'be', 'proportionality', 'fetish', 'up', 'mselves', 'issue', 'date', 'equal', 'have', 'do', 'that', 'should', 'rights', 'mr', 'china', ',', 'need', 'which', 'europe', 'can', 'want', 'in', 'attempt', 'precedent', 'matter', 'choose', 'see', 'talk', 'welcome', 'field', 'speak', 'consider', 'to', 'question', 'plurality', 'as', 'what', 'sample', 'for', 'give', 'succeed', 'deal', 'build']
e_corpus [['<s>', 'date', 'be', 'not', 'fetish', '.', '</s>'], ['<s>', 'result', 'speak', 'for', 'mselves', '.', '</s>'], ['<s>', 'it', 'should', 'mean', 'that', 'everyone', 'be', 'give', 'equal', 'opportunity', '.', '</s>'], ['<s>', 'europe', 'have', 'do', 've

## The Language Models

### Unigram

In [12]:
unigram_lm = lm.Unigram(e_vocabs, e_corpus)
unigram_lm.train()
probability = unigram_lm.calc_probability('be')
print('probability',probability)
sentence_probability = unigram_lm.calc_sentence_probability('date be not fetish .')
print('sentence_probability',sentence_probability)
log_probability = unigram_lm.calc_log_probability('be')
print('log_probability',log_probability)
log_sentence_probability = unigram_lm.calc_log_sentence_probability('date be not fetish .')
print('log_sentence_probability',log_sentence_probability)

probability 0.046610169491525424
sentence_probability 6.0182371912008816e-12
log_probability -3.0659365322272394
log_sentence_probability -25.83622672487146


#### Handling the unseen words

In [13]:
probability = unigram_lm.calc_probability('cmp')
print('probability',probability)

probability 0.00423728813559322


### Bigram

In [14]:
bigram_lm = lm.Bigram(e_vocabs, e_corpus)
bigram_lm.train()
probability = bigram_lm.calc_probability('be','not')
print('probability',probability)
sentence_probability = bigram_lm.calc_sentence_probability('date be not fetish .')
print('sentence_probability',sentence_probability)
log_probability = bigram_lm.calc_log_probability('be','not')
print('log_probability',log_probability)
log_sentence_probability = bigram_lm.calc_log_sentence_probability('date be not fetish .')
print('log_sentence_probability',log_sentence_probability)

probability 0.021505376344086023
sentence_probability 7.142917979414252e-10
log_probability -3.8394523125933104
log_sentence_probability -21.059729556485898


### Trigram

In [15]:
trigram_lm = lm.Trigram(e_vocabs, e_corpus)
trigram_lm.train()
probability = trigram_lm.calc_probability('be','not','fetish')
print('probability',probability)
sentence_probability = trigram_lm.calc_sentence_probability('date be not fetish .')
print('sentence_probability',sentence_probability)
log_probability = trigram_lm.calc_log_probability('be','not','fetish')
print('log_probability',log_probability)
log_sentence_probability = trigram_lm.calc_log_sentence_probability('date be not fetish .')
print('log_sentence_probability',log_sentence_probability)

probability 0.023809523809523808
sentence_probability 7.651622719418543e-09
log_probability -3.7376696182833684
log_sentence_probability -18.68834809141684


###  Interpolation

In [16]:
ilm = lm.LM(e_vocabs, e_corpus)
ilm.train()
sentence_probability = ilm.calc_sentence_probability('date be not fetish .')
print('sentence_probability',probability)
log_sentence_probability = ilm.calc_log_sentence_probability('date be not fetish .')
print('log_sentence_probability',log_sentence_probability)

sentence_probability 0.023809523809523808
log_sentence_probability -17.597517215733067


## Saving the Models

In [None]:
# save the language models
utils.save_model(unigram_lm, "models/unigram_model.pkl")
utils.save_model(bigram_lm, "models/bigram_model.pkl")
utils.save_model(trigram_lm, "models/trigram_model.pkl")