# Testing the Language Model

In [2]:
import numpy as np
import LM.lm as lm
import utils as utils

### Load the Training Data

In [3]:
f_corpus, e_corpus = utils.load_data("../data/small.json", 100)
print('f_corpus len', len(f_corpus))
print('e_corpus len', len(e_corpus))
print('f_corpus', f_corpus)
print('e_corpus', e_corpus)

f_corpus len 100
e_corpus len 100
f_corpus ['the date is not a fetish', 'the results speak for themselves', 'it should mean that everyone is given an equal opportunity', 'europe has done a very great deal of work in this field which is to be welcomed', 'we simply have to build it stage by stage', 'it is an issue of proportionality', 'we are talking about rights plurality and freedom', 'that attempt did not succeed but it does represent a precedent for that kind of gathering', 'this green paper is important seeing as it concerns a matter that needs to be dealt with', 'mr audy asked the following question are your samples of sufficient size', 'it is up to you to consider and choose the outcome you want to see', 'what can we expect from china', 'we must pay by giving financial support to developing countries', 'i will however refer your request to the conference of presidents which will meet later this afternoon', 'it says that the act also complies with international standards and pursue

## Unigram

In [4]:
unigram_lm = lm.Unigram(e_corpus)
unigram_lm.preprocess()
unigram_lm.train()
# unigram_lm.print_ds()

### Words Probability

In [8]:
probability = unigram_lm.calc_probability('be')
print('probability',probability)
log_probability = unigram_lm.calc_log_probability('be')
print('log_probability',log_probability)

probability 0.03997715591090805
log_probability -3.219447090236493


In [9]:
probability = unigram_lm.calc_probability('is')
print('probability',probability)
log_probability = unigram_lm.calc_log_probability('is')
print('log_probability',log_probability)

probability 0.0005711022272986865
log_probability -7.467942332285852


### Sentence Probability

In [10]:
sentence_probability = unigram_lm.calc_sentence_probability('date be not fetish')
print('sentence_probability',sentence_probability)
log_sentence_probability = unigram_lm.calc_log_sentence_probability('date be not fetish')
print('log_sentence_probability',log_sentence_probability)
np.log(sentence_probability)

sentence_probability 1.9324956000029484e-12
log_sentence_probability -26.972208891293644


-26.97220889129364

In [11]:
sentence_probability = unigram_lm.calc_sentence_probability('Unseen sentence Unseen sentence')
print('sentence_probability',sentence_probability)
log_sentence_probability = unigram_lm.calc_log_sentence_probability('Unseen sentence Unseen sentence')
print('log_sentence_probability',log_sentence_probability)
np.log(sentence_probability)

sentence_probability 3.539369230774631e-16
log_sentence_probability -35.57741296003259


-35.57741296003259

### Probability Change

In [8]:
print(unigram_lm.calc_log_sentence_probability('My name'))
print(unigram_lm.calc_log_sentence_probability('My name is John'))
print(unigram_lm.calc_log_sentence_probability('My name called Ali'))
print(unigram_lm.probability_change('My name','is John' ))
print(unigram_lm.probability_change('My name','called Ali'))

-11.059519441353883
-16.994744270138987
-19.07418581181883
-5.935224828785104
-8.014666370464946


### Perplexity

In [9]:
training_corpus = ["The quick brown fox jumps over the lazy dog", "The quick brown fox jumps over the lazy dog"]
test_corpus = ["The lazy brown dog jumps over the quick fox"]

model = lm.Unigram(training_corpus)
model.preprocess()
model.train()

perplexity = model.perplexity(test_corpus)
print("Perplexity:", perplexity)

Perplexity: 10.024357553087649


## Bigram

In [10]:
bigram_lm = lm.Bigram(e_corpus)
bigram_lm.preprocess()
bigram_lm.train()
# bigram_lm.print_ds()

### Words Probability

In [11]:
probability = bigram_lm.calc_probability('name','is')
print('probability',probability)
log_probability = bigram_lm.calc_log_probability('name','is')
print('log_probability',log_probability)

probability 0.08695652173913043
log_probability -2.4423470353692043


In [12]:
probability = bigram_lm.calc_probability('unseen','unseen')
print('probability',probability)
log_probability = bigram_lm.calc_log_probability('unseen','unseen')
print('log_probability',log_probability)

probability 0.045454545454545456
log_probability -3.0910424533583156


### Sentence Probability

In [13]:
sentence_probability = bigram_lm.calc_sentence_probability('My name is John')
print('sentence_probability',sentence_probability)
log_sentence_probability = bigram_lm.calc_log_sentence_probability('My name is John')
print('log_sentence_probability',log_sentence_probability)
np.log(sentence_probability)

sentence_probability 3.896392488485855e-06
log_sentence_probability -12.455459435860252


-12.455459435860252

In [14]:
sentence_probability = bigram_lm.calc_sentence_probability('Unseen sentence Unseen sentence')
print('sentence_probability',sentence_probability)
log_sentence_probability = bigram_lm.calc_log_sentence_probability('Unseen sentence Unseen sentence')
print('log_sentence_probability',log_sentence_probability)
np.log(sentence_probability)

sentence_probability 1.5810496651969228e-07
log_sentence_probability -15.660006679437593


-15.660006679437593

### Probability Change

In [15]:
print(bigram_lm.calc_log_sentence_probability('My name'))
print(bigram_lm.calc_log_sentence_probability('My name is John'))
print(bigram_lm.calc_log_sentence_probability('My name called Ali'))
print(bigram_lm.probability_change('My name','is John' ))
print(bigram_lm.probability_change('My name','called Ali'))

-8.180530936742738
-12.455459435860252
-14.36261584345937
-4.2749284991175145
-6.182084906716632


### Perplexity

In [16]:
training_corpus = ["The quick brown fox jumps over the lazy dog", "The quick brown fox jumps over the lazy dog"]
test_corpus = ["The lazy brown dog jumps over the quick fox"]

model = lm.Bigram(training_corpus)
model.preprocess()
model.train()

perplexity = model.perplexity(test_corpus)
print("Perplexity:", perplexity)

Perplexity: 7.723467195920273


## Trigram

In [17]:
trigram_lm = lm.Trigram(e_corpus)
trigram_lm.preprocess()
trigram_lm.train()
# trigram_lm.print_ds()

### Words Probability

In [18]:
probability = trigram_lm.calc_probability('name','is','john')
print('probability',probability)
log_probability = trigram_lm.calc_log_probability('name','is','john')
print('log_probability',log_probability)

probability 0.08695652173913043
log_probability -2.4423470353692043


In [19]:
probability = trigram_lm.calc_probability('unseen','unseen','unseen')
print('probability',probability)
log_probability = trigram_lm.calc_log_probability('unseen','unseen','unseen')
print('log_probability',log_probability)

probability 0.045454545454545456
log_probability -3.0910424533583156


### Sentence Probability

In [20]:
sentence_probability = trigram_lm.calc_sentence_probability('My name is John')
print('sentence_probability',sentence_probability)
log_sentence_probability = trigram_lm.calc_log_sentence_probability('My name is John')
print('log_sentence_probability',log_sentence_probability)
np.log(sentence_probability)

sentence_probability 5.717532455930331e-05
log_sentence_probability -9.769388141476817


-9.769388141476817

In [21]:
sentence_probability = trigram_lm.calc_sentence_probability('Unseen sentence Unseen sentence')
print('sentence_probability',sentence_probability)
log_sentence_probability = trigram_lm.calc_log_sentence_probability('Unseen sentence Unseen sentence')
print('log_sentence_probability',log_sentence_probability)
np.log(sentence_probability)

sentence_probability 4.268834096031692e-06
log_sentence_probability -12.364169813433262


-12.364169813433262

### Probability Change

In [22]:
print(trigram_lm.calc_log_sentence_probability('My name'))
print(trigram_lm.calc_log_sentence_probability('My name is John'))
print(trigram_lm.calc_log_sentence_probability('My name called Ali'))
print(trigram_lm.probability_change('My name','is John' ))
print(trigram_lm.probability_change('My name','called Ali'))

-5.577841251298354
-9.769388141476817
-11.759926158014986
-4.191546890178463
-6.182084906716632


### Perplexity

In [23]:
training_corpus = ["The quick brown fox jumps over the lazy dog", "The quick brown fox jumps over the lazy dog"]
test_corpus = ["The lazy brown dog jumps over the quick fox"]

model = lm.Trigram(training_corpus)
model.preprocess()
model.train()

perplexity = model.perplexity(test_corpus)
print("Perplexity:", perplexity)

Perplexity: 10.68280100505348


##  Interpolation

In [24]:
ilm = lm.LM(e_corpus)
ilm.preprocess()
ilm.train()

In [25]:
sentence_probability = ilm.calc_sentence_probability('My name is John')
print('sentence_probability', sentence_probability)
log_sentence_probability = ilm.calc_log_sentence_probability('My name is John')
print('log_sentence_probability', log_sentence_probability)
np.log(sentence_probability)

sentence_probability 2.8748583392406415e-05
log_sentence_probability -10.456922065242487


-10.456922065242487

In [26]:
sentence_probability = ilm.calc_sentence_probability('Unseen sentence Unseen sentence')
print('sentence_probability',sentence_probability)
log_sentence_probability = ilm.calc_log_sentence_probability('Unseen sentence Unseen sentence')
print('log_sentence_probability',log_sentence_probability)
np.log(sentence_probability)

sentence_probability 2.4891571613960787e-06
log_sentence_probability -12.903566394187292


-12.90356639418729

### Probability Change

In [27]:
print(ilm.calc_log_sentence_probability('My name'))
print(ilm.calc_log_sentence_probability('My name is John'))
print(ilm.calc_log_sentence_probability('My name called Ali'))
print(ilm.probability_change('My name','is John' ))
print(ilm.probability_change('My name','called Ali'))

-5.359402967288709
-10.456922065242487
-12.273515929097197
-5.0975190979537786
-6.914112961808488


## Saving the Models

In [28]:
# save the language models
utils.save_model(unigram_lm, "models/unigram_model.pkl")
utils.save_model(bigram_lm, "models/bigram_model.pkl")
utils.save_model(trigram_lm, "models/trigram_model.pkl")