# Prior Language Models

*Computing of $p(x)$*

`config`

In [1]:
from os import chdir, getcwd

if getcwd().endswith('notebooks'):
    chdir('..')

# Data loading

### Vocabulary initialisation

In [2]:
from data.vocab import vocabulary
from models.models import EOS_TOKEN, SOS_TOKEN, PADDING_TOKEN
print('|IPA| =', len(vocabulary)-3)
for token in (EOS_TOKEN, SOS_TOKEN, PADDING_TOKEN):
    print(token, vocabulary[token])

|IPA| = 57
) 57
( 58
- 59


### Evaluation load

In [3]:
from data.getDataset import getIteration
from data.vocab import computeInferenceData

sources = computeInferenceData(getIteration(3)[:, :24], vocabulary)

In [4]:
from data.vocab import oneHotsToWords

testIndexInBatch = 20
word = oneHotsToWords(sources[0][:,testIndexInBatch:testIndexInBatch+1], False, vocabulary)[0]
print("word for test =", word)
print("\n word IntTensor:")
print(sources[0][:, testIndexInBatch])

word for test = (absyrdʊ)---

 word IntTensor:
tensor([58,  0,  1, 15, 21, 14,  2, 43, 57, 59, 59, 59], dtype=torch.int32)


### Training load

In [8]:
from torch.utils.data.backward_compatibility import worker_init_fn
from torch.utils.data import DataLoader
from torchtext.datasets import CC100

dp = CC100(root='./out/cache', language_code='la')
DataLoader(dp, shuffle=True, num_workers=4, worker_init_fn=worker_init_fn, drop_last=True)

<torch.utils.data.dataloader.DataLoader at 0x232a517c700>

## RNN LM

### Inference

In [9]:
from lm.PriorLM import CharLM

EMBEDDING_SIZE = 1024
HIDDEN_SIZE = 100
NUMBER_OF_LAYER = 2
lm = CharLM(1024, 100, 2, 0.1, vocabulary)

In [10]:
probs = lm.inference(sources)
print(f"p('{word}') =", probs[testIndexInBatch].item())

p('(absyrdʊ)---') = -32.569068908691406


### Training

***`TODO`***

## $n$-gram LM

* If $L < n$:

If the sequence's length is lower than $n$ -- the order of the $n$-gram --, so the sequence is padded with closing boundaries `')'`.\\
For instance, `'(b)'` string will be processed by the 4-gram as `'(b))'` so at least one inference can be done in the language model.

In training, the probabilities of this kind of $n$-gram will be computed by lower-order $(n-k)$-gram models.

* If $L < L_\textrm{max}$:

The inference in a batch of strings will force the algorithm to process $n$-grams containing padding empty characters. To neutralize their useless probabilities, the empty characters are represented with a "full one-hot" vector, containing only values equaling `1`. Then, the computed transition probabilities will be higher than 1 and we after limit it to 1 with the max operator.

***`TODO`: review this section with new NGram implementation***

### Inference

In [None]:
import torch
from torch.nn.utils.rnn import pack_sequence
from torch.nn.functional import one_hot
V = {'a':0, 'b':1, 'c':2, "(":3, ')':4}
V_inv = ['a', 'b', 'c', "(", ")"]
raw_batch = ['(ab)', '(abcb)', '(cba)', '(b)']
batch = pack_sequence([one_hot(torch.LongTensor([V[c] for c in w])) for w in raw_batch], enforce_sorted=False) # size = (L, B)

from lm.PriorLM import NGramLM

model = NGramLM([], 4)
paddedData = torch.exp(model.padDataToNgram((batch, None, None)))
for w in range(4):
    print(f'\"{raw_batch[w]}\"')
    print(paddedData[:,w,:]) # dim = (L, V)



### Training