# Prior Language Models

*Computing of $p(x)$*

Run this before continuing so that the imports work.

In [1]:
from os import chdir, getcwd

if getcwd().endswith('notebooks'):
    chdir('..')

## Data loading

### Vocabulary initialisation

In [2]:
from data.vocab import vocabulary
from models.models import EOS_TOKEN, SOS_TOKEN, PADDING_TOKEN

print('|IPA| =', len(vocabulary)-3) # '-3' because `vocabulary` contains the IPA characters plus the special tokens listed below
for token in (SOS_TOKEN, EOS_TOKEN, PADDING_TOKEN):
    print(token, vocabulary[token])

|IPA| = 57
( 57
) 58
- 59


### Dataset initialisation

In [3]:
from data.getDataset import getLMTrainingSet

# Following 'Article Scientifique' there are three db of different sizes.
DB_SIZE = [20_000, 10_000, 5_000]

# Generate the three db of different sizes.
tokens_20k, tokens_10k, tokens_5k = getLMTrainingSet(DB_SIZE)

### Training load

In [4]:
from torch.utils.data.backward_compatibility import worker_init_fn
from torch.utils.data import DataLoader
from torchtext.datasets import CC100

dp = CC100(root='./out/cache', language_code='la')
DataLoader(dp, shuffle=True, num_workers=4, worker_init_fn=worker_init_fn, drop_last=True)

<torch.utils.data.dataloader.DataLoader at 0x1fdd7b32dc0>

### Evaluation load

In [5]:
from data.getDataset import getIteration
from data.vocab import computeInferenceData

sources = computeInferenceData(getIteration(3)[:, :24], vocabulary)
print(sources)

(tensor([[57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57,
         57, 57, 57, 57, 57, 57],
        [32,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0],
        [ 1,  1,  1,  1, 55,  1,  1,  1,  1,  1,  1,  1, 43, 43,  1,  1,  1,  1,
          1,  1,  1,  1,  1,  1],
        [ 0,  0,  2, 32,  7,  9,  9,  9, 11, 17, 17, 30, 15, 15, 15, 15, 15, 15,
         15, 15, 15, 30, 30, 21],
        [11,  2,  6, 14, 32,  0,  0, 43, 36,  9, 10, 14, 37, 32, 37, 30, 16, 16,
         16, 16, 21, 11, 11, 15],
        [43, 32,  8,  0,  8, 56, 16, 56, 33,  6,  6, 16, 11, 11, 11, 14,  6,  6,
         14, 14, 14,  2,  2, 43],
        [58, 15,  0, 56, 16, 27,  6, 27,  0,  3, 11,  6, 16, 15, 16,  1, 11, 11,
          0,  0,  2,  0,  0, 58],
        [59,  0, 14, 15, 43, 30, 18, 30, 15, 58,  0, 18, 58,  0, 43, 14, 32, 37,
          8,  8, 43, 14, 11, 59],
        [59, 58, 58, 30, 58, 58, 43, 58, 30, 59,  1, 43, 59, 58, 58,  3, 11, 11

In [6]:
from data.vocab import oneHotsToWords

testIndexInBatch = 20
word = oneHotsToWords(sources[0][:,testIndexInBatch:testIndexInBatch+1], False, vocabulary)[0]
print(f"word for test: {word}\n")
print(f"word IntTensor: {sources[0][:, testIndexInBatch]}")

word for test: (absyrdʊ)---

word IntTensor: tensor([57,  0,  1, 15, 21, 14,  2, 43, 58, 59, 59, 59], device='cuda:0',
       dtype=torch.int32)


## RNN LM

### Initialisation

In [7]:
from torch.cuda import is_available
from lm.PriorLM import CharLM

device = 'cuda' if is_available() else 'cpu'

# Init the character level LSTM language model 
LSTM_lm = CharLM(embedding_size=1024, hidden_size=100, num_layers=2, dropout_rate=0.1, vocab=vocabulary).to(device)

  from .autonotebook import tqdm as notebook_tqdm


### Training

***`TODO`***

### Inference

In [8]:
probs = LSTM_lm.inference(sources)
print(f"p('{word}') =", probs[testIndexInBatch].item())

p('(absyrdʊ)---') = -28.352100372314453


## $n$-gram LM

### Initialisation

In [9]:
from lm.PriorLM import NGramLM

bigram_20k = NGramLM(n=2, vocab=vocabulary)
bigram_10k = NGramLM(n=2, vocab=vocabulary)
bigram_5k = NGramLM(n=2, vocab=vocabulary)

trigram_20k = NGramLM(n=3, vocab=vocabulary)
trigram_10k = NGramLM(n=3, vocab=vocabulary)
trigram_5k = NGramLM(n=3, vocab=vocabulary)

In [10]:
sentence_test = "absyrdʊ ifikare"
batch = bigram_20k.batch_ngram(sentence_test)
print(batch)

tensor([[[57,  0],
         [57,  6]],

        [[ 0,  1],
         [ 6,  4]],

        [[ 1, 15],
         [ 4,  6]],

        [[15, 21],
         [ 6,  8]],

        [[21, 14],
         [ 8,  0]],

        [[14,  2],
         [ 0, 14]],

        [[ 2, 43],
         [14,  3]],

        [[43, 58],
         [ 3, 58]]], device='cuda:0', dtype=torch.int32)


### Training

In [11]:
# TODO: Torch MP
bigram_20k.train(tokens_20k)
bigram_10k.train(tokens_10k)
bigram_5k.train(tokens_5k)

trigram_20k.train(tokens_20k)
trigram_10k.train(tokens_10k)
trigram_5k.train(tokens_5k)

tensor([[[ 1.0000e-05,  1.0000e-05,  1.0000e-05,  ...,  1.0000e-05,
           1.0000e-05,  1.0000e-05],
         [-4.5163e+00,  1.0000e-05, -5.2095e+00,  ...,  1.0000e-05,
          -2.8223e-01,  1.0000e-05],
         [-2.4849e+00,  1.0000e-05,  1.0000e-05,  ...,  1.0000e-05,
          -9.5343e-01,  1.0000e-05],
         ...,
         [ 1.0000e-05,  1.0000e-05,  1.0000e-05,  ...,  1.0000e-05,
           1.0000e-05,  1.0000e-05],
         [ 1.0000e-05,  1.0000e-05,  1.0000e-05,  ...,  1.0000e-05,
           0.0000e+00,  1.0000e-05],
         [ 1.0000e-05,  1.0000e-05,  1.0000e-05,  ...,  1.0000e-05,
           1.0000e-05,  1.0000e-05]],

        [[ 1.0000e-05, -2.6741e+00,  1.0000e-05,  ...,  1.0000e-05,
          -3.3673e+00,  1.0000e-05],
         [ 1.0000e-05,  1.0000e-05,  1.0000e-05,  ...,  1.0000e-05,
           1.0000e-05,  1.0000e-05],
         [ 1.0000e-05,  1.0000e-05,  1.0000e-05,  ...,  1.0000e-05,
           1.0000e-05,  1.0000e-05],
         ...,
         [ 1.0000e-05,  1

### Inference

In [14]:
bigram_20k.inference(sources)
bigram_10k.inference(sources)
bigram_5k.inference(sources)

trigram_20k.inference(sources)
trigram_10k.inference(sources)
trigram_5k.inference(sources)

tensor([-12.8235, -15.4207, -13.8246, -11.5589,  -9.7588,  -3.5543,  -5.3734,
         -3.5543,  -8.4310,  -3.5543,  -6.7731, -12.7137, -20.8093, -15.4825,
        -14.3738, -12.9782, -17.1908, -18.7406, -17.7046, -18.8678,  -8.4200,
        -18.5905, -25.0964,  -3.5543], device='cuda:0', dtype=torch.float64)