# Prior Language Models

*Computing of $p(x)$*

Run this before continuing so that the imports work.

In [None]:
from os import chdir, getcwd

if getcwd().endswith('notebooks'):
    chdir('..')

## Data loading

### Vocabulary initialisation

In [None]:
from data.vocab import vocabulary
from models.types import EOS_TOKEN, SOS_TOKEN, PADDING_TOKEN

print('|IPA| =', len(vocabulary)-3) # '-3' because `vocabulary` contains the IPA characters plus the special tokens listed below
for token in (SOS_TOKEN, EOS_TOKEN, PADDING_TOKEN):
    print(token, vocabulary[token])

### Dataset initialisation

In [None]:
from data.getDataset import getLMTrainingSet

# Following 'Article Scientifique' there are three db of different sizes.
DB_SIZE = [20_000, 10_000, 5_000]

# Generate the three db of different sizes.
tokens_20k, tokens_10k, tokens_5k = getLMTrainingSet(DB_SIZE)

### Training load

In [None]:
from torch.utils.data.backward_compatibility import worker_init_fn
from torch.utils.data import DataLoader
from torchtext.datasets import CC100

dp = CC100(root='./out/cache', language_code='la')
DataLoader(dp, shuffle=True, num_workers=4, worker_init_fn=worker_init_fn, drop_last=True)

### Evaluation load

In [None]:
from data.getDataset import getIteration
from data.vocab import computeInferenceData_Cognates

sources = computeInferenceData_Cognates(getIteration(3)[:24], vocabulary)
print(sources)

In [None]:
from data.vocab import oneHotsToWords

testIndexInBatch = 20
word = oneHotsToWords(sources[0][:,testIndexInBatch:testIndexInBatch+1], False, vocabulary)[0]
print(f"word for test: {word}\n")
print(f"word IntTensor: {sources[0][:, testIndexInBatch]}")

## RNN LM

### Initialisation

In [None]:
from torch.cuda import is_available
from lm.PriorLM import CharLM

device = 'cuda' if is_available() else 'cpu'

# Init the character level LSTM language model 
LSTM_lm = CharLM(embedding_size=1024, hidden_size=100, num_layers=2, dropout_rate=0.1, vocab=vocabulary).to(device)

### Training

***`TODO`***

### Inference

In [None]:
probs = LSTM_lm.inference(sources)
print(f"p('{word}') =", probs[testIndexInBatch].item())

## $n$-gram LM

### Initialisation

In [None]:
from lm.PriorLM import NGramLM

bigram_20k = NGramLM(n=2, vocab=vocabulary)
bigram_10k = NGramLM(n=2, vocab=vocabulary)
bigram_5k = NGramLM(n=2, vocab=vocabulary)

trigram_20k = NGramLM(n=3, vocab=vocabulary)
trigram_10k = NGramLM(n=3, vocab=vocabulary)
trigram_5k = NGramLM(n=3, vocab=vocabulary)

In [None]:
sentence_test = "absyrdʊ ifikare"
batch = bigram_20k.batch_ngram(sentence_test)
print(batch)

### Training

In [None]:
# TODO: Torch MP
bigram_20k.train(tokens_20k)
bigram_10k.train(tokens_10k)
bigram_5k.train(tokens_5k)

trigram_20k.train(tokens_20k)
trigram_10k.train(tokens_10k)
trigram_5k.train(tokens_5k)

### Inference

In [None]:
bigram_20k.inference(sources)
bigram_10k.inference(sources)
bigram_5k.inference(sources)

trigram_20k.inference(sources)
trigram_10k.inference(sources)
trigram_5k.inference(sources)