# Sampling

In [1]:
from os import getcwd, chdir

if getcwd().endswith('notebooks'):
    chdir('..')

## Get Data

In [2]:
import torch
from data.getDataset import getCognatesSet, getIteration
from data.vocab import computeInferenceData, PADDING_TOKEN, vocabulary, SOS_TOKEN, EOS_TOKEN
from torch.nn.utils.rnn import pad_sequence
from Types.articleModels import ModernLanguages
from Types.models import InferenceData
from Source.reconstructionModel import ReconstructionModel

device = 'cuda' if torch.cuda.is_available() else 'cpu'

cognates:dict[ModernLanguages, InferenceData] = {lang:computeInferenceData(cogSet) for (lang, cogSet) in getCognatesSet().items()}
currentReconstructions = computeInferenceData(getIteration(1))

LSTM_INPUT_DIM = 50
LSTM_HIDDEN_DIM = 50

randomEditModel = ReconstructionModel(cognates, vocabulary, LSTM_INPUT_DIM, LSTM_HIDDEN_DIM)

x_maxLength = currentReconstructions[0].size()[0] - 2
print('|x| max =', x_maxLength)

|x| max = 16


False Language Model with neutral probability for test

In [None]:
from torch import Tensor
from Types.models import InferenceData
from lm.PriorLM import PriorLM
from numpy import ndarray

class LM(PriorLM):
    def __init__(self):
        pass
    def inference(self, reconstructions: InferenceData) -> ndarray:
        return torch.ones((len(reconstructions[1]),)).numpy()
    
lm = LM()

## Generation

In [None]:
from Source.generateProposals import generateProposalsFromCurrentReconstructions

proposalsList = generateProposalsFromCurrentReconstructions(
    getIteration(1),
    getCognatesSet()
)
torch.save(proposalsList, './Tests/proposalsSet.pt')

In [None]:
!ls Tests

In [None]:
proposalsList = [p.cpu().numpy() for p in torch.load('./Tests/proposalsSet.pt')]

## Forward dynamic program

In [None]:
import numpy as np

B = len(proposalsList)
b = 40

proposalNumbers = np.array([len(p) for p in proposalsList], dtype=np.uint)
for it in range(100):
    randomIndexes = np.floor(np.random.random((B, b))*proposalNumbers[:,np.newaxis]).astype(np.uint)
    samples = pad_sequence([
        torch.as_tensor(proposalsList[p][randomIndexes[p]], dtype=torch.int, device='cuda').T
            for p in range(B)])
print(samples.size())

In [7]:
from data.vocab import computeInferenceData

sources = computeInferenceData(torch.load('./Tests/samples.pt', torch.device('cpu'))[:,:,:1])
print(sources[0].size())
randomEditModel.update_modernForm_context()

torch.Size([26, 3213, 1])


In [8]:
print(randomEditModel.forward_dynProg(sources)['french'].size())

torch.Size([3213, 1])


## Sampling from proposals

In [None]:
from Source.sampling import metropolisHasting

samples = metropolisHasting(proposalsList, randomEditModel, lm)