# Sampling

In [None]:
from os import getcwd, chdir

if getcwd().endswith('notebooks'):
    chdir('..')

## Get cognate Data

For the generation, the cognates already need to be transformed into the ByteTensor format. Each ByteTensor in the list of dictionnaries has the elementary shape $\left( |y_{c, l}|\right)$ 

In [None]:
import torch
from torch import tensor, Tensor, uint8
from uneurecon.data.vocab import get_vocabulary
from uneurecon.data.getDataset import getCognatesSet
from uneurecon.Source.utils import dl_to_ld
from uneurecon.models.types import ModernLanguages

vocabulary = get_vocabulary()[0]
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("On CUDA device:", device == "cuda")

cognates: list[dict[ModernLanguages, Tensor]] = [{lang:tensor(data=vocabulary(list(d[lang])), dtype=uint8, device=device) for lang in d} for d in dl_to_ld(getCognatesSet())]

## Sampling hyparametres
Please choose the hyperparametres for this sampling run:

In [None]:
GENERATE_WITH_ALGORITHM = False
SAMPLES_NUMBER_PER_COGNATE_GROUP = 2000
MINI_BATCH_SHAPE = (len(cognates)//8, 50)

## Samples generation

*One of the two proposed generation methods for the test is being executed according to the value of the `GENERATE_WITH_ALGORITHM` constant.*

In [None]:
samples: list[Tensor] = []

### 1. Random prototype

In [None]:
from uneurecon.Tests.createSamples import createSamplesBatch
if not GENERATE_WITH_ALGORITHM:
    samples = createSamplesBatch(len(cognates), SAMPLES_NUMBER_PER_COGNATE_GROUP, device, vocabulary)

### 2. With the generation algorithm 

We choose the first Bouchard-Côté model's iteration for the reconstructions from which the generation will be processed

In [None]:
from uneurecon.data.getDataset import getIteration
from uneurecon.Source.generateProposals import generateProposalsFromCurrentReconstructions

if GENERATE_WITH_ALGORITHM:
    currentReconstructions: list[Tensor] = [tensor(data=vocabulary(list(word)), dtype=uint8, device=device) for word in getIteration(1)]
    samples = generateProposalsFromCurrentReconstructions(currentReconstructions, cognates, SAMPLES_NUMBER_PER_COGNATE_GROUP)    

## Init models

In [None]:
from uneurecon.Source.reconstructionModel import ReconstructionModel
from uneurecon.models.types import MODERN_LANGUAGES

LSTM_INPUT_DIM = 50
LSTM_HIDDEN_DIM = 50

randomEditModel = ReconstructionModel(MODERN_LANGUAGES, vocabulary, LSTM_INPUT_DIM, LSTM_HIDDEN_DIM)

False Language Model with neutral probability for test

In [None]:
from uneurecon.models.types import InferenceData_Samples
from uneurecon.lm.PriorLM import PriorLM
from torch import zeros, float32

class LM(PriorLM):
    def __init__(self):
        pass
    def inference(self, reconstructions: InferenceData_Samples):
        return zeros(size=reconstructions[1].size(), dtype=float32, device=device)
    
random_lm = LM()

## Compute unnormalized probs

### Init Dataloader

Dataset of roughly $\frac{B \cdot C}{b \cdot c}$ mini batches

In [None]:
from uneurecon.data.reconstruction_datasets import samplingDataLoader

dataloader = samplingDataLoader(samples, cognates, vocabulary, MINI_BATCH_SHAPE)

In [None]:
print(device=="cuda")

### Run Inference

The unnormalized probabilities are computed from by running the inference in the prior language model and the forward dynamic program for each edit model.

<u>Example:</u>

In [None]:
from torch.profiler import profile, ProfilerActivity, record_function

acts = [ProfilerActivity.CPU]
if device == "cuda":
    acts.append(ProfilerActivity.CUDA)
with profile(activities = acts, use_cuda = device=="cuda") as prof:
    with record_function("data loading"):
        elt = next(iter(dataloader))
    with record_function("reconstruction_model_inference"):
        edit_models_results = randomEditModel.forward_dynProg(*elt[0])
    with record_function("prior_lm_inference"):
        prior_lm_results = random_lm.inference(elt[0][0])
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))

In [None]:
from time import time

t1 = time()
elt = next(iter(dataloader))
edit_models_results = randomEditModel.forward_dynProg(*elt[0])
prior_lm_results = random_lm.inference(elt[0][0])
t2 = time()
dt = t2 - t1
total_time = dt*(SAMPLES_NUMBER_PER_COGNATE_GROUP*len(cognates))/(len(cognates)//8 * 50)
print(f"Pour une itération : {dt}s ; Au total sur le dataset : {total_time//3600}h{total_time//60}m")

for lang in edit_models_results:
    print(f'Probs tensor shape from {lang}\'s edit model: {edit_models_results[lang].size()}')
print(f'Probs tensor shape from prior edit model:', prior_lm_results.size())

unnormalized_probs = prior_lm_results
for lang in edit_models_results:
    unnormalized_probs += edit_models_results[lang]
print('Unnormalized probs tensor shape:', unnormalized_probs.shape)

__Complete iteration:__

In [None]:
for data in dataloader:
    __edit_models_results = randomEditModel.forward_dynProg(*data[0])
    __prior_lm_results = random_lm.inference(data[0][0])
    __results = __prior_lm_results
    for lang in __edit_models_results:
        __results += __edit_models_results[lang]