# Sampling

In [1]:
from os import getcwd, chdir

if getcwd().endswith('notebooks'):
    chdir('..')

## Get Data

### Transform the cognates

For the generation, the cognates already need to be transformed into the ByteTensor format. Each ByteTensor in the list of dictionnaries has the elementary shape $\left( |y_{c, l}|\right)$ 

In [2]:
import torch
from torch import tensor, Tensor, uint8
from data.vocab import vocabulary
from data.getDataset import getCognatesSet
from Source.utils import dl_to_ld
from models.types import ModernLanguages

device = 'cuda' if torch.cuda.is_available() else 'cpu'

cognates: list[dict[ModernLanguages, Tensor]] = [{lang:tensor(data=vocabulary(list(d[lang])), dtype=uint8, device=device) for lang in d} for d in dl_to_ld(getCognatesSet())]

### Samples generation

*Choose one of the two proposed generation methods for the test by setting the constant below.*

In [3]:
GENERATE_WITH_ALGORITHM = False

In [4]:
SAMPLES_NUMBER_PER_COGNATE_GROUP = 2000

In [5]:
samples: list[Tensor] = []

#### 1. Random prototype

In [6]:
from Tests.createSamples import createSamplesBatch
if not GENERATE_WITH_ALGORITHM:
    samples = createSamplesBatch(len(cognates), SAMPLES_NUMBER_PER_COGNATE_GROUP)

#### 2. With the generation algorithm 

We choose the first Bouchard-Côté model's iteration for the reconstructions from which the generation will be processed

In [7]:
from data.getDataset import getIteration
from Source.generateProposals import generateProposalsFromCurrentReconstructions

if GENERATE_WITH_ALGORITHM:
    currentReconstructions: list[Tensor] = [tensor(data=vocabulary(list(word)), dtype=uint8, device=device) for word in getIteration(1)]
    samples = generateProposalsFromCurrentReconstructions(currentReconstructions, cognates, SAMPLES_NUMBER_PER_COGNATE_GROUP)    

## Init models

In [8]:
from Source.reconstructionModel import ReconstructionModel
from models.types import MODERN_LANGUAGES

LSTM_INPUT_DIM = 50
LSTM_HIDDEN_DIM = 50

randomEditModel = ReconstructionModel(MODERN_LANGUAGES, vocabulary, LSTM_INPUT_DIM, LSTM_HIDDEN_DIM)

False Language Model with neutral probability for test

In [9]:
from models.types import InferenceData_Samples
from lm.PriorLM import PriorLM
from torch import zeros, float32

class LM(PriorLM):
    def __init__(self):
        pass
    def inference(self, reconstructions: InferenceData_Samples):
        return zeros(size=reconstructions[1].size(), dtype=float32, device=device)
    
random_lm = LM()

## Compute unnormalized probs

### Init Dataloader

Dataset of roughly $\frac{B \cdot C}{b \cdot c}$ mini batches

In [10]:
from data.reconstruction_datasets import samplingDataLoader

MINI_BATCH_SHAPE = (len(cognates)//8, 50)
dataloader = samplingDataLoader(samples, cognates, MINI_BATCH_SHAPE)

### Run Inference

The unnormalized probabilities are computed from by running the inference in the prior language model and the forward dynamic program for each edit model.

<u>Example:</u>

In [11]:
from time import time

t1 = time()
elt = next(iter(dataloader))
edit_models_results = randomEditModel.forward_dynProg(*elt[0])
prior_lm_results = random_lm.inference(elt[0][0])
t2 = time()
dt = t2 - t1
total_time = dt*(SAMPLES_NUMBER_PER_COGNATE_GROUP*len(cognates))/(len(cognates)//8 * 50)
print(f"Pour une itération : {dt}s ; Au total sur le dataset : {total_time//3600}h{total_time//60}m")

for lang in edit_models_results:
    print(f'Probs tensor shape from {lang}\'s edit model: {edit_models_results[lang].size()}')
print(f'Probs tensor shape from prior edit model:', prior_lm_results.size())

unnormalized_probs = prior_lm_results
for lang in edit_models_results:
    unnormalized_probs += edit_models_results[lang]
print('Unnormalized probs tensor shape:', unnormalized_probs.shape)

Pour une itération : 2.4593262672424316s ; Au total sur le dataset : 0.0h13.0m
Probs tensor shape from french's edit model: torch.Size([401, 50])
Probs tensor shape from spanish's edit model: torch.Size([401, 50])
Probs tensor shape from italian's edit model: torch.Size([401, 50])
Probs tensor shape from portuguese's edit model: torch.Size([401, 50])
Probs tensor shape from romanian's edit model: torch.Size([401, 50])
Probs tensor shape from prior edit model: torch.Size([401, 50])
Unnormalized probs tensor shape: torch.Size([401, 50])


__Complete iteration:__

In [12]:
for data in dataloader:
    __edit_models_results = randomEditModel.forward_dynProg(*data[0])
    __prior_lm_results = random_lm.inference(data[0][0])
    __results = __prior_lm_results
    for lang in __edit_models_results:
        __results += __edit_models_results[lang]

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.13 GiB (GPU 0; 11.76 GiB total capacity; 6.51 GiB already allocated; 2.11 GiB free; 8.46 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF