# Data encoding

In [None]:
from os import getcwd, chdir

if getcwd().endswith('notebooks'):
    chdir('..')

In [None]:
from data.getDataset import getCognatesSet, getIteration
from data.vocab import computeInferenceData_Samples, computeInferenceData_Cognates, wordsToOneHots
from random import randint

raw_cognates = getCognatesSet()
__random_start_index = randint(0, len(raw_cognates['french'])-6)
cognates = computeInferenceData_Cognates({lang: wordsToOneHots(raw_cognates[lang]) for lang in raw_cognates})['french']
cognates = (cognates[0][:, __random_start_index: __random_start_index + 5],
            cognates[1][__random_start_index: __random_start_index + 5],
            )
__max_cognate_length = cognates[1].max().item()
cognates = (cognates[0][:__max_cognate_length], cognates[1], __max_cognate_length)
raw_cognates = raw_cognates['french'][__random_start_index: __random_start_index + 5]
raw_samples = getIteration(1)[__random_start_index: __random_start_index + 5]
samples = computeInferenceData_Samples(wordsToOneHots(raw_samples)) #TODO: simplify the data loading

In [3]:
import torch
print(torch.cuda.is_available())

True


## Encoding

In [4]:
from data.vocab import vocabulary
from models.types import SOS_TOKEN, EOS_TOKEN, PADDING_TOKEN
print(f"SOS_TOKEN = \"{SOS_TOKEN}\" = {vocabulary[SOS_TOKEN]}")
print(f"EOS_TOKEN = \"{EOS_TOKEN}\" = {vocabulary[EOS_TOKEN]}")
print(f"PADDING_TOKEN = \"{PADDING_TOKEN}\" = {vocabulary[PADDING_TOKEN]}")

SOS_TOKEN = "(" = 58
EOS_TOKEN = ")" = 57
PADDING_TOKEN = "-" = 59


## Samples encoding: `InferenceData_Source` type

This type refers to a tuple of three elements:
- an IntTensor `S` of shape $\left(\max \{|x|, x \in \textrm{batch}\} + 2, c, b\right)$. For all $0 \leq i < c$ and $0 \leq j < b$, `S[:, i, j]` represents one sample among the $b$ ones which are linked with the $i$-th cognate pair. It is represented along the first axis by tokens encoded with one-hot indexes and the sequence is opened by the `SOS_TOKEN` and the `EOS_TOKEN`.
- a cpu ByteTensor `L` of shape $\left( c, b \right)$ containing the length of each samples with the boundaries token. It is defined such that `S[L[i, j]:, i, j]` is a list of the `PADDING_TOKEN`'s one-hot indices. Therefore, `L[i, j]` = $|x_{(i,j)}| + 2$, if we note $x_{(i,j)}$ as the raw sample (without the boundaries token) represented at the position (i, j) in `S`.
- `n`: the max of `L` (if the tuple is correctly defined, then `n = S.size()[0]`)

In [None]:
print(raw_samples)
print(samples[0][...,0].T)
print("Sample tensor's shape:", samples[0].size())
print('\n' + "Samples' length (without boundaries):", str([len(c) for c in raw_samples]))
print("Samples' length (with boundaries):", samples[1][:,0])
print("Max sample length with boundaries:", samples[2])

['pyblikaθjɔ', 'pɔlʊ', 'pldr', 'pulmɔ', 'plpa']
tensor([[58, 13, 21,  1,  9,  6,  8,  0, 56,  7, 30, 57],
        [58, 13, 30,  9, 43, 57, 59, 59, 59, 59, 59, 59],
        [58, 13,  9,  2, 14, 57, 59, 59, 59, 59, 59, 59],
        [58, 13, 17,  9, 10, 30, 57, 59, 59, 59, 59, 59],
        [58, 13,  9, 13,  0, 57, 59, 59, 59, 59, 59, 59]], device='cuda:0',
       dtype=torch.int32)
Sample tensor's shape: torch.Size([12, 5, 1])

Samples' length (without boundaries): [10, 4, 4, 5, 4]
Samples' length (with boundaries): tensor([12,  6,  6,  7,  6])
Max sample length with boundaries: 12


## Cognates encoding: `InferenceData_Targets` type

This type of data is defined by a tuple similar with `InferenceData_Source`, excepted that the `EOS_TOKEN` is here removed from the first IntTensor, which involves that the sequences lengths are reduced by one, compared to the sequences in the previous type. Therefore, we can sum up its three elements in the following list:
- `S`: an IntTensor of shape $\left( \max \{ |y_l|, y_l\in \textrm{batch}_l \} + 1, c \right)$
- `L`: a cpu ByteTensor of shape $(c)$ `L[i]` = $|y_{l, i}| + 1$
- `n`: the max of `L` (if the tuple is correctly defined, then `n = S.size()[0]` )

In [None]:
print(raw_cognates)
print(cognates[0].T)
print("Cognate tensor's shape:",cognates[0].size())
print('\n' + "Cognates' length (without SOS token):", str([len(c) for c in raw_cognates]))
print("Cognates' length (with SOS token):", cognates[1])
print("Max cognate length with SOS token:", cognates[2])

['pyblikasjˈɔ̃', 'pˈɔ̃dʁ', 'pˈudʁ', 'pumˈɔ̃', 'pˈup']
tensor([[58, 13, 21,  1,  9,  6,  8,  0, 15,  7, 51, 30, 54],
        [58, 13, 51, 30, 54,  2, 41, 59, 59, 59, 59, 59, 59],
        [58, 13, 51, 17,  2, 41, 59, 59, 59, 59, 59, 59, 59],
        [58, 13, 17, 10, 51, 30, 54, 59, 59, 59, 59, 59, 59],
        [58, 13, 51, 17, 13, 59, 59, 59, 59, 59, 59, 59, 59]], device='cuda:0',
       dtype=torch.int32)
Cognate tensor's shape: torch.Size([13, 5])

Cognates' length (without SOS token): [12, 6, 5, 6, 4]
Cognates' length (with SOS token): tensor([13,  7,  6,  7,  5])
Max cognate length with SOS token: 13
