# Maximisation-expectation

Notebook configuration

In [1]:
from os import getcwd, chdir

# set the current working directory to the repository's root, if required
if getcwd().endswith('notebooks'):
    chdir('./../')

In [2]:
!python -m pip install tabulate
from IPython.display import display
import pandas as pd
pd.set_option('display.float_format', lambda x: f'{x:.4f}')

def display_horizontaly_two_tables(df:pd.DataFrame,df2:pd.DataFrame, title1:str, title2:str):
    df_md = df.to_markdown().split('\n')
    df2_md = df2.to_markdown().split('\n')
    title_line = title1 + " " * (len(df_md[0])-len(title1)) + "\t" + title2
    two_tabs = "\n".join([title_line] + [df_md[i] + '\t' + df2_md[i] for i in range(len(df_md))])
    print(two_tabs)




[notice] A new release of pip is available: 23.1.2 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## Initial Dataset

In [3]:
from data.getDataset import getCognatesSet, getIteration
from data.datapipes import formatTargets
from data.vocab import computeInferenceData, wordsToOneHots, vocabulary, IPA_charsNumber
from models.articleModels import ModernLanguages, MODERN_LANGUAGES
from models.models import InferenceData
from source.reconstructionModel import ReconstructionModel

raw_cognates = getCognatesSet()
cognates:dict[ModernLanguages, InferenceData] = formatTargets(raw_cognates)
raw_samples = getIteration(1)
currentReconstructions = computeInferenceData(wordsToOneHots(raw_samples).unsqueeze(-1)) #TODO: simplify the data loading

LSTM_INPUT_DIM = 50
LSTM_HIDDEN_DIM = 50

randomEditModel = ReconstructionModel(MODERN_LANGUAGES, vocabulary, LSTM_INPUT_DIM, LSTM_HIDDEN_DIM)

TEST_LANGUAGE:ModernLanguages = "french"
x_maxLength = currentReconstructions[0].size()[0] - 2
y_maxLength = cognates[TEST_LANGUAGE][0].size()[0] - 2
print('|y| max =', y_maxLength)
print('|x| max =', x_maxLength)

|y| max = 17
|x| max = 16


## Backward dynamic Program

Backward dynamic Program

In [4]:
targets_prob = randomEditModel.backward_dynProg(currentReconstructions, cognates)

### Analysis

Test on $x :=$`"absyrdʊ"` and $y :=$`"absˈyʁd"` ($|x|=7$ and $|y|=7$)

In [5]:
IDX = 20 # the index in the batch of the sample to be studied
x_length, y_length = 7, 7

backward_prob_dlt = targets_prob[TEST_LANGUAGE].dlt[:,:,IDX].cpu().numpy().squeeze(-1)
backward_prob_end = targets_prob[TEST_LANGUAGE].end[:,:,IDX].cpu().numpy().squeeze(-1)
backward_prob_sub = targets_prob[TEST_LANGUAGE].sub[:,:,IDX].cpu().numpy().squeeze(-1)
backward_prob_ins = targets_prob[TEST_LANGUAGE].ins[:,:,IDX].cpu().numpy().squeeze(-1)

#### Display of cached probabilities for the substitution and the deletion operations

Notice that the logarithmic probabilities that have not to be defined for these operations are automatically set to $-\infty{}$ thanks to the recurrence mathematical relations of the backward dynamic program. So do the undefined probabilities in padding positions, thanks to the neutrality of edit model's cached inference probabilities (which equal $0$ in the log space).

In [6]:
df = pd.DataFrame(backward_prob_sub[:x_length+2, :y_length+2])
df2 = pd.DataFrame(backward_prob_dlt[:x_length+2, :y_length+2])

display_horizontaly_two_tables(df, df2, "Subsitution Operation", "Deletion Operation")

Subsitution Operation                                                                                	Deletion Operation
|    |        0 |        1 |        2 |        3 |        4 |        5 |        6 |      7 |      8 |	|    |        0 |        1 |        2 |        3 |        4 |        5 |        6 |         7 |      8 |
|---:|---------:|---------:|---------:|---------:|---------:|---------:|---------:|-------:|-------:|	|---:|---------:|---------:|---------:|---------:|---------:|---------:|---------:|----------:|-------:|
|  0 | -1e+09   | -1e+09   | -1e+09   | -1e+09   | -1e+09   | -1e+09   | -1e+09   | -2e+09 | -2e+09 |	|  0 | -1e+09   | -1e+09   | -1e+09   | -1e+09   | -1e+09   | -1e+09   | -1e+09   | -1e+09    | -2e+09 |
|  1 | -1.35371 | -1.86939 | -2.35208 | -3.49419 | -4.72537 | -5.81715 | -7.59627 | -1e+09 | -1e+09 |	|  1 | -1.80048 | -2.08853 | -2.44592 | -3.08607 | -4.10483 | -4.71093 | -6.2416  | -7.18322  | -1e+09 |
|  2 | -2.28378 | -1.84261 | -1.96672 | -2.45948 | 

## Rendering of the target and the logits before the loss computation

In [7]:
for lang in targets_prob.keys():
    targets_prob[lang] = targets_prob[lang].toTargetsProbs()
targets_prob = [dict(zip(targets_prob,t)) for t in zip(*targets_prob.values())] # list of C dict[ModernLanguages, dict[Operations, Tensor(shape=*)]]

  return torch.logical_and(A.unsqueeze(1), B.unsqueeze(0)).to(device)
  return torch.logical_and(A.unsqueeze(1), B.unsqueeze(0)).to(device)
  return torch.logical_and(A.unsqueeze(1), B.unsqueeze(0)).to(device)
  return torch.logical_and(A.unsqueeze(1), B.unsqueeze(0)).to(device)


In [8]:
renderedTargets = targets_prob[IDX][TEST_LANGUAGE]
df_renderedSub = pd.DataFrame(renderedTargets['sub'].squeeze(-1).squeeze(-1)[:x_length+2, :y_length+2])
df2_renderedDlt = pd.DataFrame(renderedTargets['dlt'].squeeze(-1).squeeze(-1)[:x_length+2, :y_length+2])
display_horizontaly_two_tables(df_renderedSub, df2_renderedDlt, "Subsitution Operation", "Deletion Operation")

Subsitution Operation                                                                             	Deletion Operation
|    |        0 |        1 |        2 |        3 |        4 |        5 |        6 |      7 |   8 |	|    |        0 |        1 |        2 |        3 |        4 |        5 |        6 |         7 |   8 |
|---:|---------:|---------:|---------:|---------:|---------:|---------:|---------:|-------:|----:|	|---:|---------:|---------:|---------:|---------:|---------:|---------:|---------:|----------:|----:|
|  0 | -1e+09   | -1e+09   | -1e+09   | -1e+09   | -1e+09   | -1e+09   | -1e+09   | -2e+09 |   0 |	|  0 | -1e+09   | -1e+09   | -1e+09   | -1e+09   | -1e+09   | -1e+09   | -1e+09   | -1e+09    |   0 |
|  1 | -1.35371 | -1.86939 | -2.35208 | -3.49419 | -4.72537 | -5.81715 | -7.59627 | -1e+09 |   0 |	|  1 | -1.80048 | -2.08853 | -2.44592 | -3.08607 | -4.10483 | -4.71093 | -6.2416  | -7.18322  |   0 |
|  2 | -2.28378 | -1.84261 | -1.96672 | -2.45948 | -3.33074 | -4.20092 | -5.86

In [10]:
from torchdata.datapipes.iter import IterableWrapper
dp3 = IterableWrapper(targets_prob)
iterator = iter(dp3)
firstElt = next(iterator)
print()




In [11]:
from torchdata.dataloader2 import DataLoader2

dl = DataLoader2(dp3)

KeyboardInterrupt: 

In [9]:
MINI_BATCH_SIZE = 30
from data.datapipes import get_training_datapipe

_raw_cognates_list: list[dict[ModernLanguages, str]] = [{lang:raw_cognates[lang][i] for lang in MODERN_LANGUAGES} for i in range(len(raw_cognates["french"]))]
_dp2 = IterableWrapper(_raw_cognates_list)
_dp1 = IterableWrapper(raw_samples)
dp = _dp1.zip(_dp2, _dp3)
dp = get_training_datapipe(dp, MINI_BATCH_SIZE)

In [10]:
from torchdata.dataloader2 import DataLoader2

dl = DataLoader2(dataset=dp) # issue with DataLoader2 and to_graph in cause of tensors in targets_probs (_dp3)

In [None]:
maxModernSequenceLength_inMiniBatch: dict[ModernLanguages, int] = {lang:data[2] for (lang, data) in miniBatchDataForLogitsComputation[1].items()} #type:ignore
maxSampleSequenceLength_inMiniBatch: int = miniBatchDataForLogitsComputation[0][2]+1
renderedTargets = renderedTargets.split([maxModernSequenceLength_inMiniBatch[lang]*maxSampleSequenceLength_inMiniBatch for lang in randomEditModel.languages])[randomEditModel.languages.index(TEST_LANGUAGE)].view(maxSampleSequenceLength_inMiniBatch, maxModernSequenceLength_inMiniBatch[TEST_LANGUAGE], IPA_charsNumber*2+2).cpu().numpy()
renderedLogits = renderedLogits.split([maxModernSequenceLength_inMiniBatch[lang]*maxSampleSequenceLength_inMiniBatch for lang in randomEditModel.languages])[randomEditModel.languages.index(TEST_LANGUAGE)].view(maxSampleSequenceLength_inMiniBatch, maxModernSequenceLength_inMiniBatch[TEST_LANGUAGE], IPA_charsNumber*2+2).cpu().numpy()

print(renderedTargets.shape)
print(renderedLogits.shape)

#### Comparison of the rendered target probs format and the cached probs format

A reduction of the renderedTargets has been done for the displaying. The comparison is done for the insertion and the ending operations.

In [None]:
df_insBackward = pd.DataFrame(backward_prob_ins[:x_length+2, :y_length+2])
df_endBackward = pd.DataFrame(backward_prob_end[:x_length+2, :y_length+2])
df_insRenderedTargets = pd.DataFrame(renderedTargets[:x_length+2, :y_length+2, -IPA_charsNumber-1:-1].sum(axis=2))
df_endRenderedTargets = pd.DataFrame(renderedTargets[:x_length+2, :y_length+2, -1])
df_insRenderedLogits = pd.DataFrame(renderedLogits[:x_length+2, :y_length+2, -IPA_charsNumber-1:-1].sum(axis=2))
df_endRenderedLogits = pd.DataFrame(renderedLogits[:x_length+2, :y_length+2, -1])

display_horizontaly_two_tables(df_insBackward, df_endBackward, "Insertion operation - cached posterior probs", "Ending operation - cached posterior probs")
display_horizontaly_two_tables(df_insRenderedTargets, df_endRenderedTargets, "Insertion operation - rendered target probs", "Ending operation - rendered target probs")
display_horizontaly_two_tables(df_insRenderedLogits, df_endRenderedLogits, "Insertion operation - rendered logits", "Ending operation - rendered logits")

# display(pd.merge(df_insBackward, df_endBackward, left_index=True, right_index=True, suffixes=(' ins', ' end')),
#         pd.merge(df_insRenderedTargets, df_endRenderedTargets, left_index=True, right_index=True, suffixes=(' ins', ' end'))
#         )

## Training round

In [12]:
EPOCHS, LEARNING_RATE = 5, 0.01
randomEditModel.train_models(dl, EPOCHS, LEARNING_RATE)

Epoch 1
------------------------------------------------------------


KeyboardInterrupt: 