# Aligning manuscript verses to target -- Greek

## Prerequisites

Clone the following to the same directory.

```
git clone https://github.com/Clear-Bible/ClearEngine
git clone https://github.com/sillsdev/machine
[copy syntax trees directory ..\test\TestSandbox1\SyntaxTrees\* to .\SyntaxTrees\*]
```

In [1]:
#r "nuget:SIL.Scripture,10.1.0"
#r "nuget:Thot,3.3.5"
#r "../src/ClearBible.Engine/bin/Debug/net7.0/ClearBible.Engine.dll"
#r "../src/ClearBible.Engine.SyntaxTree/bin/Debug/net7.0/ClearBible.Engine.SyntaxTree.dll"
#r "../src/ClearBible.Engine.SyntaxTree.Aligner/bin/Debug/net7.0/ClearBible.Engine.SyntaxTree.Aligner.dll"    
#r "../../machine/src/SIL.Machine/bin/Debug/netstandard2.0/Clear.SIL.Machine.dll"
#r "../../machine/src/SIL.Machine.Translation.Thot/bin/Debug/netstandard2.0/SIL.Machine.Translation.Thot.dll"    

using SIL.Machine.Corpora;
using SIL.Machine.Tokenization;
using SIL.Machine.Translation;
using SIL.Machine.Translation.Thot;
using static SIL.Machine.Corpora.TokenProcessors;
using SIL.Machine.Utils;

using ClearBible.Engine.Corpora;
using ClearBible.Engine.Exceptions;
using ClearBible.Engine.SyntaxTree.Aligner.Persistence;
using ClearBible.Engine.SyntaxTree.Aligner.Translation;
using ClearBible.Engine.SyntaxTree.Corpora;
using ClearBible.Engine.Tokenization;
using ClearBible.Engine.Translation;
using ClearBible.Engine.Persistence;

## Load corpora and build parallel verse segments

In [2]:
var syntaxTree = new SyntaxTrees();
var sourceCorpus = new SyntaxTreeFileTextCorpus(syntaxTree, FileGetBookIds.LanguageCodeEnum.G);
var transformedSourceCorpus = sourceCorpus
    .Transform<SetTrainingByTrainingLowercase>();

var targetCorpus = new ParatextTextCorpus("data/WEB-PT");
var transformedTargetCorpus = targetCorpus
    .Tokenize<LatinWordTokenizer>()
    .Transform<IntoTokensTextRowProcessor>()
    .Transform<SetTrainingBySurfaceLowercase>();

var parallelTextCorpus = transformedSourceCorpus.EngineAlignRows(
    transformedTargetCorpus,
    new SourceTextIdToVerseMappingsFromVerseMappings(
        EngineParallelTextCorpus.VerseMappingsForAllVerses(sourceCorpus.Versification, targetCorpus.Versification))
    ).ToList();

// Helper function for detokenizing text.
public static void WriteTokensEngineParallelTextRow(
    EngineParallelTextRow engineParallelTextRow, 
    IDetokenizer<IEnumerable<(Token token, string paddingBefore, string paddingAfter)>, Token> sourceDetokenizer, 
    IDetokenizer<IEnumerable<(Token token, string paddingBefore, string paddingAfter)>, Token> targetDetokenizer)
{
    Console.WriteLine(engineParallelTextRow.Ref.ToString());

    //SOURCE

    //TRAINING:
    //Token ids for segments (training) and training text, which is used to build segments:
    var tokenIdsForSegmentsAndTrainingText = engineParallelTextRow.SourceTokens!
        .Select(t => t.TokenId);
    Console.WriteLine($"Source segments tokenIds      : {string.Join(" ", tokenIdsForSegmentsAndTrainingText)}");

    //segment text, used by training
    Console.WriteLine($"Source segments spaced        : {string.Join(" ", engineParallelTextRow.SourceSegment)}");

    //training text, used to build segments
    var trainingTexts = engineParallelTextRow.SourceTokens!
        .Select(t => t.TrainingText);
    Console.WriteLine($"Source trainingText spaced    : {string.Join(" ", trainingTexts)}");

    //DISPLAY:
    //Token ids for surface text (display):
    var tokenIdsForSurfaceText = engineParallelTextRow.SourceTokens!
        .GetPositionalSortedBaseTokens() //pull out the tokens from composite tokens
        .Select(t => t.TokenId);
    Console.WriteLine($"Source tokens tokenIds        : {string.Join(" ", tokenIdsForSurfaceText)}");

    //Surface text, still tokenized
    var surfaceTexts = engineParallelTextRow.SourceTokens!
        .GetPositionalSortedBaseTokens()
        .Select(t => t.SurfaceText);
    Console.WriteLine($"Source surfaceTexts spaced    : {string.Join(" ", surfaceTexts)}");

    //Surface text, detokenized
    var tokensWithPadding = sourceDetokenizer.Detokenize(engineParallelTextRow.SourceTokens!.GetPositionalSortedBaseTokens());
    Console.WriteLine($"Source detokenized surfaceText: {tokensWithPadding.Aggregate(string.Empty, (constructedString, tokenWithPadding) => $"{constructedString}{tokenWithPadding.paddingBefore}{tokenWithPadding.token}{tokenWithPadding.paddingAfter}")}");
    Console.WriteLine("");


    //TARGET

    //TRAINING:
    //Token ids for segments (training) and training text, which is used to build segments:
    tokenIdsForSegmentsAndTrainingText = engineParallelTextRow.TargetTokens!
        .Select(t => t.TokenId);
    Console.WriteLine($"Target segments tokenIds      : {string.Join(" ", tokenIdsForSegmentsAndTrainingText)}");

    //segment text, used by training
    Console.WriteLine($"Target segments spaced        : {string.Join(" ", engineParallelTextRow.TargetSegment)}");

    //training text, used to build segments
    trainingTexts = engineParallelTextRow.TargetTokens!
        .Select(t => t.TrainingText);
    Console.WriteLine($"Target trainingText spaced    : {string.Join(" ", trainingTexts)}");

    //DISPLAY:
    //Token ids for surface text (display):
    tokenIdsForSurfaceText = engineParallelTextRow.TargetTokens!
        .GetPositionalSortedBaseTokens() //pull out the tokens from composite tokens
        .Select(t => t.TokenId);
    Console.WriteLine($"Target tokens tokenIds        : {string.Join(" ", tokenIdsForSurfaceText)}");

    //Surface text, still tokenized
    surfaceTexts = engineParallelTextRow.TargetTokens!
        .GetPositionalSortedBaseTokens()
        .Select(t => t.SurfaceText);
    Console.WriteLine($"Target surfaceTexts spaced    : {string.Join(" ", surfaceTexts)}");

    //Surface text, detokenized
    tokensWithPadding = targetDetokenizer.Detokenize(engineParallelTextRow.TargetTokens!.GetPositionalSortedBaseTokens());
    Console.WriteLine($"Target detokenized surfaceText: {tokensWithPadding.Aggregate(string.Empty, (constructedString, tokenWithPadding) => $"{constructedString}{tokenWithPadding.paddingBefore}{tokenWithPadding.token}{tokenWithPadding.paddingAfter}")}");
    Console.WriteLine("");
}


## Approach 1: train smt model(s) at once.

In [3]:
{
    // Build SymmetrizedModel for increased accuracy and many to many alignments.

    // Create the source->target SMT model
    using var srcTrgModel = new ThotFastAlignWordAlignmentModel();

    // Create the target->source SMT model
    using var trgSrcModel = new ThotFastAlignWordAlignmentModel();
    
    //put the source->target and target->source models into the symmetrized SMT model
    using var smtWordAlignmentModel = new SymmetrizedWordAlignmentModel(srcTrgModel, trgSrcModel)
    {
        Heuristic = SymmetrizationHeuristic.GrowDiagFinalAnd
    };

    //train SMT model
    using var smtWordAlignmentModelTrainer = smtWordAlignmentModel.CreateTrainer(parallelTextCorpus.Lowercase());
    smtWordAlignmentModelTrainer.Train(new DelegateProgress(status => Console.WriteLine($"Training Fastalign model: {status.PercentCompleted:P}")));
    await smtWordAlignmentModelTrainer.SaveAsync();


    
    // set the manuscript tree aligner hyperparameters
    var hyperparameters = await FileGetSyntaxTreeWordAlignerHyperparams.Get().SetLocation("hyperparametersfiles").GetAsync();

    var syntaxTrees = new SyntaxTrees();

    // create the manuscript word aligner. Engine's main implementation is specifically a tree-based aligner.
    ISyntaxTreeTrainableWordAligner syntaxTreeTrainableWordAligner = new SyntaxTreeWordAligner(
        new List<IWordAlignmentModel>() { smtWordAlignmentModel },
        0,
        hyperparameters,
        syntaxTrees);

    // initialize a manuscript word alignment model. At this point it has not yet been trained.
    using var syntaxTreeWordAlignmentModel = new SyntaxTreeWordAlignmentModel(syntaxTreeTrainableWordAligner);
    using var manuscriptTrainer = syntaxTreeWordAlignmentModel.CreateTrainer(parallelTextCorpus);

    // Trains the manuscriptmodel using the pre-trained SMT model(s)
    manuscriptTrainer.Train();
    await manuscriptTrainer.SaveAsync();

    foreach (EngineParallelTextRow engineParallelTextRow in parallelTextCorpus.Take(5))
    {
        WriteTokensEngineParallelTextRow(engineParallelTextRow, new EngineStringDetokenizer(new WhitespaceDetokenizer()), new EngineStringDetokenizer(new LatinWordDetokenizer()));

        //predict primary smt aligner alignments only then display - ONLY FOR COMPARISON
        var smtOrdinalAlignments = smtWordAlignmentModel.GetBestAlignment(engineParallelTextRow.SourceSegment, engineParallelTextRow.TargetSegment);
        IEnumerable<AlignedTokenPairs> smtSourceTargetTokenIdPairs = engineParallelTextRow.GetAlignedTokenPairs(smtOrdinalAlignments);
        // (Legacy): Alignments as ordinal positions in versesmap
        Console.WriteLine($"SMT Alignment        : {smtOrdinalAlignments}");
        // Alignments as source token to target token pairs
        Console.WriteLine($"SMT Alignment        : {string.Join(" ", smtSourceTargetTokenIdPairs.Select(t => $"{t.SourceToken.TokenId}->{t.TargetToken.TokenId}"))}");

        //predict syntax tree aligner alignments then display
        var syntaxTreeOrdinalAlignedWordPairs = syntaxTreeWordAlignmentModel.GetBestAlignmentAlignedWordPairs(engineParallelTextRow);
        // (Legacy): Alignments as ordinal positions in versesmap - ONLY FOR COMPARISON
        Console.WriteLine($"Syntax tree Alignment: {string.Join(" ", syntaxTreeOrdinalAlignedWordPairs.Select(a => a.ToString()))}");
        // ALIGNMENTS as source token to target token pairs
        var syntaxTreeAlignments = engineParallelTextRow.GetAlignedTokenPairs(syntaxTreeOrdinalAlignedWordPairs);

        Console.WriteLine($"Syntax tree Alignment: {string.Join(" ", syntaxTreeAlignments.Select(t => $"{t.SourceToken.TokenId}->{t.TargetToken.TokenId}"))}");
    }
}

Training Fastalign model: 0.00%
Training Fastalign model: 0.00%
Training Fastalign model: 8.33%
Training Fastalign model: 16.67%
Training Fastalign model: 25.00%
Training Fastalign model: 33.33%
Training Fastalign model: 41.67%
Training Fastalign model: 50.00%
Training Fastalign model: 50.00%
Training Fastalign model: 50.00%
Training Fastalign model: 58.33%
Training Fastalign model: 66.67%
Training Fastalign model: 75.00%
Training Fastalign model: 83.33%
Training Fastalign model: 91.67%
Training Fastalign model: 100.00%
1JN 1:1
Source segments tokenIds      : 062001001001001 062001001002001 062001001003001 062001001004001 062001001005001 062001001006001 062001001007001 062001001008001 062001001009001 062001001010001 062001001011001 062001001012001 062001001013001 062001001014001 062001001015001 062001001016001 062001001017001 062001001018001 062001001019001 062001001020001 062001001021001 062001001022001 062001001023001
Source segments spaced        : ὅς εἰμί ἀπό ἀρχή ὅς ἀκούω ὅς ὁράω 

Target tokens tokenIds        : 062001002001001 062001002002001 062001002003001 062001002004001 062001002005001 062001002006001 062001002007001 062001002008001 062001002009001 062001002010001 062001002011001 062001002012001 062001002013001 062001002014001 062001002015001 062001002016001 062001002017001 062001002018001 062001002019001 062001002020001 062001002021001 062001002022001 062001002023001 062001002024001 062001002025001 062001002026001 062001002027001 062001002028001 062001002029001 062001002030001 062001002031001 062001002032001 062001002033001 062001002034001 062001002035001 062001002036001 062001002037001 062001002038001 062001002039001
Target surfaceTexts spaced    : ( and the life was revealed , and we have seen , and testify , and declare to you the life , the eternal life , which was with the Father , and was revealed to us ) ;
Target detokenized surfaceText: (and the life was revealed, and we have seen, and testify, and declare to you the life, the eternal life, which w

1JN 1:4
Source segments tokenIds      : 062001004001001 062001004002001 062001004003001 062001004004001 062001004005001 062001004006001 062001004007001 062001004008001 062001004009001 062001004010001
Source segments spaced        : καί οὗτος γράφω ἐγώ ἵνα ὁ χαρά ἐγώ εἰμί πληρόω
Source trainingText spaced    : καί οὗτος γράφω ἐγώ ἵνα ὁ χαρά ἐγώ εἰμί πληρόω
Source tokens tokenIds        : 062001004001001 062001004002001 062001004003001 062001004004001 062001004005001 062001004006001 062001004007001 062001004008001 062001004009001 062001004010001
Source surfaceTexts spaced    : καὶ ταῦτα γράφομεν ἡμεῖς, ἵνα ἡ χαρὰ ἡμῶν ᾖ πεπληρωμένη.
Source detokenized surfaceText: καὶ ταῦτα γράφομεν ἡμεῖς, ἵνα ἡ χαρὰ ἡμῶν ᾖ πεπληρωμένη.

Target segments tokenIds      : 062001004001001 062001004002001 062001004003001 062001004004001 062001004005001 062001004006001 062001004007001 062001004008001 062001004009001 062001004010001 062001004011001 062001004012001 062001004013001 062001004014001 062001004015001

## Approach 2: train smt models individually.

In [4]:
{
    // Create the source->target SMT model, train it.
    using var srcTrgModel = new ThotFastAlignWordAlignmentModel();
    using var trainerSrcTrg = srcTrgModel.CreateTrainer(parallelTextCorpus.Lowercase());
    trainerSrcTrg.Train(new DelegateProgress(status =>
        Console.WriteLine($"Training SMT FastAlign model: {status.PercentCompleted:P}")));
    trainerSrcTrg.Save();

    // Create the target->source SMT model, train it.
    using var trgSrcModel = new ThotFastAlignWordAlignmentModel();
    using var trainerTrgSrc = trgSrcModel.CreateTrainer(parallelTextCorpus.Invert().Lowercase());
    
    trainerTrgSrc.Train(new DelegateProgress(status =>
        Console.WriteLine($"Training SMT FastAlign inverted model: {status.PercentCompleted:P}")));
    trainerTrgSrc.Save();
    
    //put the source->target and target->source models into the symmetrized SMT model
    using var smtWordAlignmentModel = new SymmetrizedWordAlignmentModel(srcTrgModel, trgSrcModel)
    {
        Heuristic = SymmetrizationHeuristic.GrowDiagFinalAnd
    };


    // set the manuscript tree aligner hyperparameters
    var hyperparameters = await FileGetSyntaxTreeWordAlignerHyperparams.Get().SetLocation("hyperparametersfiles").GetAsync();

    var syntaxTrees = new SyntaxTrees();

    // create the manuscript word aligner. Engine's main implementation is specifically a tree-based aligner.
    ISyntaxTreeTrainableWordAligner syntaxTreeTrainableWordAligner = new SyntaxTreeWordAligner(
        new List<IWordAlignmentModel>() { smtWordAlignmentModel },
        0,
        hyperparameters,
        syntaxTrees);

    // initialize a manuscript word alignment model. At this point it has not yet been trained.
    using var syntaxTreeWordAlignmentModel = new SyntaxTreeWordAlignmentModel(syntaxTreeTrainableWordAligner);
    using var manuscriptTrainer = syntaxTreeWordAlignmentModel.CreateTrainer(parallelTextCorpus);

    // Trains the manuscriptmodel using the pre-trained SMT model(s)
    manuscriptTrainer.Train();
    await manuscriptTrainer.SaveAsync();
    
    foreach (EngineParallelTextRow engineParallelTextRow in parallelTextCorpus.Take(5))
    {
        WriteTokensEngineParallelTextRow(engineParallelTextRow, new EngineStringDetokenizer(new WhitespaceDetokenizer()), new EngineStringDetokenizer(new LatinWordDetokenizer()));

        //predict primary smt aligner alignments only then display - ONLY FOR COMPARISON
        var smtOrdinalAlignments = smtWordAlignmentModel.GetBestAlignment(engineParallelTextRow.SourceSegment, engineParallelTextRow.TargetSegment);
        IEnumerable<AlignedTokenPairs> smtSourceTargetTokenIdPairs = engineParallelTextRow.GetAlignedTokenPairs(smtOrdinalAlignments);
        // (Legacy): Alignments as ordinal positions in versesmap
        Console.WriteLine($"SMT Alignment        : {smtOrdinalAlignments}");
        // Alignments as source token to target token pairs
        Console.WriteLine($"SMT Alignment        : {string.Join(" ", smtSourceTargetTokenIdPairs.Select(t => $"{t.SourceToken.TokenId}->{t.TargetToken.TokenId}"))}");

        //predict syntax tree aligner alignments then display
        var syntaxTreeOrdinalAlignedWordPairs = syntaxTreeWordAlignmentModel.GetBestAlignmentAlignedWordPairs(engineParallelTextRow);
        // (Legacy): Alignments as ordinal positions in versesmap - ONLY FOR COMPARISON
        Console.WriteLine($"Syntax tree Alignment: {string.Join(" ", syntaxTreeOrdinalAlignedWordPairs.Select(a => a.ToString()))}");
        // ALIGNMENTS as source token to target token pairs
        var syntaxTreeAlignments = engineParallelTextRow.GetAlignedTokenPairs(syntaxTreeOrdinalAlignedWordPairs);

        Console.WriteLine($"Syntax tree Alignment: {string.Join(" ", syntaxTreeAlignments.Select(t => $"{t.SourceToken.TokenId}->{t.TargetToken.TokenId}"))}");
    }
}

Training SMT FastAlign model: 0.00%
Training SMT FastAlign model: 16.67%
Training SMT FastAlign model: 33.33%
Training SMT FastAlign model: 50.00%
Training SMT FastAlign model: 66.67%
Training SMT FastAlign model: 83.33%
Training SMT FastAlign model: 100.00%
Training SMT FastAlign inverted model: 0.00%
Training SMT FastAlign inverted model: 16.67%
Training SMT FastAlign inverted model: 33.33%
Training SMT FastAlign inverted model: 50.00%
Training SMT FastAlign inverted model: 66.67%
Training SMT FastAlign inverted model: 83.33%
Training SMT FastAlign inverted model: 100.00%
1JN 1:1
Source segments tokenIds      : 062001001001001 062001001002001 062001001003001 062001001004001 062001001005001 062001001006001 062001001007001 062001001008001 062001001009001 062001001010001 062001001011001 062001001012001 062001001013001 062001001014001 062001001015001 062001001016001 062001001017001 062001001018001 062001001019001 062001001020001 062001001021001 062001001022001 062001001023001
Source segm

Target trainingText spaced    : ( and the life was revealed , and we have seen , and testify , and declare to you the life , the eternal life , which was with the father , and was revealed to us ) ;
Target tokens tokenIds        : 062001002001001 062001002002001 062001002003001 062001002004001 062001002005001 062001002006001 062001002007001 062001002008001 062001002009001 062001002010001 062001002011001 062001002012001 062001002013001 062001002014001 062001002015001 062001002016001 062001002017001 062001002018001 062001002019001 062001002020001 062001002021001 062001002022001 062001002023001 062001002024001 062001002025001 062001002026001 062001002027001 062001002028001 062001002029001 062001002030001 062001002031001 062001002032001 062001002033001 062001002034001 062001002035001 062001002036001 062001002037001 062001002038001 062001002039001
Target surfaceTexts spaced    : ( and the life was revealed , and we have seen , and testify , and declare to you the life , the eternal life , w

Syntax tree Alignment: 0-1 1-4 3-6 4-8 5-5 6-10 7-9 8-14 9-13 10-17 11-16 13-19 14-18 19-23 20-27 22-29 23-30 24-31 26-33 27-32 28-35 29-36
Syntax tree Alignment: 062001003001001->062001003002001 062001003002001->062001003005001 062001003004001->062001003007001 062001003005001->062001003009001 062001003006001->062001003006001 062001003007001->062001003011001 062001003008001->062001003010001 062001003009001->062001003015001 062001003010001->062001003014001 062001003011001->062001003018001 062001003012001->062001003017001 062001003014001->062001003020001 062001003015001->062001003019001 062001003020001->062001003024001 062001003021001->062001003028001 062001003023001->062001003030001 062001003024001->062001003031001 062001003025001->062001003032001 062001003027001->062001003034001 062001003028001->062001003033001 062001003029001->062001003036001 062001003030001->062001003037001
1JN 1:4
Source segments tokenIds      : 062001004001001 062001004002001 062001004003001 062001004004001 0620010