# Aligning manuscript verses to target

## Prerequisites

Clone the following to the same directory.

```
git clone https://github.com/Clear-Bible/ClearEngine
git clone https://github.com/sillsdev/machine
[copy syntax trees directory ..\test\TestSandbox1\SyntaxTrees\* to .\SyntaxTrees\*]
```

In [1]:
#r "nuget:SIL.Scripture,7.0.0"
#r "nuget:Thot,3.3.5"
#r "../src/ClearBible.Engine/bin/Debug/net6.0/ClearBible.Engine.dll"
#r "../src/ClearBible.Engine.TreeAligner/bin/Debug/net6.0/ClearBible.Engine.TreeAligner.dll"
#r "../../machine/src/SIL.Machine/bin/Debug/netstandard2.0/SIL.Machine.dll"
#r "../../machine/src/SIL.Machine.Translation.Thot/bin/Debug/netstandard2.0/SIL.Machine.Translation.Thot.dll"

using SIL.Machine.Corpora;
using SIL.Machine.Tokenization;
using SIL.Machine.Translation;
using SIL.Machine.Translation.Thot;
using static SIL.Machine.Corpora.TokenProcessors;
using SIL.Machine.Utils;

using ClearBible.Engine.Translation;
using ClearBible.Engine.Corpora;
using ClearBible.Engine.Persistence;
using ClearBible.Engine.Tokenization;
using ClearBible.Engine.TreeAligner.Persistence;
using ClearBible.Engine.TreeAligner.Translation;

## Load corpora and build parallel verse segments

In [2]:
var manuscriptText = new ManuscriptFileTree("SyntaxTrees");
var sourceTextCorpus = new EngineManuscriptFileTextCorpus(manuscriptText);

var tokenizer = new LatinWordTokenizer();
var targetTextCorpus = new EngineParatextTextCorpus(tokenizer, "data/WEB-PT");

// formulate a parallel corpus based on versification. If Engine versification (sourceTargetParallelVersesList parameter) not provided,
// build it using SIL Scripture / Machine versification.
var parallelTextCorpus = new EngineParallelTextCorpus(sourceTextCorpus, targetTextCorpus);

## Approach 1: train smt model(s) at once.

In [3]:
{
    // Create the source->target SMT model
    using var srcTrgModel = new ThotFastAlignWordAlignmentModel();

    // Create the target->source SMT model
    using var trgSrcModel = new ThotFastAlignWordAlignmentModel();
     
    //put the source->target and target->source models into the symmetrized SMT model
    using var symmetrizedModel = new SymmetrizedWordAlignmentModel(srcTrgModel, trgSrcModel)
    {
        Heuristic = SymmetrizationHeuristic.GrowDiagFinalAnd
    };

    // obtain the manuscript word aligner configuration
    var manuscriptTreeAlignmentConfig = await FileGetManuscriptTreeAligmentConfig.Get().SetLocation("InputCommon").GetAsync();
    
    // create the manuscript word aligner. Engine's main implementation is specifically a tree-based aligner.
    IManuscriptTrainableWordAligner manuscriptTrainableWordAligner = new ManuscriptTreeWordAlginer(
        symmetrizedModel,
        manuscriptTreeAlignmentConfig);

    // initialize a manuscript word alignment model. At this point it has not yet been trained.
    using var manuscriptModel = new ManuscriptWordAlignmentModel(manuscriptTrainableWordAligner);
    using var manuscriptTrainer = manuscriptModel.CreateTrainer(parallelTextCorpus, targetPreprocessor: TokenProcessors.Lowercase);

    // train the source->target and target->source models within the symmetrized word alignment model.
    // NOTE: this only needs to be done if they aren't trained already, otherwise this step can be skipped.
    manuscriptTrainer.Train(new DelegateProgress(status =>
        Console.WriteLine($"Training ManuscriptWordAlignmentModel: {status.PercentCompleted:P}")));
    manuscriptTrainer.Save();

    // now iterate through the best alignments in the model.
    foreach (ParallelTextSegment textSegment in parallelTextCorpus.GetSegments().Take(5))
    {
        var alignment = manuscriptModel.GetBestAlignment(TokenProcessors.Lowercase.Process(textSegment.SourceSegment),
            TokenProcessors.Lowercase.Process(textSegment.TargetSegment));

        var verseRefStr = textSegment.SegmentRef.ToString();
        var sourceVerseText = string.Join(" ", textSegment.SourceSegment);
        var targetVerseText = string.Join(" ", textSegment.TargetSegment);
        Console.WriteLine(verseRefStr);
        Console.WriteLine($"Source: {sourceVerseText}");
        Console.WriteLine($"Target: {targetVerseText}");
        Console.WriteLine($"Alignment: {alignment}");


        if (textSegment != null && textSegment is EngineParallelTextSegment)
        {
            var sourceTokenIds = string.Join(" ", ((EngineParallelTextSegment)textSegment).SourceTokens?
                .Select(token => token.TokenId.ToString()) ?? new string[] { "NONE" });
            Console.WriteLine($"SourceTokenIds: {sourceTokenIds}");

            var targetTokenIds = string.Join(" ", ((EngineParallelTextSegment)textSegment).TargetTokens?
                .Select(token => token.TokenId.ToString()) ?? new string[] {"NONE"});
            Console.WriteLine($"TargetTokenIds: {targetTokenIds}");

            IEnumerable<(Token, Token)> sourceTargetTokenIdPairs = ((EngineParallelTextSegment)textSegment).GetAlignedTokenIdPairs(alignment);
            var alignments = string.Join(" ", sourceTargetTokenIdPairs.Select(t => $"{t.Item1.TokenId}->{t.Item2.TokenId}"));
            Console.WriteLine($"SourceTokenId->TargetTokenId: {alignments}");
        }
    }
}

Training ManuscriptWordAlignmentModel: 0.00%
Training ManuscriptWordAlignmentModel: 0.00%
Training ManuscriptWordAlignmentModel: 0.00%
Training ManuscriptWordAlignmentModel: 4.17%
Training ManuscriptWordAlignmentModel: 8.33%
Training ManuscriptWordAlignmentModel: 12.50%
Training ManuscriptWordAlignmentModel: 16.67%
Training ManuscriptWordAlignmentModel: 20.83%
Training ManuscriptWordAlignmentModel: 25.00%
Training ManuscriptWordAlignmentModel: 25.00%
Training ManuscriptWordAlignmentModel: 25.00%
Training ManuscriptWordAlignmentModel: 29.17%
Training ManuscriptWordAlignmentModel: 33.33%
Training ManuscriptWordAlignmentModel: 37.50%
Training ManuscriptWordAlignmentModel: 41.67%
Training ManuscriptWordAlignmentModel: 45.83%
Training ManuscriptWordAlignmentModel: 50.00%
Training ManuscriptWordAlignmentModel: 50.00%
Training ManuscriptWordAlignmentModel: 100.00%
1JN 1:1
Source: ὅς εἰμί ἀπό ἀρχή ὅς ἀκούω ὅς ὁράω ὁ ὀφθαλμός ἐγώ ὅς θεάομαι καί ὁ χείρ ἐγώ ψηλαφάω περί ὁ λόγος ὁ ζωή
Target: That

1JN 1:4
Source: καί οὗτος γράφω ἐγώ ἵνα ὁ χαρά ἐγώ εἰμί πληρόω
Target: And we write these things to you , that our joy may be fulfilled .
Alignment: 0-0 1-1 2-2 1-3 2-4 2-5 3-5 2-6 4-8 5-9 7-9 6-10 6-11 8-12 9-13 8-14
SourceTokenIds: 062001004001001 062001004002001 062001004003001 062001004004001 062001004005001 062001004006001 062001004007001 062001004008001 062001004009001 062001004010001
TargetTokenIds: 062001004001001 062001004002001 062001004003001 062001004004001 062001004005001 062001004006001 062001004007001 062001004008001 062001004009001 062001004010001 062001004011001 062001004012001 062001004013001 062001004014001 062001004015001
SourceTokenId->TargetTokenId: 062001004001001->062001004001001 062001004002001->062001004002001 062001004003001->062001004003001 062001004002001->062001004004001 062001004003001->062001004005001 062001004003001->062001004006001 062001004004001->062001004006001 062001004003001->062001004007001 062001004005001->062001004009001 062001004006001->062001

To get book, chapter, verse, word, subword numbering (BBBCCCVVVWWWSSS) for corpora and alignments:

## Approach 2: train smt models individually.

In [4]:
{
    // Create the source->target SMT model, train it.
    using var srcTrgModel = new ThotFastAlignWordAlignmentModel();
    using var trainerSrcTrg = srcTrgModel.CreateTrainer(parallelTextCorpus,
                                                        targetPreprocessor: TokenProcessors.Lowercase);
    trainerSrcTrg.Train(new DelegateProgress(status =>
        Console.WriteLine($"Training SMT FastAlign model: {status.PercentCompleted:P}")));
    trainerSrcTrg.Save();

    // Create the target->source SMT model, train it.
    using var trgSrcModel = new ThotFastAlignWordAlignmentModel();
    using var trainerTrgSrc = trgSrcModel.CreateTrainer(parallelTextCorpus.Invert(),
                                                        targetPreprocessor: TokenProcessors.Lowercase);
    trainerTrgSrc.Train(new DelegateProgress(status =>
        Console.WriteLine($"Training SMT FastAlign inverted model: {status.PercentCompleted:P}")));
    trainerTrgSrc.Save();
    
    //put the source->target and target->source models into the symmetrized SMT model
    using var symmetrizedModel = new SymmetrizedWordAlignmentModel(srcTrgModel, trgSrcModel)
    {
        Heuristic = SymmetrizationHeuristic.GrowDiagFinalAnd
    };

    // obtain the manuscript word aligner configuration
    var manuscriptTreeAlignmentConfig = await FileGetManuscriptTreeAligmentConfig.Get().SetLocation("InputCommon").GetAsync();
    
    // create the manuscript word aligner. Engine's main implementation is specifically a tree-based aligner.
    IManuscriptTrainableWordAligner manuscriptTrainableWordAligner = new ManuscriptTreeWordAlginer(
        symmetrizedModel,
        manuscriptTreeAlignmentConfig);

    // initialize a manuscript word alignment model. At this point it has already been trained.
    using var manuscriptModel = new ManuscriptWordAlignmentModel(manuscriptTrainableWordAligner);

    // now iterate through the best alignments in the model.
    foreach (ParallelTextSegment textSegment in parallelTextCorpus.GetSegments().Take(5))
    {
        var alignment = manuscriptModel.GetBestAlignment(TokenProcessors.Lowercase.Process(textSegment.SourceSegment),
            TokenProcessors.Lowercase.Process(textSegment.TargetSegment));

        var verseRefStr = textSegment.SegmentRef.ToString();
        var sourceVerseText = string.Join(" ", textSegment.SourceSegment);
        var targetVerseText = string.Join(" ", textSegment.TargetSegment);
        Console.WriteLine(verseRefStr);
        Console.WriteLine($"Source: {sourceVerseText}");
        Console.WriteLine($"Target: {targetVerseText}");
        Console.WriteLine($"Alignment: {alignment}");


        if (textSegment != null && textSegment is EngineParallelTextSegment)
        {
            var sourceTokenIds = string.Join(" ", ((EngineParallelTextSegment)textSegment).SourceTokens?
                .Select(token => token.TokenId.ToString()) ?? new string[] { "NONE" });
            Console.WriteLine($"SourceTokenIds: {sourceTokenIds}");

            var targetTokenIds = string.Join(" ", ((EngineParallelTextSegment)textSegment).TargetTokens?
                .Select(token => token.TokenId.ToString()) ?? new string[] {"NONE"});
            Console.WriteLine($"TargetTokenIds: {targetTokenIds}");

            IEnumerable<(Token, Token)> sourceTargetTokenIdPairs = ((EngineParallelTextSegment)textSegment).GetAlignedTokenIdPairs(alignment);
            var alignments = string.Join(" ", sourceTargetTokenIdPairs.Select(t => $"{t.Item1.TokenId}->{t.Item2.TokenId}"));
            Console.WriteLine($"SourceTokenId->TargetTokenId: {alignments}");
        }
    }
}

Training SMT FastAlign model: 0.00%
Training SMT FastAlign model: 16.67%
Training SMT FastAlign model: 33.33%
Training SMT FastAlign model: 50.00%
Training SMT FastAlign model: 66.67%
Training SMT FastAlign model: 83.33%
Training SMT FastAlign model: 100.00%
Training SMT FastAlign inverted model: 0.00%
Training SMT FastAlign inverted model: 16.67%
Training SMT FastAlign inverted model: 33.33%
Training SMT FastAlign inverted model: 50.00%
Training SMT FastAlign inverted model: 66.67%
Training SMT FastAlign inverted model: 83.33%
Training SMT FastAlign inverted model: 100.00%
1JN 1:1
Source: ὅς εἰμί ἀπό ἀρχή ὅς ἀκούω ὅς ὁράω ὁ ὀφθαλμός ἐγώ ὅς θεάομαι καί ὁ χείρ ἐγώ ψηλαφάω περί ὁ λόγος ὁ ζωή
Target: That which was from the beginning , that which we have heard , that which we have seen with our eyes , that which we saw , and our hands touched , concerning the Word of life
Alignment: 0-0 0-1 8-2 2-3 1-4 3-5 4-7 4-8 6-9 6-10 5-11 11-14 10-15 11-16 12-17 13-18 10-19 9-20 15-25 16-28 17-29 17

1JN 1:4
Source: καί οὗτος γράφω ἐγώ ἵνα ὁ χαρά ἐγώ εἰμί πληρόω
Target: And we write these things to you , that our joy may be fulfilled .
Alignment: 0-0 1-1 2-2 2-3 2-4 2-5 3-5 2-6 4-8 5-9 7-9 6-10 6-11 8-12 9-13 8-14
SourceTokenIds: 062001004001001 062001004002001 062001004003001 062001004004001 062001004005001 062001004006001 062001004007001 062001004008001 062001004009001 062001004010001
TargetTokenIds: 062001004001001 062001004002001 062001004003001 062001004004001 062001004005001 062001004006001 062001004007001 062001004008001 062001004009001 062001004010001 062001004011001 062001004012001 062001004013001 062001004014001 062001004015001
SourceTokenId->TargetTokenId: 062001004001001->062001004001001 062001004002001->062001004002001 062001004003001->062001004003001 062001004003001->062001004004001 062001004003001->062001004005001 062001004003001->062001004006001 062001004004001->062001004006001 062001004003001->062001004007001 062001004005001->062001004009001 062001004006001->062001