# Aligning manuscript verses to target

## Prerequisites

Clone the following to the same directory.

```
git clone https://github.com/Clear-Bible/ClearEngine
git clone https://github.com/sillsdev/machine
[copy syntax trees directory ..\test\TestSandbox1\SyntaxTrees\* to .\SyntaxTrees\*]
```

In [1]:
#r "nuget:SIL.Scripture,7.0.0"
#r "nuget:Thot,3.3.5"
#r "../src/ClearBible.Engine/bin/Debug/net6.0/ClearBible.Engine.dll"
#r "../src/ClearBible.Engine.TreeAligner/bin/Debug/net6.0/ClearBible.Engine.TreeAligner.dll"
#r "../../machine/src/SIL.Machine/bin/Debug/netstandard2.0/SIL.Machine.dll"
#r "../../machine/src/SIL.Machine.Translation.Thot/bin/Debug/netstandard2.0/SIL.Machine.Translation.Thot.dll"

using SIL.Machine.Corpora;
using SIL.Machine.Tokenization;
using SIL.Machine.Translation;
using SIL.Machine.Translation.Thot;
using static SIL.Machine.Corpora.TokenProcessors;
using SIL.Machine.Utils;

using ClearBible.Engine.Translation;
using ClearBible.Engine.Corpora;
using ClearBible.Engine.Persistence;
using ClearBible.Engine.Tokenization;
using ClearBible.Engine.TreeAligner.Persistence;
using ClearBible.Engine.TreeAligner.Translation;

## Load corpora and build parallel verse segments

In [2]:
var manuscriptTree = new ManuscriptFileTree("SyntaxTrees");

var sourceCorpus = new ManuscriptFileTextCorpus(manuscriptTree)
    .Tokenize<LatinWordTokenizer>()
    .Transform<IntoTokensTextRowProcessor>();

var targetCorpus = new ParatextTextCorpus("data/WEB-PT")
    .Tokenize<LatinWordTokenizer>()
    .Transform<IntoTokensTextRowProcessor>();

var parallelTextCorpus = sourceCorpus.EngineAlignRows(targetCorpus, new());

FunctionWordTextRowProcessor.Train(parallelTextCorpus);

parallelTextCorpus.SourceCorpus = parallelTextCorpus.SourceCorpus
    .Transform<FunctionWordTextRowProcessor>();

## Approach 1: train smt model(s) at once.

In [3]:
{
    // Build SymmetrizedModel for increased accuracy and many to many alignments.

    // Create the source->target SMT model
    using var srcTrgModel = new ThotFastAlignWordAlignmentModel();

    // Create the target->source SMT model
    using var trgSrcModel = new ThotFastAlignWordAlignmentModel();
    
    //put the source->target and target->source models into the symmetrized SMT model
    using var smtWordAlignmentModel = new SymmetrizedWordAlignmentModel(srcTrgModel, trgSrcModel)
    {
        Heuristic = SymmetrizationHeuristic.GrowDiagFinalAnd
    };

    //train model
    using var smtWordAlignmentModelTrainer = smtWordAlignmentModel.CreateTrainer(parallelTextCorpus.Lowercase());
    smtWordAlignmentModelTrainer.Train(new DelegateProgress(status => Console.WriteLine($"Training Fastalign model: {status.PercentCompleted:P}")));
    await smtWordAlignmentModelTrainer.SaveAsync();

    // set the manuscript tree aligner hyperparameters
    var manuscriptTreeAlignerParams = await FileGetManuscriptTreeAlignerParams.Get().SetLocation("InputCommon").GetAsync();
    manuscriptTreeAlignerParams.useAlignModel = true;
    manuscriptTreeAlignerParams.maxPaths = 1000000;
    manuscriptTreeAlignerParams.goodLinkMinCount = 3;
    manuscriptTreeAlignerParams.badLinkMinCount = 3;
    manuscriptTreeAlignerParams.contentWordsOnly = true;

    // create the manuscript word aligner. Engine's main implementation is specifically a tree-based aligner.
    IManuscriptTrainableWordAligner manuscriptTrainableWordAligner = new ManuscriptTreeWordAligner(
        new List<IWordAlignmentModel>() { smtWordAlignmentModel },
        0,
        manuscriptTreeAlignerParams,
        manuscriptTree);

    // initialize a manuscript word alignment model. At this point it has not yet been trained.
    using var manuscriptWordAlignmentModel = new ManuscriptWordAlignmentModel(manuscriptTrainableWordAligner);
    using var manuscriptWordAlignmentModelTrainer = manuscriptWordAlignmentModel.CreateTrainer(parallelTextCorpus);

    // Trains the manuscript word alignment model using the pre-trained SMT model(s)
    manuscriptWordAlignmentModelTrainer.Train(new DelegateProgress(status =>
        Console.WriteLine($"Training ManuscriptWordAlignmentModel: {status.PercentCompleted:P}")));
    manuscriptWordAlignmentModelTrainer.Save();

    foreach (EngineParallelTextRow engineParallelTextRow in parallelTextCorpus.Take(5))
    {
        //Display corpora
        var verseRefStr = engineParallelTextRow.Ref.ToString();
        var sourceVerseText = string.Join(" ", engineParallelTextRow.SourceSegment);
        var targetVerseText = string.Join(" ", engineParallelTextRow.TargetSegment);
        Console.WriteLine(verseRefStr);

        //source
        Console.WriteLine($"Source: {sourceVerseText}");
        var sourceTokenIds = string.Join(" ", engineParallelTextRow.SourceTokens?
            .Select(token => token.TokenId.ToString()) ?? new string[] { "NONE" });
        Console.WriteLine($"SourceTokenIds: {sourceTokenIds}");

        //target
        Console.WriteLine($"Target: {targetVerseText}");
        var targetTokenIds = string.Join(" ", engineParallelTextRow.TargetTokens?
            .Select(token => token.TokenId.ToString()) ?? new string[] { "NONE" });
        Console.WriteLine($"TargetTokenIds: {targetTokenIds}");

        //get smt alignments
        var smtOrdinalAlignments = smtWordAlignmentModel.GetBestAlignment(engineParallelTextRow.SourceSegment, engineParallelTextRow.TargetSegment);
        IEnumerable<(Token, Token)> smtSourceTargetTokenIdPairs = engineParallelTextRow.GetAlignedTokenIdPairs(smtOrdinalAlignments);

        //get manuscript tree aligner alignments
        var manuscriptOrdinalAlignedWordPairs = manuscriptWordAlignmentModel.GetBestAlignmentAlignedWordPairs(engineParallelTextRow);
        IEnumerable<(Token, Token)> manuscriptSourceTargetTokenIdPairs = engineParallelTextRow.GetAlignedTokenIdPairs(manuscriptOrdinalAlignedWordPairs);

        //display smt alignments ordinally and by tokenIds
        Console.WriteLine($"SMT Alignment        : {smtOrdinalAlignments}");
        Console.WriteLine($"SMT Alignemnt        : {string.Join(" ", smtSourceTargetTokenIdPairs.Select(t => $"{t.Item1.TokenId}->{t.Item2.TokenId}"))}");

        //display manuscript alignments ordinally and by tokenIds
        Console.WriteLine($"Manuscript Alignment : { string.Join(" ", manuscriptOrdinalAlignedWordPairs.Select(a => a.ToString()))}");
        Console.WriteLine($"Manuscript Alignemnt : {string.Join(" ", manuscriptSourceTargetTokenIdPairs.Select(t => $"{t.Item1.TokenId}->{t.Item2.TokenId}"))}");
    }
}

Training Fastalign model: 0.00%
Training Fastalign model: 0.00%
Training Fastalign model: 8.33%
Training Fastalign model: 16.67%
Training Fastalign model: 25.00%
Training Fastalign model: 33.33%
Training Fastalign model: 41.67%
Training Fastalign model: 50.00%
Training Fastalign model: 50.00%
Training Fastalign model: 50.00%
Training Fastalign model: 58.33%
Training Fastalign model: 66.67%
Training Fastalign model: 75.00%
Training Fastalign model: 83.33%
Training Fastalign model: 91.67%
Training Fastalign model: 100.00%
Training ManuscriptWordAlignmentModel: 0.00%
Training ManuscriptWordAlignmentModel: 0.00%
Training ManuscriptWordAlignmentModel: 50.00%
Training ManuscriptWordAlignmentModel: 50.00%
Training ManuscriptWordAlignmentModel: 100.00%
1JN 1:1
Source: ὅς εἰμί ἀπό ἀρχή ὅς ἀκούω ὅς ὁράω ὁ ὀφθαλμός ἐγώ ὅς θεάομαι καί ὁ χείρ ἐγώ ψηλαφάω περί ὁ λόγος ὁ ζωή
SourceTokenIds: 062001001001001 062001001002001 062001001003001 062001001004001 062001001005001 062001001006001 062001001007001

SMT Alignment        : 0-1 0-2 1-4 2-5 5-5 3-6 4-8 7-9 6-10 9-13 8-14 7-15 11-16 10-17 12-18 13-19 16-20 19-22 15-23 18-24 17-25 21-26 20-27 22-28 23-30 24-31 25-32 27-32
SMT Alignemnt        : 062001003001001->062001003002001 062001003001001->062001003003001 062001003002001->062001003005001 062001003003001->062001003006001 062001003006001->062001003006001 062001003004001->062001003007001 062001003005001->062001003009001 062001003008001->062001003010001 062001003007001->062001003011001 062001003010001->062001003014001 062001003009001->062001003015001 062001003008001->062001003016001 062001003012001->062001003017001 062001003011001->062001003018001 062001003013001->062001003019001 062001003014001->062001003020001 062001003017001->062001003021001 062001003020001->062001003023001 062001003016001->062001003024001 062001003019001->062001003025001 062001003018001->062001003026001 062001003022001->062001003027001 062001003021001->062001003028001 062001003023001->062001003029001 06200100302400

## Approach 2: train smt models individually.

In [4]:
{
    // Create the source->target SMT model, train it.
    using var srcTrgModel = new ThotFastAlignWordAlignmentModel();
    using var trainerSrcTrg = srcTrgModel.CreateTrainer(parallelTextCorpus.Lowercase());
    trainerSrcTrg.Train(new DelegateProgress(status =>
        Console.WriteLine($"Training SMT FastAlign model: {status.PercentCompleted:P}")));
    trainerSrcTrg.Save();

    // Create the target->source SMT model, train it.
    using var trgSrcModel = new ThotFastAlignWordAlignmentModel();
    using var trainerTrgSrc = trgSrcModel.CreateTrainer(parallelTextCorpus.Invert().Lowercase());
    
    trainerTrgSrc.Train(new DelegateProgress(status =>
        Console.WriteLine($"Training SMT FastAlign inverted model: {status.PercentCompleted:P}")));
    trainerTrgSrc.Save();
    
    //put the source->target and target->source models into the symmetrized SMT model
    using var smtWordAlignmentModel = new SymmetrizedWordAlignmentModel(srcTrgModel, trgSrcModel)
    {
        Heuristic = SymmetrizationHeuristic.GrowDiagFinalAnd
    };

    // set the manuscript tree aligner hyperparameters
    // set the manuscript tree aligner hyperparameters
    var manuscriptTreeAlignerParams = await FileGetManuscriptTreeAlignerParams.Get().SetLocation("InputCommon").GetAsync();
    manuscriptTreeAlignerParams.useAlignModel = true;
    manuscriptTreeAlignerParams.maxPaths = 1000000;
    manuscriptTreeAlignerParams.goodLinkMinCount = 3;
    manuscriptTreeAlignerParams.badLinkMinCount = 3;
    manuscriptTreeAlignerParams.contentWordsOnly = true;

    // create the manuscript word aligner. Engine's main implementation is specifically a tree-based aligner.
    IManuscriptTrainableWordAligner manuscriptTrainableWordAligner = new ManuscriptTreeWordAligner(
        new List<IWordAlignmentModel>() { smtWordAlignmentModel },
        0,
        manuscriptTreeAlignerParams,
        manuscriptTree);

    // initialize a manuscript word alignment model. At this point it has not yet been trained.
    using var manuscriptWordAlignmentModel = new ManuscriptWordAlignmentModel(manuscriptTrainableWordAligner);
    using var manuscriptWordAlignmentModelTrainer = manuscriptWordAlignmentModel.CreateTrainer(parallelTextCorpus);

    // Trains the manuscript word alignment model using the pre-trained SMT model(s)
    manuscriptWordAlignmentModelTrainer.Train(new DelegateProgress(status =>
        Console.WriteLine($"Training ManuscriptWordAlignmentModel: {status.PercentCompleted:P}")));
    manuscriptWordAlignmentModelTrainer.Save();

    foreach (EngineParallelTextRow engineParallelTextRow in parallelTextCorpus.Take(5))
    {
        //Display corpora
        var verseRefStr = engineParallelTextRow.Ref.ToString();
        var sourceVerseText = string.Join(" ", engineParallelTextRow.SourceSegment);
        var targetVerseText = string.Join(" ", engineParallelTextRow.TargetSegment);
        Console.WriteLine(verseRefStr);

        //source
        Console.WriteLine($"Source: {sourceVerseText}");
        var sourceTokenIds = string.Join(" ", engineParallelTextRow.SourceTokens?
            .Select(token => token.TokenId.ToString()) ?? new string[] { "NONE" });
        Console.WriteLine($"SourceTokenIds: {sourceTokenIds}");

        //target
        Console.WriteLine($"Target: {targetVerseText}");
        var targetTokenIds = string.Join(" ", engineParallelTextRow.TargetTokens?
            .Select(token => token.TokenId.ToString()) ?? new string[] { "NONE" });
        Console.WriteLine($"TargetTokenIds: {targetTokenIds}");

        //get smt alignments
        var smtOrdinalAlignments = smtWordAlignmentModel.GetBestAlignment(engineParallelTextRow.SourceSegment, engineParallelTextRow.TargetSegment);
        IEnumerable<(Token, Token)> smtSourceTargetTokenIdPairs = engineParallelTextRow.GetAlignedTokenIdPairs(smtOrdinalAlignments);

        //get manuscript tree aligner alignments
        var manuscriptOrdinalAlignedWordPairs = manuscriptWordAlignmentModel.GetBestAlignmentAlignedWordPairs(engineParallelTextRow);
        IEnumerable<(Token, Token)> manuscriptSourceTargetTokenIdPairs = engineParallelTextRow.GetAlignedTokenIdPairs(manuscriptOrdinalAlignedWordPairs);

        //display smt alignments ordinally and by tokenIds
        Console.WriteLine($"SMT Alignment        : {smtOrdinalAlignments}");
        Console.WriteLine($"SMT Alignemnt        : {string.Join(" ", smtSourceTargetTokenIdPairs.Select(t => $"{t.Item1.TokenId}->{t.Item2.TokenId}"))}");

        //display manuscript alignments ordinally and by tokenIds
        Console.WriteLine($"Manuscript Alignment : { string.Join(" ", manuscriptOrdinalAlignedWordPairs.Select(a => a.ToString()))}");
        Console.WriteLine($"Manuscript Alignemnt : {string.Join(" ", manuscriptSourceTargetTokenIdPairs.Select(t => $"{t.Item1.TokenId}->{t.Item2.TokenId}"))}");
    }
}

Training SMT FastAlign model: 0.00%
Training SMT FastAlign model: 16.67%
Training SMT FastAlign model: 33.33%
Training SMT FastAlign model: 50.00%
Training SMT FastAlign model: 66.67%
Training SMT FastAlign model: 83.33%
Training SMT FastAlign model: 100.00%
Training SMT FastAlign inverted model: 0.00%
Training SMT FastAlign inverted model: 16.67%
Training SMT FastAlign inverted model: 33.33%
Training SMT FastAlign inverted model: 50.00%
Training SMT FastAlign inverted model: 66.67%
Training SMT FastAlign inverted model: 83.33%
Training SMT FastAlign inverted model: 100.00%
Training ManuscriptWordAlignmentModel: 0.00%
Training ManuscriptWordAlignmentModel: 0.00%
Training ManuscriptWordAlignmentModel: 50.00%
Training ManuscriptWordAlignmentModel: 50.00%
Training ManuscriptWordAlignmentModel: 100.00%
1JN 1:1
Source: ὅς εἰμί ἀπό ἀρχή ὅς ἀκούω ὅς ὁράω ὁ ὀφθαλμός ἐγώ ὅς θεάομαι καί ὁ χείρ ἐγώ ψηλαφάω περί ὁ λόγος ὁ ζωή
SourceTokenIds: 062001001001001 062001001002001 062001001003001 06200100

TargetTokenIds: 062001003001001 062001003002001 062001003003001 062001003004001 062001003005001 062001003006001 062001003007001 062001003008001 062001003009001 062001003010001 062001003011001 062001003012001 062001003013001 062001003014001 062001003015001 062001003016001 062001003017001 062001003018001 062001003019001 062001003020001 062001003021001 062001003022001 062001003023001 062001003024001 062001003025001 062001003026001 062001003027001 062001003028001 062001003029001 062001003030001 062001003031001 062001003032001 062001003033001 062001003034001 062001003035001 062001003036001 062001003037001 062001003038001
SMT Alignment        : 0-1 0-2 1-4 2-5 5-5 3-6 4-8 7-9 6-10 9-13 8-14 7-15 11-16 10-17 12-18 13-19 16-20 19-22 15-23 18-24 17-25 21-26 20-27 22-28 23-30 24-31 25-32 27-32
SMT Alignemnt        : 062001003001001->062001003002001 062001003001001->062001003003001 062001003002001->062001003005001 062001003003001->062001003006001 062001003006001->062001003006001 062001003004001->