# SUR to Hebrew top 30 alignments for 'mo' using fastalign



## Prerequisites

Clone the following to the same directory.

```
git clone https://github.com/Clear-Bible/ClearEngine
git clone https://github.com/russellmorley/machine.git
```

In [1]:
#r "nuget:SIL.Scripture,10.1.0"
#r "nuget:Thot,3.3.5"
#r "../src/ClearBible.Engine/bin/Debug/net7.0/ClearBible.Engine.dll"
#r "../src/ClearBible.Engine.SyntaxTree/bin/Debug/net7.0/ClearBible.Engine.SyntaxTree.dll"
#r "../../machine/src/SIL.Machine/bin/Debug/netstandard2.0/Clear.SIL.Machine.dll"
#r "../../machine/src/SIL.Machine.Translation.Thot/bin/Debug/netstandard2.0/SIL.Machine.Translation.Thot.dll"

using SIL.Machine.Corpora;
using SIL.Machine.Tokenization;
using SIL.Machine.Translation;
using SIL.Machine.Translation.Thot;
using static SIL.Machine.Corpora.TokenProcessors;
using SIL.Machine.Utils;

using ClearBible.Engine.Translation;
using ClearBible.Engine.Corpora;
using ClearBible.Engine.SyntaxTree.Corpora;
using ClearBible.Engine.Persistence;
using ClearBible.Engine.Tokenization;

## Prepare corpora

In [2]:
var sourceCorpus = new ParatextTextCorpus("C:\\My Paratext 9 Projects\\zz_SUR");
var transformedSourceCorpus = sourceCorpus
    .Tokenize<LatinWordTokenizer>()
    .Transform<IntoTokensTextRowProcessor>()
    .Transform<SetTrainingBySurfaceLowercase>();


var syntaxTree = new SyntaxTrees();
var targetCorpus = new SyntaxTreeFileTextCorpus(syntaxTree, FileGetBookIds.LanguageCodeEnum.H);
var transformedTargetCorpus = targetCorpus
    .Transform<SetTrainingByTrainingLowercase>();


var parallelTextCorpus = transformedSourceCorpus.EngineAlignRows(
    transformedTargetCorpus,
    new SourceTextIdToVerseMappingsFromVerseMappings(
        EngineParallelTextCorpus.VerseMappingsForAllVerses(sourceCorpus.Versification, targetCorpus.Versification)
    )
);

## SMT align using FastAlign, top 30 target alignment counts for source training "mo"

In [3]:
{
    using var smtWordAlignmentModel = new ThotFastAlignWordAlignmentModel();

    using var trainer = smtWordAlignmentModel.CreateTrainer(parallelTextCorpus.Lowercase());
    trainer.Train(new DelegateProgress(status =>
            Console.WriteLine($"Training symmetrized Fastalign model: {status.PercentCompleted:P}")));
    await trainer.SaveAsync();

    var topAlignedTokenPairs = parallelTextCorpus
        .SelectMany(row => ((EngineParallelTextRow)row).GetAlignedTokenPairs(smtWordAlignmentModel.GetBestAlignment((EngineParallelTextRow)row)))
        .Where(atp => atp.SourceToken.TrainingText.Equals("mo"))
        .GroupBy(atp => atp.TargetToken.TrainingText)
        .OrderByDescending(g => g
            .Select(atp => atp.TargetToken.TrainingText)
            .Count())
        .Take(30)
        .ToList();

    foreach (var atp in topAlignedTokenPairs)
    {
        Console.WriteLine($"Target.TrainingText: '{atp.Key}'; Count: {atp.Select(atp => atp).Count()}");
    }
}

Training symmetrized Fastalign model: 0.00%
Training symmetrized Fastalign model: 16.67%
Training symmetrized Fastalign model: 33.33%
Training symmetrized Fastalign model: 50.00%
Training symmetrized Fastalign model: 66.67%
Training symmetrized Fastalign model: 83.33%
Training symmetrized Fastalign model: 100.00%
Target.TrainingText: 'הַ'; Count: 3193
Target.TrainingText: 'וְ'; Count: 3118
Target.TrainingText: 'ָמ'; Count: 2581
Target.TrainingText: 'בְּ'; Count: 1080
Target.TrainingText: 'אֵת_1'; Count: 1010
Target.TrainingText: 'לְ'; Count: 903
Target.TrainingText: 'כֹּל'; Count: 657
Target.TrainingText: 'מִן'; Count: 371
Target.TrainingText: 'הֵם'; Count: 321
Target.TrainingText: 'עַל_2'; Count: 270
Target.TrainingText: 'עַם'; Count: 235
Target.TrainingText: 'אֲשֶׁר'; Count: 190
Target.TrainingText: 'אִישׁ'; Count: 148
Target.TrainingText: 'בֵּן_1'; Count: 141
Target.TrainingText: 'אֵלֶּה'; Count: 104
Target.TrainingText: 'ָנ'; Count: 67
Target.TrainingText: 'עִיר_1'; Count: 67
Targe