# Aligning non-manuscript source to target using Engine versification



In [1]:
#r "nuget:SIL.Scripture,7.0.0"
#r "nuget:Thot,3.3.5"
#r "../src/ClearBible.Engine/bin/Debug/net6.0/ClearBible.Engine.dll"
#r "../../machine/src/SIL.Machine/bin/Debug/netstandard2.0/SIL.Machine.dll"
#r "../../machine/src/SIL.Machine.Translation.Thot/bin/Debug/netstandard2.0/SIL.Machine.Translation.Thot.dll"

using SIL.Machine.Corpora;
using SIL.Machine.Tokenization;
using SIL.Machine.Translation;
using SIL.Machine.Translation.Thot;
using static SIL.Machine.Corpora.TokenProcessors;
using SIL.Machine.Utils;

using ClearBible.Engine.Translation;
using ClearBible.Engine.Corpora;
using ClearBible.Engine.Persistence;
using ClearBible.Engine.Tokenization;

## Prepare corpora

In [2]:
var sourceCorpus = new ParatextTextCorpus("data/VBL-PT")
    .Tokenize<LatinWordTokenizer>()
    .Transform<IntoTokensTextRowProcessor>();

var targetCorpus = new ParatextTextCorpus("data/WEB-PT")
    .Tokenize<LatinWordTokenizer>()
    .Transform<IntoTokensTextRowProcessor>();

var parallelTextCorpus = sourceCorpus.EngineAlignRows(targetCorpus, new());

## Approach 1: save model then load it to get alignments

In [3]:
using System.IO;

Directory.CreateDirectory("out/VBL-WEB-FASTALIGN");
{
    using var srcTrgTrainer = new ThotWordAlignmentModelTrainer(ThotWordAlignmentModelType.FastAlign, parallelTextCorpus.Lowercase(),
        "out/VBL-WEB-FASTALIGN/src_trg");

    using var trgSrcTrainer = new ThotWordAlignmentModelTrainer(ThotWordAlignmentModelType.FastAlign,
        parallelTextCorpus.Invert().Lowercase(), "out/VBL-WEB-FASTALIGN/trg_src");

    using var symmetrizedTrainer = new SymmetrizedWordAlignmentModelTrainer(srcTrgTrainer, trgSrcTrainer);

    symmetrizedTrainer.Train(new DelegateProgress(status =>
        Console.WriteLine($"{status.Message}: {status.PercentCompleted:P}")));
    symmetrizedTrainer.Save();
 
    Console.WriteLine("Symmetrized FastAlign model saved");
}

Training direct alignment model: 0.00%
Training direct alignment model: 0.00%
Training direct alignment model: 8.33%
Training direct alignment model: 16.67%
Training direct alignment model: 25.00%
Training direct alignment model: 33.33%
Training direct alignment model: 41.67%
Training direct alignment model: 50.00%
Training inverse alignment model: 50.00%
Training inverse alignment model: 50.00%
Training inverse alignment model: 58.33%
Training inverse alignment model: 66.67%
Training inverse alignment model: 75.00%
Training inverse alignment model: 83.33%
Training inverse alignment model: 91.67%
Training inverse alignment model: 100.00%
Symmetrized FastAlign model saved


In [4]:
{
    using var srcTrgModel = new ThotFastAlignWordAlignmentModel("out/VBL-WEB-FASTALIGN/src_trg");
    using var trgSrcModel = new ThotFastAlignWordAlignmentModel("out/VBL-WEB-FASTALIGN/trg_src");
    using var smtWordAlignmentModel = new SymmetrizedWordAlignmentModel(srcTrgModel, trgSrcModel)
    {
        Heuristic = SymmetrizationHeuristic.GrowDiagFinalAnd
    };

    foreach (EngineParallelTextRow engineParallelTextRow in parallelTextCorpus.Take(5))
    {
        //Display corpora
        var verseRefStr = engineParallelTextRow.Ref.ToString();
        var sourceVerseText = string.Join(" ", engineParallelTextRow.SourceSegment);
        var targetVerseText = string.Join(" ", engineParallelTextRow.TargetSegment);
        Console.WriteLine(verseRefStr);

        //source
        Console.WriteLine($"Source: {sourceVerseText}");
        var sourceTokenIds = string.Join(" ", engineParallelTextRow.SourceTokens?
            .Select(token => token.TokenId.ToString()) ?? new string[] { "NONE" });
        Console.WriteLine($"SourceTokenIds: {sourceTokenIds}");

        //target
        Console.WriteLine($"Target: {targetVerseText}");
        var targetTokenIds = string.Join(" ", engineParallelTextRow.TargetTokens?
            .Select(token => token.TokenId.ToString()) ?? new string[] { "NONE" });
        Console.WriteLine($"TargetTokenIds: {targetTokenIds}");

        //get smt alignments
        var smtOrdinalAlignments = smtWordAlignmentModel.GetBestAlignment(engineParallelTextRow.SourceSegment, engineParallelTextRow.TargetSegment);
        IEnumerable<(Token, Token)> smtSourceTargetTokenIdPairs = engineParallelTextRow.GetAlignedTokenIdPairs(smtOrdinalAlignments);

        //display smt alignments ordinally and by tokenIds
        Console.WriteLine($"SMT Alignment        : {smtOrdinalAlignments}");
        Console.WriteLine($"SMT Alignemnt        : {string.Join(" ", smtSourceTargetTokenIdPairs.Select(t => $"{t.Item1.TokenId}->{t.Item2.TokenId}"))}");
    }
}

1JN 1:1
Source: Esta carta trata sobre la Palabra de vida que existía desde el principio , que hemos escuchado , que hemos visto con nuestros propios ojos y le hemos contemplado , y que hemos tocado con nuestras manos .
SourceTokenIds: 062001001001001 062001001002001 062001001003001 062001001004001 062001001005001 062001001006001 062001001007001 062001001008001 062001001009001 062001001010001 062001001011001 062001001012001 062001001013001 062001001014001 062001001015001 062001001016001 062001001017001 062001001018001 062001001019001 062001001020001 062001001021001 062001001022001 062001001023001 062001001024001 062001001025001 062001001026001 062001001027001 062001001028001 062001001029001 062001001030001 062001001031001 062001001032001 062001001033001 062001001034001 062001001035001 062001001036001 062001001037001 062001001038001
Target: ( and the life was revealed , and we have seen , and testify , and declare to you the life , the eternal life , which was with the Father , and was 

1JN 1:4
Source: Escribimos para decirles esto , a fin de que nuestra felicidad sea completa .
SourceTokenIds: 062001004001001 062001004002001 062001004003001 062001004004001 062001004005001 062001004006001 062001004007001 062001004008001 062001004009001 062001004010001 062001004011001 062001004012001 062001004013001 062001004014001
Target: And we write these things to you , that our joy may be fulfilled .
TargetTokenIds: 062001004001001 062001004002001 062001004003001 062001004004001 062001004005001 062001004006001 062001004007001 062001004008001 062001004009001 062001004010001 062001004011001 062001004012001 062001004013001 062001004014001 062001004015001
SMT Alignment        : 1-1 2-2 3-3 3-4 5-5 5-6 6-6 4-7 7-7 8-8 8-9 9-10 10-10 10-11 11-12 12-13 13-14
SMT Alignemnt        : 062001004002001->062001004002001 062001004003001->062001004003001 062001004004001->062001004004001 062001004004001->062001004005001 062001004006001->062001004006001 062001004006001->062001004007001 062001004007

## Approach 2: don't save model and get alignments

In [5]:
{
    using var srcTrgModel = new ThotFastAlignWordAlignmentModel();
    //using var trainerSrcTrg = srcTrgModel.CreateTrainer(parallelTextCorpus, TokenProcessors.Lowercase, TokenProcessors.Lowercase);

    using var trgSrcModel = new ThotFastAlignWordAlignmentModel();
    //using var trainerTrgSrc = trgSrcModel.CreateTrainer(parallelTextCorpus.Invert(), TokenProcessors.Lowercase, TokenProcessors.Lowercase);

    
    using var smtWordAlignmentModel = new SymmetrizedWordAlignmentModel(srcTrgModel, trgSrcModel)
    {
        Heuristic = SymmetrizationHeuristic.GrowDiagFinalAnd
    };
    using var smtWordAlignmentModeltrainer = smtWordAlignmentModel.CreateTrainer(parallelTextCorpus.Lowercase());
    smtWordAlignmentModeltrainer.Train(new DelegateProgress(status =>
        Console.WriteLine($"Training symmetrized fast align model: {status.PercentCompleted:P}")));
    smtWordAlignmentModeltrainer.Save();
    
    foreach (EngineParallelTextRow engineParallelTextRow in parallelTextCorpus.Take(5))
    {
        //Display corpora
        var verseRefStr = engineParallelTextRow.Ref.ToString();
        var sourceVerseText = string.Join(" ", engineParallelTextRow.SourceSegment);
        var targetVerseText = string.Join(" ", engineParallelTextRow.TargetSegment);
        Console.WriteLine(verseRefStr);

        //source
        Console.WriteLine($"Source: {sourceVerseText}");
        var sourceTokenIds = string.Join(" ", engineParallelTextRow.SourceTokens?
            .Select(token => token.TokenId.ToString()) ?? new string[] { "NONE" });
        Console.WriteLine($"SourceTokenIds: {sourceTokenIds}");

        //target
        Console.WriteLine($"Target: {targetVerseText}");
        var targetTokenIds = string.Join(" ", engineParallelTextRow.TargetTokens?
            .Select(token => token.TokenId.ToString()) ?? new string[] { "NONE" });
        Console.WriteLine($"TargetTokenIds: {targetTokenIds}");

        //get smt alignments
        var smtOrdinalAlignments = smtWordAlignmentModel.GetBestAlignment(engineParallelTextRow.SourceSegment, engineParallelTextRow.TargetSegment);
        IEnumerable<(Token, Token)> smtSourceTargetTokenIdPairs = engineParallelTextRow.GetAlignedTokenIdPairs(smtOrdinalAlignments);

        //display smt alignments ordinally and by tokenIds
        Console.WriteLine($"SMT Alignment        : {smtOrdinalAlignments}");
        Console.WriteLine($"SMT Alignemnt        : {string.Join(" ", smtSourceTargetTokenIdPairs.Select(t => $"{t.Item1.TokenId}->{t.Item2.TokenId}"))}");
    }
}


Training symmetrized fast align model: 0.00%
Training symmetrized fast align model: 0.00%
Training symmetrized fast align model: 8.33%
Training symmetrized fast align model: 16.67%
Training symmetrized fast align model: 25.00%
Training symmetrized fast align model: 33.33%
Training symmetrized fast align model: 41.67%
Training symmetrized fast align model: 50.00%
Training symmetrized fast align model: 50.00%
Training symmetrized fast align model: 50.00%
Training symmetrized fast align model: 58.33%
Training symmetrized fast align model: 66.67%
Training symmetrized fast align model: 75.00%
Training symmetrized fast align model: 83.33%
Training symmetrized fast align model: 91.67%
Training symmetrized fast align model: 100.00%
1JN 1:1
Source: Esta carta trata sobre la Palabra de vida que existía desde el principio , que hemos escuchado , que hemos visto con nuestros propios ojos y le hemos contemplado , y que hemos tocado con nuestras manos .
SourceTokenIds: 062001001001001 06200100100200

1JN 1:4
Source: Escribimos para decirles esto , a fin de que nuestra felicidad sea completa .
SourceTokenIds: 062001004001001 062001004002001 062001004003001 062001004004001 062001004005001 062001004006001 062001004007001 062001004008001 062001004009001 062001004010001 062001004011001 062001004012001 062001004013001 062001004014001
Target: And we write these things to you , that our joy may be fulfilled .
TargetTokenIds: 062001004001001 062001004002001 062001004003001 062001004004001 062001004005001 062001004006001 062001004007001 062001004008001 062001004009001 062001004010001 062001004011001 062001004012001 062001004013001 062001004014001 062001004015001
SMT Alignment        : 1-1 2-2 3-3 3-4 5-5 5-6 6-6 4-7 7-7 8-8 8-9 9-10 10-10 10-11 11-12 12-13 13-14
SMT Alignemnt        : 062001004002001->062001004002001 062001004003001->062001004003001 062001004004001->062001004004001 062001004004001->062001004005001 062001004006001->062001004006001 062001004006001->062001004007001 062001004007