# Aligning non-manuscript source to target using Engine versification



In [1]:
#r "nuget:SIL.Machine,2.5.11"
#r "nuget:SIL.Machine.Translation.Thot,2.5.11"
#r "../src/ClearBible.Engine/bin/Debug/net6.0/ClearBible.Engine.dll"

using ClearBible.Engine.Utils;
using SIL.Machine.Corpora;
using SIL.Machine.Tokenization;
using SIL.Machine.Translation.Thot;
using SIL.Machine.Utils;
using SIL.Machine.Translation;

## Prepare corpora

In [2]:
var tokenizer = new LatinWordTokenizer();
var sourceCorpus = new EngineParatextTextCorpus(tokenizer, "data/VBL-PT");
var targetCorpus = new EngineParatextTextCorpus(tokenizer, "data/WEB-PT");
var parallelCorpus = new EngineParallelTextCorpus(sourceCorpus, targetCorpus);

When not supplied an Engine versificaiton, `EngineParallelTextCorpus` generates one based on Machine / SIL's
versification. This mapping is obtained and can be modified as follows:

In [None]:
var engineVersificationMapping = parallelCorpus.SourceTargetParallelVersesList;


## Approach 1: save model then load it to get alignments

In [3]:
using System.IO;

Directory.CreateDirectory("out/VBL-WEB-FASTALIGN");
{
    using var srcTrgTrainer = new ThotWordAlignmentModelTrainer(ThotWordAlignmentModelType.FastAlign, parallelCorpus,
        "out/VBL-WEB-FASTALIGN/src_trg", sourcePreprocessor: TokenProcessors.Lowercase, targetPreprocessor: TokenProcessors.Lowercase);
    using var trgSrcTrainer = new ThotWordAlignmentModelTrainer(ThotWordAlignmentModelType.FastAlign,
        parallelCorpus.Invert(), "out/VBL-WEB-FASTALIGN/trg_src", sourcePreprocessor: TokenProcessors.Lowercase,
        targetPreprocessor: TokenProcessors.Lowercase);
    using var symmetrizedTrainer = new SymmetrizedWordAlignmentModelTrainer(srcTrgTrainer, trgSrcTrainer);
    symmetrizedTrainer.Train(new DelegateProgress(status =>
        Console.WriteLine($"{status.Message}: {status.PercentCompleted:P}")));
    symmetrizedTrainer.Save();
    Console.WriteLine("Symmetrized FastAlign model saved");
}

Training direct alignment model: 0.00%
Training direct alignment model: 0.00%
Training direct alignment model: 8.33%
Training direct alignment model: 16.67%
Training direct alignment model: 25.00%
Training direct alignment model: 33.33%
Training direct alignment model: 41.67%
Training direct alignment model: 50.00%
Training inverse alignment model: 50.00%
Training inverse alignment model: 50.00%
Training inverse alignment model: 58.33%
Training inverse alignment model: 66.67%
Training inverse alignment model: 75.00%
Training inverse alignment model: 83.33%
Training inverse alignment model: 91.67%
Training inverse alignment model: 100.00%
Symmetrized FastAlign model saved


In [4]:
{
    using var srcTrgModel = new ThotFastAlignWordAlignmentModel("out/VBL-WEB-FASTALIGN/src_trg");
    using var trgSrcModel = new ThotFastAlignWordAlignmentModel("out/VBL-WEB-FASTALIGN/trg_src");
    using var symmetrizedModel1 = new SymmetrizedWordAlignmentModel(srcTrgModel, trgSrcModel)
    {
        Heuristic = SymmetrizationHeuristic.GrowDiagFinalAnd
    };
    foreach (ParallelTextSegment textSegment in parallelCorpus.GetSegments().Take(5))
    {
        var alignment = symmetrizedModel1.GetBestAlignment(TokenProcessors.Lowercase.Process(textSegment.SourceSegment),
            TokenProcessors.Lowercase.Process(textSegment.TargetSegment));

        var verseRefStr = textSegment.SegmentRef.ToString();
        var sourceVerseText = string.Join(" ", textSegment.SourceSegment);
        var targetVerseText = string.Join(" ", textSegment.TargetSegment);
        Console.WriteLine(verseRefStr);
        Console.WriteLine($"Source: {sourceVerseText}");
        Console.WriteLine($"Target: {targetVerseText}");
        Console.WriteLine($"Alignment: {alignment}");
    }
}

1JN 1:1
Source: Esta carta trata sobre la Palabra de vida que existía desde el principio , que hemos escuchado , que hemos visto con nuestros propios ojos y le hemos contemplado , y que hemos tocado con nuestras manos .
Target: That which was from the beginning , that which we have heard , that which we have seen with our eyes , that which we saw , and our hands touched , concerning the Word of life
Alignment: 0-0 2-1 4-4 12-5 6-6 8-7 9-8 10-8 15-9 15-10 16-11 13-12 14-13 15-14 15-15 15-16 20-17 21-18 22-19 19-20 18-22 27-23 26-24 27-25 28-26 29-26 30-27 31-28 35-28 32-29 32-30 33-31 37-32 5-34 7-36
1JN 1:2
Source: Esta Vida nos fue revelada . La vimos y damos testimonio de ella . Estamos hablándoles de Aquél que es la Vida Eterna , que estaba con el Padre , y que nos fue revelado .
Target: ( and the life was revealed , and we have seen , and testify , and declare to you the life , the eternal life , which was with the Father , and was revealed to us ) ;
Alignment: 0-2 1-3 3-4 3-5 6-6 

## Approach 2: don't save model and get alignments

In [5]:
{
    using var srcTrgModel = new ThotFastAlignWordAlignmentModel();
    using var trainerSrcTrg = srcTrgModel.CreateTrainer(parallelCorpus, TokenProcessors.Lowercase, TokenProcessors.Lowercase);
    trainerSrcTrg.Train(new DelegateProgress(status =>
        Console.WriteLine($"Training FastAlign model: {status.PercentCompleted:P}")));
    trainerSrcTrg.Save();
    

    using var trgSrcModel = new ThotFastAlignWordAlignmentModel();
    using var trainerTrgSrc = trgSrcModel.CreateTrainer(parallelCorpus.Invert(), TokenProcessors.Lowercase, TokenProcessors.Lowercase);
    trainerTrgSrc.Train(new DelegateProgress(status =>
        Console.WriteLine($"Training FastAlign model: {status.PercentCompleted:P}")));
    trainerTrgSrc.Save();

    
    using var symmetrizedModel2 = new SymmetrizedWordAlignmentModel(srcTrgModel, trgSrcModel)
    {
        Heuristic = SymmetrizationHeuristic.GrowDiagFinalAnd
    };
    foreach (ParallelTextSegment textSegment in parallelCorpus.GetSegments().Take(5))
    {
        var alignment = symmetrizedModel2.GetBestAlignment(TokenProcessors.Lowercase.Process(textSegment.SourceSegment),
            TokenProcessors.Lowercase.Process(textSegment.TargetSegment));

        var verseRefStr = textSegment.SegmentRef.ToString();
        var sourceVerseText = string.Join(" ", textSegment.SourceSegment);
        var targetVerseText = string.Join(" ", textSegment.TargetSegment);
        Console.WriteLine(verseRefStr);
        Console.WriteLine($"Source: {sourceVerseText}");
        Console.WriteLine($"Target: {targetVerseText}");
        Console.WriteLine($"Alignment: {alignment}");
    }
}


Training FastAlign model: 0.00%
Training FastAlign model: 16.67%
Training FastAlign model: 33.33%
Training FastAlign model: 50.00%
Training FastAlign model: 66.67%
Training FastAlign model: 83.33%
Training FastAlign model: 100.00%
Training FastAlign model: 0.00%
Training FastAlign model: 16.67%
Training FastAlign model: 33.33%
Training FastAlign model: 50.00%
Training FastAlign model: 66.67%
Training FastAlign model: 83.33%
Training FastAlign model: 100.00%
1JN 1:1
Source: Esta carta trata sobre la Palabra de vida que existía desde el principio , que hemos escuchado , que hemos visto con nuestros propios ojos y le hemos contemplado , y que hemos tocado con nuestras manos .
Target: That which was from the beginning , that which we have heard , that which we have seen with our eyes , that which we saw , and our hands touched , concerning the Word of life
Alignment: 0-0 2-1 4-4 12-5 6-6 8-7 9-8 10-8 15-9 15-10 16-11 13-12 14-13 15-14 15-15 15-16 20-17 21-18 22-19 19-20 18-22 27-23 26-24 2