# Aligning manuscript verses to target

In [1]:
#r "nuget:SIL.Machine,2.5.11"
#r "nuget:SIL.Machine.Translation.Thot,2.5.11"
#r "../src/ClearBible.Engine/bin/Debug/net6.0/ClearBible.Engine.dll"
    
using SIL.Machine.Corpora;
using SIL.Machine.Tokenization;
using SIL.Machine.Translation.Thot;

using ClearBible.Engine.Translation;
using ClearBible.Engine.Corpora;
using ClearBible.Engine.Utils; //FIXME: use SIL.Util once in current version
using ClearBible.Engine.Persistence;

## Prepare corpora

In [3]:
// set up the parallel corpra
var tokenizer = new LatinWordTokenizer();

// Obtain the target corpra from paratext directly
var targetCorpus = new ParatextTextCorpus(tokenizer, "data/WEB-PT");

// Obtain the target corpra from usfm file
// var targetCorpus = new UsxFileTextCorpus(tokenizer, "path/to/usx", ScrVers.Original);

var manuscriptTree = await FileGetManuscriptTree.Get().SetLocation("syntaxTree").GetAsync();

var manuscriptTextCorpus = new ManuscriptTextCorpus(manuscriptTree);

var manuscriptParallelTextCorpus = new ManuscriptParallelTextCorpus(manuscriptTextCorpus, targetCorpus);

Unhandled exception: System.ArgumentException: An item with the same key has already been added. Key: SIL.Scripture.Versification+Table+VersificationKey
   at System.Collections.Generic.Dictionary`2.TryInsert(TKey key, TValue value, InsertionBehavior behavior)
   at System.Collections.Generic.Dictionary`2.Add(TKey key, TValue value)
   at SIL.Scripture.Versification.Table.Load(TextReader stream, String fullPath, ScrVers baseVers, String name)
   at SIL.Machine.Corpora.ParatextTextCorpus..ctor(ITokenizer`3 wordTokenizer, String projectDir, Boolean includeMarkers)
   at Submission#5.<<Initialize>>d__0.MoveNext()
--- End of stack trace from previous location ---
   at Microsoft.CodeAnalysis.Scripting.ScriptExecutionState.RunSubmissionsAsync[TResult](ImmutableArray`1 precedingExecutors, Func`2 currentExecutor, StrongBox`1 exceptionHolderOpt, Func`2 catchExceptionOpt, CancellationToken cancellationToken)

## Approach #1

In [5]:
{
    // Create the source->target SMT model and train it
    using var srcTrgModel = new ThotFastAlignWordAlignmentModel();
    using var trainerSrcTrg = srcTrgModel.CreateTrainer(manuscriptParallelTextCorpus, 
                                                        targetPreprocessor: TokenProcessors.Lowercase);
    trainerSrcTrg.Train(new DelegateProgress(status =>
        Console.WriteLine($"Training FastAlign model: {status.PercentCompleted:P}")));
    trainerSrcTrg.Save();
    
    // Create the target->source SMT model and train it
    using var trgSrcModel = new ThotFastAlignWordAlignmentModel();
    using var trainerTrgSrc = trgSrcModel.CreateTrainer(manuscriptParallelTextCorpus.Invert(), 
                                                        targetPreprocessor: TokenProcessors.Lowercase);
    trainerTrgSrc.Train(new DelegateProgress(status =>
        Console.WriteLine($"Training FastAlign model: {status.PercentCompleted:P}")));
    trainerTrgSrc.Save();

    //put the trained source->target and trained target->source models into the symmetrized SMT model
    using var symmetrizedModel = new SymmetrizedWordAlignmentModel(srcTrgModel, trgSrcModel)
    {
        Heuristic = SymmetrizationHeuristic.GrowDiagFinalAnd
    };
    
    /*
    foreach (ParallelTextSegment textSegment in parallelCorpus.GetSegments().Take(5))
    {
        var alignment = symmetrizedModel2.GetBestAlignment(TokenProcessors.Lowercase.Process(textSegment.SourceSegment),
            TokenProcessors.Lowercase.Process(textSegment.TargetSegment));

        var verseRefStr = textSegment.SegmentRef.ToString();
        var sourceVerseText = string.Join(" ", textSegment.SourceSegment);
        var targetVerseText = string.Join(" ", textSegment.TargetSegment);
        Console.WriteLine(verseRefStr);
        Console.WriteLine($"Source: {sourceVerseText}");
        Console.WriteLine($"Target: {targetVerseText}");
        Console.WriteLine($"Alignment: {alignment}");
    }
    */


    using var manuscriptModel = new ManuscriptWordAlignmentModel();
    using var manuscriptTrainer = manuscriptModel.CreateTrainer(
        symmetrizedModel,
        true, // symmetrizedModel is trained
        manuscriptTree,
        manuscriptParallelTextCorpus,
        await FileGetManuscriptTreeAligmentConfig.Get().SetLocation("InputCommon").GetAsync());
    
    await SqlLitePersistManuscriptInfoAlignments.Get().SetLocation("connection string")
    .PutAsync(new ManuscriptInfoAlignments(manuscriptModel, manuscriptTree));
}

Unhandled exception: System.NotImplementedException: The method or operation is not implemented.
   at ClearBible.Engine.Translation.ManuscriptTreeWordAlignmentTrainer.Dispose() in C:\Users\rm\source\repos\clear\ClearEngine\src\ClearBible.Engine\Translation\ManuscriptWordAlignmentTrainer.cs:line 32
   at Submission#7.<<Initialize>>d__0.MoveNext()
--- End of stack trace from previous location ---
   at Microsoft.CodeAnalysis.Scripting.ScriptExecutionState.RunSubmissionsAsync[TResult](ImmutableArray`1 precedingExecutors, Func`2 currentExecutor, StrongBox`1 exceptionHolderOpt, Func`2 catchExceptionOpt, CancellationToken cancellationToken)

## Approach #2

In [None]:
{
    // Create the source->target SMT model but don't train it
    using var srcTrgModel = new ThotFastAlignWordAlignmentModel();
    
    // Create the target->source SMT model but don't train it
    using var trgSrcModel = new ThotFastAlignWordAlignmentModel();

    //put the untrained source->target and untrained target->source models into the symmetrized SMT model
    using var symmetrizedModel = new SymmetrizedWordAlignmentModel(srcTrgModel, trgSrcModel)
    {
        Heuristic = SymmetrizationHeuristic.GrowDiagFinalAnd
    };
    
    using var manuscriptModel = new ManuscriptWordAlignmentModel();
    using var manuscriptTrainer = manuscriptModel.CreateTrainer(
        symmetrizedModel,
        false, // symmetrizedModel is not trained
        manuscriptTree,
        manuscriptParallelTextCorpus,
        await FileGetManuscriptTreeAligmentConfig.Get().SetLocation("InputCommon").GetAsync(),
        targetPreprocessor: TokenProcessors.Lowercase);
    
    await SqlLitePersistManuscriptInfoAlignments.Get().SetLocation("connection string")
    .PutAsync(new ManuscriptInfoAlignments(manuscriptModel, manuscriptTree));

In [1]:
string foo = "1c";
int.Parse(foo);

Unhandled exception: System.FormatException: Input string was not in a correct format.
   at System.Number.ThrowOverflowOrFormatException(ParsingStatus status, TypeCode type)
   at System.Int32.Parse(String s)
   at Submission#3.<<Initialize>>d__0.MoveNext()
--- End of stack trace from previous location ---
   at Microsoft.CodeAnalysis.Scripting.ScriptExecutionState.RunSubmissionsAsync[TResult](ImmutableArray`1 precedingExecutors, Func`2 currentExecutor, StrongBox`1 exceptionHolderOpt, Func`2 catchExceptionOpt, CancellationToken cancellationToken)

In [8]:
public class PowersOf2
{


    public static System.Collections.Generic.IEnumerable<int> Power(int number, int exponent)
    {
        int result = 1;

        for (int i = 0; i < exponent; i++)
        {
            result = result * number;
            yield return result;
        }
    }

    // Output: 2 4 8 16 32 64 128 256
}

var foo = PowersOf2.Power(2,8);


var first = foo.Where(num => num == 4).ToList();
var second = foo.Where(num => num == 2).ToList();

foreach (var x in first)
{
    Console.WriteLine(x);
}

foreach (var x in second)
{
    Console.WriteLine(x);
}


4
2
