In [None]:
import xml.etree.ElementTree as ET

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tag import pos_tag
from collections import defaultdict

import stanza
import jieba.posseg as pseg
import re


# Translog to Workspace

In [None]:

fn = "/data/critt/tprdb/TPRDB/AR22/Translog-II/P01_T2.xml"
fn = "/data/critt/tprdb/TPRDB/SG12/Translog-II/P01_T2.xml"
fn = "/data/critt/tprdb/TPRDB/BML12/Translog-II/P01_T2.xml"
fn = "/data/critt/tprdb/TPRDB/BML12/Translog-II/P01_T1.xml"
#fn = "/data/critt/tprdb/TPRDB/RUC17/Translog-II/P01_T1.xml"
#fn = "/data/critt/tprdb/TPRDB/STC17bolt/Translog-II/P01_T1.xml"
#fn = "/data/critt/tprdb/TPRDB/ENJA15/Translog-II/P01_T1.xml"

wks_root = Translog2WKS(fn)

# pretty-print WorkSpace_root
ET.indent(wks_root, space='  ')  # 2 spaces
print(ET.tostring(wks_root, encoding='unicode'))

## to do
# Realign and substitute in WKS
# Table extraction

In [None]:
def Translog2WKS(fn) :
    
    ### Root of the WorkSpace XML output file
    WorkSpace_root = ET.Element("WorkSpace")
    
    ### Read Translog-XML file
    translog = ET.parse(fn)
    # Root of the Translog XML input file
    translog_root = translog.getroot()
    
    # get Source and Target Languages
    e = translog_root.find('.//Languages')
    SL = e.get('source') 
    TL = e.get('target')
    
    ##################################################################
    ### ST segmentation and tokenization
    # <SourceToken language="en" >
    #    <Token cur="0" tokId="1" pos="NNP" sntId="1" tok="Killer" />
      
    # Source text and target Text 
    SText = getSourceText(translog_root)
    
    # segment and tokenize the source text
    (STsnt, SToken, SToken_root) = Tokenize(SText, SL, 'SourceToken')
    
    # append ST tokenization to WorkSpace root 
    WorkSpace_root.append(SToken_root)
    
    ##################################################################
    ### TT final target text (translation) segmentation and tokenization
    #  <FinalToken language="ar" >
    #    <Token  cur="0" tokId="1" sntId="1" tok="ﺎﻠﻤﻣﺮﺿ" />  
    
    # get final text from Translog file 
    FText = getFinalText(translog_root)
    
    # segment and tokenize the target text
    (FTsnt, FToken, FToken_root) = Tokenize(FText, TL, 'FinalToken')
    
    # append FT tokenization to WorkSpace root 
    WorkSpace_root.append(FToken_root)
    
    ##################################################################
    ### Segment-alignment
    #  <SntAlign>
    #     <Snt src="1" tgt="1" />
    
    # very preliminary Sentence Alignment
    SntAlignList = sntAignment(STsnt, FTsnt)
    
    # convert SntAlign Dictionary to xml
    SntAln_root = list_of_dicts_to_xml(SntAlignList, root_tag='SntAlign', item_tag='Snt')
    
    # append SntAlign Dictionary to WorkSpace root 
    WorkSpace_root.append(SntAln_root)
    
    ##################################################################
    ### Token-alignment
    #  <TokAlign>
    #     <Tok sid="1" tid="1" />

    tokAlignList = tokAlignment(STsnt, FTsnt, SntAlignList)
    
    # convert SntAlign Dictionary to xml
    TokAln_root = list_of_dicts_to_xml(tokAlignList, root_tag='TokAlign', item_tag='Tok')
    
    # append SntAlign Dictionary to WorkSpace root 
    WorkSpace_root.append(TokAln_root)

    ##################################################################
    ### Keystroke-Token mapping
    #  <Modifications>
    #    <Mod time="72953" type="Mins" cur="0" chr="ﺍ" X="0" Y="0" sntId="1" sid="2" tid="1"  />
     
    ##################################################################
    ### Fixation-Token mapping
    #  <Fixations>
    #      <Fix time="30" win="1" cur="227" dur="175" X="502" Y="228" sntId="3" sid="41" tid="39" />
    
    
    ##################################################################
    ### Segment open-closing
    #  <Segments>
    #    <Seg sntId="1" open="72952" close="89436" />

    return WorkSpace_root

## Linguistic processing
- sentence segmentation (NLTK)
- tokenization (NLTK)
- lexical features (Stanza)
- cursor offset of words in text
                    

In [None]:
def Tokenize(text, lng, tag):
    """
    Tokenize and annotate text with linguistic features.
    
    Parameters:
    -----------
    text : str
        Raw input text to process
    lng : str
        Language code (e.g., 'en', 'es', 'de')
    tag : str
        XML root tag name (e.g., 'SourceText', 'TargetText')
    
    Returns:
    --------
    tuple: (snt, toksFeats, token_root)
        - snt: List of tokenized sentences with POS tags
        - toksFeats: List of token dictionaries with all features
        - token_root: XML ElementTree with all token data
    """
    
    # segment and tokenize source text 
    # snt: is list of tokenized ,Tagged sentences: 
    #    [[(token, pos), ...], [(token, pos), ... ], ...]  
    snt = segmentText(text, lng=lng, flag=1)
    
    # create list of tokens with sntId, tokId, cursor offset
    #    [{tok1features}, {tok2features}, ...]
    toksList = tokenCurOffset(text, snt)

    # get additional features from Stanza 
    # add features to list of STokens 
    toksFeats = stanzaFeatures(snt, lng, toksList)
    
    # convert token Dictionary to xml
    token_root = list_of_dicts_to_xml(toksFeats, root_tag=tag, item_tag='Token')
    
    # assign source language 
    token_root.set('language', str(lng))

    return (snt, toksFeats, token_root)


# get ST from the Translog file
def getSourceText(root):
    
    # get text from UTF8 container in the xml file 
    ST = root.find('.//SourceTextUTF8')
    if ST is not None:
        return ST.text
        
    # in older versions there is no UTF8 version in the xml file 
    # else SourceTextChar must extist
    text2 = ''
    STchars = root.findall('.//SourceTextChar/CharPos')
    for chars in STchars:
        text2 += chars.get('Value')
    return text2

# get FT from the Translog file
def getFinalText(root):

    # FinalText in UTF8 should usually always be there
    FT = root.find('.//FinalText')
    if FT is not None:
        return FT.text

    # else FinalTextChar must extist
    text2 = ''
    FTchars = root.findall('.//FinalTextChar/CharPos')
    for chars in FTchars:
        text2 += chars.get('Value')
    return text2

# segment text FT from the Translog file
def segmentText(text, lng='en', flag = 1):
    
    # replace multiple \n by one (no impact on NLTK segmentation)
    text1 = re.sub(r'\n+', '\n', text)

    if(lng == 'zh') : return segmentChinese(text, flag)
    if(lng == 'ja') : return segmentChinese(text, flag)
        
    # Segment text into list of sentences
    snt0 = sent_tokenize(text1)

    # segment text at newline into segments (not covered by NLTK)
    snt1 = []
    for i in range(len(snt0)) :
        s = snt0[i]
        snt1.extend(s.split('\n'))

    # Tokenize each sentences
    snt1 = [word_tokenize(s) for s in snt1]
    
    # Part-of-speech tagging each sentences: works only properly for English
    if(flag == 1) : snt1 = [pos_tag(s) for s in snt1]

    # collapse back into list of sentences
    if(flag == 2) : snt1 = [" ".join(s) for s in snt1]
    return snt1

def segmentChinese(T, flag) :
    pattern = r'[。！？]'
    
    words = pseg.cut(T)
#    print(T, '\n')
    S = []
    L = []
    for tok, pos in words:
        # skip word that are whitespaces
        if(re.search(r'\s',  tok)) : 
            # new sentence with \n,but it's not a token
            if(re.search(r'\n',  tok)) : 
                S.append(L)
                L = []
            continue
            
        # token and pos
        if(flag == 1) : L.append((tok, pos))
        # only token if flag != 1
        else : L.append(tok)

#        print(tok, pos)
        # end of sentence
        match = re.search(pattern, tok)
        if(match) : 
            S.append(L)
            L = []

    return S
 
# additional features from Stanza
def stanzaFeatures(sntList, lng, tokList, processor='tokenize,mwt,pos,lemma', tokenize_no_ssplit=False, verbose=False):

    # mwt not available for those languages
    if (lng == 'ja' or lng == 'zh' or lng == 'ko' or lng == 'nl'): processor = 'tokenize,pos,lemma'

    # initialize stanza pipeline
    nlp = stanza.Pipeline(lang=lng, processors=processor, tokenize_pretokenized=True, tokenize_no_ssplit=tokenize_no_ssplit, verbose=verbose)
    
    # take out the POS tags from the list of sentences
    sntList = [[w for w, p in s] for s in sntList]
    doc = nlp(sntList)

    # stanza document to list of list of dictionaries
    stza_list = doc.to_dict()
    
    # map list of NLTK tokens into dictinary for faster lookup 
    TD = {d['tokId']: d for d in tokList}

    sntId = 0
    tokId = 0
    Token = []
    off = 0

    for snt in stza_list:
        sntId +=1
        for tok in snt :
            tokId +=1
            tok['sntId'] = sntId
            tok['tokId'] = tokId
            
            # these features must be identical
            if(tok['text'] != TD[tokId]['tok']) :
                print(f"stanzaFeatures Error: snt:{sntId} tokId:{tokId} stanzaWord:{tok['text']} NLTKWord:{TD[tokId]['tok']}")
                
            # rename into tok
            tok['tok'] = tok.pop('text')
            
            # copy from tokList
            tok['cur'] = TD[tokId]['cur']
            
            if('space' in TD[tokId]):  tok['space'] = TD[tokId]['space']
            else:  tok['space'] = ''

            # pos tag from NLTK
            if('pos' in TD[tokId]): tok['pos'] = TD[tokId]['pos']

            # delete Stanza features
            if('misc' in tok): tok.pop("misc")
            if('id' in tok): tok.pop("id")
            if('start_char' in tok): tok.pop("start_char")
            if('end_char' in tok): tok.pop("end_char")
            Token.append(tok)
    return Token


# Find cursor offset for tokens in text
def tokenCurOffset(text, sntList): 
    
    L = [] # list of dictionaries the contain Token information
    end = 0 # position of end of previous word in text
    tokId = 0  # word ID
    sntId = 0  # sentence ID

    for snt in sntList:
        sntId += 1
        #for tok, pos in snt:
        for i in range(len(snt)):
            tok, pos = snt[i]
            start = text[end:-1].find(tok)
            space = text[end:end+start]
            cur = end+start 
            tokId += 1
            H = {'tokId': tokId, 
                 'sntId' : sntId, 
                 'cur': end+start,
                 'tok' : tok, 
                 'space' :space, 
                 'pos' : pos
                }
            # memorize tokId
            snt[i] = (tok, tokId)
            
            L.append(H)
#            print(f"id:{tokId} cur:{cur}\t{tok:<20}\tend0:{end} space:{start}>{space}< {pos}")
    
            end += start + len(tok) 
    return L


### Testing

In [None]:
import MeCab

m = MeCab.Tagger('')

In [None]:
## example for merging 
L = [
 {'src': 1, 'tgt': 1},
 {'src': 2, 'tgt': 2},
 {'src': 3, 'tgt': 2},
 {'src': 2, 'tgt': 3},
 {'src': 3, 'tgt': 3},
 {'src': 3, 'tgt': 4},
 {'src': 4, 'tgt': 5},
 {'src': 5, 'tgt': 6},
 {'src': 6, 'tgt': 6},
 {'src': 7, 'tgt': 7},
 {'src': 8, 'tgt': 8},
 {'src': 9, 'tgt': 9},
 {'src': 10, 'tgt': 10},
 {'src': 11, 'tgt': 10}]


merge_alignments_graph(L)


In [None]:
import difflib
# test whether texts are identical

# Split the strings into lists of lines
lines1 = text1.splitlines()
lines2 = text2.splitlines()

# Use ndiff to find the differences
diff_result = difflib.ndiff(lines1, lines2)

# Print the differences
for line in diff_result:
    print(line)

## Segment Word Alignment
- segment: sentence by sentence
- word alignment
- merged groups 

In [None]:
import random

# random word alignmet per bilingual segment 
def tokAlignment(STnt, FTnt, SntAlign):
    s = len(STnt)
    t = len(FTnt)

    STok = [[t for t, i in s] for s in STnt]
    FTok = [[t for t, i in s] for s in FTnt]
    STid = [[i for t, i in s] for s in STnt]
    FTid = [[i for t, i in s] for s in FTnt]

    AG = {}
    ag = 0
    for aln in SntAlign:
        sIds = [int(s)-1 for s in aln['src'].split('+')]
        tIds = [int(s)-1 for s in aln['tgt'].split('+')]
        
        AG.setdefault(ag, {})
        AG[ag]['src'] = []
        AG[ag]['tgt'] = []
        AG[ag]['sid'] = []
        AG[ag]['tid'] = []
        
        for i in sIds : AG[ag]['src'].extend(STok[i])
        for i in tIds : AG[ag]['tgt'].extend(FTok[i])
        for i in sIds : AG[ag]['sid'].extend(STid[i])
        for i in tIds : AG[ag]['tid'].extend(FTid[i])        
        ag += 1

    # random word alignment
    L = []
    for ag in AG:
        AG[ag]['aln'] = []

        for i in range(int((len(AG[ag]['sid']) / 1.5))) :
            # Get a random index from the list
            rs = random.randint(0, len(AG[ag]['sid']) - 1)
            rt = random.randint(0, len(AG[ag]['tid']) - 1)

            L.append({'src' : AG[ag]['sid'][rs], 'tgt':AG[ag]['tid'][rt]})

    M = merge_alignments_graph(L)
                
    return M


def sntAignment(STnt, FTnt):
    s = len(STnt)
    t = len(FTnt)
    
    L = []
    for i in range(min(s, t)):
        L.append({'src': i+1, 'tgt': i+1})
    
    if(s > t) :
        for i in range(t, s): 
            L.append({'src': i+1, 'tgt': t})
        
    if(t > s) :
        for i in range(s, t): 
            L.append({'src': s, 'tgt': i+1})

    # bring into a grouped format
    M = merge_alignments_graph(L)
    
    # map into xml format 
    SegAln_root = list_of_dicts_to_xml(M, root_tag='SntAlign', item_tag='Snt')
    
    return M


def merge_alignments_graph(alignments):
    """
    Use graph-based approach to find connected components.
    Alignments that share src or tgt indices are in the same group.
    """
    
    if not alignments:
        return []
    
    # Build graph of connections
    graph = defaultdict(set)
    
    for i, align in enumerate(alignments):
        graph[i].add(i)
    
    # Connect alignments that share indices
    for i in range(len(alignments)):
        for j in range(i + 1, len(alignments)):
            if (alignments[i]['src'] == alignments[j]['src'] or
                alignments[i]['tgt'] == alignments[j]['tgt']):
                graph[i].add(j)
                graph[j].add(i)
    
    # Find connected components
    visited = set()
    components = []
    
    def dfs(node, component):
        if node in visited:
            return
        visited.add(node)
        component.add(node)
        for neighbor in graph[node]:
            dfs(neighbor, component)
    
    for i in range(len(alignments)):
        if i not in visited:
            component = set()
            dfs(i, component)
            components.append(component)
    
    # Build merged results
    merged = []
    for component in components:
        src_indices = set()
        tgt_indices = set()
        for idx in component:
            src_indices.add(alignments[idx]['src'])
            tgt_indices.add(alignments[idx]['tgt'])
        
        merged.append({
            'src': sorted(src_indices),
            'tgt': sorted(tgt_indices)
        })
    
    # Sort by first src index
    merged.sort(key=lambda x: x['src'][0])

    M = []
    for item in merged:
        src_str = '+'.join(map(str, item['src'])) if len(item['src']) > 1 else str(item['src'][0])
        tgt_str = '+'.join(map(str, item['tgt'])) if len(item['tgt']) > 1 else str(item['tgt'][0])
        M.append({'src': src_str, 'tgt': tgt_str})
    
    return M



## XML 

In [None]:


# Convert list of dictionary into xml
def list_of_dicts_to_xml(data_list, root_tag='root', item_tag='item'):
    """
    Converts a list of dictionaries into an XML root,
    placing dictionary values into attributes of XML elements.

    Args:
        data_list (list): A list of dictionaries to convert.
        root_tag (str): The tag name for the root element of the XML.
        item_tag (str): The tag name for each item element in the XML.

    Returns:
        root: The XML root.
    """
    root = ET.Element(root_tag)
    for item_dict in data_list:
        item_element = ET.SubElement(root, item_tag)
        for key, value in item_dict.items():
            # Convert value to string as XML attributes are strings
            item_element.set(key, str(value))

    return root




### Events XML -> WKS XML

In [None]:

# Events Token container -> list of token dictionaries
def tokens_xml_to_dict(xml_root):
    """Convert list token XML to dictionary."""
    
    tokens = []
    
    for token in xml_root.findall('Token'):
        token_dict = dict(token.attrib)
        
        # Convert numeric strings to numbers if needed
        if 'id' in token_dict:
            token_dict['tokId'] = int(token_dict['id'])

        tokens.append(token_dict)
    
    return tokens


def tokens2snt(tokList):
    """
    Convert list of token dictionary to list of lists of sentence.
    Assumes tokens have 'segId'
    """
    
    # Group tokens by sentence
    S = []
    L = []
    segId = 0
    for token in tokList:
        if(token.get('segId') != segId) :
            if(segId != 0) :
                S.append(L)
                L=[]
            segId = token.get('segId')        
        L.append((token.get('tok'), token.get('id')))
#        print(token.get('segId'), token.get('tok'), token.get('id'))
    
    if(L) : S.append(L)

    return S

## Simalign

In [None]:
sys.path.append('/data/critt/tprdb/bin/')
import TPRDB
import importlib
importlib.reload(TPRDB)


# read a study from the YAWAT folder (German)
study = TPRDB.readYawatStudy("/data/critt/yawat/TPRDB/BML12/P01_T1", verbose=0)
#print(f"SimAlign sessions:{len(study.keys())}")

ref, tst = TPRDB.yawat2Alignment(study)
#print(f"SimAlign segments:{len(ref)} {len(tst)}\n{ref}\n{tst}")

tst[0].words

In [None]:
# load Simaligner
#!/usr/bin/env python
# coding: utf-8

import os
import sys
import os.path
import glob
import numpy as np
from nltk.translate import AlignedSent, Alignment
from nltk.translate import alignment_error_rate

# load TPRDB library
sys.path.append('/data/critt/tprdb/bin/')
import TPRDB
import importlib
importlib.reload(TPRDB)


# load Simaligner
from simalign import SentenceAligner


# "/data/critt/yawat/TPRDB/AR19/"
def SimAlign(inStudy, outStudy="", method="a", verbose=0) :
    # method: m, a, i

    print(f"SimAlign initiate AimAlign\n")
    myaligner = SentenceAligner(model="bert", token_type="bpe", matching_methods=method)

    print(f"SimAlign in:{inStudy} out:{outStudy} method:{method}\n")

    # read a study from the YAWAT folder (German)
    study = TPRDB.readYawatStudy(inStudy, verbose=0)
    print(f"SimAlign sessions:{len(study.keys())}")

    # separate study into list of NLTK alignments with reference and test set
    ref, tst = TPRDB.yawat2Alignment(study)
    print(f"SimAlign segments:{len(ref)}")

    #run alignment
    SimA = simAlignment(tst, method, myaligner, verbose=0)

    print("SimAlign: transitiveAlignment")
    # transitive mapping 
    SimAT = TPRDB.transitiveAlignment(SimA, verbose = 0)

    # add alignments to study under feature name "SimA"
    TPRDB.alignment2Yawat(study, SimAT, feat="SimAT")

    # write alignments to study
    if(inStudy == '') : outStudy = inStudy
    print(f"SimAlign: writeYawatStudy {outStudy}")
    TPRDB.writeYawatStudy(outStudy, study, feat="SimAT")


# simalign
# simalign

def simAlignment(ALN, method, myaligner, verbose = 0):
    R = []

    for seg in range(len(ALN)):
        aln = []
        mot = ALN[seg].mots
        word = ALN[seg].words
        if(verbose): print(f"simAlignment: {seg} from {len(ALN)}")
        if((seg % 10) == 0) : print(f"simAlignment: {seg} from {len(ALN)}")

        if(len(mot) == 0 or len(word) == 0):
            if(verbose): print(f"Unaligned: {word}\n{mot}")
            R.append(AlignedSent(word, mot, Alignment(aln)))
        else :
            aln = myaligner.get_word_aligns(word, mot)
            for a in aln:
                R.append(AlignedSent(word, mot, Alignment(aln[a])))
                break
    return R



# Events to Workspace

In [None]:
import glob
import os
import io

path = "/data/critt/tprdb/TPRDB"
studies = ['BML12', 'SG12', 'RUC17']
verbose = 2
    
for study in studies:
    files = glob.glob(f"{path}/{study}/Events/*Event.xml")
    if(verbose): print(f"Reading:{study}\twith {len(files)} files")
    try:
        os.mkdir(f"{path}/{study}/WKS/")
    except FileExistsError:
        print(f"\tDirectory WKS already exists.")

    n = 0
    for fn in sorted(files):
        root, extension = os.path.splitext(fn)
        wks = os.path.basename(root).removesuffix(".Event")
        out = f"{path}/{study}/WKS/{wks}.xml"
        n += 1
        
        if os.path.exists(out):
            if(verbose): print(f"  {n}\tExists: {out}")
            continue
            
        if(verbose):  print(f"  {n}\tOutput: {out}")
            
        try:
            WorkSpace_root = Events2WKS(fn) 
            ET.indent(WorkSpace_root, space='  ')  # 2 spaces
            with open(out, "w") as f:
                print(ET.tostring(WorkSpace_root, encoding='unicode'), file=f)

        except FileExistsError:
            print(f"\tError in XML File.")


In [None]:
fn = "/data/critt/tprdb/TPRDB/ENJA15/Events/P01_T1.Event.xml"
fn = "/data/critt/tprdb/TPRDB/BML12/Events/P22_P5.Event.xml"
fn = "/data/critt/tprdb/TPRDB//RUC17/Events/P02_P3.Event.xml"


WorkSpace_root = Events2WKS(fn, Verbose=1) 

##################################################################
# pretty-print WorkSpace_root
ET.indent(WorkSpace_root, space='  ')  # 2 spaces
print(ET.tostring(WorkSpace_root, encoding='unicode'))

In [None]:
def Events2WKS(fn, Verbose = 0) :
    
    ### Root of the WorkSpace XML output file
    WorkSpace_root = ET.Element("WorkSpace")
    
    if(Verbose) : print(f"Events2WKS 1")
    
    ### Read Events-XML file
    events = ET.parse(fn)
    if(Verbose) : print(f"Events2WKS 2")
    # Root of the Translog XML input file
    events_root = events.getroot()
    if(Verbose) : print(f"Events2WKS 2")
    
    # get Source and Target Languages
    e = events_root.find('.//Languages')
    SL = e.get('source') 
    TL = e.get('target')

    if(Verbose) : print(f"Events2WKS: SL:{SL} TL:{TL}")
        
    ##################################################################
    
    def EventsToken(tag, lng, Verbose=0) :
        
        e = events_root.find(f".//{tag}")
        token = tokens_xml_to_dict(e)
        
        if(Verbose) : print(f"EventsToken: lng:{lng} No:{len(token)}")

        # Reconstruct Sentence from Tokens 
        snt = tokens2snt(token)
        
        # get additional features from Stanza 
        # add features to list of STokens 
        tokFeats = stanzaFeatures(snt, lng, token)
        
        # convert token Dictionary to xml
        token_root = list_of_dicts_to_xml(tokFeats, root_tag=tag, item_tag='Token')
        
        # assign source language 
        token_root.set('language', str(lng))
        return token_root
    
    ######################################
    # append FT tokenization to WorkSpace root 
    Root = EventsToken('SourceToken', SL, Verbose=Verbose)
    WorkSpace_root.append(Root)
    
    # append FT tokenization to WorkSpace root 
    Root = EventsToken('FinalToken', TL, Verbose=Verbose)
    WorkSpace_root.append(Root)
    
    ######################################
    e = events_root.find('.//Alignment')
    L = []
    for token in e.findall('Align'):
        d = dict(token.attrib)
        d['src'] = d['sid']
        d['tgt'] = d['tid']
        L.append(d)
     
    M = merge_alignments_graph(L)
    if(Verbose) : print(f"Alignment: {len(M)}")
    
    # map into xml format 
    Root = list_of_dicts_to_xml(M, root_tag='TokAlign', item_tag='Tok')
    WorkSpace_root.append(Root)
    
    ######################################
    e = events_root.find('.//Salignment')
    L = []
    for token in e.findall('Salign'):
        d = dict(token.attrib)
        L.append(d)
     
    M = merge_alignments_graph(L)
    if(Verbose) : print(f"Salignment: {len(M)}")
    
    # map into xml format 
    Root = list_of_dicts_to_xml(M, root_tag='SntAlign', item_tag='Snt')
    WorkSpace_root.append(Root)
    
    ##################################################################
    e = events_root.find('.//Modifications')
    
    L = []
    for token in e.findall('Mod'):
        d = dict(token.attrib)
        # rename sid and tid
        d['src'] = d.pop('sid')
        d['tgt'] = d.pop('tid')
    
        L.append(d)

    if(Verbose) : print(f"Modifications: {len(L)}")

    Root = list_of_dicts_to_xml(L, root_tag='Modifications', item_tag='Mod')
    WorkSpace_root.append(Root)
     
    
    ##################################################################
    e = events_root.find('.//Fixations')
    L = []
    for token in e.findall('Fix'):
        d = dict(token.attrib)
        
        # rename sid and tid
        d['src'] = d.pop('sid')
        d['tgt'] = d.pop('tid')
        L.append(d)

    if(Verbose) : print(f"Fixations: {len(L)}")
        
    Root = list_of_dicts_to_xml(L, root_tag='Fixations', item_tag='Fix')
    WorkSpace_root.append(Root)    
    
    ##################################################################
    ### Segment open-closing
    #  <Segments>
    #    <Seg sntId="1" open="72952" close="89436" />
    e = events_root.find('.//Segments')
    L = []
    for token in e.findall('Seg'):
        d = dict(token.attrib)
        L.append(d)
        
    if(Verbose) : print(f"Segments: {len(L)}")

    Root = list_of_dicts_to_xml(L, root_tag='SntEdits', item_tag='Snt')
    WorkSpace_root.append(Root)

    return WorkSpace_root