In [None]:
import xml.etree.ElementTree as ET

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tag import pos_tag
from collections import defaultdict

import stanza
import jieba
import jieba.posseg as pseg
import re


In [None]:
# load Simaligner
from simalign import SentenceAligner

myaligner = SentenceAligner(model="bert", token_type="bpe", matching_methods='a')


# Translog to Workspace

In [None]:

fn = "/data/critt/tprdb/TPRDB/AR22/Translog-II/P01_T2.xml"
fn = "/data/critt/tprdb/TPRDB/SG12/Translog-II/P01_T2.xml"
fn = "/data/critt/tprdb/TPRDB/BML12/Translog-II/P01_T2.xml"
fn = "/data/critt/tprdb/TPRDB/BML12/Translog-II/P01_T1.xml"
fn = "/data/critt/tprdb/TPRDB/RUC17/Translog-II/P01_T1.xml"
#fn = "/data/critt/tprdb/TPRDB/STC17bolt/Translog-II/P01_T1.xml"
#fn = "/data/critt/tprdb/TPRDB/ENJA15/Translog-II/P01_T1.xml"

wks_root = Translog2WKS(fn)

# pretty-print WorkSpace_root
ET.indent(wks_root, space='  ')  # 2 spaces
print(ET.tostring(wks_root, encoding='unicode'))

## to do
# Realign and substitute in WKS
# Table extraction

In [None]:
def Translog2WKS(fn) :
    
    ### Root of the WorkSpace XML output file
    WorkSpace_root = ET.Element("WorkSpace")
    
    ### Read Translog-XML file
    translog = ET.parse(fn)
    # Root of the Translog XML input file
    translog_root = translog.getroot()
    
    # get Source and Target Languages
    e = translog_root.find('.//Languages')
    SL = e.get('source') 
    TL = e.get('target')
    
    ##################################################################
    ### ST segmentation and tokenization
    # <SourceToken language="en" >
    #    <Token cur="0" tokId="1" pos="NNP" sntId="1" tok="Killer" />
      
    # Source text and target Text 
    SText = getSourceText(translog_root)
    
    # segment and tokenize the source text
    (STsnt, SToken, SToken_root) = Tokenize(SText, SL, 'SourceToken')
    
    # append ST tokenization to WorkSpace root 
    WorkSpace_root.append(SToken_root)
    
    ##################################################################
    ### TT final target text (translation) segmentation and tokenization
    #  <FinalToken language="ar" >
    #    <Token  cur="0" tokId="1" sntId="1" tok="ﺎﻠﻤﻣﺮﺿ" />  
    
    # get final text from Translog file 
    FText = getFinalText(translog_root)
    
    # segment and tokenize the target text
    (FTsnt, FToken, FToken_root) = Tokenize(FText, TL, 'FinalToken')
    
    # append FT tokenization to WorkSpace root 
    WorkSpace_root.append(FToken_root)
    
    ##################################################################
    ### Segment-alignment
    #  <SntAlignment>
    #     <Snt src="1" tgt="1" />
    
    # very preliminary Sentence Alignment
    SntAlignList = sntAlignment(STsnt, FTsnt)
    
    # convert SntAlign Dictionary to xml
    SntAln_root = list_of_dicts_to_xml(SntAlignList, root_tag='SntAlignment', item_tag='Snt')
    
    # append SntAlign Dictionary to WorkSpace root 
    WorkSpace_root.append(SntAln_root)
    
    ##################################################################
    ### Token-alignment
    #  <TokAlign>
    #     <Tok sid="1" tid="1" />

    # create segment alignments
    segAlign = snt2segAlign(STsnt, FTsnt, SntAlignList)
    
    # random token alignment: not very useful
#    tokAlign = rndAlignment(segAlign)
    
    tokAlign = simAlignment(segAlign)
    
    # convert tokAlign Dictionary to xml
    TokAln_root = list_of_dicts_to_xml(tokAlign, root_tag='TokAlignment', item_tag='Tok')
    
    # append SntAlign Dictionary to WorkSpace root 
    WorkSpace_root.append(TokAln_root)

    ##################################################################
    ### Keystroke-Token mapping
    #  <Modifications>
    #    <Mod time="72953" type="Mins" cur="0" chr="ﺍ" X="0" Y="0" sntId="1" sid="2" tid="1"  />
     
    ##################################################################
    ### Fixation-Token mapping
    #  <Fixations>
    #      <Fix time="30" win="1" cur="227" dur="175" X="502" Y="228" sntId="3" sid="41" tid="39" />
    
    
    ##################################################################
    ### Segment open-closing
    #  <Segments>
    #    <Seg sntId="1" open="72952" close="89436" />

    return WorkSpace_root

## Linguistic processing
- sentence segmentation (NLTK)
- tokenization (NLTK)
- lexical features (Stanza)
- cursor offset of words in text
                    

In [None]:
def Tokenize(text, lng, tag, form=1, stanzaFeats=1):
    """
    Tokenize and annotate text with linguistic features.
    
    Parameters:
    -----------
    text : str
        Raw input text to process
    lng : str
        Language code (e.g., 'en', 'es', 'de')
    tag : str
        XML root tag name (e.g., 'SourceText', 'TargetText')
    
    Returns:
    --------
    tuple: (snt, toksFeats, token_root)
        - snt: List of tokenized sentences with POS tags
        - toksFeats: List of token dictionaries with all features
        - token_root: XML ElementTree with all token data
    """
    
    # segment and tokenize source text 
    # snt: is list of tokenized ,Tagged sentences: 
    #    [[(token, pos), ...], [(token, pos), ... ], ...]  
    snt = segmentText(text, lng=lng, form=form)

    # create list of tokens with sntId, tokId, cursor offset
    #    [{tok1features}, {tok2features}, ...]
    toksList = tokenCurOffset(text, snt)

    # get additional features from Stanza to list of tokens
    # add features to list of STokens 
    if(stanzaFeats) :
        tokens = stanzaFeatures(snt, lng, toksList)
    else: tokens = toksList
    
    # convert token Dictionary to xml
    token_root = list_of_dicts_to_xml(tokens, root_tag=tag, item_tag='Tok')
    
    # assign source language 
    token_root.set('language', str(lng))

    return (snt, tokens, token_root)

#########################################################################
# get ST from the Translog file
def getSourceText(root):
    
    # get text from UTF8 container in the xml file 
    ST = root.find('.//SourceTextUTF8')
    if ST is not None:
        return ST.text
        
    # in older versions there is no UTF8 version in the xml file 
    # else SourceTextChar must extist
    text2 = ''
    STchars = root.findall('.//SourceTextChar/CharPos')
    for chars in STchars:
        text2 += chars.get('Value')
    return text2

# get FT from the Translog file
def getFinalText(root):

    # FinalText in UTF8 should usually always be there
    FT = root.find('.//FinalText')
    if FT is not None:
        return FT.text

    # else FinalTextChar must extist
    text2 = ''
    FTchars = root.findall('.//FinalTextChar/CharPos')
    for chars in FTchars:
        text2 += chars.get('Value')
    return text2

#########################################################################
# segment text FT from the Translog file
def segmentText(text, lng='en', form = 1):

    '''
    form 0: Output Format: [['token', ...], ...] : Use Case: Just tokenized sentences
    form 1: Output Format: [[('token', 'POS'), ...], ...]: Use Case: With POS tags (default)
    form 2: Output Format: ['token token ...', ...]: Use Case: Sentences as strings
    form 4: Uses Stanza for all languages: Use Case: High-quality annotations
    '''
    
    # replace multiple \n by one (no impact on NLTK segmentation)
    text1 = re.sub(r'\n+', '\n', text)
    
    if(form == 4) : return segmentStanza(text, lng)
    if(lng == 'ja') : return segmentStanza(text, lng)
    if(lng == 'zh') : return segmentChineseJieba(text, form)
        
    # Segment text into list of sentences
    snt0 = sent_tokenize(text1)

    # segment text at newline into segments (not covered by NLTK)
    snt1 = []
    for i in range(len(snt0)) :
        s = snt0[i]
        snt1.extend(s.split('\n'))
        
    # Tokenize each sentences
    snt1 = [word_tokenize(s) for s in snt1]
    
    # remove empty sentences (e.g. produced by \n)
    if([] in snt1): snt1.remove([])

    # Part-of-speech tagging each sentences: works only properly for English
    if(form == 1) : snt1 = [pos_tag(s) for s in snt1]

    # collapse back into list of sentences
    if(form == 2) : snt1 = [" ".join(s) for s in snt1]
        
    return snt1


def segmentStanza(text, lng) :
#    pattern = r'[。！？]'

    nlp = stanza.Pipeline(lang=lng, processors='tokenize,pos')

    doc = nlp(text)
    
    # stanza document to list of list of dictionaries
    stza_list = doc.to_dict()   
    
    L = []
    for snt in stza_list:
        T = []
        for tok in snt :
            T.append((tok['text'], tok['upos']))
        if(len(T) > 0):  L.append(T)
    return L

def segmentChineseJieba(text, form) :
    
    words = pseg.cut(text)
    return words2snt(list(words), form)

def words2snt(words, form) :

    pattern = r'[。！？.!?]'
    
    S = []
    L = []
    for tok, pos in words:
        # skip word that are whitespaces
        if(re.search(r'\s',  tok)) : 
            # new sentence with \n,but it's not a token
            if(re.search(r'\n',  tok)) : 
                if(len(L) > 0) : S.append(L)
                L = []
            continue
            
        # token and pos
        if(form == 1) : L.append((tok, pos))
        # only token if form != 1
        else : L.append(tok)

        # end of sentence
        match = re.search(pattern, tok)
        if(match) : 
            if(len(L) > 0) : S.append(L)
            L = []

    if(len(L) > 0) : S.append(L)

    return S


# additional features from Stanza
def stanzaFeatures(snt, lng, token, processor='tokenize,mwt,pos,lemma', tokenize_no_ssplit=False, verbose=False):

    # mwt not available for those languages
    if (lng == 'ja' or lng == 'zh' or lng == 'ko' or lng == 'hi' or lng == 'nl'): processor = 'tokenize,pos,lemma'

    # initialize stanza pipeline
    nlp = stanza.Pipeline(lang=lng, processors=processor, tokenize_pretokenized=True, tokenize_no_ssplit=tokenize_no_ssplit, verbose=verbose)
    
    # keep only token from the list of sentences
    # there can be empty tokens '' which are substituted by '.'
    sntList = [[w if w != '' else '.' for w, p in s] for s in snt]

    doc = nlp(sntList)

    # stanza document to list of list of dictionaries
    stza_list = doc.to_dict()
    
    # map list of NLTK tokens into dictinary for faster lookup 
    TD = {d['tokId']: d for d in token}

    sntId = 0
    tokId = 0
    Token = []
    off = 0

    for snt in stza_list:
        sntId +=1
        for tok in snt :
            tokId +=1
            tok['sntId'] = sntId
            tok['tokId'] = tokId
            
            # these features must be identical
            if(tok['text'] != TD[tokId]['tok']) :
                print(f"stanzaFeatures Warning: snt:{sntId} tokId:{tokId} stanzaWord:>{tok['text']}< NLTKWord:>{TD[tokId]['tok']}<")
                      
            # copy from tokList
            tok['cur'] = TD[tokId]['cur']
            tok['tok'] = TD[tokId]['tok']
            
            if('space' in TD[tokId]):  tok['space'] = TD[tokId]['space']
            else:  tok['space'] = ''

            # pos tag from NLTK
            if('pos' in TD[tokId]): tok['pos'] = TD[tokId]['pos']

            # delete Stanza features
            if('text' in tok): tok.pop('text')
            if('misc' in tok): tok.pop("misc")
            if('id' in tok): tok.pop("id")
            if('start_char' in tok): tok.pop("start_char")
            if('end_char' in tok): tok.pop("end_char")
            Token.append(tok)
    return Token


# Find cursor offset for tokens in text
def tokenCurOffset(text, snt): 
    
    L = [] # list of dictionaries the contain Token information
    end = 0 # position of end of previous word in text
    tokId = 0  # word ID
    sntId = 0  # sentence ID

    for s in snt:
        sntId += 1
        #for tok, pos in s:
        for i in range(len(s)):
            tok, pos = s[i]
            start = text[end:].find(tok)
            space = text[end:end+start]
            cur = end+start 
            tokId += 1
            H = {'tokId': tokId, 
                 'sntId' : sntId, 
                 'cur': end+start,
                 'tok' : tok, 
                 'space' :space, 
                 'pos' : pos
                }
            # memorize (tok, tokId)
            s[i] = (tok, tokId)
            
            L.append(H)
#            print(f"id:{tokId} cur:{cur}\t{tok:<20}\tend0:{end} space:{start}>{space}< {pos}")
    
            end += start + len(tok) 
    return L


### Testing

In [None]:
text = """麩菓子は、
麩を主材料とした日本の菓子。

麩菓子は、麩を主材料とした日本の菓子。麩菓子は、麩を主材料とした日本の菓子。

麩菓子は、麩を主材料とした日本の菓子。"""

RUC17T = """  杀人犯护士受到了四次终身监禁处罚
今日，医院护士柯林·诺瑞思被投入监狱终身监禁，因为他杀死了四位病人。2002年，来自格拉斯哥的32岁的诺瑞思通过给病人服用大量的安眠药杀死了四位女病人。昨日，经过一段长时间的审判，诺瑞思被指控犯有四起谋杀案。对于每起谋杀案，他至少需要服刑三十年。警方人员克里斯·格雷格谈到，此前诺瑞思一直在医院附近行动诡异。直到一些其他医院工作人员发现时才停止了他的谋杀行为。警方获悉，诺瑞思杀人的动机是不喜欢同老年人一起工作。此案所有受害者均是年老患有心脏病的妇女。这些人都是医院工作人员的负担。
"""

segmentText(RUC17T, 'zh', 1)


In [None]:
import MeCab

# Initialize Tagger for "wakati" (word segmentation) output
wakati = MeCab.Tagger("-Owakati")
print(wakati.parse("pythonが大好きです").split())

# Initialize Tagger for detailed morphological analysis
tagger = MeCab.Tagger()
print(tagger.parse("これはペンです"))

In [None]:
## example for merging 
L = [
 {'src': 2, 'tgt': 2},
 {'src': 3, 'tgt': 2},
 {'src': 2, 'tgt': 3},
 {'src': 3, 'tgt': 3},
 {'src': 2, 'tgt': 4},
 {'src': 3, 'tgt': 4},
]


merge_alignments_graph(L)


In [None]:
import MeCab

m = MeCab.Tagger('')

In [None]:
import difflib
# test whether texts are identical

# Split the strings into lists of lines
lines1 = text1.splitlines()
lines2 = text2.splitlines()

# Use ndiff to find the differences
diff_result = difflib.ndiff(lines1, lines2)

# Print the differences
for line in diff_result:
    print(line)

## Segment / Word Alignment
- segment: sentence by sentence
- word alignment
- merged groups 

In [None]:
import random

# build alignment segments with m *n sentence alignment groups 
def snt2segAlign(STnt, FTnt, SntAlign):

    """
    Convert sentence alignments to token-level aligned segments.
    
    Parameters:
    -----------
    STnt : list of lists of Source Text (tokens , tokIds)
        Source sentences: [[(token, id), (token, id), ...], ...]
        Example: [[('Killer', 1), ('nurse', 2), ...], ...]
    
    FTnt : list of lists of Target Text (tokens , tokIds)
        Target sentences: [[(token, id), (token, id), ...], ...]
        Example: [[('El', 1), ('enfermero', 2), ...], ...]
    
    SntAlign : list of aligned sentence ids per segment (list of dicts)
        Sentence alignments: [{'src': '1', 'tgt': '1'}, {'src': '2+3', 'tgt': '2+3'}, ...]
        where: 
        'src': source sentence id
        'tgt': target sentence id
    
    Returns:
    --------
    SEGS : dict of alignment segments
        Alignment groups with tokens and IDs
    """
    
    s = len(STnt)
    t = len(FTnt)

    # list of tokens per sentence
    STok = [[t for t, i in s] for s in STnt]
    FTok = [[t for t, i in s] for s in FTnt]
    # list of token ids per sentence
    STid = [[i for t, i in s] for s in STnt]
    FTid = [[i for t, i in s] for s in FTnt]

    SEGS = {}
    ag = 0
    # loop over aligned sentence ids
    for aln in SntAlign:
        # ST sentences 
        sIds = [int(s)-1 for s in aln['src'].split('+')]
        # aligned TT sentences
        tIds = [int(s)-1 for s in aln['tgt'].split('+')]
        
        SEGS.setdefault(ag, {})
        SEGS[ag]['src'] = []
        SEGS[ag]['tgt'] = []
        SEGS[ag]['sid'] = []
        SEGS[ag]['tid'] = []

        # join ST/TT tokens of a segment
        for i in sIds : SEGS[ag]['src'].extend(STok[i])
        for i in tIds : SEGS[ag]['tgt'].extend(FTok[i])
        # join ST/TT token ids of a segment
        for i in sIds : SEGS[ag]['sid'].extend(STid[i])
        for i in tIds : SEGS[ag]['tid'].extend(FTid[i])        
        ag += 1
        
    return SEGS

# random word alignmet per bilingual segment 
def rndAlignment(SEGS):

    # random word alignment
    L = []
    for ag in SEGS:
        SEGS[ag]['aln'] = []

        for i in range(int((len(SEGS[ag]['sid']) / 1.5))) :
            # Get a random index from the list
            rs = random.randint(0, len(SEGS[ag]['sid']) - 1)
            rt = random.randint(0, len(SEGS[ag]['tid']) - 1)

            L.append({'src' : SEGS[ag]['sid'][rs], 'tgt':SEGS[ag]['tid'][rt]})

    M = merge_alignments_graph(L)
                
    return M

# simAlign word alignmet per bilingual segment 
def simAlignment(SEGS):
    
    aln = []
    for seg in SEGS :
        # needs to be initialized globally
        # returns a dictionary of aligned indexes {key: [(s,t), (s,t), ...]}
        A = myaligner.get_word_aligns(SEGS[seg]['src'], SEGS[seg]['tgt'])

        # map simalign segment-relative indexes into TPR-DB text-relative indexes
        for m in A:
            for s, t in A[m]:
                aln.append({'src' : SEGS[seg]['sid'][s], 'tgt': SEGS[seg]['tid'][t]})
                
    return merge_alignments_graph(aln)
                

def sntAlignment(STnt, FTnt):
    s = len(STnt)
    t = len(FTnt)
    
    L = []
    for i in range(min(s, t)):
        L.append({'src': i+1, 'tgt': i+1})
    
    if(s > t) :
        for i in range(t, s): 
            L.append({'src': i+1, 'tgt': t})
        
    if(t > s) :
        for i in range(s, t): 
            L.append({'src': s, 'tgt': i+1})

    # bring into a grouped format
    M = merge_alignments_graph(L) 
    return M


def merge_alignments_graph(alignments):
    """
    Use graph-based approach to find connected components.
    Alignments that share src or tgt indices are in the same group.
    """
    
    if not alignments:
        return []
    
    # Build graph of connections
    graph = defaultdict(set)
    
    for i, align in enumerate(alignments):
        graph[i].add(i)
    
    # Connect alignments that share indices
    for i in range(len(alignments)):
        for j in range(i + 1, len(alignments)):
            if (alignments[i]['src'] == alignments[j]['src'] or
                alignments[i]['tgt'] == alignments[j]['tgt']):
                graph[i].add(j)
                graph[j].add(i)
    
    # Find connected components
    visited = set()
    components = []
    
    def dfs(node, component):
        if node in visited:
            return
        visited.add(node)
        component.add(node)
        for neighbor in graph[node]:
            dfs(neighbor, component)
    
    for i in range(len(alignments)):
        if i not in visited:
            component = set()
            dfs(i, component)
            components.append(component)
    
    # Build merged results
    merged = []
    for component in components:
        src_indices = set()
        tgt_indices = set()
        for idx in component:
            src_indices.add(alignments[idx]['src'])
            tgt_indices.add(alignments[idx]['tgt'])
        
        merged.append({
            'src': sorted(src_indices),
            'tgt': sorted(tgt_indices)
        })
    
    # Sort by first src index
    merged.sort(key=lambda x: int(x['src'][0]))

    M = []
    for item in merged:
        src_str = '+'.join(map(str, item['src'])) if len(item['src']) > 1 else str(item['src'][0])
        tgt_str = '+'.join(map(str, item['tgt'])) if len(item['tgt']) > 1 else str(item['tgt'][0])
        M.append({'src': src_str, 'tgt': tgt_str})
    
    return M



## Keystroke mapping

In [None]:
path = "/data/critt/tprdb/TPRDB"
studies = ['BML12', 'SG12', 'RUC17']
studies = ['BML12','SG12']
studies = ['AR22']
studies = ['RUC17']
studies = ['SG12']
studies = ['NJ12']

for study in studies:
    files = glob.glob(f"{path}/{study}/Translog-II/*.xml")
    if(verbose): print(f"Reading:{study}\twith {len(files)} files")

    for fn in sorted(files):
        base = os.path.basename(fn)
        out = f"TESTED/{study}-{base}"
        print(f"{fn} --> {out}")
        if os.path.exists(out):
            if(verbose): print(f"  \tExists: {out}")
            continue

        Modifs = OnlyKeyMapping(fn, verbose=0, stanzaFeats=0) 
        with open(out, "w") as f:
            print(Modifs, file=f)

        
        

In [None]:
fn = '/data/critt/tprdb/TPRDB/BML12/Translog-II/P01_T2.xml'
fn = '/data/critt/tprdb/TPRDB/BML12/Translog-II/P04_T2.xml'
fn = '/data/critt/tprdb/TPRDB/BML12/Translog-II/P03_T5.xml'
fn = '/data/critt/tprdb/TPRDB/BML12/Translog-II/P03_T6.xml'
fn = '/data/critt/tprdb/TPRDB/BML12/Translog-II/P05_T3.xml'
fn = '/data/critt/tprdb/TPRDB/BML12/Translog-II/P06_T6.xml'
fn = '/data/critt/tprdb/TPRDB/BML12/Translog-II/P09_T5.xml'
fn = '/data/critt/tprdb/TPRDB/BML12/Translog-II/P12_T3.xml'
fn = '/data/critt/tprdb/TPRDB/BML12/Translog-II/P13_T5.xml'
fn = '/data/critt/tprdb/TPRDB/BML12/Translog-II/P28_T6.xml'
fn = '/data/critt/tprdb/TPRDB/BML12/Translog-II/P29_T5.xml'

fn = '/data/critt/tprdb/TPRDB/SG12/Translog-II/P01_T1.xml'
fn = '/data/critt/tprdb/TPRDB/SG12/Translog-II/P02_T3.xml'
fn = '/data/critt/tprdb/TPRDB/NJ12/Translog-II/P01_P3.xml'


### Root of the WorkSpace XML output file
WorkSpace_root = ET.Element("WorkSpace")

### Read Translog-XML file
translog = ET.parse(fn)
# Root of the Translog XML input file
translog_root = translog.getroot()

# get Source and Target Languages
e = translog_root.find('.//Languages')
SL = e.get('source') 
TL = e.get('target')

# get final text from Translog file 
FText = getFinalText(translog_root)

# segment and tokenize the target text
(FTsnt, FToken, FToken_root) = Tokenize(FText, TL, 'FinalToken')

Keys = KeyMapping(FText, FToken, translog_root, Verbose = 1)


In [None]:
Keys = KeyMapping(FText, FToken, translog_root, Verbose = 1)

In [None]:
def OnlyKeyMapping(fn, verbose=0, stanzaFeats=0) :
    wks_root = Translog2WKS(fn)
    
    ### Root of the WorkSpace XML output file
    WorkSpace_root = ET.Element("WorkSpace")
    
    ### Read Translog-XML file
    translog = ET.parse(fn)
    # Root of the Translog XML input file
    translog_root = translog.getroot()
    
    # get Source and Target Languages
    e = translog_root.find('.//Languages')
    SL = e.get('source') 
    TL = e.get('target')
    
    # get final text from Translog file 
    FText = getFinalText(translog_root)
    
    # segment and tokenize the target text
    (FTsnt, FToken, FToken_root) = Tokenize(FText, TL, 'FinalToken', stanzaFeats=0)
    
    Keys = KeyMapping(FText, FToken, translog_root, Verbose = 0)

    return Keys


In [None]:
Token

In [None]:
def KeyMapping(Text, Token, translog_root, Verbose = 1):

    text = list(Text)  # list of characers for final text 
    index = [0] * len(text) # list of word indexes for final text 

    pm = 10 # margin for character visualization, left and right

    # test whether tokens fit text, print warning
    for tok in Token:
        cur =  tok['cur']
        if(not Text.startswith(tok['tok'],  cur)):
            print(f"\tWARNING cur:{tok['cur']} token:{tok['tokId']}:>{tok['tok']}< ~~ {''.join(text[max(cur-pm, 0): cur]).replace('\n', '\\n')}>{text[cur]}<{''.join(text[cur+1: min(cur+pm,len(text))]).replace('\n', '\\n')}")      
   
    # character arrays of final text
    index[0] = 1
    for tok in Token:
        cur = tok['cur']
        end = cur + len(tok['tok'])
        
        # space before is part of the following token 
        if('space' in tok) : cur -= len(tok['space'])
        index[cur:end] = [tok['tokId']]*(end-cur)

    for i in index: 
        if(index[i] == 0): index[i] = index[i-1]

    # find keystrokes in the xml file
    Keys = {}
    e = translog_root.find('.//Events')
    for key in e.findall('Key'):
        d = dict(key.attrib)
        d['Cursor'] = int(d['Cursor'])
        d['Time'] = int(d['Time'])
        
        time = d['Time']
        # two keystrokes at the same time
        if(time in Keys) :
            print(f"\tKeystrokes same time {time}\t{d['Type']}:{d['Value']} -- {Keys[d["Time"]]['Type']}:{Keys[d["Time"]]['Value']}")
            time +=1
        Keys[time] = d

    def DeleteString(cur, cut):
        cutId = index[-1]
        Keys[time]['tokId'] = index[-1]
        if(cur < len(index)) : 
            cutId = index[cur]      
            Keys[time]['tokId'] = index[cur]
            text[cur:cur] = cut
            index[cur:cur] = [cutId] * len(cut)
        else :      
            text.extend(cut)
            index.extend([cutId] * len(cut))

    def InsertString(cur, ins):
        w = ''
        Keys[time]['tokId'] = index[cur]

        # insert array of pasted characters
        for i in range(len(ins)) :
            if(cur >= len(text)):
                w = f"\tWARNING InsertString: {cur} length text {len(text)}"
            elif((ins[i] != text[cur]) and  (text[cur] != '#') and  (text[cur] != '')):
                w = f"\tWARNING InsertString: {cur} key:>{ins[i]}< text:>{text[cur]}<\t\t{''.join(text[max(cur-pm, 0): cur])}>{text[cur]}<{''.join(text[cur+1: min(cur+pm,len(text))]).replace('\n', '\\n')}"
                
            del text[cur]
            del index[cur]
        return w

    Warn = 0
    
    ############################################################
    # main loop over keystrokes in reversed time
    for time in  sorted(Keys.keys(), reverse=True) :
        tpe = Keys[time]['Type']    # one of 'insert', 'delete', 'edit'

        
        # navigation not interesting for modifications
        if(tpe == 'navi') : continue
    
        key = Keys[time]['Value']   # the value of the keystroke
        cur = Keys[time]['Cursor']  # cursor offeset in text
        
        # text that is marked and deleted
        cut = ''
        if('Text' in Keys[time]) : cut = Keys[time]['Text']
        if('Paste' in Keys[time]) : cut = Keys[time]['Paste']
    
        warning = ''

        # initialize tokId
        if(len(index) > 0) :
            if(cur >= len(index)) : Keys[time]['tokId'] = index[-1]
            else : Keys[time]['tokId'] = index[cur]
        else : Keys[time]['tokId'] = 1       

        # plot text changes
        if(Verbose) :
            print(f"{time}\t{tpe} i:{key.replace('\n', '\\n')}< d:{cut.replace('\n', '\\n')}< \tc:{cur}\t{''.join(text[max(cur-pm, 0):min(cur+pm,len(text))]).replace('\n', '\\n')}", end='')

        #################################################################
        # [Ctrl+V] and [Ctrl+X]
        if(tpe == 'edit') :

            # insertion: [Ctrl+V]
            if('Paste' in Keys[time]) : 
                ins = list(Keys[time]['Paste'])
#                print("No paste")
                warning = InsertString(cur, ins)
                            
            # deletion: [Ctrl+X]
            if('Text' in Keys[time]) : 
                cut = (list(Keys[time]['Text']))
                DeleteString(cur+1, cut)

        #################################################################
        if(tpe == 'insert' or tpe == 'return') :
            cut = ''

            # marked text that is deleted with the insertion
            if('Text' in Keys[time]) : 
                cut = (list(Keys[time]['Text']))
                DeleteString(cur+1, cut)

            # insert keystroke
            if(tpe == 'return') : key ="\n"
            if(key == '' and cut == '') : 
                key = ' '
                print(f"\tInsert with no value: {cur}")
            # insert the 
            warning = InsertString(cur, list(key))
    
        #################################################################
        if(tpe == 'delete') :
            if('Text' in Keys[time]) : cut = (list(Keys[time]['Text']))
            else: cut = list('#')

            # could be: [Back], [Ctrl+Back], [Shift+Back]
            if('Back' in key) :
                cutId = 1
                if(cur >= len(index)) :
                    if(len(index) > 0) : cutId = index[-1]                
                    Keys[time]['tokId'] = cutId
                    text.extend(cut)
                    index.extend([cutId] * len(cut))

                elif(cur > 0) : 
#                    print(f"\n\tAAA: {cur} >{cut}< {len(index)} {len(text)}")
                    cutId = index[cur-1]                
                    Keys[time]['tokId'] = index[cur-1]
                    text[cur:cur] = cut
                    index[cur:cur] = [cutId] * len(cut)

                # cursor at first position. index / text might be empty
                else : 
                    Keys[time]['tokId'] = 1 
                    
                    # insert deletion in the first position
                    text = cut + text
                    index = [cutId] * len(cut) + index

            elif('Delete' in key) : DeleteString(cur, cut)
            else: 
                print(f"Delete key not covered: {key}")
    
        if(Verbose) :
            print(f"\t\t{''.join(text[max(cur-pm, 0): cur]).replace('\n', '\\n')}@{''.join(text[cur: min(cur+pm,len(text))]).replace('\n', '\\n')}\tToken:{Keys[time]['tokId']}\tlen:{len(text)}-{len(index)}")
            if(warning): print(warning)
                
        if(len(warning) > 1) : Warn += 1


        
    print(f"\tRemaining Text length: {len(index)}, Warnings:{Warn}")
    if(Verbose >2) :
        print(f"Remaining Text: {''.join(text)}")
        print(f"Remaining Text:{index}")

    return Keys



## XML 

In [None]:


# Convert list of dictionary into xml
def list_of_dicts_to_xml(data_list, root_tag='root', item_tag='item'):
    """
    Converts a list of dictionaries into an XML root,
    placing dictionary values into attributes of XML elements.

    Args:
        data_list (list): A list of dictionaries to convert.
        root_tag (str): The tag name for the root element of the XML.
        item_tag (str): The tag name for each item element in the XML.

    Returns:
        root: The XML root.
    """
    root = ET.Element(root_tag)
    for item_dict in data_list:
        item_element = ET.SubElement(root, item_tag)
        for key, value in item_dict.items():
            # Convert value to string as XML attributes are strings
            item_element.set(key, str(value))

    return root




### Events XML -> WKS XML

In [None]:

# Events Token container -> list of token dictionaries
def tokens2dict(xml_root):
    """Convert list token XML to dictionary."""
    
    tokens = []    
    token_dict = {}
    for token in xml_root.findall('Token'):
        token_old = token_dict
        token_dict = dict(token.attrib)

        # Convert numeric strings to numbers if needed
        if 'id' in token_dict:
            token_dict['tokId'] = int(token_dict['id'])

        if ('cur' not in token_dict):
            print(f"tokens2dict Warning: no cur snt:{token_dict['segId']} tokId:{token_dict['id']} >{token_dict['tok']}<")
            if('cur' in token_old) : token_dict['cur'] = int(token_old['cur'])
            else : token_dict['cur'] = 0
        tokens.append(token_dict)
    
    return tokens


def tokens2snt(tokList):
    """
    Convert list of token dictionary to list of lists of sentence.
    Assumes tokens have 'segId'
    """
    
    # Group tokens by sentence
    S = []
    L = []
    segId = 0
    for token in tokList:
        if(token.get('segId') != segId) :
            if(segId != 0) :
                S.append(L)
                L=[]
            segId = token.get('segId')        
        L.append((token.get('tok'), token.get('id')))
#        print(token.get('segId'), token.get('tok'), token.get('id'))
    
    if(L) : S.append(L)

    return S

# Events to Workspace

In [None]:
import glob
import os
import io

path = "/data/critt/tprdb/TPRDB"
studies = ['BML12', 'SG12', 'RUC17']
studies = ['RUC17']
verbose = 2
    
for study in studies:
    files = glob.glob(f"{path}/{study}/Events/*Event.xml")
    if(verbose): print(f"Reading:{study}\twith {len(files)} files")
    try:
        os.mkdir(f"{path}/{study}/WKS/")
    except FileExistsError:
        print(f"\tDirectory WKS already exists.")

    n = 0
    for fn in sorted(files):
        root, extension = os.path.splitext(fn)
        wks = os.path.basename(root).removesuffix(".Event")
        out = f"{path}/{study}/WKS/{wks}.xml"
        n += 1
        
        if os.path.exists(out):
            if(verbose): print(f"  {n}\tExists: {out}")
            continue
            
        if(verbose):  print(f"  {n}\tOutput: {out}")
            
        try:
            WorkSpace_root = Events2WKS(fn) 
        except FileExistsError:
            print(f"\tError in XML File.")
            continue
        
        ET.indent(WorkSpace_root, space='  ')  # 2 spaces
        with open(out, "w") as f:
            print(ET.tostring(WorkSpace_root, encoding='unicode'), file=f)



In [None]:
fn = "/data/critt/tprdb/TPRDB/ENJA15/Events/P01_T1.Event.xml"
fn = "/data/critt/tprdb/TPRDB/BML12/Events/P22_P5.Event.xml"
fn = "/data/critt/tprdb/TPRDB//RUC17/Events/P02_P3.Event.xml"
fn = "/data/critt/tprdb/TPRDB//RUC17/Events/P02_T1.Event.xml"
fn = "/data/critt/tprdb/TPRDB//RUC17/Events/P03_T1.Event.xml"


WorkSpace_root = Events2WKS(fn, Verbose=1) 

##################################################################
# pretty-print WorkSpace_root
ET.indent(WorkSpace_root, space='  ')  # 2 spaces
print(ET.tostring(WorkSpace_root, encoding='unicode'))

In [None]:
import stanza

Verbose = 1
fn = "/data/critt/tprdb/TPRDB//RUC17/Events/P02_T1.Event.xml"

### Root of the WorkSpace XML output file
WorkSpace_root = ET.Element("WorkSpace")

if(Verbose) : print(f"Events2WKS 1")

### Read Events-XML file
events = ET.parse(fn)
if(Verbose) : print(f"Events2WKS 2")
# Root of the Translog XML input file
events_root = events.getroot()
if(Verbose) : print(f"Events2WKS 2")

# get Source and Target Languages
e = events_root.find('.//Languages')
SL = e.get('source') 
TL = e.get('target')

if(Verbose) : print(f"Events2WKS: SL:{SL} TL:{TL}")
    
##################################################################
e = events_root.find(f".//SourceToken")
token = tokens2dict(e)
# Reconstruct Sentence from Tokens 
STsnt = tokens2snt(token)

e = events_root.find(f".//FinalToken")
token = tokens2dict(e)
# Reconstruct Sentence from Tokens 
FTsnt = tokens2snt(token)

######################################
e = events_root.find('.//Salignment')
L = []
for token in e.findall('Salign'):
    d = dict(token.attrib)
    L.append(d)
 
SntAlign = merge_alignments_graph(L)

segAlign = snt2segAlign(STsnt, FTsnt, SntAlign)
#tokAlign = rndAlignment(segAlign)
tokAlign = simAlignment(segAlign)


rndAlign

In [None]:
SntAlign

In [None]:
def Events2WKS(fn, Verbose = 0) :
    
    ### Root of the WorkSpace XML output file
    WorkSpace_root = ET.Element("WorkSpace")
    
    if(Verbose) : print(f"Events2WKS 1")
    
    ### Read Events-XML file
    events = ET.parse(fn)
    if(Verbose) : print(f"Events2WKS 2")
    # Root of the Translog XML input file
    events_root = events.getroot()
    if(Verbose) : print(f"Events2WKS 2")
    
    # get Source and Target Languages
    e = events_root.find('.//Languages')
    SL = e.get('source') 
    TL = e.get('target')

    if(Verbose) : print(f"Events2WKS: SL:{SL} TL:{TL}")
        
    ##################################################################
    
    def EventsToken(tag, lng, Verbose=0) :
        
        e = events_root.find(f".//{tag}")
        token = tokens2dict(e)
        
        if(Verbose) : print(f"EventsToken: lng:{lng} No:{len(token)}")

        # Reconstruct Sentence from Tokens 
        snt = tokens2snt(token)
        
        # get additional features from Stanza 
        # add features to list of STokens 
        tokFeats = stanzaFeatures(snt, lng, token)
        
        # convert token Dictionary to xml
        token_root = list_of_dicts_to_xml(tokFeats, root_tag=tag, item_tag='Token')
        
        # assign source language 
        token_root.set('language', str(lng))
        return token_root
    
    ######################################
    # append FT tokenization to WorkSpace root 
    Root = EventsToken('SourceToken', SL, Verbose=Verbose)
    WorkSpace_root.append(Root)
    
    # append FT tokenization to WorkSpace root 
    Root = EventsToken('FinalToken', TL, Verbose=Verbose)
    WorkSpace_root.append(Root)
    
    ######################################
    e = events_root.find('.//Salignment')
    L = []
    for token in e.findall('Salign'):
        d = dict(token.attrib)
        L.append(d)
     
    M = merge_alignments_graph(L)
    if(Verbose) : print(f"Salignment: {len(M)}")
    
    # map into xml format 
    Root = list_of_dicts_to_xml(M, root_tag='SntAlignment', item_tag='Snt')
    WorkSpace_root.append(Root)
    
    ######################################
    e = events_root.find('.//Alignment')
    L = []
    for token in e.findall('Align'):
        d = dict(token.attrib)
        d['src'] = d['sid']
        d['tgt'] = d['tid']
        L.append(d)
     
    M = merge_alignments_graph(L)
    if(Verbose) : print(f"Alignment: {len(M)}")
    
    # map into xml format 
    Root = list_of_dicts_to_xml(M, root_tag='TokAlignment', item_tag='Tok')
    WorkSpace_root.append(Root)
    
    ##################################################################
    e = events_root.find('.//Modifications')
    
    L = []
    for token in e.findall('Mod'):
        d = dict(token.attrib)
        # rename sid and tid
        d['src'] = d.pop('sid')
        d['tgt'] = d.pop('tid')
    
        L.append(d)

    if(Verbose) : print(f"Modifications: {len(L)}")

    Root = list_of_dicts_to_xml(L, root_tag='Modifications', item_tag='Mod')
    WorkSpace_root.append(Root)
     
    
    ##################################################################
    e = events_root.find('.//Fixations')
    L = []
    for token in e.findall('Fix'):
        d = dict(token.attrib)
        
        # rename sid and tid
        d['src'] = d.pop('sid')
        d['tgt'] = d.pop('tid')
        L.append(d)

    if(Verbose) : print(f"Fixations: {len(L)}")
        
    Root = list_of_dicts_to_xml(L, root_tag='Fixations', item_tag='Fix')
    WorkSpace_root.append(Root)    
    
    ##################################################################
    ### Segment open-closing
    #  <Segments>
    #    <Seg sntId="1" open="72952" close="89436" />
    e = events_root.find('.//Segments')
    L = []
    for token in e.findall('Seg'):
        d = dict(token.attrib)
        L.append(d)
        
    if(Verbose) : print(f"Segments: {len(L)}")

    Root = list_of_dicts_to_xml(L, root_tag='SntEdits', item_tag='Snt')
    WorkSpace_root.append(Root)

    return WorkSpace_root