# Collating lemmatised texts in XML with Collatex

Let's start by defining an import function, XML to json

In [None]:
from lxml import etree
import json

def XMLtoJson(id,xmlInput):
    # converts an XML tokenised and annotated input to JSON for collation
    witness = {}
    witness['id'] = id
    monXSL = etree.XML('''
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
    xmlns:xs="http://www.w3.org/2001/XMLSchema"
    xmlns:tei="http://www.tei-c.org/ns/1.0"
    exclude-result-prefixes="xs"
    version="1.0">

    <xsl:output method="text"/>

    <xsl:template match="/">
        <xsl:apply-templates
            select="descendant::tei:w"/>
    </xsl:template>

    <xsl:template match="tei:w">
        <xsl:text>{"t": "</xsl:text>
        <xsl:apply-templates/>
        <xsl:text>", "xml:id": "</xsl:text>
        <xsl:value-of select="@xml:id"/>
        <xsl:text>", "lemma": "</xsl:text>
        <xsl:value-of select="@lemma"/>
        <xsl:text>", "POS": "</xsl:text>
        <xsl:value-of select="substring-before(@type, '|')"/>
        <xsl:text>", "morph": "</xsl:text>
        <xsl:value-of select="substring-after(@type, '|')"/>
        <xsl:text>"}</xsl:text>
        <xsl:if test="following::tei:w">
            <xsl:text>, </xsl:text>
        </xsl:if>
    </xsl:template>
</xsl:stylesheet>
    ''')
    monXSL = etree.XSLT(monXSL)
    witness['tokens'] = json.loads( '[' +str(monXSL(xmlInput)) +']')
    return witness

And now 

In [None]:
import collatex
from lxml import etree

#Création des témoins
#import A F G H M P R S V
A = open('data/xml/A.xml').read()
A = etree.XML(A)
F = open('data/xml/F.xml').read()
F = etree.XML(F)
G = open('data/xml/G.xml').read()
G = etree.XML(G)
H = open('data/xml/H.xml').read()
H = etree.XML(H)
M = open('data/xml/M.xml').read()
M = etree.XML(M)
P = open('data/xml/P.xml').read()
P = etree.XML(P)
R = open('data/xml/R.xml').read()
R = etree.XML(R)
S = open('data/xml/S.xml').read()
S = etree.XML(S)
V = open('data/xml/V.xml').read()
V = etree.XML(V)

#Add A F G H M P R S V to input
json_input = {}
json_input['witnesses'] = []
json_input['witnesses'].append(XMLtoJson('A',A))
json_input['witnesses'].append(XMLtoJson('F',F))
json_input['witnesses'].append(XMLtoJson('G',G))
json_input['witnesses'].append(XMLtoJson('H',H))
json_input['witnesses'].append(XMLtoJson('M',M))
json_input['witnesses'].append(XMLtoJson('P',P))
json_input['witnesses'].append(XMLtoJson('R',R))
json_input['witnesses'].append(XMLtoJson('S',S))
json_input['witnesses'].append(XMLtoJson('V',V))

#collate(json_input, output="table", layout="horizontal", segmentation=True, near_match=False, astar=False,
#            detect_transpositions=False, debug_scores=False, properties_filter=None, svg_output=True, indent=False)

result = collatex.collate(json_input, output="table", layout="vertical", segmentation=False,near_match=True)#,svg_output=True)

print(result)

Or, as 

In [None]:
result = collatex.collate(json_input, output="tei", segmentation=False,near_match=True)
print(result)
# f = open('results2_tei.xml', 'w')
#print(result, file=f)
#f.close()

In [None]:
result = collatex.collate(json_input, output="xml", segmentation=False,near_match=True)
print(result)
#f = open('results2.xml', 'w')
#print(result, file=f)
#f.close()


or as JSON,

In [None]:
result = collatex.collate(json_input, output="json", segmentation=False,near_match=True)
print(result)
#f = open('results2.json', 'w')
#print(result, file=f)
#f.close()

## Going further

This is all well and good, but, preferably, we would want to:

- collate using lemmata instead of form;
- keep linguistic information in the output.

We'll need to define some functions again, and even redefine core functions of Collatex.



In [None]:
def XMLtoJson(id,xmlInput):
    '''converts an XML tokenised and annotated input to JSON for collation.
    Uses lemma as t instead of token (lemma will be the base of collation) 
    :param id: sigla of the witness
    :param xmlInput: xml content to parse
    :return: json output for collation
    '''
    witness = {}
    witness['id'] = id
    monXSL = etree.XML('''
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
    xmlns:xs="http://www.w3.org/2001/XMLSchema"
    xmlns:tei="http://www.tei-c.org/ns/1.0"
    exclude-result-prefixes="xs"
    version="1.0">

    <xsl:output method="text"/>

    <xsl:template match="/">
        <xsl:apply-templates
            select="descendant::tei:w"/>
    </xsl:template>

    <xsl:template match="tei:w">
        <xsl:text>{"form": "</xsl:text>
        <xsl:apply-templates/>
        <xsl:text>", "xml:id": "</xsl:text>
        <xsl:value-of select="@xml:id"/>
        <xsl:text>", "t": "</xsl:text>
        <xsl:value-of select="@lemma"/>
        <xsl:text>", "POS": "</xsl:text>
        <xsl:value-of select="substring-before(@type, '|')"/>
        <xsl:text>", "morph": "</xsl:text>
        <xsl:value-of select="substring-after(@type, '|')"/>
        <xsl:text>"}</xsl:text>
        <xsl:if test="following::tei:w">
            <xsl:text>, </xsl:text>
        </xsl:if>
    </xsl:template>
</xsl:stylesheet>
    ''')
    monXSL = etree.XSLT(monXSL)
    witness['tokens'] = json.loads( '[' +str(monXSL(xmlInput)) +']')
    return witness

## And now we need to redefine a collatex function to allow for various attributes
def table_to_xml(table):
    """"Converts a Collatex collation table to xml, while keeping attributes lemma, POS, morph and ID,
     as well as the form.)
    """
    readings = []
    for column in table.columns:
        app = etree.Element('app')
        for key, value in sorted(column.tokens_per_witness.items()):
            child = etree.Element('rdg')
            child.attrib['wit'] = "#" + key
            child.text = "".join(str(item.token_data["form"]) for item in value)
            #TODO: redéfinir pour accepter un nombre arbitraire d'éléments et faire ça proprement
            #TODO: apparemment, aussi, il ne veut pas d'xml:id
            child.attrib['id'] = "".join(str(item.token_data["xml:id"]) for item in value)
            child.attrib['lemma'] = "".join(str(item.token_data["t"]) for item in value)
            child.attrib['POS'] = "".join(str(item.token_data["POS"]) for item in value)
            child.attrib['morph'] = "".join(str(item.token_data["morph"]) for item in value)
            app.append(child)
        # Without the encoding specification, outputs bytes instead of a string
        result = etree.tostring(app, encoding="unicode")
        readings.append(result)
    return "<root>" + "".join(readings) + "</root>"

In [None]:
#Création des témoins
#import A F G H M P R S V
A = open('data/xml/A.xml').read()
A = etree.XML(A)
F = open('data/xml/F.xml').read()
F = etree.XML(F)
G = open('data/xml/G.xml').read()
G = etree.XML(G)
H = open('data/xml/H.xml').read()
H = etree.XML(H)
M = open('data/xml/M.xml').read()
M = etree.XML(M)
P = open('data/xml/P.xml').read()
P = etree.XML(P)
R = open('data/xml/R.xml').read()
R = etree.XML(R)
S = open('data/xml/S.xml').read()
S = etree.XML(S)
V = open('data/xml/V.xml').read()
V = etree.XML(V)


#Add A F G H M P R S V to input
json_input = {}
json_input['witnesses'] = []
json_input['witnesses'].append(XMLtoJson('A',A))
json_input['witnesses'].append(XMLtoJson('F',F))
json_input['witnesses'].append(XMLtoJson('G',G))
json_input['witnesses'].append(XMLtoJson('H',H))
json_input['witnesses'].append(XMLtoJson('M',M))
json_input['witnesses'].append(XMLtoJson('P',P))
json_input['witnesses'].append(XMLtoJson('R',R))
json_input['witnesses'].append(XMLtoJson('S',S))
json_input['witnesses'].append(XMLtoJson('V',V))

#collate(json_input, output="table", layout="horizontal", segmentation=True, near_match=False, astar=False,
#            detect_transpositions=False, debug_scores=False, properties_filter=None, svg_output=True, indent=False)

#layout="horizontal"
#scheduler=collatex.near_matching.Scheduler()

result = collatex.collate(json_input, output="table", layout="vertical", segmentation=False,near_match=True)#,svg_output=True)

print(result)

But we would like an output conserving the additional informations we inputed, in order to be able to use them to categorise variants.

In [None]:
xml_output = table_to_xml(result)
print(xml_output)

#f = open('results.xml', 'w')
#print(xml_output, file=f)
#f.close()