In [1]:
# import modules
import os, anparse

In [2]:
# head function to limit output to first 25 lines
def head(sequence, num=25):
    for i, line in zip(range(num), sequence):
        yield line
    yield '...'

In [3]:
# Let us take the Book of the Laws of the Countries as an example.
# To parse an an_file, we need the an_file, and some auxiliary files
# (an alphabet, lexicon, psdef(*), and word_grammar file). If no auxiliary
# files are provided, an 'at2ps.conf' file is looked for in the directory
# of the an_file. In the case of the BLC, there is an at2ps.conf file present,
# so we need not provide the auxiliary files.
# (*) In fact, we do not need the psdef file, since we do not generate ps files,
#     but even though it is ignored, for now the functions expect some value.

an_file = 'blc/Laws.an'

In [4]:
# the function parse_anfile() returns tuples with all analyzed data

laws = anparse.parse_anfile(an_file)

In [5]:
# example of analyzed data.
# - tuple with verse label, surface form, analyzed form, tuple with word elements.
#   - word element: tuple with analyzed form, morphemes, functions, lex.
#     - morpheme: tuple with morpheme type, tuple with morpheme forms (paradigmatic, surface, analyzed form).
#     - functions: tuples with function identifier and value.
#     - lex: lemma as found in lexicon, tuple with lex-id and descriptions in id-value tuples.

next(laws)

('0,1',
 'TWB',
 'TWB',
 (('TWB',
   ((('lex', ('TWB', 'TWB', 'TWB')),),
    (('nu', False),
     ('gn', False),
     ('st', False),
     ('vt', False),
     ('vs', False),
     ('ps', False),
     ('sp', 'advb')),
    ('11299', (('sp', 'advb'), ('gl', 'again, back'))))),))

In [6]:
# the function print_anfile() adds some labels

prnt = anparse.print_anfile(an_file)

for line in head(prnt):
    print(line)

0,1	TWB	TWB
    TWB
	morphemes: (('lex', ('TWB', 'TWB', 'TWB')),)
	functions: (('nu', False), ('gn', False), ('st', False), ('vt', False), ('vs', False), ('ps', False), ('sp', 'advb'))
	lex      : ('11299', (('sp', 'advb'), ('gl', 'again, back')))
0,1	KTB>	KTB=/~>
    KTB=/~>
	morphemes: (('lex', ('KTB=', 'KTB', 'KTB=')), ('nme', ('', '', '')), ('emf', ('>', '>', '>')))
	functions: (('vt', False), ('vs', False), ('ps', False), ('sp', 'subs'), ('nu', None), ('gn', 'm'), ('st', 'emph'))
	lex      : ('8929', (('sp', 'subs'), ('gn', 'm'), ('gl', 'writing, book')))
0,1	DNM"WS>	D-NMWS/(J~>
    D
	morphemes: (('lex', ('D', 'D', 'D')),)
	functions: (('nu', False), ('gn', False), ('st', False), ('vt', False), ('vs', False), ('ps', False), ('sp', 'prep'), ('ls', 'pcon'))
	lex      : ('7789', (('sp', 'prep'), ('ls', 'pcon'), ('gl', '(relative)')))
    NMWS/(J~>
	morphemes: (('lex', ('NMWS', 'NMWS', 'NMWS')), ('nme', ('J', '', '(J')), ('emf', ('>', '>', '>')))
	functions: (('vt', False), ('vs', Fa

In [7]:
# the function dump_anfile mimics the output of ETCBC .dmp files
# (but the annotations do not appear in the same order,
# so the output cannot be compared with diff or similar)

dmp = anparse.dump_anfile('BLC', an_file)

for line in head(dmp, 10):
# for line in dmp: # no errors
    print(line)

BLC 0,1	TWB	TWB	TWB	-	sp=advb
BLC 0,1	KTB=/~>	KTB>	KTB=	nme="",emf=">"	sp=subs,+nu,gn=m,st=emph
BLC 0,1	D	D	D	-	sp=prep,ls=pcon
BLC 0,1	NMWS/(J~>	NMWS>	NMWS	nme="J",emf=">"	sp=subs,nu=pl,gn=m,st=emph
BLC 0,1	D	D	D	-	sp=prep,ls=pcon
BLC 0,1	>TR/&WT=~>	>TRWT>	>TR	nme="T=",emf=">"	sp=subs,nu=pl,gn=m,st=emph
BLC 1,1	MN	MN	MN	-	sp=prep
BLC 1,1	QDM	QDM	QDM	-	sp=prep
BLC 1,1	JWM/T=~>	JWMT>	JWM	nme="T=",emf=">"	sp=subs,nu=pl,gn=m,st=emph
BLC 1,1	<L=[/JN	<LJN	<L=	vbe="",nme="JN"	sp=verb,nu=pl,gn=m,st=abs,vo=act,vs=pe,vt=ptc
...


In [8]:
# Now check other an_files

an_file = 'turgama/ApBar/P_EpBarA.an'
dmp = anparse.dump_anfile('ApBarA', an_file)

for line in head(dmp, 10):
# for line in dmp: # no errors
    print(line)

ApBarA 78,0	TWB	TWB	TWB	-	sp=advb
ApBarA 78,0	>GR(>/T~>	>GRT>	>GR>	nme="T",emf=">"	sp=subs,nu=sg,gn=f,st=emph
ApBarA 78,0	QDMJ/T~>	QDMJT>	QDMJ	nme="T",emf=">"	sp=adjv,nu=sg,gn=f,st=emph
ApBarA 78,0	D	D	D	-	sp=prep,ls=pcon
ApBarA 78,0	BRWK=/	BRWK	BRWK=	nme=""	sp=subs,+nu,+gn,st=abs,ls=prop
ApBarA 78,0	SPR=/~>	SPR>	SPR=	nme="",emf=">"	sp=subs,+nu,gn=m,st=emph
ApBarA 78,0	D	D	D	-	sp=prep,ls=pcon
ApBarA 78,0	CDR[:d	CDR	CDR	vbe="",vpm=d	nu=sg,gn=m,sp=verb,vo=act,vs=pe,vt=pf,ps=third
ApBarA 78,0	MN	MN	MN	-	sp=prep
ApBarA 78,0	GW/	GW	GW	nme=""	sp=subs,+nu,gn=m,+st,ls=ppre
...


In [9]:
an_file = 'turgama/ApBar/P_EpBarB.an'
dmp = anparse.dump_anfile('ApBarB', an_file)

for line in head(dmp, 10):
# for line in dmp: # no errors
    print(line)

ApBarB 78,0	>GR(>/T~>	>GRT>	>GR>	nme="T",emf=">"	sp=subs,nu=sg,gn=f,st=emph
ApBarB 78,0	D	D	D	-	sp=prep,ls=pcon
ApBarB 78,0	BRWK=/	BRWK	BRWK=	nme=""	sp=subs,+nu,+gn,st=abs,ls=prop
ApBarB 78,0	BR/	BR	BR	nme=""	sp=subs,+nu,gn=m,+st
ApBarB 78,0	NRJ>/	NRJ>	NRJ>	nme=""	sp=subs,+nu,+gn,st=abs,ls=prop
ApBarB 78,0	D	D	D	-	sp=prep,ls=pcon
ApBarB 78,0	KTB[	KTB	KTB	vbe=""	nu=sg,gn=m,sp=verb,vo=act,vs=pe,vt=pf,ps=third
ApBarB 78,0	L	L	L	-	sp=prep
ApBarB 78,0	TC</>	TC<>	TC<	nme=">"	sp=subs,nu=sg,gn=f,st=abs,ls=card
ApBarB 78,0	CBV/JN	CBVJN	CBV	nme="JN"	sp=subs,nu=pl,gn=m,st=abs
...


In [10]:
an_file = 'turgama/Judges_Syr/P_Judices.an'
dmp = anparse.dump_anfile('Judices', an_file)

for line in head(dmp, 10):
# for line in dmp: # no errors
    print(line)

Judices 1,1	W	W	W	-	sp=conj
Judices 1,1	HW(J&>[	HW>	HWJ	vbe=""	nu=sg,gn=m,sp=verb,vo=act,vs=pe,vt=pf,ps=third,ls=vbex
Judices 1,1	MN	MN	MN	-	sp=prep
Judices 1,1	BTR	BTR	BTR	-	sp=prep
Judices 1,1	D	D	D	-	sp=prep,ls=pcon
Judices 1,1	M(W&JT[	MJT	MWT	vbe=""	nu=sg,gn=m,sp=verb,vo=act,vs=pe,vt=pf,ps=third
Judices 1,1	JCW</	JCW<	JCW<	nme=""	sp=subs,+nu,gn=m,st=abs,ls=prop
Judices 1,1	BR/	BR	BR	nme=""	sp=subs,+nu,gn=m,+st
Judices 1,1	NWN=/	NWN	NWN=	nme=""	sp=subs,+nu,gn=m,st=abs,ls=prop
Judices 1,1	<BD=/	<BD	<BD=	nme=""	sp=subs,+nu,gn=m,+st
...


In [11]:
# Unfortunately, the directory containing the Prayer of Manasse
# (originally from /home/eep/synheb/Turgama/)
# though containing a file 'at2ps.conf', does not contain the
# syrlex, syrpsd and syrwgr files mentioned in the conf file.
# So we have to provide other files for those,
# and need to check if they are correct.
# The file path to the an_file is relative to the data directory,
# the file paths for other files are relative to the directory
# containing the an_file.
# Auxiliary files for now taken from 'turgama/Judges_Syr'.

an_file = 'turgama/OrMan/P_OrManA.an'
conf_file = '../Judges_Syr/at2ps.conf'

dmp = anparse.dump_anfile('OrManA', an_file, conf_file)

for line in head(dmp, 10):
# for line in dmp: # no errors
    print(line)

OrManA 1,0	YLW/T~>	YLWT>	YLW	nme="T",emf=">"	sp=subs,nu=sg,gn=f,st=emph
OrManA 1,0	D	D	D	-	sp=prep,ls=pcon
OrManA 1,0	MNC>/	MNC>	MNC>	nme=""	sp=subs,+nu,gn=m,st=abs,ls=prop
OrManA 1,1	MRJ>/	MRJ>	MRJ>	nme=""	sp=subs,+nu,+gn,st=abs,ls=prop
OrManA 1,1	>LH/~>	>LH>	>LH	nme="",emf=">"	sp=subs,+nu,gn=m,st=emph
OrManA 1,1	D	D	D	-	sp=prep,ls=pcon
OrManA 1,1	>B/&HJ	>BHJ	>B	nme="J"	sp=subs,nu=pl,gn=m,st=cst
OrManA 1,1	N	N	N	-	nu=pl,ps=first,sp=pron,ls=pers
OrManA 1,1	>LH/	>LH	>LH	nme=""	sp=subs,+nu,gn=m,+st
OrManA 1,1	H	H	H	-	nu=sg,gn=m,ps=third,sp=pron,ls=pers
...


In [12]:
# for auxiliary files see above

an_file = 'turgama/OrMan/P_OrManB.an'
dmp = anparse.dump_anfile('OrManB', an_file, conf_file)

for line in head(dmp, 10):
# for line in dmp: # no errors
    print(line)

OrManB 1,0	YLW/T~>	YLWT>	YLW	nme="T",emf=">"	sp=subs,nu=sg,gn=f,st=emph
OrManB 1,0	D	D	D	-	sp=prep,ls=pcon
OrManB 1,0	MNC>/	MNC>	MNC>	nme=""	sp=subs,+nu,gn=m,st=abs,ls=prop
OrManB 1,0	MLK/~>	MLK>	MLK	nme="",emf=">"	sp=subs,+nu,gn=m,st=emph
OrManB 1,0	D	D	D	-	sp=prep,ls=pcon
OrManB 1,0	BN/J	BNJ	BN	nme="J"	sp=subs,nu=pl,gn=m,st=cst
OrManB 1,0	>JSR&>JL/	>JSR>JL	>JSRJL	nme=""	sp=subs,+nu,+gn,st=abs,ls=prop
OrManB 1,0	KD	KD	KD	-	sp=conj
OrManB 1,0	@>(T&C@(C&TBJ==[	>CTBJ	CBJ==	pfx=">T",vbe=""	nu=sg,gn=m,sp=verb,vo=pas,vs=pe,vt=pf,ps=third
OrManB 1,0	L	L	L	-	sp=prep
...


In [13]:
an_file = 'calap/BenSira.an'
dmp = anparse.dump_anfile('BenSira', an_file, 'data/at2ps.conf')
# WRONG LEXICON - find right lexicon to use

for line in head(dmp, 10):
# for line in dmp: # lexicon error
    print(line)

BenSira 1,0	XKM(>/T~>	XKMT>	XKM>	nme="T",art=">"	sp=subs,nu=sg,gn=f,st=det
BenSira 1,0	D	D	D	-	sp=conj
BenSira 1,0	BR/	BR	BR	nme=""	sp=subs,+nu,gn=m,+st
BenSira 1,0	SJR>/	SJR>	SJR>	nme=""	sp=subs,+nu,+gn,+st
BenSira 1,1	KL/	KL	KL	nme=""	sp=subs,+nu,+gn,+st
BenSira 1,1	XKM>/	XKM>	XKM>	nme=""	sp=subs,+nu,gn=f,+st
BenSira 1,1	MN	MN	MN	-	sp=prep
BenSira 1,1	QDM	QDM	QDM	-	sp=prep
BenSira 1,1	MRJ>/	MRJ>	MRJ>	nme=""	sp=nmpr,+nu,+gn,st=abs
BenSira 1,1	HJ	HJ	HJ	-	nu=sg,gn=f,ps=third,sp=prps,ls=ppde
...


In [14]:
an_file = 'calap/Reges.an'
dmp = anparse.dump_anfile('Reges', an_file, 'data/at2ps.conf')
# WRONG LEXICON - find right lexicon to use

for line in head(dmp, 10):
# for line in dmp: # error
    print(line)

Reges 1,1	W	W	W	-	sp=conj
Reges 1,1	MLK/~>	MLK>	MLK	nme="",art=">"	sp=subs,+nu,gn=m,st=det
Reges 1,1	DWJD/	DWJD	DWJD	nme=""	sp=nmpr,+nu,gn=m,st=abs
Reges 1,1	S>B[	S>B	S>B	vbe=""	nu=sg,gn=m,sp=verb,vo=act,vs=pe,vt=pf,ps=third
Reges 1,1	W	W	W	-	sp=conj
Reges 1,1	<L=[	<L	<L=	vbe=""	nu=sg,gn=m,sp=verb,vo=act,vs=pe,vt=pf,ps=third
Reges 1,1	B	B	B	-	sp=prep
Reges 1,1	CN(>/J~>	CNJ>	CN>	nme="J",art=">"	sp=subs,nu=pl,gn=f,st=det
Reges 1,1	W	W	W	-	sp=conj
Reges 1,1	!M!KS(>|[/JN	MKSJN	KS>	pfm="M",frv="",vbe="",nme="JN"	sp=verb,st=abs,vo=act,vs=pa,vt=inf
...


In [15]:
an_file = 'calap/P_Psalmi/P_Psalmi.an'
dmp = anparse.dump_anfile('Ps', an_file)

for line in head(dmp, 10):
# for line in dmp: # lexicon error in 3,2: word SGJ=
    print(line)

Ps 1,1	VWB/(J	VWB	VWB	nme="J"	sp=subs,nu=pl,gn=m,st=cst
Ps 1,1	&WHJ==	WHJ	HJ==	-	nu=sg,gn=m,ps=third,sp=pron,ls=pers
Ps 1,1	L	L	L	-	sp=prep
Ps 1,1	GBR/~>	GBR>	GBR	nme="",emf=">"	sp=subs,+nu,gn=m,st=emph
Ps 1,1	D	D	D	-	sp=prep,ls=pcon
Ps 1,1	B	B	B	-	sp=prep
Ps 1,1	>WRX/~>	>WRX>	>WRX	nme="",emf=">"	sp=subs,+nu,gn=f,st=emph
Ps 1,1	D	D	D	-	sp=prep,ls=pcon
Ps 1,1	<WL/(J~>	<WL>	<WL	nme="J",emf=">"	sp=adjv,nu=pl,gn=m,st=emph
Ps 1,1	L>	L>	L>	-	sp=nega
...


In [16]:
an_file = 'efrem/preekjona/preek.an'
dmp = anparse.dump_anfile('Preek', an_file)

for line in head(dmp, 10):
# for line in dmp: # lexicon error in 1,353: word XJ> (root XJ, there: de=XJ>)
    print(line)

Preek 1,0	M>MR/J~>	M>MRJ>	M>MR	nme="J",emf=">"	nu=pl,gn=m,sp=subs,st=emph
Preek 1,0	D	D	D	-	sp=prep,ls=pcon
Preek 1,0	MR(>/J	MRJ	MR>	nme="J"	nu=pl,gn=m,sp=subs,st=cst
Preek 1,0	>PRJM/	>PRJM	>PRJM	nme=""	nu=sg,+gn,sp=subs,st=abs,ls=prop
Preek 1,0	VWB/	VWB	VWB	nme=""	nu=sg,gn=m,sp=subs,+st
Preek 1,0	N>	N>	N>	-	sp=intj
Preek 1,0	QDMJ/~>	QDMJ>	QDMJ	nme="",emf=">"	nu=sg,+gn,sp=adjv,st=emph
Preek 1,0	D	D	D	-	sp=prep,ls=pcon
Preek 1,0	<L	<L	<L	-	sp=prep,ls=pcon
Preek 1,0	NJNW>/	NJNW>	NJNW>	nme=""	nu=sg,+gn,sp=subs,st=abs,ls=prop
...


In [17]:
an_file = 'efrem/eprx/eprx.an'
dmp = anparse.dump_anfile('PrRef', an_file)

for line in head(dmp, 10):
# for line in dmp: # lexicon error in 4,21: word BY not in lexicon, it is in eprx4.gloss though
    print(line)

PrRef 1,1	TWB	TWB	TWB	-	sp=advb
PrRef 1,1	M>MR/~>	M>MR>	M>MR	nme="",emf=">"	sp=subs,+nu,gn=m,st=emph
PrRef 1,1	D	D	D	-	sp=prep,ls=pcon
PrRef 1,1	LWQBL	LWQBL	LWQBL	-	sp=prep
PrRef 1,1	MNJ==/	MNJ	MNJ==	nme=""	sp=subs,+nu,gn=m,st=abs,ls=prop
PrRef 1,1	!N!@(>(T&C@(C&T>L[	NCT>L	C>L	pfm="N",pfx=">T",vbe=""	nu=sg,gn=m,sp=verb,vo=pas,vs=pe,vt=ipf,ps=third
PrRef 1,1	MNJ==/	MNJ	MNJ==	nme=""	sp=subs,+nu,gn=m,st=abs,ls=prop
PrRef 1,1	<L	<L	<L	-	sp=prep,ls=pcon
PrRef 1,1	HW	HW	HW	-	nu=sg,gn=m,ps=third,sp=pron,ls=pers
PrRef 1,1	>RKWNV/~>	>RKWNV>	>RKWNV	nme="",emf=">"	sp=subs,+nu,+gn,st=emph
...
