In [1]:
import collections
import re

Text = collections.namedtuple('Text', 'id dialect title informant place text')

re_verse_no = re.compile('\s*(\([0-9]+\))\s*')
re_footnote = re.compile('^ [^.]*\.')

# regexes for Barwar texts
re_title = re.compile('^([ABCD][0-9]+) *[\t]+(.*\S)\s*$')
re_info = re.compile('^Informant: (.*) \((.*)\)\s*$')

# regexes for Urmi texts
re_heading = re.compile('^([AB]\s*[0-9]+)\s*$')
re_title_info = re.compile('^(.*\S)\s*\(([^,]*), (.*)\)\s*$')
re_ignore = re.compile('^[A-Z][A-Z0-9 ]+\s*$')
# version regex is for Urmi text A35
re_version = re.compile('^(Version [0-9]+): (.*) \((.*)\)\s*$')

ignored = []

def get_texts(filename, dialect):
    """Read and structure NENA texts.
    
    Reads from filename, and structures the texts to fit in the fields
    of the namedtuple Text.
    Optimized for text files extracted from MS-Word files with Barwar
    and Christian Urmi texts.
    """
    
    t_id = None
    version = None
    texts = []

    with open(filename, 'r') as text_file:
        for line in text_file:
            empty = len(line.strip()) < 2 # ignore 'empty' lines of 1 character
            
            title_line = re_title.match(line)
            heading_line = re_heading.match(line)
            if title_line or heading_line:
                if t_id:
                    text = split_verses(text)
                    texts.append(Text(t_id, dialect, title, informant, place, text))
                if title_line:
                    t_id, title = title_line.groups()
                elif heading_line:
                    t_id = ''.join(heading_line.group(1).split()) # remove space in 'A 1'
                    title = None
                informant = None
                place = None
                text = []
                continue

            # Version is special case for Urmi A35
            # Version line also matches re_title_info, so must be checked before
            # Title has been added to text list
            version_line = re_version.match(line)
            if version_line:
                version, v_informant, v_place = version_line.groups()
                if len([e for e in text if e.strip()]) == 1:
                    title = text[0].strip()
                    text = []
                text.append(version)
                
                if informant is not None:
                    informant += '; {}: {}'.format(version, v_informant)
                else:
                    informant = '{}: {}'.format(version, v_informant)
                if place is not None:
                    place += '; {}: {}'.format(version, v_place)
                else:
                    place = '{}: {}'.format(version, v_place)
                continue
            
            informant_line = re_info.match(line)
            title_info_line = re_title_info.match(line)
            if t_id and informant is None and (informant_line or title_info_line):
                if informant_line:
                    informant, place = informant_line.groups()
                elif title_info_line:
                    title, informant, place = title_info_line.groups()
                continue
            
            # if no heading/title has yet been encountered,
            # line is part of front matter, so ignore
            if t_id is None:
                continue
            
            # if line does not match heading/title
            if re_ignore.match(line):
                ignored.append(line)
                continue
            
            # footnotes are replaced by newline(s), followed by
            # the footnote text preceded by a space.
            # delete footnote and preceding newlines
            # TODO store footnote and location somewhere?
            footnote = re_footnote.match(line)
            if footnote:
                ignored.append(footnote.group())
                line = re_footnote.sub('', line)
                while text and text[-1].strip() == '':
                    ignored.append(text.pop())
                
            if text or not empty:
                text.append(line)
            elif empty: # empty line after title or informant lines
                ignored.append(line)
                pass

        # add last text
        text = split_verses(text)
        texts.append(Text(t_id, dialect, title, informant, place, text))
    
    return texts
    
def split_verses(text):
    """Split text into verses.
    
    Verses are marked by a verse number in parentheses.
    Returns a list of tuples: (verse_no, verse_text)
    """
    
    # strip empty lines and lines with only one character from end of text
    # (single characters are sometimes appended by abiword to end of file)
    while len(text[-1].strip()) < 2:
        ignored.append(text.pop())

    verses = []
    cur_verse = []
    verse_no = ''
    
    for string in text:
        for e in re_verse_no.split(string):
            if cur_verse and cur_verse[-1].startswith('Version'):
                if not e.strip():
                    ignored.append(e)
                    continue # ignore newlines after 'Version', it will be added
            if re_verse_no.match(e):
                if cur_verse and cur_verse[-1].startswith('Version'):
                    e = '{}:\n\n{}'.format(cur_verse.pop(), e)
                if verse_no and cur_verse:
                    verses.append((verse_no, ''.join(cur_verse)))
                    cur_verse = []
                verse_no = e
            elif e:
                cur_verse.append(e)
    
    # from last verse in text, strip trailing whitespace
    verses.append((verse_no, ''.join(cur_verse).rstrip()))
    
    return verses

In [2]:
barwar_texts = get_texts('barwar.txt', 'Barwar')
urmi_texts = get_texts('urmi_c.txt', 'Urmi_C')
alltexts = barwar_texts + urmi_texts

In [3]:
# set Unicode range for combining characters
combining_characters = range(0x300, 0x370)

cnt = collections.Counter()

cur_c = None
for t in alltexts:
    for v, verse in t.text:
        for c in verse:
            # combine combining characters
            if ord(c) in combining_characters:
                cur_c += c
            else:
                if cur_c is not None:
                    cnt[cur_c] += 1
                cur_c = c

In [4]:
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 300)
data = []

for c in sorted(cnt):
    data.append({'character': c,
                 'count': cnt[c],
                 'hex codes': ' '.join([hex(ord(e)) for e in c])})

df = pd.DataFrame(data)
df

Unnamed: 0,character,count,hex codes
0,\n,397,0xa
1,,7248,0x1e
2,̭,8,0x1e 0x32d
3,,91764,0x20
4,!,560,0x21
5,(,5,0x28
6,),5,0x29
7,+,11633,0x2b
8,",",7761,0x2c
9,-,18023,0x2d


In [5]:
chars = '()[]'

for i, t in enumerate(alltexts):
    for v, verse in t.text:
        if any(c in verse for c in chars):
            print(t.dialect, i, t.id, t.title)
            print(v, verse)

Urmi_C 54 A3 Axiqar
(28) +ʾAxík̭ar lišánu bədvàk̭ələ.| +naràhat váyələ,| lišánu bədvàk̭ələ,| lélə +bašúrə hàmzəm.| (interruption)

Urmi_C 109 B2 Village Life
(18) (GK: bətvátə dàxiva?|) bətvàtə| sáma zòda| … ɟu-matvátə ʾə́tvalan bátə prìšə| clítəva +ʾal-nàšə| mújjurra másalan cmá ʾá-ʾiva +cásəb yán dolàtman.| ʾə́tvalan bátət tré +ṱlá tabák̭əzə b-cárpəc smùk̭ta| cárpəč k̭ə̀tta| muk̭ə̀tta| ʾə́tvalan bàtə| ʾína cullanaʾít sáma zódət bátət màta| ci-+ṱarsívalun mə́n … ci-+palṱíva +ʾal-váddar mən-màta| cimànə ʾə́tvalan.| ci-+k̭aṱṱíva cìmə,| ʾáxnan ci-tanàxlun xína,| ci-+k̭aṱṱíva cìmə| ʾḗn címə b-+ṱìna ci-mayyíva| mattíva +ʾàl +ʾuydálə| ʾátxa mask̭ívalə ɟùyda| bèta +ṱarsíva biyyé.| ʾíta xína +ʾulluyléda cúllə c-avíva k̭èsə,| pardùvvə,| +k̭aryàtə.| stùyna ci-mattíva.
Urmi_C 111 B4 Hunting
(6) [GK: ʾə́tva xzùyrə?] là,| ʾé +dána ʾàxnan| ɟá māt-díyyan lǝ̀tva.| xzúyrǝ c-azíva +ʾal-+ṱurànǝ.| ʾánnə náš

In [6]:
[e for e in ignored if e != '\n']

[' The name Čuxo means ‘one who wears the woolen čuxa garment’.',
 '4\n',
 '3\n',
 '?\n',
 '1\n',
 '',
 '',
 ' In the original recording of the story the speaker used the word camra ‘animal droppings’ here, but subsequently corrected this to calla.',
 'HISTORY AND CULTURE\n',
 '\x0c\n',
 '\x0c\n',
 ' Mistake for šənnə +xarayə.',
 'PAGE  190\n',
 'PAGE  189\n']