# Punctuation 

Now we will work out the allowable punctuator characters. 

In [2]:
import re
import json
import collections
import unicodedata
import pandas as pd
from pprint import pprint
pd.options.display.max_rows = 200
from tf.app import use
nena = use('nena:clone', hoist=globals(), checkout='clone', version='0.02')

Using TF-app in /Users/cody/github/annotation/app-nena/code:
	repo clone offline under ~/github (local github)
Using data in /Users/cody/github/CambridgeSemiticsLab/nena_tf/tf/0.02:
	repo clone offline under ~/github (local github)
   |     0.00s Dataset without structure sections in otext:no structure functions in the T-API


In [3]:
def normalize_string(string):
    return unicodedata.normalize('NFD', string).lower()

def tokenize_string(string):
    norm_string = normalize_string(string)
    return re.findall('.[\u0300-\u036F]*', norm_string)

In [4]:
puncts = collections.Counter()
letters = re.compile('[a-zðɟəɛʾʿθıɑɉʸ][\u0300-\u033d]*')

for dialect in F.otype.s('dialect'):
    tokens = tokenize_string(T.text(dialect))
    for t in tokens:
        if not letters.match(t):
            puncts[t] += 1
            
# add new/undetected puncts
puncts['...'] = 0

            
len(puncts)

13

In [5]:
puncts.most_common()

[(' ', 93795),
 ('ˈ', 35964),
 ('-', 25281),
 ('.', 15191),
 ('⁺', 11633),
 (',', 7761),
 ('?', 1686),
 ('=', 1100),
 ('!', 560),
 (':', 42),
 ('—', 27),
 (';', 1),
 ('...', 0)]

In [6]:
punct_data = {
    '⁺': {'class': 'phonetic', 'position': 'begin', 'modifies':'stress group'},
    '(?<= )"': {'string': '"', 'class': 'separator', 'modifies': 'text', 'position': 'begin'},
    ' ': {'class': 'separator', 'modifies':'word', 'position': 'end'},
    '-': {'class': 'connector', 'modifies': 'stress group', 'position': 'end'},
    '=': {'class': 'connector', 'modifies': 'stress group', 'position': 'end'},
    'ˈ': {'class': 'separator', 'modifies': 'intonation group', 'position': 'end'},
    ',': {'class': 'separator', 'modifies': 'subsentence', 'position': 'end'},
    '...': {'class': 'separator', 'modifies':'subsentence', 'position': 'end', 'regex': '\.\.\.'},
    ':': {'class': 'separator', 'modifies': 'subsentence', 'position': 'end'},
    '—': {'class': 'separator', 'modifies': 'subsentence', 'position': 'end'},
    ';': {'class': 'separator', 'modifies': 'subsentence', 'position': 'end'},
    '.': {'class': 'separator', 'modifies': 'sentence', 'regex': '(?<!\.)\.(?!\.)', 'position': 'end'},
    '?': {'class': 'separator', 'modifies': 'sentence', 'position': 'end', 'regex': '\?'},
    '!': {'class': 'separator', 'modifies': 'sentence', 'position': 'end'},
    '(?<! )"': {'string': '"', 'class': 'separator', 'modifies': 'text', 'position': 'end'},
}

punctuation = []

for i, punct in enumerate(punct_data):
    data = {
        'regex': punct,
        'string': punct,
    }
    data.update(punct_data[punct])    
    data['codepoints'] = tuple(ord(c) for c in data['string'])
    punctuation.append(data)
        

In [7]:
pprint(punctuation, sort_dicts=False)

[{'regex': '⁺',
  'string': '⁺',
  'class': 'phonetic',
  'position': 'begin',
  'modifies': 'stress group',
  'codepoints': (8314,)},
 {'regex': '(?<= )"',
  'string': '"',
  'class': 'separator',
  'modifies': 'text',
  'position': 'begin',
  'codepoints': (34,)},
 {'regex': ' ',
  'string': ' ',
  'class': 'separator',
  'modifies': 'word',
  'position': 'end',
  'codepoints': (32,)},
 {'regex': '-',
  'string': '-',
  'class': 'connector',
  'modifies': 'stress group',
  'position': 'end',
  'codepoints': (45,)},
 {'regex': '=',
  'string': '=',
  'class': 'connector',
  'modifies': 'stress group',
  'position': 'end',
  'codepoints': (61,)},
 {'regex': 'ˈ',
  'string': 'ˈ',
  'class': 'separator',
  'modifies': 'intonation group',
  'position': 'end',
  'codepoints': (712,)},
 {'regex': ',',
  'string': ',',
  'class': 'separator',
  'modifies': 'subsentence',
  'position': 'end',
  'codepoints': (44,)},
 {'regex': '\\.\\.\\.',
  'string': '...',
  'class': 'separator',
  'modifie

In [8]:
with open('../punctuation.json', 'w') as outfile:
    json.dump(punctuation, outfile, indent=4, ensure_ascii=False)