In [3]:
# Referenced from https://pandey.github.io/posts/tokenize-indic-syllables-python.html

# suggested

# vowels = '\u0904-\u0914\u0960-\u0961\u0972-\u0977'
# consonants = '\u0915-\u0939\u0958-\u095F\u0978-\u097C\u097E-\u097F' 
# glottal = '\u097D' 
# vowel_signs = '\u093E-\u094C\u093A-\u093B\u094E-\u094F\u0955-\u0957\u1CF8-\u1CF9' 
# nasals = '\u0900-\u0902\u1CF2-\u1CF6' 
# visarga = '\u0903' 
# nukta = '\u093C' 
# avagraha = '\u093D' 
# virama = '\u094D' 
# vedic_signs = '\u0951-\u0952\u1CD0-\u1CE1\u1CED' 
# visarga_modifiers = '\u1CE2-\u1CE8' 
# combining = '\uA8E0-\uA8F1' 
# om = '\u0950' 
# accents = '\u0953-\u0954' 
# dandas = '\u0964-\u0965' 
# digits = ['\u0966-\u096F'] 
# abbreviation = '\u0970' 
# spacing = '\u0971' 
# vedic_nasals = '\uA8F2-\uA8F7\u1CE9-\u1CEC\u1CEE-\u1CF1' 
# fillers = '\uA8F8-\uA8F9' 
# caret = '\uA8FA' 
# headstroke = '\uA8FB' 
# space = '\u0020' 
# joiners = '\u200C-\u200D'

![devanagari Table](DevanagariTable.png)

In [4]:
import re

In [5]:
len('देवनागरी')

8

In [6]:
list('देवनागरी')
# but we want it to read as
# ['दे', 'व', 'ना', 'ग', 'री']

['द', 'े', 'व', 'न', 'ा', 'ग', 'र', 'ी']

In [7]:
vowels = '\u0904-\u0914\u0972-\u0977'
consonants = '\u0915-\u0939\u0958-\u095F' 
matras = '\u093E-\u094C' 
nasals = '\u0900-\u0902' 
visarga = '\u0903'
nukta = '\u093C'
virama = '\u094D'
om = '\u0950'
dandas = '\u0964-\u0965' 
numbers = '\u0966-\u096F' 
space = '\u0020' 
joiners = '\u200C-\u200D'

In [9]:
def get_words(text):
    pass

def get_syllables(word):
    # 
    syllables = []
    # buffer 
    buffer = ""
    
    for char in word:
        print(char, buffer)
        # match any words that either begin with vowels or om
        if re.match('[' + vowels + om + ']', char): 
            # if the buffer string is empty, add the string, otherwise the first character
            # is complete so add it to the syllables list
            if buffer != "":
                syllables.append(char)
            else:
                buffer = char
        # match any words that begin with a consonant
        elif re.match('[' + consonants + ']', char): 
            # if buffer exists and last character is not virama (्), append buffer string as syllable
            # otherwise add present consonant to buffer string 
            if len(buffer) > 0 and buffer[-1] != virama: 
                syllables.append(buffer) 
                buffer = char 
            else: 
                buffer = buffer + char 
        elif re.match('[' + nukta + ']', char): 
            buffer = buffer + char 
        elif re.match('[' + virama + ']', char): 
            buffer = buffer + char 
        elif re.match('[' + joiners + ']', char): 
            buffer = buffer + char 
        else:
            pass
        if buffer != '': 
            syllables.append(buffer) 
            buffer = '' 
    return syllables
        
get_syllables('हलन्त')

ह 
ल 
न 
् 
त 


['ह', 'ल', 'न', '्', 'त']

In [10]:
for char in "हलन्त":
    print(char)

ह
ल
न
्
त


In [11]:
def syllabify(inputtext): 
    syllables = [] 
    curr = '' 
    # iterate over each character in the input. if a char belongs to a 
    # class that can be part of a syllable, then add it to the curr 
    # buffer. otherwise, output it to syllables[] right away. 
    for char in inputtext: 
        if re.match('[' + vowels + om + ']', char): 
            # need to handle non-initial independent vowel letters, 
            # avagraha, and om 
            if curr != '': 
                syllables.append(curr) 
                curr = char 
            else: 
                curr = curr + char 
        elif re.match('[' + consonants + ']', char): 
            # if last in curr is not virama, output curr as syllable 
            # # else add present consonant to curr 
            if len(curr) > 0 and curr[-1] != virama: 
                syllables.append(curr) 
                curr = char 
            else: 
                curr = curr + char 
        elif re.match('[' + vowel_signs + visarga + vedic_signs + ']', char): 
            curr = curr + char
        elif re.match('[' + visarga_modifiers + ']', char): 
            if len(curr) > 0 and curr[-1] == visarga: 
                curr = curr + char 
                syllables.append(curr) 
                curr = '' 
            else: 
                syllables.append(curr) 
                curr = '' 
        elif re.match('[' + nasals + vedic_nasals + ']', char): 
            # if last in curr is a vowel sign, output curr as syllable 
            # else add present vowel modifier to curr and output as syllable 
            vowelsign = re.match('[' + vowel_signs + ']$', curr) 
            if vowelsign: 
                syllables.append(curr) 
                curr = '' 
            else: 
                curr = curr + char 
                syllables.append(curr) 
                curr = '' 
        elif re.match('[' + nukta + ']', char): 
            curr = curr + char 
        elif re.match('[' + virama + ']', char): 
            curr = curr + char 
        elif re.match('[' + digits + ']', char): 
            curr = curr + char 
        elif re.match('[' + fillers + headstroke + ']', char): 
            syllables.append(char) 
        elif re.match('[' + joiners + ']', char): 
            curr = curr + char 
        else: 
            pass 
        #print ("unhandled: " + char + " ", char.encode('unicode_escape')) 
        # handle remaining curr 
        if curr != '': 
            syllables.append(curr) 
            curr = '' 
        # return each syllable as item in a list 
        return syllables
syllabify('हलन्त')

['ह']