Skip to content
This repository has been archived by the owner on Mar 1, 2023. It is now read-only.

Commit

Permalink
1) Add method to UTF8 module for classifying letters
Browse files Browse the repository at this point in the history
  • Loading branch information
Muthu Annamalai committed Dec 17, 2017
1 parent 11c7062 commit bc530ac
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 1 deletion.
30 changes: 29 additions & 1 deletion tamil/utf8.py
Expand Up @@ -533,7 +533,7 @@ def splitMeiUyir(uyirmei_char):

if uyirmei_char in mei_letters or uyirmei_char in uyir_letters or uyirmei_char in ayudha_letter:
return uyirmei_char

if uyirmei_char not in grantha_uyirmei_letters:
if not is_normalized( uyirmei_char ):
norm_char = unicode_normalize(uyirmei_char)
Expand Down Expand Up @@ -577,6 +577,34 @@ def joinMeiUyir(mei_char, uyir_char):
uyirmeiidx = meiidx*12 + uyiridx
return grantha_uyirmei_letters[uyirmeiidx]

def classify_letter(letter):
if not isinstance(letter, PYTHON3 and str or unicode):
raise TypeError("Input'%s' must be unicode, not just string" % letter)
kinds = [u'kuril',u'nedil',u'ayudham',u'vallinam',u'mellinam',u'idayinam',u'uyirmei',u'tamil_or_grantham']
if letter in uyir_letters:
if letter in kuril_letters:
return u'kuril'
elif letter in nedil_letters:
return u'nedil'
elif letter == ayudha_letter:
return 'ayudham'
if letter in mei_letters:
if letter in mellinam_letters:
return 'mellinam'
elif letter in vallinam_letters:
return 'vallinam'
elif letter in idayinam_letters:
return 'idayinam'
if letter in uyirmei_letters:
return 'uyirmei'
if letter in tamil_letters:
return 'tamil_or_grantham'
if letter.isalpha():
return 'english'
elif letter.isdigit():
return 'digit'
raise ValueError("Unknown letter '%s' neither Tamil nor English or number"%letter)

# Tamil Letters
# அ ஆ இ ஈ உ ஊ எ ஏ ஐ ஒ ஓ ஔ ஃ
# க் ச் ட் த் ப் ற் ஞ் ங் ண் ந் ம் ன் ய் ர் ல் வ் ழ் ள் ஜ் ஷ் ஸ் ஹ்
Expand Down
12 changes: 12 additions & 0 deletions tests/letter_tests2.py
Expand Up @@ -23,5 +23,17 @@ def test_uyir_mei_split(self):
il,ee = utf8.splitMeiUyir(u"லி")
self.assertEqual((il,ee),(u"ல்",u"இ"))

def test_classifier(self):
expected = []
expected.extend(['english']*3)
expected.extend(['digit']*4)
expected.extend(['kuril','nedil','uyirmei','vallinam','uyirmei'])
data = list(map(utf8.classify_letter,utf8.get_letters(u"abc1230அஆரெட்டை")))
self.assertEqual(data,expected)

def test_classified_except(self):
with self.assertRaises(ValueError) as ve:
utf8.classify_letter(u'.')

if __name__ == '__main__':
unittest.main()
2 changes: 2 additions & 0 deletions tests/python2_localtest.sh
@@ -0,0 +1,2 @@
#!/bin/bash -x
PYTHONPATH=~/devel/open-tamil python $@
2 changes: 2 additions & 0 deletions tests/python3_localtest.sh
@@ -0,0 +1,2 @@
#!/bin/bash -x
PYTHONPATH=~/devel/open-tamil python3 $@

0 comments on commit bc530ac

Please sign in to comment.