Skip to content
Permalink
Browse files

Add function print_tamil_words to the UTF8 module;

Add the demos to the default install paths in open-tamil as Python scripts;
This provides functions:
tamilphonetic - convert EN input to Tamil text
tamilwordfilter - filter Tamil input only from all input text data
tamilurlfilter - filter Tamil text from the input website data
tamiltscii2utf8 - convert encoding from TSCII to UTF-8 for input file
tamilwordgrid - generate a crossword from Tamil input text and write to output.html file
tamilwordcount - like UNIX wc program but for Tamil
  • Loading branch information...
Muthu Annamalai
Muthu Annamalai committed Mar 14, 2018
1 parent a880fe6 commit 02810461bef216df56b10ebf09818b94dfc75574

This file was deleted.

Oops, something went wrong.
@@ -1,4 +1,4 @@
#!/usr/bin/python
#!python
# -*- coding: utf-8 -*-
# (C) 2013 Muthiah Annamalai

@@ -1,13 +1,14 @@
#!/usr/bin/python
#!python
# -*- coding: utf-8 -*-
# (C) 2013 Muthiah Annamalai
# (C) 2013-2018 Muthiah Annamalai

from sys import argv, exit
import tamil
import sys
import codecs

def usage():
return u"tscii2utf8.py <source-file> <destination-file> "
return u"tamiltscii2utf8.py <source-file> <destination-file> "

if __name__ == u"__main__":
if not argv[1:]:
@@ -17,13 +18,13 @@ def usage():
try:
source_file = argv[1]
destination_file = argv[2]
with open(source_file) as fileHandle:
with open(source_file,"rb") as fileHandle:
print("working on " + source_file + "\n")
output = tamil.tscii.convert_to_unicode( fileHandle.read() )
#print( output )
fi = open(destination_file,"w")
fi = codecs.open(destination_file,"w","UTF-8")
fi.write(output.encode('utf-8'))
fi.close()
print("TSCII to UTF8 conversion completed. Check the file " + destination_file)
except Exception as fileOrConvException:
print(u"tscii2utf8 error - file %s could not be processed due to - %s"%(source_file,str(fileOrConvException)))
print(u"tamiltscii2utf8 error - file %s could not be processed due to - %s"%(source_file,str(fileOrConvException)))
@@ -0,0 +1,36 @@
#!python
# -*- coding: utf-8 -*-
# (C) 2013-2018 Muthiah Annamalai
#
# This file is part of 'open-tamil' package tests
#

import tamil
import sys
from transliterate import *
from tamil.utf8 import print_tamil_words
try:
import bs4 #requires beautiful soup 4
except ImportError as ie:
# work with BS3
try:
import BeautifulSoup
except ImportError as ie2:
print("Module BeautifulSoup required for successful execution; please install module via pip.\n")
sys.exit(-1)
class bs4:
BeautifulSoup = BeautifulSoup.BeautifulSoup

from urllib2 import urlopen
import operator

def url_tamil_text_filter( url ):
tapage = bs4.BeautifulSoup(urlopen(url))
tatext = tapage.body.text
print_tamil_words( tatext )

if __name__ == u"__main__":
if len(sys.argv) < 2:
print(u"Usage: tamilurlfilter.py <url-1> <url-2> ...\n")
for url in sys.argv[1:]:
url_tamil_text_filter(url)
@@ -1,4 +1,4 @@
#!/usr/bin/env python3
#!python
# -*- coding: utf-8 -*-
# MIT License
#
@@ -0,0 +1,19 @@
#!python
# -*- coding: utf-8 -*-
# (C) 2013-2018 Muthiah Annamalai
#
# This file is part of 'open-tamil' package tests
#
from __future__ import print_function
import sys
import codecs
from tamil.utf8 import print_tamil_words
from transliterate import *
import operator

if __name__ == u"__main__":
if len(sys.argv) < 2:
print(u"Usage: tamilwordfilter.py <filename-1> <filename-2> ... \n")
for filename in sys.argv[1:]:
with codecs.open(filename,"r","UTF-8") as fp:
print_tamil_words( fp.read() )
@@ -1,4 +1,4 @@
#!/usr/bin/python
#!python
# -*- coding: utf-8 -*-
# (C) 2015 Muthiah Annamalai

@@ -17,6 +17,8 @@
package_dir={'solthiruthi': 'solthiruthi'},
package_data={'solthiruthi': ['data/*.txt']},
license='MIT',
scripts=['examples/tamilwordcount.py','examples/tamilwordfilter.py','examples/tamilurlfilter.py',
'examples/tamilphonetic.py','examples/tamiltscii2utf8.py','examples/tamilwordgrid.py'],
platforms='PC,Linux,Mac',
classifiers=['Natural Language :: Tamil',
'Programming Language :: Python :: 2.6',
@@ -11,6 +11,7 @@
from sys import version
from copy import copy
import re
import operator

PYTHON3 = version > '3'
del version
@@ -607,6 +608,23 @@ def classify_letter(letter):
return 'digit'
raise ValueError("Unknown letter '%s' neither Tamil nor English or number"%letter)

def print_tamil_words( tatext, use_frequencies = False ):
taletters = get_letters(tatext)
#for word in re.split(u"\s+",tatext):
# print(u"-> ",word)
# tamil words only
frequency = {}
for pos,word in enumerate(get_tamil_words(taletters)):
frequency[word] = 1 + frequency.get(word,0)
#for key in frequency.keys():
# print(u"%s : %s"%(frequency[key],key))
# sort words by descending order of occurence
for l in sorted(frequency.iteritems(), key=operator.itemgetter(1)):
if use_frequencies:
print(u"%d -> %s"%(l[1],l[0]))
else:
print(u"%s"%l[0])

# Tamil Letters
# அ ஆ இ ஈ உ ஊ எ ஏ ஐ ஒ ஓ ஔ ஃ
# க் ச் ட் த் ப் ற் ஞ் ங் ண் ந் ம் ன் ய் ர் ல் வ் ழ் ள் ஜ் ஷ் ஸ் ஹ்

0 comments on commit 0281046

Please sign in to comment.
You can’t perform that action at this time.