In [5]:
import re
import unicodedata
from nltk.stem import PorterStemmer 
from nltk.stem import WordNetLemmatizer
from queue import Queue

In [118]:
class Preprocess:
        
        # --------------------------------------- Constructor --------------------------------------- 
        
        def __init__(self,stopword_list):
            self.data_path = ''
            self.stopword_list = stopword_list
                

        # --------------------------------------- Preprocess --------------------------------------- 
        
        def expand_concatenations(self, word):
            
            
            if not re.match('[a-zA-Z]+', word) or re.match('/d+',word):
                for i in range(len(word)):
                    if not('DEVANAGARI ' in unicodedata.name(word[i])):
                        word = word[:i] if( len(word[i:]) < 2 and not word[i:].isnumeric()) else word[:i] + " " + word[i:]
                        break
            else:
                for i in range(len(word)):
                    if ('DEVANAGARI ' in unicodedata.name(word[i])):
                        word = word[i:] if( len(word[:i]) < 2 and not word[:i].isnumeric() ) else word[:i] + " " + word[i:]
                        break

            return(word)
    
        
        def clean_text(self,text: str) -> str:
            try:
                special_chars = r'''!()-[]{};:'"\,<>./?@#$%^&*_~'''
                stemmer = PorterStemmer()
                lemmatizer = WordNetLemmatizer()

                if not(isinstance(text, str)): text = str(text)

                #Removing unprintable characters
                text = ''.join(x for x in text if x.isprintable())

                # Cleaning the urls
                text = re.sub(r'https?://\S+|www\.\S+', '', text)

                # Cleaning the html elements
                text = re.sub(r'<.*?>', '', text)

                # Removing the punctuations
                text = re.sub('[!#?,.:";-@#$%^&*_~<>()-]', '', text)


                # Removing stop words
                text = ' '.join([word for word in text.split() if word not in self.stopword_list])

                # Expanding noisy concatenations (Eg: algorithmआणि  -> algorithm आणि ) 
                text = ' '.join([self.expand_concatenations(word) for word in text.split()])

#                 preprocessed_text = ""

#                 for word in text.split(): 
#                     if (re.match('\d+', word)):
#                         if(word.isnumeric()):
#                             preprocessed_text = preprocessed_text + '#N' + " "
#                         else:
#                             preprocessed_text = preprocessed_text + word.lower() + " "

#                     else:
#                         if(re.match('[a-zA-Z]+', word)):
#                             if not len(word) < 2:
#                                 word = word.lower()
#     #                             word = lemmatizer.lemmatize(word, pos='v')
#                                 preprocessed_text = preprocessed_text + word + " "

#                         else:
#                             preprocessed_text = preprocessed_text + word + " "

#                 return preprocessed_text
                return text
            
            except ValueError as ve:
                print('Error processing:\t',text)
                return ''
    
        def preprocess_text(self,text: str) -> str:

            try:
                if not(isinstance(text, str)): text = str(text)
                preprocessed_text = ""

                for word in text.split(): 
                    if (re.match('\d+', word)):
                        if(word.isnumeric()):
                            preprocessed_text = preprocessed_text + '#N' + " "
                        else:
                            preprocessed_text = preprocessed_text + word.lower() + " "

                    else:
                        if(re.match('[a-zA-Z]+', word)):
                            if not len(word) < 2:
                                word = word.lower()
    #                             word = lemmatizer.lemmatize(word, pos='v')
                                preprocessed_text = preprocessed_text + word + " "

                        else:
                            preprocessed_text = preprocessed_text + word + " "

                return preprocessed_text

            except ValueError as ve:
                print('Error processing:\t',text)
                return ''
            
        def split_devanagri_word(self,word: str, punctuations = True) -> str:
            try:
                q = Queue()
                if not(isinstance(word, str)): word = str(word)
                tokens = []
                
                for char in word:
#                     print(char, '--->', unicodedata.name(char))

                    if 'letter' in unicodedata.name(char).lower():
                        if q.empty():
                            tokens.append(char)
                        else:
                            while not q.empty():
                                tokens[len(tokens)-1] += q.get() 
                            tokens.append(char)   
                    else:
                        if punctuations == True:
                            q.put(char)

                while not q.empty():
                    tokens[len(tokens)-1] += q.get() 
                
                return tokens
                
            except ValueError as ve:
                print('Error processing:\t',text)
                return ''
        
        def text2characters(self,text:str, punctuations = True)->str:
            try:
                if not(isinstance(text, str)): text = str(text)
                char_sequence = ""
                char_list = []
                
                for word in text.split():
                    seq = ' '.join([char for char in self.split_devanagri_word(word, punctuations)])                
                    char_sequence = char_sequence + seq + ' '
    
#                     print(word,'--->',seq)
                    
                return char_sequence
            
            except ValueError as ve:
                print('Error processing:\t',text)
                return ''
            
            
        def tokenize_characters(self, document):
            vocab = set()
            cnt = 0
            token_dict = {}
            
            if isinstance(document, list):
                for text in document:
                    char_sequence = self.text2characters(text)
                    tokens_indic = pd.Series(trivial_tokenize_indic(char_sequence))
                    word_counts = tokens_indic.value_counts()
                    
                    vocab = vocab.union(set(word_counts.keys()))

                print('Total Unique Tokens (Characters): {}'.format(len(vocab)))

                for char in vocab:
                    cnt += 1
                    token_dict[char] = cnt
            
            else:
                char_sequence = self.text2characters(document)
                tokens_indic = pd.Series(trivial_tokenize_indic(char_sequence))
                word_counts = tokens_indic.value_counts()  
                vocab = vocab.union(set(word_counts.keys()))

                print('Total Unique Tokens (Characters): {}'.format(len(vocab)))

                for char in vocab:
                    cnt += 1
                    token_dict[char] = cnt
                
            return token_dict

        
        def text_to_sequence(self,document,token_dict):
            
            sequence_doc = []
            
            if isinstance(document, list):
                print('Total records: ',len(document))
                cnt = 0
                for text in document:
                    char_array = self.text2characters(text).split()
                    text_sequence = [token_dict[x] for x in char_array]
                    sequence_doc.append(text_sequence)
                    cnt+=1
                print('Records converted: ',cnt)
                
            else:
                char_array = self.text2characters(document).split()
                text_sequence = [token_dict[x] for x in char_array]
                sequence_doc.append(text_sequence)
                print('Records converted: 1')
                
            return sequence_doc
            

In [80]:
if __name__ == '__main__':
    
    import pandas as pd
    df = pd.read_csv('../Technodifacation/Data/training_data_marathi.csv')
    
    sampletext1 = df['text'].sample().values
    print(sampletext1)
    pp = Preprocess([])
    sampletext2 = 'त्यांना जनतेला पटवून द्यावे लागेल99'

    test_list1 = ['त्यांना','H20', '2H20','Animal2Animal' ,'सी२ओ२', 'लागेल99', 'Animalत्यांना',
                 'त्यांनाAnimal', 'Analogy_त्यांना', 'Science२१', '१२Number', '!@)$&%!#)&$!&$!$B Bo ', '११.२२','I', '१','1','11.22','a','B','सी']

    test_list2 = ['त्यांना CO2 2H20 सीओ२ लागेल99 , Animalत्यांना त्यांनाAnimal Analogy_त्यांना Science२१ १२Number',
                 '!@)$&%!#)&$!&$!$I am Atharva ११.२२ Kulkarni 11.22 a B 1 सी']

    for text in test_list2:
        print(text, '\t--->\t', pp.clean_text(text),'\n')   

['तर , इंटरप्ट डिस्क्रिप्टर टेबलची सामग्री काय आहे ?']
त्यांना CO2 2H20 सीओ२ लागेल99 , Animalत्यांना त्यांनाAnimal Analogy_त्यांना Science२१ १२Number 	--->	 त्यांना CO2  2H20 सीओ२ लागेल 99 Animal त्यांना त्यांना Animal Analogy त्यांना Science २१ १२ Number 

!@)$&%!#)&$!&$!$I am Atharva ११.२२ Kulkarni 11.22 a B 1 सी 	--->	 I am Atharva ११२२ Kulkarni  1122 a B  1 सी 



In [119]:
pp = Preprocess([])

In [95]:
sample_word =  "हिरड्यांच्या"
tokens = pp.split_devanagri_word(sample_word, punctuations=True)
tokens

['हि', 'र', 'ड्', 'यां', 'च्', 'या']

In [104]:
text = 'पहिला,  स्तंभ आपल्याला अंदाज देतो.'

clean_text = pp.clean_text(text)

char_sequence_1 = pp.text2characters(clean_text)
char_sequence_2 = pp.text2characters(clean_text, punctuations=False)
print('\nText: ',clean_text,'\n\nWith Punctuations: ',char_sequence_1,'\n\nOnly Letters: ',char_sequence_2)

पहिला ---> प हि ला
स्तंभ ---> स् तं भ
आपल्याला ---> आ प ल् या ला
अंदाज ---> अं दा ज
देतो ---> दे तो
पहिला ---> प ह ल
स्तंभ ---> स त भ
आपल्याला ---> आ प ल य ल
अंदाज ---> अ द ज
देतो ---> द त

Text:  पहिला स्तंभ आपल्याला अंदाज देतो 

With Punctuations:  प हि ला स् तं भ आ प ल् या ला अं दा ज दे तो  

Only Letters:  प ह ल स त भ आ प ल य ल अ द ज द त 


<h4>Idiotic Keras<h4>

In [117]:
import tensorflow as tf

tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level = False, split = " ")

tokenizer.fit_on_texts(char_sequence_1)

print(tokenizer.word_counts)

OrderedDict([('प', 2), ('ह', 1), ('ि', 1), ('ल', 3), ('ा', 4), ('स', 1), ('्', 2), ('त', 2), ('ं', 2), ('भ', 1), ('आ', 1), ('य', 1), ('अ', 1), ('द', 2), ('ज', 1), ('े', 1), ('ो', 1)])


<h4>Max Jugaad<h4>

In [91]:
from indicnlp.tokenize.indic_tokenize import trivial_tokenize_indic

tokens_indic = trivial_tokenize_indic(char_sequence_1)

tokens_indic = pd.Series(tokens_indic)

word_counts = tokens_indic.value_counts()
print(word_counts)

प     2
ला    2
अं    1
भ     1
तो    1
दे    1
दा    1
स्    1
ज     1
तं    1
या    1
ल्    1
हि    1
आ     1
dtype: int64


<h4>Converting Devanagri Text to Char array<h4>

In [123]:
token_dict = pp.tokenize_characters(clean_text)
token_dict

Total Unique Tokens (Characters): 14


{'ला': 1,
 'स्': 2,
 'हि': 3,
 'ल्': 4,
 'या': 5,
 'तं': 6,
 'ज': 7,
 'दा': 8,
 'दे': 9,
 'आ': 10,
 'तो': 11,
 'भ': 12,
 'अं': 13,
 'प': 14}

In [122]:
text_seq = pp.text_to_sequence(clean_text, token_dict)
print(len(text_seq))
text_seq

Records converted: 1
1


[[14, 3, 1, 2, 6, 12, 10, 14, 4, 5, 1, 13, 8, 7, 9, 11]]