In [1]:
import pandas as pd
import numpy as np
import os, sys

sys.path.insert(0, '../../src/')

## Data Preprocessing

In [2]:
from banglanlp.text.WordProcessor import *

In [3]:
bnp = BanglaWordProcessor()
enp = EnglishWordProcessor()

In [4]:
df1 = pd.read_csv('../../data/transliteration/train.tsv', sep='\t', header=None)
df2 = pd.read_csv('../../data/transliteration/validation.tsv', sep='\t', header=None)
df3 = pd.read_csv('../../data/transliteration/dictionary_data/Dictionary_data_cleaned.tsv', sep='\t', header=None)

In [5]:
df1.shape, df2.shape, df3.shape

((3496, 2), (504, 2), (16209, 2))

In [6]:
columns = ['beng','eng']
df1.columns = columns
df2.columns = columns
df3.columns = columns

In [7]:
def clean(word, lang):
    if lang is 'en':
        return enp.process(word)
    else:
        return bnp.process(word)

In [8]:
df1['clean_beng'] = df1['beng'].apply(clean, lang='bn')
df1['clean_eng'] = df1['eng'].apply(clean, lang='en')
df2['clean_beng'] = df2['beng'].apply(clean, lang='bn')
df2['clean_eng'] = df2['eng'].apply(clean, lang='en')
df3['clean_beng'] = df3['beng'].apply(clean, lang='bn')
df3['clean_eng'] = df3['eng'].apply(clean, lang='en')

In [9]:
df1.drop_duplicates(inplace=True)
df2.drop_duplicates(inplace=True)
df3.drop_duplicates(inplace=True)

In [10]:
df1.dropna(inplace=True)
df2.dropna(inplace=True)
df3.dropna(inplace=True)

In [11]:
df1[df1['clean_beng'].isna() == True]
df1[df1['clean_eng'].isna() == True]
df2[df2['clean_beng'].isna() == True]
df2[df2['clean_eng'].isna() == True]
df3[df3['clean_beng'].isna() == True]
df3[df3['clean_eng'].isna() == True]

Unnamed: 0,beng,eng,clean_beng,clean_eng


In [12]:
df1.shape, df2.shape, df3.shape

((3464, 4), (504, 4), (11489, 4))

## Data Preparation For Task

In [13]:
df_train = pd.concat([df1[['clean_beng','clean_eng']], df3[['clean_beng','clean_eng']]])

In [14]:
df_train.shape

(14953, 2)

#### TODO : move everything to library

In [15]:
from abc import ABCMeta, abstractmethod
import re

In [16]:
vowels = 'aeiou'
semivowels = 'wy'

In [17]:
class Tokenizer():
    
    def __init__(self):
        pass
    
    @abstractmethod
    def tokenize(self, word):
        tokens = []
        for char in word:
            tokens.append(char)
        return tokens

class BanglaCharTokenizer(Tokenizer):
    
    def __init__(self):
        super().__init__()
    
    def tokenize(self, word):
        return super().tokenize(word)
    
class EnglishCharTokenizer(Tokenizer):
    
    def __init__(self):
        super().__init__()
    
    def tokenize(self, word):
        return super().tokenize(word)
    
class EnglishBanglaCharTokenizer(Tokenizer):
    
    def __init__(self):
        super().__init__()
    
    def tokenize(self, word):
        tokens = []
        current_token = ''
        for char in word:
            if char is 'h' and (current_token != '' and current_token[-1] not in vowels and current_token[-1] not in semivowels):
                tokens.append(current_token + 'h')
                current_token = ''
            else:
                if current_token != '':
                    tokens.append(current_token)
                current_token = char
                
        if current_token != '':
            tokens.append(current_token)
                
        return tokens

In [18]:
bntok = BanglaCharTokenizer()
entok = EnglishCharTokenizer()
ebtok = EnglishBanglaCharTokenizer()

In [25]:
class VocabularyBuilder():
    '''
    This class builds a vocabulary with the tokenizer provided
    '''
    
    def __init__(self, tokenizer):
        # Don't use 0 as it can be used as a default value in vector for length normalization
        self.tokenToIndex = {'<S>':1, '<E>': 2, '<U>' : 3}
        self.indexToToken = {1 : '<S>', 2 : '<E>', 3 : '<U>'}
        self.__index__  = 4
        
        if not isinstance(tokenizer, Tokenizer):
            raise AttributeError("input param tokenizer is not a Tokenizer instance")
        
        self.tokenizer = tokenizer

    def add_to_vocab(self, tokenlist):
        '''
        Adds a list of tokens into the vocabulary
        '''
        for token in tokenlist:
            if token not in self.tokenToIndex:
                self.tokenToIndex[token] = self.__index__ 
                self.indexToToken[self.__index__] = token
                self.__index__ += 1
    
    def build_vocab(self, series):
        '''
        Build a vocabulary with series as input
        '''
        
        if self.tokenizer is None:
            raise AttributeError("Tokenizer is not set.")
            
        for word in series.values:
            self.add_to_vocab(self.tokenizer.tokenize(word))
    
    def get_len(self):
        return self.__index__
    
    def text_to_vector(self, text):
        vec = [self.tokenToIndex['<S>']]
        tokens = self.tokenizer.tokenize(text)
        for token in tokens:
            if token in self.tokenToIndex:
                vec.append(self.tokenToIndex[token])
            else:
                vec.append(self.tokensToIndex['<U>'])
        vec.append(self.tokenToIndex['<E>'])
        return vec
    
    def vector_to_tokens(self, vector):
        tokens = []
        for elem in vector:
            tokens.append(self.indexToToken[elem])
        return tokens

In [26]:
bnvocab = VocabularyBuilder(bntok)
envocab = VocabularyBuilder(ebtok)

In [27]:
bnvocab.build_vocab(df_train['clean_beng'])
envocab.build_vocab(df_train['clean_eng'])

In [30]:
v = envocab.text_to_vector('bihan')
t = envocab.vector_to_tokens(v)
print(v,t)

[1, 4, 11, 37, 6, 10, 2] ['<S>', 'b', 'i', 'h', 'a', 'n', '<E>']


## train and test Vectors

In [61]:
import pandas as pd

class Vectorizer():
    
    def __init__(self, vocab_builder):
        self.vocab = vocab_builder
        self.max_len = 0
    
    def vectorize(self, data):
        
        if type(data) == str:
            return np.array(self.vocab.text_to_vector(data), dtype=np.int32)
        
        if type(data) == pd.core.series.Series:
            vecs = data.apply(self.vocab.text_to_vector)
            self.max_len = get_max_len(vecs)
            vec = np.zeros((data.shape[0],self.max_len), dtype=np.int32)
            print(vec.shape)
            for i in range(data.shape[0]):
                vec[i][0:len(vecs[i])] = vecs[i]
            
            return vec

    
# add to utils
def get_max_len(vectors):
    '''vectors is a np array/list of list'''
    max_len = 0
    for v in vectors:
        max_len = max(len(v), max_len)
    return max_len

In [62]:
vecs = np.array(df_train['clean_eng'].apply(envocab.text_to_vector))

In [63]:
max_len = get_max_len(vecs)
max_len

27

In [64]:
vecs

array([list([1, 4, 5, 6, 7, 8, 9, 8, 10, 4, 11, 12, 11, 2]),
       list([1, 13, 14, 6, 10, 15, 16, 6, 2]),
       list([1, 13, 9, 11, 17, 11, 14, 2]), ...,
       list([1, 37, 5, 6, 18, 8, 11, 10, 6, 2]),
       list([1, 37, 9, 6, 13, 6, 7, 6, 9, 6, 10, 6, 2]),
       list([1, 37, 9, 16, 13, 6, 12, 4, 6, 10, 11, 2])], dtype=object)

In [65]:
data = np.zeros((df_train['clean_eng'].shape[0],max_len), dtype=np.int32)

In [66]:
data[0][0:len(vecs[0])] = vecs[0]

In [67]:
data[0]

array([ 1,  4,  5,  6,  7,  8,  9,  8, 10,  4, 11, 12, 11,  2,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)

In [68]:
vr = Vectorizer(envocab)

In [69]:
vr.vectorize(df_train['clean_eng'])

(14953, 27)


ValueError: setting an array element with a sequence.