In [3]:
import pandas as pd
import numpy as np
import os, sys

sys.path.insert(0, '../src/')

## Data Preprocessing

In [4]:
from banglanlp.text.WordProcessor import *
from banglanlp.text.Tokenizer import *
from banglanlp.text.VocabularyBuilder import *

In [5]:
bnp = BanglaWordProcessor()
enp = EnglishWordProcessor()

In [7]:
df1 = pd.read_csv('../data/transliteration/train.tsv', sep='\t', header=None)
df2 = pd.read_csv('../data/transliteration/validation.tsv', sep='\t', header=None)
df3 = pd.read_csv('../data/transliteration/dictionary_data/Dictionary_data_cleaned.tsv', sep='\t', header=None)

In [8]:
df1.shape, df2.shape, df3.shape

((3496, 2), (504, 2), (16209, 2))

In [9]:
columns = ['beng','eng']
df1.columns = columns
df2.columns = columns
df3.columns = columns

In [10]:
def clean(word, lang):
    if lang is 'en':
        return enp.process(word)
    else:
        return bnp.process(word)

In [11]:
df1['clean_beng'] = df1['beng'].apply(clean, lang='bn')
df1['clean_eng'] = df1['eng'].apply(clean, lang='en')
df2['clean_beng'] = df2['beng'].apply(clean, lang='bn')
df2['clean_eng'] = df2['eng'].apply(clean, lang='en')
df3['clean_beng'] = df3['beng'].apply(clean, lang='bn')
df3['clean_eng'] = df3['eng'].apply(clean, lang='en')

In [12]:
df1.drop_duplicates(inplace=True)
df2.drop_duplicates(inplace=True)
df3.drop_duplicates(inplace=True)

In [13]:
df1.dropna(inplace=True)
df2.dropna(inplace=True)
df3.dropna(inplace=True)

In [14]:
df1[df1['clean_beng'].isna() == True]
df1[df1['clean_eng'].isna() == True]
df2[df2['clean_beng'].isna() == True]
df2[df2['clean_eng'].isna() == True]
df3[df3['clean_beng'].isna() == True]
df3[df3['clean_eng'].isna() == True]

Unnamed: 0,beng,eng,clean_beng,clean_eng


In [15]:
df1.shape, df2.shape, df3.shape

((3464, 4), (504, 4), (11489, 4))

## Data Preparation For Task

In [16]:
df_train = pd.concat([df1[['clean_beng','clean_eng']], df3[['clean_beng','clean_eng']]])

In [17]:
df_train = df_train.reset_index()

In [18]:
bntok = BanglaCharTokenizer()
entok = EnglishCharTokenizer()
ebtok = EnglishBanglaCharTokenizer()

In [19]:
bnvocab = VocabularyBuilder(bntok)
envocab = VocabularyBuilder(ebtok)

In [20]:
bnvocab.build_vocab(df_train['clean_beng'])
envocab.build_vocab(df_train['clean_eng'])

In [21]:
v = envocab.text_to_vector('bihan')
t = envocab.vector_to_tokens(v)
print(v,t)

[1, 4, 11, 37, 6, 10, 2] ['<S>', 'b', 'i', 'h', 'a', 'n', '<E>']


## train and test Vectors

In [23]:
from banglanlp.text.Vectorizer import *

In [24]:
vr = Vectorizer(envocab)

In [25]:
vr.fit_transform(df_train['clean_eng'])

array([[ 1,  4,  5, ...,  0,  0,  0],
       [ 1, 13, 14, ...,  0,  0,  0],
       [ 1, 13,  9, ...,  0,  0,  0],
       ...,
       [ 1, 37,  5, ...,  0,  0,  0],
       [ 1, 37,  9, ...,  0,  0,  0],
       [ 1, 37,  9, ...,  0,  0,  0]], dtype=int32)

In [27]:
vr.transform('ami')

array([ 1,  6, 28, 11,  2], dtype=int32)