In [1]:
import os, sys
import re
import pandas as pd
import numpy as np
sys.path.insert(0, '../')

In [2]:
from banglanlp.text.WordProcessor import *

In [3]:
df = pd.read_csv('../../data/transliteration/dictionary_data/dictionary_extracted_processed.tsv', sep='\t', header=None)

In [4]:
df.shape

(15657, 2)

In [5]:
df.drop_duplicates(inplace=True)
df.shape

(12614, 2)

In [6]:
df.columns = ['ben','eng']
df 

Unnamed: 0,ben,eng
0,' ই.এম.,' I.ema.
1,", পাশে",", Pashe"
2,- আছে হয়েছে,- che hayeche
3,- খোলা শেষ,- Khola shesa
4,- গভীর উপবিষ্ট,- Gabhira upabisa
5,- চিম্টি আঘাত,- Cimi aghata
6,- নরম সেদ্ধ,- Narama sed'dha
7,- নিথর শুকনো,- Nithara shukano
8,BRIC একটি ব্র্যাক,BRIC ekai bryaka
9,NFL এবং,NFL ebang


In [10]:
df['ben_split'] = df['ben'].str.split(' ')
df['en_split'] = df['eng'].str.split(' ')

In [11]:
df

Unnamed: 0,ben,eng,ben_split,en_split
0,' ই.এম.,' I.ema.,"[', ই.এম.]","[', I.ema.]"
1,", পাশে",", Pashe","[,, পাশে]","[,, Pashe]"
2,- আছে হয়েছে,- che hayeche,"[-, আছে, হয়েছে]","[-, che, hayeche]"
3,- খোলা শেষ,- Khola shesa,"[-, খোলা, শেষ]","[-, Khola, shesa]"
4,- গভীর উপবিষ্ট,- Gabhira upabisa,"[-, গভীর, উপবিষ্ট]","[-, Gabhira, upabisa]"
5,- চিম্টি আঘাত,- Cimi aghata,"[-, চিম্টি, আঘাত]","[-, Cimi, aghata]"
6,- নরম সেদ্ধ,- Narama sed'dha,"[-, নরম, সেদ্ধ]","[-, Narama, sed'dha]"
7,- নিথর শুকনো,- Nithara shukano,"[-, নিথর, শুকনো]","[-, Nithara, shukano]"
8,BRIC একটি ব্র্যাক,BRIC ekai bryaka,"[BRIC, একটি, ব্র্যাক]","[BRIC, ekai, bryaka]"
9,NFL এবং,NFL ebang,"[NFL, এবং]","[NFL, ebang]"


In [12]:
for i in range(df.shape[0]):
    if len(df['ben_split'][0]) != len(df['en_split'][0]):
        print(i)

In [20]:
df.dropna(inplace=True)
df.shape

(12613, 5)

In [13]:
bnp = BanglaWordProcessor()
enp = EnglishWordProcessor()

In [15]:
bnp.puncstr += string.punctuation

In [21]:
def clean(words, lang):
    res = []
    #print(words)
    for w in words:
        if lang == 'en':
            cw = enp.process(w)
            if cw != '':
                res.append(cw)
        else:
            cw = bnp.process(w)
            if cw != '':
                res.append(cw)
    return res
    

df['ben_split_clean'] = df['ben_split'].apply(clean,lang='bn')
df['en_split_clean'] = df['en_split'].apply(clean,lang='en')

In [22]:
df

Unnamed: 0,ben,eng,ben_split,en_split,ben_split_clean,en_split_clean
0,' ই.এম.,' I.ema.,"[', ই.এম.]","[', I.ema.]",[ইএম],[iema]
1,", পাশে",", Pashe","[,, পাশে]","[,, Pashe]",[পাশে],[pashe]
2,- আছে হয়েছে,- che hayeche,"[-, আছে, হয়েছে]","[-, che, hayeche]","[আছে, হয়েছে]","[che, hayeche]"
3,- খোলা শেষ,- Khola shesa,"[-, খোলা, শেষ]","[-, Khola, shesa]","[খোলা, শেষ]","[khola, shesa]"
4,- গভীর উপবিষ্ট,- Gabhira upabisa,"[-, গভীর, উপবিষ্ট]","[-, Gabhira, upabisa]","[গভীর, উপবিষ্ট]","[gabhira, upabisa]"
5,- চিম্টি আঘাত,- Cimi aghata,"[-, চিম্টি, আঘাত]","[-, Cimi, aghata]","[চিম্টি, আঘাত]","[cimi, aghata]"
6,- নরম সেদ্ধ,- Narama sed'dha,"[-, নরম, সেদ্ধ]","[-, Narama, sed'dha]","[নরম, সেদ্ধ]","[narama, seddha]"
7,- নিথর শুকনো,- Nithara shukano,"[-, নিথর, শুকনো]","[-, Nithara, shukano]","[নিথর, শুকনো]","[nithara, shukano]"
8,BRIC একটি ব্র্যাক,BRIC ekai bryaka,"[BRIC, একটি, ব্র্যাক]","[BRIC, ekai, bryaka]","[BRIC, একটি, ব্র্যাক]","[BRIC, ekai, bryaka]"
9,NFL এবং,NFL ebang,"[NFL, এবং]","[NFL, ebang]","[NFL, এবং]","[NFL, ebang]"


In [24]:
for i in range(df.shape[0]):
    if len(df['ben_split_clean'][0]) != len(df['en_split_clean'][0]):
        print(i)

In [37]:
data = []
for row in df[['ben_split_clean','en_split_clean']].values:
    
    for i in range(len(row[0])):
        if not row[0][i].isalnum():
            data.append([row[0][i],row[1][i]])

In [39]:
df_2 = pd.DataFrame(data, columns=['ben','eng'])

In [41]:
df_2.to_csv('../../data/transliteration/dictionary_data/Dictionary_data_cleaned.tsv',sep='\t',header=None, index=False)