In [1]:
#--------------------------------------
# imports
#--------------------------------------
import pandas as pd 
from tqdm import tqdm
from glob import glob
from bnunicodenormalizer import Normalizer
from indicparser.langs import bangla
bangla.consonant_diacritics+=['ং','ঃ'] 
from indicparser import graphemeParser
import random

gp=graphemeParser("bangla")
bnorm=Normalizer()
tqdm.pandas()
#--------------------------------------
# globals
#--------------------------------------
numbers                =    ['০', '১', '২', '৩', '৪', '৫', '৬', '৭', '৮', '৯']
punctuations           =    ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '।']


# Normalized Oscar Corpus
* get unique graphemes

In [2]:
csvs=[csv for csv in tqdm(glob("oscar/*.csv"))]
dfs=[pd.read_csv(csv) for csv in tqdm(csvs)]
df=pd.concat(dfs)
df

100%|██████████| 288/288 [00:00<00:00, 3671609.58it/s]
100%|██████████| 288/288 [00:03<00:00, 86.68it/s] 


Unnamed: 0,word,count
0,আয়াছ,184
1,মধুরিমা,184
2,ঝিমলি,184
3,জেটের,184
4,পুলিসকে,184
...,...,...
9995,কর্মসূচীরপ্রকল্পরাস্তা,1
9996,বাঙ্কারিয়া,1
9997,কর্মসূচীরা,1
9998,বাঙ্কোসদের,1


In [3]:

df["graphemes"]=df.word.progress_apply(lambda x:gp.process(x))
graphemes=df.graphemes.tolist()
dict_graphemes=[]
for _graphemes in tqdm(graphemes):
    for grapheme in _graphemes:
        if grapheme not in dict_graphemes:
            dict_graphemes.append(grapheme)
dict_graphemes=sorted(["র‍্যা"]+dict_graphemes)
df

100%|██████████| 2879809/2879809 [00:26<00:00, 110385.41it/s]
100%|██████████| 2879809/2879809 [00:25<00:00, 111844.72it/s]


Unnamed: 0,word,count,graphemes
0,আয়াছ,184,"[আ, য়া, ছ]"
1,মধুরিমা,184,"[ম, ধু, রি, মা]"
2,ঝিমলি,184,"[ঝি, ম, লি]"
3,জেটের,184,"[জে, টে, র]"
4,পুলিসকে,184,"[পু, লি, স, কে]"
...,...,...,...
9995,কর্মসূচীরপ্রকল্পরাস্তা,1,"[ক, র্ম, সূ, চী, র, প্র, ক, ল্প, রা, স্তা]"
9996,বাঙ্কারিয়া,1,"[বা, ঙ্কা, রি, য়া]"
9997,কর্মসূচীরা,1,"[ক, র্ম, সূ, চী, রা]"
9998,বাঙ্কোসদের,1,"[বা, ঙ্কো, স, দে, র]"


In [4]:
graphemes=[]
for g in tqdm(dict_graphemes):
    for cd in bangla.consonant_diacritics:
        g=g.replace(cd,'')
    if g not in graphemes+numbers:
        graphemes.append(g)
len(graphemes)

100%|██████████| 5863/5863 [00:00<00:00, 34291.38it/s]


3279

# Random gen-funcs

In [5]:

def random_exec(poplutation=[0,1],weights=[0.5,0.5],match=0):
    return random.choices(population=poplutation,weights=weights,k=1)[0]==match

def create_words(graphemes,
                min_len=1,
                max_len=10,
                mods=['ঁ', 'ং', 'ঃ'],
                mod_weights=[0.3,0.7]):
    
    _graphemes = graphemes.copy()
    random.shuffle(_graphemes)
    words = [] 
    index = 0 
    length = len(_graphemes) 
    while (index < length):
        _len = random.randint(min_len,max_len)
        word=_graphemes[index:index+_len]
        if random_exec(weights=mod_weights):
            wlen=len(word)
            widx=random.randint(0,wlen-1)
            word[widx]+=random.choice(mods) 
        words.append("".join(word)) 
        index = index + _len
    return words
def create_numbers(numbers,
                min_len=1,
                max_len=10,
                num_samples=100000):
    
    words = [] 
    for _ in range(num_samples):
        _len = random.randint(min_len,max_len)
        _word=[]
        for _ in range(_len):_word.append(random.choice(numbers))
        if random_exec():_word[random.randint(0,_len-1)]+="."
        words.append("".join(_word))
    return words

def create_mixed_data(numbers,
                    graphemes,
                    punctuations,    
                    num_samples=100000,
                    lens= [1,2,3,4,5,6,7,8,9,10],
                    weights= [0.05,0.05,0.1,0.15,0.15,0.15,0.15,0.1,0.05,0.05],
                    comp_weights= [0.33,0.34,0.33]):
    words=[]
    for _ in tqdm(range(num_samples)):
        len_word=random.choices(population=lens,weights=weights,k=1)[0]
        _graphemes=[]
        for _ in range(len_word):
            _ctype=random.choices(population=["g","n","p"],weights=comp_weights,k=1)[0]
            if _ctype=="g":    
                _graphemes.append(random.choice(graphemes))
            elif _ctype=="n":    
                _graphemes.append(random.choice(numbers))
            else:
                _graphemes.append(random.choice(punctuations))        
        words.append("".join(_graphemes))
    return words

# combine

In [6]:
_hf=1
_sf=1
_gf=1000
df=pd.read_csv("hw.csv")
dfs=[df for _ in range(_hf)]
df=pd.read_csv("sc.csv")
dfs+=[df for _ in range(_sf)]
df=pd.concat(dfs,ignore_index=True)

with open("bn_test.txt","w+") as f:
    for idx in tqdm(range(len(df))):
        word=df.iloc[idx,0]
        f.write(f"{word}\n")

words=create_mixed_data(numbers,graphemes,punctuations)
dfm=pd.DataFrame({"word":words})
words=create_numbers(numbers)
dfn=pd.DataFrame({"word":words})
gwords=[]
for i in tqdm(range(_gf)):
    gwords+=create_words(graphemes)
dfg=pd.DataFrame({"word":gwords})

df=pd.concat([dfm,dfn,dfg],ignore_index=True)
df=df.sample(frac=1)

with open("bn_train.txt","w+") as f:
    for idx in tqdm(range(len(df))):
        word=df.iloc[idx,0]
        f.write(f"{word}\n")


100%|██████████| 26741/26741 [00:00<00:00, 52170.47it/s]
100%|██████████| 100000/100000 [00:00<00:00, 100593.40it/s]
100%|██████████| 1000/1000 [00:02<00:00, 446.83it/s]
100%|██████████| 796779/796779 [00:15<00:00, 51631.89it/s]
