In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
import random

import sys
sys.path.append("/home/ubuntu/ibas_project")

nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/zubair/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/zubair/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
def synonym_replacement(words, n):
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word.isalpha()]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) > 0:
            synonym = random.choice(synonyms)
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break
    sentence = ' '.join(new_words)
    return sentence

In [3]:
def get_synonyms(word):
    synonyms = []
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonym = lemma.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char.isalpha()])
            if synonym != word:
                synonyms.append(synonym)
    return list(set(synonyms))

In [4]:
def augment_sentence(sentence, n=10):
    augmented_sentences = [sentence]
    words = word_tokenize(sentence)
    for _ in range(n-1):
        augmented_sentences.append(synonym_replacement(words, 1))
    
    # Shuffle the entire set of augmented sentences
    random.shuffle(augmented_sentences)
    return augmented_sentences

In [5]:

def augment_data(input_file, output_file):
    df = pd.read_excel(input_file)

    augmented_data = {'bangla_ques': [], 'transliterated_ques': [], 'english_ques': []}

    for index, row in df.iterrows():
        bangla_question = row['bangla_ques']
        transliterated_question = row['transliterated_ques']
        english_question = row['english_ques']

        augmented_bangla = augment_sentence(bangla_question, 10)
        augmented_transliterated = augment_sentence(transliterated_question, 10)
        augmented_english = augment_sentence(english_question, 10)

        augmented_data['bangla_ques'].extend(augmented_bangla)
        augmented_data['transliterated_ques'].extend(augmented_transliterated)
        augmented_data['english_ques'].extend(augmented_english)

    augmented_df = pd.DataFrame(augmented_data)
    augmented_df.to_excel(output_file, index=False)

In [None]:
if __name__ == "__main__":
    input_file = "/home/zubair/workstation_2/source/Final-updated-dataset.xlsx"  # Change this to the path of your input file
    output_file = "/home/zubair/workstation_2/source/Final-updated-augmented-dataset.xlsx"  # Change this to the desired output file name
    augment_data(input_file, output_file)


In [7]:
!pip install bnaug

Collecting bnaug
  Downloading bnaug-1.1.2-py3-none-any.whl (4.8 kB)
Collecting bnlp-toolkit==3.3.2 (from bnaug)
  Downloading bnlp_toolkit-3.3.2-py3-none-any.whl (23 kB)
Collecting transformers==4.24.0 (from bnaug)
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:01[0m
Collecting ftfy (from bnlp-toolkit==3.3.2->bnaug)
  Downloading ftfy-6.1.3-py3-none-any.whl (53 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.4/53.4 kB[0m [31m336.0 kB/s[0m eta [36m0:00:00[0m kB/s[0m eta [36m0:00:01[0m
[?25hCollecting emoji==1.7.0 (from bnlp-toolkit==3.3.2->bnaug)
  Downloading emoji-1.7.0.tar.gz (175 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.4/175.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25h  Prepari

In [9]:
from bnaug.sentence import TokenReplacement

tokr = TokenReplacement()
text = "আমি ঢাকায় বাস করি।"
output = tokr.masking_based(text, sen_n=5)
print(output)

Some weights of the model checkpoint at sagorsarker/bangla-bert-base were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at sagorsarker/bangla-bert-base and are newly initialized: ['cls.predictions.decoder.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


['আমরা ঢাকায বাস করি ।', 'আমি ঢাকায বাস করি ।', 'এখানে ঢাকায বাস করি ।', 'সেখানে ঢাকায বাস করি ।', 'বাসে ঢাকায বাস করি ।', 'আমি ঢাকায বাস করি ।', 'আমি ঢাকায বাস করি?', 'আমি ঢাকায বাস করি!', 'আমি ঢাকায বাস করি না', 'আমি ঢাকায বাস করি', 'আমি ঢাকায বাস করি ।', 'আমি ঢাকায বাস করছি ।', 'আমি ঢাকায বাস করেছি ।', 'আমি ঢাকায বাস করতাম ।', 'আমি ঢাকায বাস করিনি ।', 'আমি এখানে বাস করি ।', 'আমি সেখানে বাস করি ।', 'আমি বাস করি ।', 'আমি বাংলাদেশে বাস করি ।', 'আমি ওখানে বাস করি ।', 'আমি ঢাকায বাস করি ।', 'আমি ঢাকায বাস করি?', 'আমি ঢাকায বাস করি!', 'আমি ঢাকায বাস করি না', 'আমি ঢাকায বাস করি']


In [13]:
from bnaug.sentence import TokenReplacement

tokr = TokenReplacement()
text = "আমি ঢাকায় বাস করি।"
model = "/bangla_word2vec/bnwiki_word2vec.model"
output = tokr.word2vec_based(text, model=model, sen_n=5, word_n=5)
print(output)

[Errno 2] No such file or directory: '/bangla_word2vec/bnwiki_word2vec.model'
[Errno 2] No such file or directory: '/bangla_word2vec/bnwiki_word2vec.model'
[Errno 2] No such file or directory: '/bangla_word2vec/bnwiki_word2vec.model'
[Errno 2] No such file or directory: '/bangla_word2vec/bnwiki_word2vec.model'
[Errno 2] No such file or directory: '/bangla_word2vec/bnwiki_word2vec.model'
[]


In [14]:
from bnaug.sentence import TokenReplacement

tokr = TokenReplacement()
text = "আমি ঢাকায় বাস করি।"
vector = "msc/bn_glove.300d.txt"
output = tokr.glove_based(text, vector_path=vector, sen_n=5, word_n=5)
print(output)

[Errno 2] No such file or directory: 'msc/bn_glove.300d.txt'
[Errno 2] No such file or directory: 'msc/bn_glove.300d.txt'
[Errno 2] No such file or directory: 'msc/bn_glove.300d.txt'
[Errno 2] No such file or directory: 'msc/bn_glove.300d.txt'
[Errno 2] No such file or directory: 'msc/bn_glove.300d.txt'
[]


In [None]:
from bnaug.sentence import BackTranslation

bt = BackTranslation()
text = "বাংলা ভাষা আন্দোলন তদানীন্তন পূর্ব পাকিস্তানে সংঘটিত একটি সাংস্কৃতিক ও রাজনৈতিক আন্দোলন। "
output = bt.get_augmented_sentences(text)
print(output)


In [None]:
from bnaug.sentence import TextGeneration

tg = TextGeneration()
text = "বিমানটি যখন মাটিতে নামার জন্য এয়ারপোর্টের কাছাকাছি আসছে, তখন ল্যান্ডিং গিয়ারের খোপের ঢাকনাটি খুলে যায়।"
output = tg.parapharse_generation(text)
print(output)

In [None]:
from bnaug import randaug

text = "১০০ বাকি দিলাম"
output = randaug.remove_digits(text)
print(output)

text = "১০০! বাকি দিলাম?"
output = randaug.remove_punctuations(text)
print(output)

text = "আমি ১০০ বাকি দিলাম"
randaug.remove_stopwords(text)
print(output)

text = "আমি ১০০ বাকি দিলাম"
randaug.remove_random_word(text)
print(output)

text = "আমি ১০০ বাকি দিলাম"
randaug.remove_random_char(text)
print(output)