<a href="https://colab.research.google.com/github/FaisalAhmedBijoy/Chattogram-language-to-standard-bangla-language-conversion/blob/master/chittagong_Bn_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Import Libraries

In [1]:
import math
import torchtext
import torch
import torch.nn as nn
from torch import Tensor
from collections import Counter
from torchtext.vocab import Vocab
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

import io
import time
import pickle
import numpy as np
import pandas as pd
import sentencepiece as spm

from torch.nn import TransformerEncoder, TransformerDecoder, TransformerEncoderLayer, TransformerDecoderLayer
torch.manual_seed(0)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
print('device: ',device)

device:  cuda


## 2. Access Google Drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 3. Analysis the Dataset

In [4]:
dataset_path='/content/drive/MyDrive/chittagong_language/data/chittagong_en_bn.csv'
df=pd.read_csv(dataset_path)
df.head()

Unnamed: 0,Original Chattogram Sentense,Original Chattogram language English,bangali sentence,English Sentense,English Maning Sentense
0,অনেরা,onera,আপনারা,APNARA,YOU
1,অলপল,olpol,আগোছালো,AGOSALO,TIDY UP
2,ফ্ইর,foir,ভিক্ষুক,BIKKUK,CADGER
3,আজিয়া,ajiya,আজকে,AJKE,TODAY
4,কালিয়া,kaliya,কালকে,KALKE,TOMORROW


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1080 entries, 0 to 1079
Data columns (total 5 columns):
 #   Column                                Non-Null Count  Dtype 
---  ------                                --------------  ----- 
 0   Original Chattogram Sentense          1079 non-null   object
 1   Original Chattogram language English  109 non-null    object
 2   bangali sentence                      873 non-null    object
 3   English Sentense                      39 non-null     object
 4   English Maning Sentense               39 non-null     object
dtypes: object(5)
memory usage: 42.3+ KB


### Check unique values in dataframe

In [6]:
def report_data_types_uniques_check(df):
    col = []
    d_type = []
    uniques = []
    n_uniques = []

    for i in df.columns:
        col.append(i)
        d_type.append(df[i].dtypes)
        uniques.append(df[i].unique()[:5])
        n_uniques.append(df[i].nunique())

    return pd.DataFrame({'Column': col, 'd_type': d_type, 'unique_sample': uniques, 'n_uniques': n_uniques})

In [7]:
report_data_types_uniques_check(df)

Unnamed: 0,Column,d_type,unique_sample,n_uniques
0,Original Chattogram Sentense,object,"[অনেরা, অলপল, ফ্ইর, আজিয়া, কালিয়া]",1068
1,Original Chattogram language English,object,"[onera, olpol, foir, ajiya, kaliya]",109
2,bangali sentence,object,"[আপনারা, আগোছালো, ভিক্ষুক, আজকে, কালকে]",855
3,English Sentense,object,"[APNARA, AGOSALO, BIKKUK, AJKE, KALKE]",38
4,English Maning Sentense,object,"[YOU, TIDY UP, CADGER, TODAY, TOMORROW]",36


In [8]:
# check missing values
df.isnull().sum()

Original Chattogram Sentense               1
Original Chattogram language English     971
bangali sentence                         207
English Sentense                        1041
English Maning Sentense                 1041
dtype: int64

### Select top 800 data for the project

In [9]:
top_800_df=df.head(800)
top_800_df.head()

Unnamed: 0,Original Chattogram Sentense,Original Chattogram language English,bangali sentence,English Sentense,English Maning Sentense
0,অনেরা,onera,আপনারা,APNARA,YOU
1,অলপল,olpol,আগোছালো,AGOSALO,TIDY UP
2,ফ্ইর,foir,ভিক্ষুক,BIKKUK,CADGER
3,আজিয়া,ajiya,আজকে,AJKE,TODAY
4,কালিয়া,kaliya,কালকে,KALKE,TOMORROW


In [10]:
top_800_df.describe()

Unnamed: 0,Original Chattogram Sentense,Original Chattogram language English,bangali sentence,English Sentense,English Maning Sentense
count,800,109,800,39,39
unique,792,109,786,38,36
top,ডাক চিক্কুরে পাড়া ফাডের,ajeye roja kaliye eid tor ma,মাইরের নাম বাবাজি,ASBEN,YOU
freq,3,1,2,2,3


### Select only the chattogram sentence and bangali sentence

In [11]:
original_chattogram_Sentense=list(top_800_df['Original Chattogram Sentense'])
bangali_sentence=list(top_800_df['bangali sentence '])

## 4. Data Processing
- Generate train and val data
- Save chattogram and stanadrd bangla text into local disk

In [12]:
data=[]
# print(Original_Chattogram_Sentense)
for i in range(len(original_chattogram_Sentense)):
  data.append([original_chattogram_Sentense[i],bangali_sentence[i]])


In [13]:
data[:10]

[['অনেরা', 'আপনারা'],
 ['অলপল', 'আগোছালো'],
 ['ফ্ইর', 'ভিক্ষুক'],
 ['আজিয়া', 'আজকে'],
 ['কালিয়া', 'কালকে'],
 ['যন নাই', 'যেতে নাই'],
 ['গত খাইল', 'গত কালকে'],
 ['ইতারে', 'তাকে'],
 ['অনেরে', 'আপনাকে'],
 ['আ্যাই', 'আমি']]

In [14]:
data[-10:]

[['মাচ এবে হাঠন পরিব', 'মাছ এটা কাটতে হবে'],
 ['তুই আজিয়ে আইবানা', 'তুমি আজকে আসবা'],
 ['তুর লাই হথা আছে ', 'তুর সাথে কথা আছে '],
 ['ওগ্গ মাছ লাগিব ', 'একটা মাছ লাগবে'],
 ['হালিয়ে দেহা গরবে', 'কালকে দেখা করবেন '],
 ['তোর হবর আছে ', 'তোমার বিপদ হবে'],
 ['হালিয়ে আইস্স', 'কালকে আসিয়েন'],
 ['তুই হন্ডে যর', 'তুমি কোথায় যাচ্ছ'],
 ['হানা ন হাবি', 'খাবার খাবি না'],
 ['পুটলা এবে হার', 'পলিথিন টা কার']]

In [15]:
train_chattogram = [i[0] for i in data[:600]]
val_chattogram = [i[0] for i in data[600:]]
print('train len: ',len(train_chattogram))
print(' val len: ',len(val_chattogram))

train len:  600
 val len:  200


In [16]:
val_chattogram[:10]

['বালা অতে টিয়া ন লাগে',
 'তোর মা তোরে ডাহের',
 'কিল্লায় ডাহের দে',
 'আই ন জনি',
 'কী হর ওডোএগিন',
 'কী হদ্দে এগিম',
 'কিল্লায় হছোনা',
 'আরে মামু তুই থাম',
 'হনে হয়য়ি দে',
 'বেশি মাতস']

In [17]:
train_bangla = [i[1] for i in data[:600]]
val_bangla = [i[1] for i in data[600:]]
print(len(train_bangla))
print(len(val_bangla))

600
200


In [18]:
val_bangla[:10]

['ভালো হতে টাকা পয়সা লাগে না',
 'তোর আম্মু ডাকতেছে ',
 'কেন ডাকতেছে ',
 'আমি জানি না',
 'কী বলছ এইসব ',
 'কী বলছ এইসব ',
 'কেন বলনা',
 'আরেহ মামা থামেন',
 'কে বলেছে?',
 'বেশি কথা বলছ']

### Saved Chattogram and standard bangla text into file

In [19]:
import os
process_data_path = "/content/drive/MyDrive/chittagong_language/data/process_data"
# os.makedirs(process_data_path, exits_ok= True)
os.makedirs(process_data_path, exist_ok = True)

In [20]:
def write_txt_file(file_path, data, encoding="utf-8"):
    with open(file_path, 'w') as f:
        for key in data:
            if isinstance(key, list):
                key = key[0]
            f.write(key+"\n")

In [21]:
write_txt_file(os.path.join(process_data_path, "chattogram_data.txt"), train_chattogram)

In [22]:
write_txt_file(os.path.join(process_data_path,"bangla_data.txt"), train_bangla)

In [23]:
def merge_data_write_txt_file(file_path, chattogram_data, bangla_data, encoding="utf-8"):
    with open(file_path, 'w') as f:
        for chattogram, bangla in zip(chattogram_data, bangla_data):
#             if isinstance(key, list):
#                 key = key[0]
            f.write(chattogram+"\t"+bangla+"\n")

In [24]:
merge_data_write_txt_file(os.path.join(process_data_path, "merge_data.txt"), train_chattogram, train_bangla)

In [25]:
model_path = "/content/drive/MyDrive/chittagong_language/model"
os.makedirs(model_path, exist_ok = True)

## 5. Generate Tokenizer and Vocab File

In [26]:
import sentencepiece as spm

def train_tokenizer(text_path="text.txt", model_prefix="model/chattogram_model", vocab_size=30000):
    spm.SentencePieceTrainer.train(f'--input={text_path} --model_prefix={model_prefix} --user_defined_symbols=<sep>,<cls> --vocab_size={vocab_size}')
    bn_sp = spm.SentencePieceProcessor()
    bn_sp.load(os.path.join(model_path, 'chattogram_model.model'))


In [27]:
chattogram_data_path = "/content/drive/MyDrive/chittagong_language/data/process_data/chattogram_data.txt"
bangla_data_path = "/content/drive/MyDrive/chittagong_language/data/process_data/bangla_data.txt"

### Generate Chattogram and standard bangla tokenizer

In [28]:
train_tokenizer(
    text_path = chattogram_data_path,
    model_prefix = "/content/drive/MyDrive/chittagong_language/model/chattogram_model",
    vocab_size = 961
)

In [29]:
train_tokenizer(
    text_path = bangla_data_path,
    model_prefix = "/content/drive/MyDrive/chittagong_language/model/bangla_model",
    vocab_size = 840
)

In [30]:
chattogram_tokenizer = spm.SentencePieceProcessor(model_file='/content/drive/MyDrive/chittagong_language/model/chattogram_model.model')
bangla_tokenizer = spm.SentencePieceProcessor(model_file='/content/drive/MyDrive/chittagong_language/model/bangla_model.model')

In [31]:
chattogram_tokenizer.encode("এত টিয়া হডে পাইয়ুম ")

[87, 23, 163, 49, 229]

In [32]:
bangla_tokenizer.encode("আমি আবার বিয়ে করেছি।")

[16, 72, 92, 90, 25, 274, 6]

In [33]:
print(chattogram_tokenizer.encode_as_pieces('এত টিয়া হডে পাইয়ুম '))
print(chattogram_tokenizer.encode_as_ids('এত টিয়া হডে পাইয়ুম '))

print(bangla_tokenizer.encode_as_pieces('এত টিয়া হডে পাইয়ুম '))
print(bangla_tokenizer.encode_as_ids('এত টিয়া হডে পাইয়ুম '))

['▁এত', '▁টিয়া', '▁হডে', '▁প', 'াইয়ুম']
[87, 23, 163, 49, 229]
['▁এত', '▁', 'টি', 'য়া', '▁হ', 'ড', 'ে', '▁পাই', 'য়', 'ু', 'ম']
[368, 10, 84, 645, 60, 603, 7, 507, 35, 105, 177]


### Generate and save vocab files

In [34]:
from torchtext.vocab import vocab
def build_vocab(sentences, tokenizer):
    counter = Counter()
    for sentence in sentences:
        # print(sentence)
        if isinstance(sentence, list):
            sentence = sentence[0]
        counter.update(tokenizer.encode(sentence, out_type=str))
    print(counter)
    return vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'], special_first=True)


In [35]:
torchtext.__version__

'0.16.0+cpu'

In [36]:
print('input sentence: ',train_chattogram[0])
chattogram_tokenizer.encode(train_chattogram[0])

input sentence:  অনেরা


[529]

### Merge train and val to generate a vocab

In [37]:
full_chattogram=train_chattogram+val_chattogram
full_bangla=train_bangla+val_bangla
print(len(full_chattogram))
print(len(full_bangla))

800
800


In [38]:
full_chattogram_vocab = build_vocab(full_chattogram, chattogram_tokenizer)
print('len of full chattogram vocab: ',len(full_chattogram_vocab))

Counter({'র': 163, '?': 130, '▁ন': 127, 'ত': 120, 'ে': 89, '।': 88, '▁আই': 71, '▁না': 66, 'ন': 54, 'া': 51, 'ি': 51, '▁আর': 51, 'ম': 50, '▁তুই': 50, 'রে': 49, ',': 49, '▁কী': 43, '▁দে': 42, '▁হ': 42, '▁টিয়া': 37, '▁এ': 36, '▁': 35, 'য়': 32, '▁আ': 32, 'ার': 30, 'ই': 29, '▁গ': 29, '▁কি': 29, 'ো': 29, '▁মা': 28, '▁নাই': 27, 'স': 27, '▁হন': 27, '▁তোয়ার': 25, '▁ব': 24, '▁হথা': 24, '▁গর': 23, '▁গরি': 22, '▁দ': 22, '▁কা': 22, '▁কিছু': 22, 'ুন': 21, 'রি': 21, '▁আরা': 21, '▁ক': 20, '▁ত': 20, 'ের': 20, 'ক': 20, '▁হই': 20, '▁প': 18, '▁ফ': 18, '▁অ': 18, '▁দি': 17, '▁আছে': 17, 'ল': 17, '▁আরে': 17, '▁ভা': 16, '▁বা': 16, '▁চ': 16, '▁ল': 16, '▁হা': 16, 'দে': 16, '▁চাই': 15, 'লা': 15, 'ু': 15, '▁এবে': 15, '▁ম': 14, '▁স': 14, 'লে': 14, '▁দাম': 14, '▁লা': 14, '▁যা': 14, '▁য': 13, '্যা': 13, 'ী': 13, 'নে': 13, '▁খ': 13, '▁এত': 13, '▁সা': 13, '▁ওয়া': 13, 'াই': 13, 'গিন': 13, 'না': 13, '▁কেন': 13, 'উ': 12, '▁তুয়ার': 12, 'দ্দে': 12, '▁ইয়ান': 12, '▁ও': 12, 'ব': 12, '▁মন': 12, '▁তুর': 12, 'হ': 12, '▁আছো'

In [39]:
full_bangla_vocab = build_vocab(full_bangla, bangla_tokenizer)
print('len of full chattogram vocab: ',len(full_bangla_vocab))

Counter({'?': 136, '।': 121, 'ে': 117, 'র': 112, '▁না': 98, '▁কী': 69, '▁আমি': 65, 'কে': 60, 'ি': 59, '▁আমা': 57, 'ন': 52, '▁': 51, 'ত': 51, 'ের': 50, 'তে': 48, '▁তুমি': 46, '▁টাকা': 42, '▁কেন': 41, 'রা': 40, '▁কথা': 39, 'ো': 37, 'া': 31, '▁ব': 31, '▁বল': 30, 'ই': 30, '▁ভালো': 27, '▁কোথায়': 27, '▁কিছু': 27, '▁করে': 26, 'ক': 26, ',': 26, 'স': 26, 'য়': 24, '▁ভা': 23, '▁মা': 23, '▁অ': 23, '▁ম': 23, '▁এটা': 22, '▁এ': 22, '▁কো': 22, 'ী': 21, '▁এই': 21, '▁তোমার': 21, '▁কে': 21, '▁নেই': 21, '▁ছেলে': 20, 'ার': 20, '▁কাজ': 19, 'লে': 18, '▁চল': 18, '▁হবে': 18, '▁কর': 18, '▁আর': 18, '▁কত': 17, 'নে': 17, '▁বাড়ি': 17, 'দের': 17, '▁দাম': 17, '▁যে': 16, '▁নাই': 16, '▁এখন': 16, '▁আছে': 16, '▁সব': 16, '▁ও': 16, '▁আস': 16, '▁কেমন': 15, 'ছ': 15, '▁তো': 15, 'টা': 15, '▁মানুষ': 15, '▁ন': 15, '▁আম': 15, '▁এক': 15, 'ছে': 14, '▁হ': 14, '▁থ': 14, '▁দি': 14, '▁দিন': 14, '▁ক': 14, '▁একটা': 14, 'ম': 14, 'না': 14, '▁তু': 14, '▁স': 13, '▁ভাই': 13, '▁খ': 13, '▁দিয়ে': 13, 'েকে': 13, 'ছো': 13, 'টি': 13, '▁আমাদের':

## 6. Generate Train and Val Data Loader

In [40]:
def data_process(chattogram, bangla):
    data = []
    for (raw_chattogram, raw_bangla) in zip(chattogram, bangla):
        chattogram_tensor_ = torch.tensor([full_chattogram_vocab[token] for token in chattogram_tokenizer.encode(raw_chattogram, out_type=str)],dtype=torch.long)
        bangla_tensor_ = torch.tensor([full_bangla_vocab[token] for token in bangla_tokenizer.encode(raw_bangla, out_type=str)],dtype=torch.long)
        data.append((chattogram_tensor_, bangla_tensor_))
    return data

In [41]:
train_data = data_process(train_chattogram, train_bangla)

In [42]:
train_data[:10]

[(tensor([4]), tensor([4])),
 (tensor([5, 6]), tensor([5, 6])),
 (tensor([7, 8, 9]), tensor([7, 8, 9])),
 (tensor([10]), tensor([10])),
 (tensor([11, 12]), tensor([11])),
 (tensor([13, 14, 15]), tensor([12, 13, 14])),
 (tensor([16, 17, 18]), tensor([15, 16, 11])),
 (tensor([19, 20]), tensor([17, 18])),
 (tensor([21, 22]), tensor([19])),
 (tensor([23]), tensor([20]))]

In [43]:
val_data = data_process(val_chattogram, val_bangla)

In [44]:
val_data[:10]

[(tensor([506, 275, 269,  71,  30, 300]),
  tensor([ 39,  58,  13,  55,  56, 523,  63,  26])),
 (tensor([877, 299, 748, 444, 518, 113]),
  tensor([ 94, 121, 324, 198, 209, 425, 130, 745])),
 (tensor([ 85, 584, 223, 444, 518, 113, 131]), tensor([247, 425, 130, 745])),
 (tensor([ 37,  30, 185,  49]), tensor([ 20, 690,  26])),
 (tensor([438,  75, 686, 898, 317]), tensor([150, 160,  49, 145, 462])),
 (tensor([438, 251, 126, 289, 540,  49,  54]),
  tensor([150, 160,  49, 145, 462])),
 (tensor([ 85, 584, 223, 251,  66, 789]), tensor([247, 160, 706])),
 (tensor([428, 893, 199, 862]), tensor([274,  63, 794, 661, 771, 144])),
 (tensor([636,  20, 783, 784, 131]), tensor([221, 152,  46,  38])),
 (tensor([543,  49, 250, 178]), tensor([711, 208, 159, 160,  49]))]

In [46]:
BATCH_SIZE = 150
PAD_IDX = full_chattogram_vocab['<pad>']
BOS_IDX = full_chattogram_vocab['<bos>']
EOS_IDX = full_chattogram_vocab['<eos>']

def generate_batch(data_batch):
    chattogram_batch, bangla_batch = [], []
    for (chattogram_item, bangla_item) in data_batch:
        chattogram_batch.append(torch.cat([torch.tensor([BOS_IDX]), chattogram_item, torch.tensor([EOS_IDX])], dim=0))
        bangla_batch.append(torch.cat([torch.tensor([BOS_IDX]), bangla_item, torch.tensor([EOS_IDX])], dim=0))
    chattogram_batch = pad_sequence(chattogram_batch, padding_value=PAD_IDX)
    bangla_batch = pad_sequence(bangla_batch, padding_value=PAD_IDX)
    return chattogram_batch, bangla_batch

train_iter = DataLoader(train_data, batch_size=BATCH_SIZE,shuffle=True, collate_fn=generate_batch)
val_iter = DataLoader(val_data, batch_size=BATCH_SIZE,shuffle=True, collate_fn=generate_batch)

In [47]:
for batch in val_iter:
  print(batch[0][0])

tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2])
tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2])


## 7. Implement Sequence 2 Sequence (Seq2Seq) Model using Transformer

In [48]:
from torch.nn import (TransformerEncoder, TransformerDecoder,
                      TransformerEncoderLayer, TransformerDecoderLayer)


class Seq2SeqTransformer(nn.Module):
    def __init__(self, num_encoder_layers: int, num_decoder_layers: int,
                 emb_size: int, src_vocab_size: int, tgt_vocab_size: int,
                 dim_feedforward:int = 512, dropout:float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        encoder_layer = TransformerEncoderLayer(
            d_model=emb_size,
            nhead=NHEAD,
            dim_feedforward=dim_feedforward
            )
        self.transformer_encoder = TransformerEncoder(
            encoder_layer,
            num_layers=num_encoder_layers
            )
        decoder_layer = TransformerDecoderLayer(
            d_model=emb_size,
            nhead=NHEAD,
            dim_feedforward=dim_feedforward
            )
        self.transformer_decoder = TransformerDecoder(
            decoder_layer,
            num_layers=num_decoder_layers
            )

        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)

    def forward(self, src: Tensor, trg: Tensor, src_mask: Tensor,
                tgt_mask: Tensor, src_padding_mask: Tensor,
                tgt_padding_mask: Tensor, memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        memory = self.transformer_encoder(src_emb, src_mask, src_padding_mask)
        outs = self.transformer_decoder(tgt_emb, memory, tgt_mask, None,
                                        tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer_encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer_decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

In [49]:
class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout, maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding +
                            self.pos_embedding[:token_embedding.size(0),:])

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size
    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

In [50]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=device)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len), device=device).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

## 8. Define Train Parameter for Seq2Seq Model

In [51]:
from tqdm import tqdm
SRC_VOCAB_SIZE = len(full_chattogram_vocab)
TGT_VOCAB_SIZE = len(full_bangla_vocab)
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 150
NUM_ENCODER_LAYERS = 6
NUM_DECODER_LAYERS = 6
NUM_EPOCHS = 300


transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS,
                                 EMB_SIZE, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE,
                                 FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(device)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

def train_epoch(model, train_iter, optimizer):
    model.train()
    losses = 0
    for idx, (src, tgt) in enumerate(train_iter):
#         print("training iter : ", idx)
#     for idx in tqdm(range(len(train_iter))):
#         src, tgt = train_iter[idx]
        src = src.to(device)
        tgt = tgt.to(device)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,
                                src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[1:,:]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()
    return losses / len(train_iter)




In [52]:
def evaluate(model, val_iter):
    model.eval()
    losses = 0
    for idx, (src, tgt) in (enumerate(val_iter)):
#         print(idx)
        src = src.to(device)
        tgt = tgt.to(device)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,
                                  src_padding_mask, tgt_padding_mask, src_padding_mask)
        tgt_out = tgt[1:,:]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()
    return losses / len(val_iter)

## 9. Train Seq2Seq Transformer Model

In [53]:
for epoch in range(1, NUM_EPOCHS+1):
    start_time = time.time()
    train_loss = train_epoch(transformer, train_iter, optimizer)
#     if epoch % 5 == 0:
    val_loss = evaluate(transformer, val_iter)
    end_time = time.time()
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, val loss : {val_loss:.3f} "
          f"Epoch time = {(end_time - start_time):.3f}s"))


    # save model + checkpoint to resume training later
    torch.save({
      'epoch': NUM_EPOCHS,
      'model_state_dict': transformer.state_dict(),
      'optimizer_state_dict': optimizer.state_dict(),
      'loss': train_loss,
      }, '/content/drive/MyDrive/chittagong_language/model/model_checkpoint.tar')



Epoch: 1, Train loss: 6.480, val loss : 6.055 Epoch time = 2.070s
Epoch: 2, Train loss: 6.001, val loss : 5.772 Epoch time = 0.867s
Epoch: 3, Train loss: 5.848, val loss : 5.654 Epoch time = 0.897s
Epoch: 4, Train loss: 5.767, val loss : 5.605 Epoch time = 0.801s
Epoch: 5, Train loss: 5.702, val loss : 5.570 Epoch time = 0.911s
Epoch: 6, Train loss: 5.657, val loss : 5.532 Epoch time = 0.795s
Epoch: 7, Train loss: 5.615, val loss : 5.568 Epoch time = 0.790s
Epoch: 8, Train loss: 5.584, val loss : 5.468 Epoch time = 0.804s
Epoch: 9, Train loss: 5.545, val loss : 5.538 Epoch time = 0.908s
Epoch: 10, Train loss: 5.499, val loss : 5.435 Epoch time = 0.814s
Epoch: 11, Train loss: 5.404, val loss : 5.385 Epoch time = 0.835s
Epoch: 12, Train loss: 5.313, val loss : 5.254 Epoch time = 0.894s
Epoch: 13, Train loss: 5.184, val loss : 5.167 Epoch time = 0.805s
Epoch: 14, Train loss: 5.065, val loss : 5.279 Epoch time = 0.892s
Epoch: 15, Train loss: 4.958, val loss : 5.030 Epoch time = 0.834s
Epoc

## 10. Evaluate Model

In [54]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [56]:
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(device)
    src_mask = src_mask.to(device)
    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(device)
    for i in range(max_len-1):
        memory = memory.to(device)
        memory_mask = torch.zeros(ys.shape[0], memory.shape[0]).to(device).type(torch.bool)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                                    .type(torch.bool)).to(device)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim = 1)
        next_word = next_word.item()
        ys = torch.cat([ys,torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys

def translate(model, src, src_vocab, tgt_vocab, src_tokenizer):
    model.eval()
    tokens = [BOS_IDX] + [src_vocab.get_stoi()[tok] for tok in src_tokenizer.encode(src, out_type=str)]+ [EOS_IDX]
    num_tokens = len(tokens)
    src = (torch.LongTensor(tokens).reshape(num_tokens, 1) )
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    p_text = " ".join([tgt_vocab.get_itos()[tok] for tok in tgt_tokens]).replace("<bos>", "").replace("<eos>", "")
    pts = " ".join(list(map(lambda x : x , p_text.replace(" ", "").split("▁"))))
    return pts.strip()

In [57]:
# for i in data[:10]:
text = "বালা অতে টিয়া ন লাগে"
pre = translate(transformer, text, full_chattogram_vocab, full_bangla_vocab, chattogram_tokenizer)
print(f"input : {text}")
print(f"prediction: {pre}")

input : বালা অতে টিয়া ন লাগে
prediction: টাকা পয়সার হিসেব কই


In [None]:
# itos = ja_vocab.itos()

## 11. Save Vocab and PyTorch model in Local Storage

In [58]:
import pickle
full_chattogram_vocab_pickle_file='/content/drive/MyDrive/chittagong_language/model/full_chattogram_vocab.pkl'
full_bangla_vocab_pickle_file='/content/drive/MyDrive/chittagong_language/model/full_bangla_vocab.pkl'
# open a file, where you want to store the data
file = open(full_chattogram_vocab_pickle_file, 'wb')
# dump information to that file
pickle.dump(full_chattogram_vocab, file)
file.close()
file = open(full_bangla_vocab_pickle_file, 'wb')
pickle.dump(full_bangla_vocab, file)
file.close()

In [59]:
# save model + checkpoint to resume training later
torch.save({
  'epoch': NUM_EPOCHS,
  'model_state_dict': transformer.state_dict(),
  'optimizer_state_dict': optimizer.state_dict(),
  'loss': train_loss,
  }, '/content/drive/MyDrive/chittagong_language/model/seq2seq_model_checkpoint.pt')

## 13. Inference
Here the inference script after load sentencepice train tokenizer model, vocal and train model

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [60]:
chattogram_model_path='/content/drive/MyDrive/chittagong_language/model/chattogram_model.model'
bangla_model_path='/content/drive/MyDrive/chittagong_language/model/bangla_model.model'
chattogram_tokenizer = spm.SentencePieceProcessor(model_file=chattogram_model_path)
bangla_tokenizer = spm.SentencePieceProcessor(model_file=bangla_model_path)

In [61]:
chattogram_vocab_file_path='/content/drive/MyDrive/chittagong_language/model/full_chattogram_vocab.pkl'
file = open(chattogram_vocab_file_path, 'rb')
chattogram_vocab = pickle.load(file)
file.close()

In [62]:
bangla_vocab_file_path='/content/drive/MyDrive/chittagong_language/model/full_bangla_vocab.pkl'
file = open(bangla_vocab_file_path, 'rb')
bangla_vocal = pickle.load(file)
file.close()

In [64]:
PATH = "/content/drive/MyDrive/chittagong_language/model/seq2seq_model_checkpoint.pt"

model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS,
                                 EMB_SIZE, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE,
                                 FFN_HID_DIM)
model.to(device)
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']

model.eval()

Seq2SeqTransformer(
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=512, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (transformer_decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )

In [66]:

def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(device)
    src_mask = src_mask.to(device)
    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(device)
    for i in range(max_len-1):
        memory = memory.to(device)
        memory_mask = torch.zeros(ys.shape[0], memory.shape[0]).to(device).type(torch.bool)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                                    .type(torch.bool)).to(device)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim = 1)
        next_word = next_word.item()
        ys = torch.cat([ys,torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys

def translate(model, src, src_vocab, tgt_vocab, src_tokenizer):
#     model.eval()
    tokens = [BOS_IDX] + [src_vocab.get_stoi()[tok] for tok in src_tokenizer.encode(src, out_type=str)]+ [EOS_IDX]
    num_tokens = len(tokens)
    src = (torch.LongTensor(tokens).reshape(num_tokens, 1) )
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    p_text = " ".join([tgt_vocab.get_itos()[tok] for tok in tgt_tokens]).replace("<bos>", "").replace("<eos>", "")
    pts = " ".join(list(map(lambda x : x , p_text.replace(" ", "").split("▁"))))
    return pts.strip()

## 14. Generate Samples with Model

In [67]:
for i in data[-10:]:
    text = "বালা অতে টিয়া ন লাগে"
    pre = translate(model, i[0],full_chattogram_vocab, full_bangla_vocab, chattogram_tokenizer)
    print(f"input : {i[0]}")
    print(f"Ground Truth : {i[1]}")
    print(f"prediction: {pre}")
    print("================================\n")

input : মাচ এবে হাঠন পরিব
Ground Truth : মাছ এটা কাটতে হবে
prediction: ওকে ধরতে হবে

input : তুই আজিয়ে আইবানা
Ground Truth : তুমি আজকে আসবা
prediction: আমি তুমি বল।

input : তুর লাই হথা আছে 
Ground Truth : তুর সাথে কথা আছে 
prediction: কথা আছে তুর সাথে

input : ওগ্গ মাছ লাগিব 
Ground Truth : একটা মাছ লাগবে
prediction: একটা বড় মাছ লাগবে

input : হালিয়ে দেহা গরবে
Ground Truth : কালকে দেখা করবেন 
prediction: এই যে, আপনাার হাট আসতে হবে

input : তোর হবর আছে 
Ground Truth : তোমার বিপদ হবে
prediction: তোমার খবর আছে

input : হালিয়ে আইস্স
Ground Truth : কালকে আসিয়েন
prediction: আমি যাই

input : তুই হন্ডে যর
Ground Truth : তুমি কোথায় যাচ্ছ
prediction: তুমি কোথায় যাচ্ছো?

input : হানা ন হাবি
Ground Truth : খাবার খাবি না
prediction: কাকা না

input : পুটলা এবে হার
Ground Truth : পলিথিন টা কার
prediction: এটা কার মেয়ে টা কী



In [68]:
for i in data[:20]:
    text = "বালা অতে টিয়া ন লাগে"
    pre = translate(model, i[0],full_chattogram_vocab, full_bangla_vocab, chattogram_tokenizer)
    print(f"input : {i[0]}")
    print(f"Ground Truth : {i[1]}")
    print(f"prediction: {pre}")
    print("================================\n")

input : অনেরা
Ground Truth : আপনারা
prediction: আপনারা

input : অলপল
Ground Truth : আগোছালো
prediction: আগোছালো

input : ফ্ইর
Ground Truth : ভিক্ষুক
prediction: ভিক্ষুক

input : আজিয়া
Ground Truth : আজকে
prediction: আজকে

input : কালিয়া
Ground Truth : কালকে
prediction: কালকে

input : যন নাই
Ground Truth : যেতে নাই
prediction: যেতে নাই

input : গত খাইল
Ground Truth : গত কালকে
prediction: গত কালকে

input : ইতারে
Ground Truth : তাকে
prediction: তাকে

input : অনেরে
Ground Truth : আপনাকে
prediction: আপনাকে

input : আ্যাই
Ground Truth : আমি
prediction: আমি

input : তুউই
Ground Truth : তুমি
prediction: তুমি

input : গোসসা
Ground Truth : রাগ
prediction: রাগ

input : ঐইন্না
Ground Truth : এখন
prediction: এখন

input : আ্যাই ন ডরাই
Ground Truth : আমি ভয় পাই না
prediction: আমি ভয় পাই না

input : ডর
Ground Truth : ভয়
prediction: ভয়

input : উযু
Ground Truth : সোজা
prediction: সোজা

input : আইস্সুন
Ground Truth : আসবেন/আসছেন
prediction: আসবেন/আসছেন

input : অনেরা বিয়াকে আইবান
Ground Truth : আপনারা

## 15. Save the Inference Result into CSV File

In [69]:
import pandas as pd

# Create a list of dictionaries containing the data
data_list = []
for i in range(800):
    input_text = data[i][0]  # Input Bangla text
    ground_truth = data[i][1]  # Ground truth Standard Bangla text
    prediction = translate(model, input_text, full_chattogram_vocab, full_bangla_vocab, chattogram_tokenizer)  # Translate input to Standard Bangla

    data_list.append({
        'Input Text': input_text,
        'Ground Truth': ground_truth,
        'Prediction': prediction
    })

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(data_list)

# Save DataFrame to a CSV file
csv_file_path='/content/drive/MyDrive/chittagong_language/logs/predictions.csv'
df.to_csv(csv_file_path, index=False)

print("CSV file saved successfully.")


CSV file saved successfully.


In [70]:
for i in data[:20]:
    text = "বালা অতে টিয়া ন লাগে"
    pre = translate(model, i[0],full_chattogram_vocab, full_bangla_vocab, chattogram_tokenizer)
    print(f"input : {i[0]}")
    print(f"Ground Truth : {i[1]}")
    print(f"prediction: {pre}")
    print("================================\n")

input : অনেরা
Ground Truth : আপনারা
prediction: আপনারা

input : অলপল
Ground Truth : আগোছালো
prediction: আগোছালো

input : ফ্ইর
Ground Truth : ভিক্ষুক
prediction: ভিক্ষুক

input : আজিয়া
Ground Truth : আজকে
prediction: আজকে

input : কালিয়া
Ground Truth : কালকে
prediction: কালকে

input : যন নাই
Ground Truth : যেতে নাই
prediction: যেতে নাই

input : গত খাইল
Ground Truth : গত কালকে
prediction: গত কালকে

input : ইতারে
Ground Truth : তাকে
prediction: তাকে

input : অনেরে
Ground Truth : আপনাকে
prediction: আপনাকে

input : আ্যাই
Ground Truth : আমি
prediction: আমি

input : তুউই
Ground Truth : তুমি
prediction: তুমি

input : গোসসা
Ground Truth : রাগ
prediction: রাগ

input : ঐইন্না
Ground Truth : এখন
prediction: এখন

input : আ্যাই ন ডরাই
Ground Truth : আমি ভয় পাই না
prediction: আমি ভয় পাই না

input : ডর
Ground Truth : ভয়
prediction: ভয়

input : উযু
Ground Truth : সোজা
prediction: সোজা

input : আইস্সুন
Ground Truth : আসবেন/আসছেন
prediction: আসবেন/আসছেন

input : অনেরা বিয়াকে আইবান
Ground Truth : আপনারা

## 16. Calculate Model Performance

In [71]:
def calculate_accuracy(model, data, full_chattogram_vocab, full_bangla_vocab, chattogram_tokenizer):
    total = 0
    correct = 0

    for i in data:
        input_text = i[0]  # Input Bangla text
        ground_truth = i[1]  # Ground truth Standard Bangla text
        prediction = translate(model, i[0], full_chattogram_vocab, full_bangla_vocab, chattogram_tokenizer)  # Translate input to Standard Bangla

        # Check if prediction matches ground truth
        if prediction.strip() == ground_truth.strip():
            correct += 1
        total += 1

    accuracy = correct / total
    return accuracy


In [72]:
# Calculate accuracy of full dataset
accuracy = calculate_accuracy(model, data, full_chattogram_vocab, full_bangla_vocab, chattogram_tokenizer)
print("accuracy:", accuracy)

accuracy: 0.50375


In [73]:
# Calculate accuracy of full dataset
train_accuracy = calculate_accuracy(model, data[:600], full_chattogram_vocab, full_bangla_vocab, chattogram_tokenizer)
print("train accuracy:", train_accuracy)

train accuracy: 0.6666666666666666


In [74]:
# Calculate accuracy of full dataset
val_accuracy = calculate_accuracy(model, data[600:], full_chattogram_vocab, full_bangla_vocab, chattogram_tokenizer)
print("val accuracy:", val_accuracy)

val accuracy: 0.015
