<a href="https://colab.research.google.com/github/FaisalAhmedBijoy/Chattogram-language-to-standard-bangla-language-conversion/blob/master/chittagong_Bn_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Import Libraries

In [1]:
import math
import torchtext
import torch
import torch.nn as nn
from torch import Tensor
from collections import Counter
from torchtext.vocab import Vocab
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

import io
import time
import pickle
import numpy as np
import pandas as pd
import sentencepiece as spm

from torch.nn import TransformerEncoder, TransformerDecoder, TransformerEncoderLayer, TransformerDecoderLayer
torch.manual_seed(0)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
print('device: ',device)

device:  cuda


## 2. Access Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 3. Analysis the Dataset

In [4]:
dataset_path='/content/drive/MyDrive/Chattogram_language/data/Chattogram_Sentense.csv'
df=pd.read_csv(dataset_path)
df.head()

Unnamed: 0,Original Chattogram Sentense,Original Chattogram language English,Bangali_Sentense,English Sentense,English Maning Sentense
0,অনেরা কডে,onera,আপনারা কোথায়,APNARA,YOU
1,অলপল গরি এইজ্জু,olpol,আগোছালো করে রাখসো,AGOSALO,TIDY UP
2,ফ্ইর বানাইবা ফানলার,foir,ভিক্ষুক বানাবা মনেহয়,BIKKUK,CADGER
3,আজিয়া বিইয়্যা,ajiya,আজকে বিয়ে,AJKE,TODAY
4,কালিয়া মেজ্জান,kaliya,কালকে মেজবান,KALKE,TOMORROW


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16625 entries, 0 to 16624
Data columns (total 5 columns):
 #   Column                                Non-Null Count  Dtype 
---  ------                                --------------  ----- 
 0   Original Chattogram Sentense          16625 non-null  object
 1   Original Chattogram language English  110 non-null    object
 2   Bangali_Sentense                      16620 non-null  object
 3   English Sentense                      39 non-null     object
 4   English Maning Sentense               39 non-null     object
dtypes: object(5)
memory usage: 649.5+ KB


### Check unique values in dataframe

In [6]:
def report_data_types_uniques_check(df):
    col = []
    d_type = []
    uniques = []
    n_uniques = []

    for i in df.columns:
        col.append(i)
        d_type.append(df[i].dtypes)
        uniques.append(df[i].unique()[:5])
        n_uniques.append(df[i].nunique())

    return pd.DataFrame({'Column': col, 'd_type': d_type, 'unique_sample': uniques, 'n_uniques': n_uniques})

In [7]:
report_data_types_uniques_check(df)

Unnamed: 0,Column,d_type,unique_sample,n_uniques
0,Original Chattogram Sentense,object,"[অনেরা কডে, অলপল গরি এইজ্জু, ফ্ইর বানাইবা ফানল...",11772
1,Original Chattogram language English,object,"[onera, olpol, foir, ajiya, kaliya]",110
2,Bangali_Sentense,object,"[আপনারা কোথায়, আগোছালো করে রাখসো, ভিক্ষুক বানা...",10538
3,English Sentense,object,"[APNARA, AGOSALO, BIKKUK, AJKE, KALKE]",38
4,English Maning Sentense,object,"[YOU, TIDY UP, CADGER, TODAY, TOMORROW]",36


In [8]:
# check missing values
df.isnull().sum()

Original Chattogram Sentense                0
Original Chattogram language English    16515
Bangali_Sentense                            5
English Sentense                        16586
English Maning Sentense                 16586
dtype: int64

### Select top 800 data for the project

In [9]:
top_1000_df=df.head(1000)
top_1000_df.head()

Unnamed: 0,Original Chattogram Sentense,Original Chattogram language English,Bangali_Sentense,English Sentense,English Maning Sentense
0,অনেরা কডে,onera,আপনারা কোথায়,APNARA,YOU
1,অলপল গরি এইজ্জু,olpol,আগোছালো করে রাখসো,AGOSALO,TIDY UP
2,ফ্ইর বানাইবা ফানলার,foir,ভিক্ষুক বানাবা মনেহয়,BIKKUK,CADGER
3,আজিয়া বিইয়্যা,ajiya,আজকে বিয়ে,AJKE,TODAY
4,কালিয়া মেজ্জান,kaliya,কালকে মেজবান,KALKE,TOMORROW


In [10]:
top_1000_df.describe()

Unnamed: 0,Original Chattogram Sentense,Original Chattogram language English,Bangali_Sentense,English Sentense,English Maning Sentense
count,1000,109,1000,39,39
unique,990,109,982,38,36
top,ডাক চিক্কুরে পাড়া ফাডের,ajeye roja kaliye eid tor ma,কী হয়েছে,ASBEN,YOU
freq,3,1,3,2,3


### Select only the chattogram sentence and bangali sentence

In [12]:
original_chattogram_Sentense=list(top_1000_df['Original Chattogram Sentense'])
bangali_sentence=list(top_1000_df['Bangali_Sentense'])

## 4. Data Processing
- Generate train and val data
- Save chattogram and stanadrd bangla text into local disk

In [13]:
data=[]
# print(Original_Chattogram_Sentense)
for i in range(len(original_chattogram_Sentense)):
  data.append([original_chattogram_Sentense[i],bangali_sentence[i]])


In [14]:
data[:10]

[['অনেরা কডে', 'আপনারা কোথায়'],
 ['অলপল গরি এইজ্জু', 'আগোছালো করে রাখসো'],
 ['ফ্ইর বানাইবা ফানলার', 'ভিক্ষুক বানাবা মনেহয়'],
 ['আজিয়া বিইয়্যা', 'আজকে বিয়ে'],
 ['কালিয়া মেজ্জান', 'কালকে মেজবান'],
 ['যন নাই', 'যেতে নাই'],
 ['গত খাইল', 'গত কালকে'],
 ['ইতারে ধর', 'তাকে ধরো'],
 ['অনেরে তোয়ারদে', 'আপনাকে খুঁজতেসে'],
 ['আ্যাই কিত্তাম', 'আমি কী করবো']]

In [15]:
data[-10:]

[['সুন্দর আর পইজ্জ ফরিষ্কার বাংলাদেশ চাই।', 'সুন্দর ও পরিষ্কার বাংলাদেশ চাই'],
 ['যেডে মাইনসে স্সথির ভাত খাইত তারিবু',
  'যেখানে মানুষ শান্তিতে ভাত খেতে পারবে'],
 ['খুদা আর দারিদ্রতা মুক্ত বাংলাদেশ চাইতাম চাই।',
  'ক্ষুদ দারিদ্রতামুক্ত বাংলাদেশ চাই'],
 ['ভালা মাইসর হাতে ভালা বাংলাদেশ চাইতাম চাই',
  'ভালো মানুষের হাতে বাংলাদেশ দেখতে চাই'],
 ['নিরাপদ ও শান্তি পূর্ণ বাংলাদেশ চাইতাম চাই ।',
  'নিরাপদ ও শান্তিপূন বাংলাদেশ দেখতে চাই'],
 ['আঁই এন ওগ্গো বাংলাদেশ দেইকতামচাই জেড়ে ধনী-গরীব, ক্ষমতাবান -নিচুবান হোন ভেদাভেদ থাইকতুনো',
  'আমি এমন একটা বাংলাদেশ দেখতে চাই যেখানে ধণী গরিব, ক্ষমতাবান নিচু কোন ভেদাভেদ থাকবে না'],
 ['উঁচুজাত-নিচুজাত কোন ভেদাভেদ থাইকতুনো;',
  'উচু নিচু জাত কোন ভেদাভেদ থাকবে না'],
 ['নামাযত জেইল্লে ধনী-গরীব, মালিক-চাকর একফুয়ারে কাতারবন্দি অই থিয়েই এল্লে বিয়াগ্গুনে এক সমান থাকিবু,',
  'নামাজে যেভাবে ধনী গরীব মালিক চাকর একসাথে এক কাতারে দাড়াই  সেভাবে সমানে থাকতে চাই'],
 ['দেশত শৃঙ্খলা থাকিবু কিয়র হক কিউ মারি মারি খাইতু নো,',
  'দেশে শৃঙ্খলা থাকবে কে্উ কারো হক মারি খেতে পা

In [16]:
train_chattogram = [i[0] for i in data[:800]]
val_chattogram = [i[0] for i in data[800:]]
print('train len: ',len(train_chattogram))
print(' val len: ',len(val_chattogram))

train len:  800
 val len:  200


In [17]:
val_chattogram[:10]

['আই যাইর গই',
 'হন্ডে যর দে',
 'আই বারিত যাইর',
 'তুই ন যাই য়',
 'তুয়ার কী অইয়ে',
 'আর মাথা ঘুরের',
 'এহন কেন লার',
 'এই হাম হনে গরিল',
 'আই গরগিদি',
 'অবা তুই কেন আছ']

In [18]:
train_bangla = [i[1] for i in data[:800]]
val_bangla = [i[1] for i in data[800:]]
print(len(train_bangla))
print(len(val_bangla))

800
200


In [19]:
val_bangla[:10]

['আমি চলে যাচ্ছি ',
 'কোথায় যাচ্ছ',
 'আমি বাড়িতে যাচ্ছি ',
 'তুমি যেয়োনা',
 'তোমার কী হয়েছে ',
 'আমার মাথা ঘুরতেছে',
 'এখন কেমন লাগতেছে ',
 'এই কাজ কে করছে',
 'আমি করছি ',
 'ও বাবা তুমি কেমন আছো ']

### Saved Chattogram and standard bangla text into file

In [20]:
import os
process_data_path = "/content/drive/MyDrive/Chattogram_language/data/process_data"
# os.makedirs(process_data_path, exits_ok= True)
os.makedirs(process_data_path, exist_ok = True)

In [21]:
def write_txt_file(file_path, data, encoding="utf-8"):
    with open(file_path, 'w') as f:
        for key in data:
            if isinstance(key, list):
                key = key[0]
            f.write(key+"\n")

In [22]:
write_txt_file(os.path.join(process_data_path, "chattogram_data.txt"), train_chattogram)

In [23]:
write_txt_file(os.path.join(process_data_path,"bangla_data.txt"), train_bangla)

In [24]:
def merge_data_write_txt_file(file_path, chattogram_data, bangla_data, encoding="utf-8"):
    with open(file_path, 'w') as f:
        for chattogram, bangla in zip(chattogram_data, bangla_data):
#             if isinstance(key, list):
#                 key = key[0]
            f.write(chattogram+"\t"+bangla+"\n")

In [25]:
merge_data_write_txt_file(os.path.join(process_data_path, "merge_data.txt"), train_chattogram, train_bangla)

In [26]:
model_path = "/content/drive/MyDrive/Chattogram_language/model"
os.makedirs(model_path, exist_ok = True)

## 5. Generate Tokenizer and Vocab File

In [27]:
import sentencepiece as spm

def train_tokenizer(text_path="text.txt", model_prefix="model/chattogram_model", vocab_size=30000):
    spm.SentencePieceTrainer.train(f'--input={text_path} --model_prefix={model_prefix} --user_defined_symbols=<sep>,<cls> --vocab_size={vocab_size}')
    bn_sp = spm.SentencePieceProcessor()
    bn_sp.load(os.path.join(model_path, 'chattogram_model.model'))


In [28]:
chattogram_data_path = "/content/drive/MyDrive/Chattogram_language/data/process_data/chattogram_data.txt"
bangla_data_path = "/content/drive/MyDrive/Chattogram_language/data/process_data/bangla_data.txt"

### Generate Chattogram and standard bangla tokenizer

In [30]:
train_tokenizer(
    text_path = chattogram_data_path,
    model_prefix = "/content/drive/MyDrive/Chattogram_language/model/chattogram_model",
    vocab_size = 968
)

In [33]:
train_tokenizer(
    text_path = bangla_data_path,
    model_prefix = "/content/drive/MyDrive/Chattogram_language/model/bangla_model",
    vocab_size = 850
)

In [35]:
chattogram_tokenizer = spm.SentencePieceProcessor(model_file='/content/drive/MyDrive/Chattogram_language/model/chattogram_model.model')
bangla_tokenizer = spm.SentencePieceProcessor(model_file='/content/drive/MyDrive/Chattogram_language/model/bangla_model.model')

In [36]:
chattogram_tokenizer.encode("এত টিয়া হডে পাইয়ুম ")

[89, 26, 167, 43, 207]

In [37]:
bangla_tokenizer.encode("আমি আবার বিয়ে করেছি।")

[16, 77, 94, 83, 23, 251, 7]

In [38]:
print(chattogram_tokenizer.encode_as_pieces('এত টিয়া হডে পাইয়ুম '))
print(chattogram_tokenizer.encode_as_ids('এত টিয়া হডে পাইয়ুম '))

print(bangla_tokenizer.encode_as_pieces('এত টিয়া হডে পাইয়ুম '))
print(bangla_tokenizer.encode_as_ids('এত টিয়া হডে পাইয়ুম '))

['▁এত', '▁টিয়া', '▁হডে', '▁প', 'াইয়ুম']
[89, 26, 167, 43, 207]
['▁এত', '▁', 'টি', 'য়া', '▁হ', 'ড', 'ে', '▁পাই', 'য়', 'ু', 'ম']
[374, 10, 84, 662, 58, 620, 6, 521, 44, 99, 181]


### Generate and save vocab files

In [39]:
from torchtext.vocab import vocab
def build_vocab(sentences, tokenizer):
    counter = Counter()
    for sentence in sentences:
        # print(sentence)
        if isinstance(sentence, list):
            sentence = sentence[0]
        counter.update(tokenizer.encode(sentence, out_type=str))
    print(counter)
    return vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'], special_first=True)


In [40]:
torchtext.__version__

'0.17.1+cpu'

In [41]:
print('input sentence: ',train_chattogram[0])
chattogram_tokenizer.encode(train_chattogram[0])

input sentence:  অনেরা কডে


[442, 11, 80]

### Merge train and val to generate a vocab

In [42]:
full_chattogram=train_chattogram+val_chattogram
full_bangla=train_bangla+val_bangla
print(len(full_chattogram))
print(len(full_bangla))

1000
1000


In [43]:
full_chattogram_vocab = build_vocab(full_chattogram, chattogram_tokenizer)
print('len of full chattogram vocab: ',len(full_chattogram_vocab))

Counter({'র': 274, 'ত': 211, '▁ন': 164, '।': 158, 'ে': 150, '?': 141, 'া': 130, 'ন': 111, '▁আই': 104, ',': 99, 'ি': 90, '▁আর': 85, 'ম': 84, '▁না': 75, '▁': 66, '▁আ': 61, 'রে': 60, 'স': 58, '▁তুই': 58, 'য়': 56, '▁ব': 52, '▁দে': 52, '▁গ': 51, '▁হ': 51, 'ই': 50, 'ের': 50, '▁ভা': 49, '▁কী': 48, '▁এ': 47, 'ার': 47, 'ো': 47, 'ষ': 46, '▁টিয়া': 42, '▁কি': 42, 'ং': 42, '▁বা': 41, '্': 41, '▁প': 40, 'ুন': 39, 'ক': 39, '▁মা': 39, '▁অ': 37, '▁নাই': 36, '▁স': 36, '▁আরা': 35, 'ল': 34, '▁কিছু': 34, '▁গরি': 33, '▁দ': 32, '▁ক': 32, 'ু': 32, 'দে': 31, '▁চাই': 31, 'লা': 31, 'শ': 31, '▁তোয়ার': 30, '▁ত': 30, '▁গর': 29, '▁হন': 29, '▁চ': 28, '▁হই': 28, 'রি': 27, 'ব': 27, '▁কা': 27, 'ত্ত': 26, '▁শ': 25, 'দ': 25, '▁হথা': 25, 'ী': 24, '▁দাম': 24, '▁খ': 23, '▁ইবা': 23, '▁ম': 23, '▁ল': 23, '▁আরার': 23, 'গ': 23, '▁ও': 22, '▁সা': 22, '▁কেন': 21, 'ট': 21, 'লে': 21, '▁য': 20, '▁আরে': 20, '▁দি': 20, '▁ফ': 20, '▁চিটা': 20, 'উ': 19, '▁আছে': 19, 'াই': 19, '▁ভালা': 19, '্র': 19, '▁মন': 18, '▁এই': 18, '্যা': 17, '▁নাম':

In [44]:
full_bangla_vocab = build_vocab(full_bangla, bangla_tokenizer)
print('len of full chattogram vocab: ',len(full_bangla_vocab))

Counter({'র': 170, 'ে': 161, '?': 144, '▁না': 124, '।': 118, 'ি': 112, 'ের': 95, 'ন': 95, '▁আমি': 94, 'তে': 87, '▁': 87, 'কে': 84, '▁কী': 75, 'া': 72, '▁আমা': 69, 'ত': 67, 'রা': 65, 'ো': 61, '▁ভা': 59, 'ক': 57, '▁অ': 55, 'স': 54, '▁তুমি': 53, '▁ব': 51, '▁কথা': 49, '▁ভালো': 48, '▁টাকা': 45, 'য়': 45, 'নে': 43, '▁কেন': 42, 'ই': 41, '▁মানুষ': 39, '▁হ': 38, 'ী': 36, ',': 36, '▁কিছু': 36, '▁বল': 35, 'ষা': 35, 'ার': 34, '▁মা': 33, '▁কোথায়': 32, '▁করে': 32, '▁শ': 32, '▁আমাদের': 32, 'ব': 32, '▁হবে': 31, '▁কো': 31, '▁স': 30, 'লে': 30, 'ম': 30, '▁এই': 28, '▁চট্টগ্রাম': 28, '▁যে': 27, 'লা': 27, '▁এটা': 27, 'ং': 27, 'তি': 27, '▁আর': 26, 'ল': 26, '▁এ': 25, '▁সব': 25, '▁ম': 25, '▁কে': 25, '▁আম': 25, '▁বা': 25, '▁ছেলে': 24, '▁তোমার': 24, '▁দাম': 24, 'ু': 24, '▁নেই': 24, '্': 23, '▁ও': 23, '▁ন': 23, 'টি': 23, 'দ': 23, '▁নাই': 22, '▁কেমন': 22, '▁কি': 22, '▁কাজ': 22, 'শ': 22, 'টা': 21, '▁পড়': 21, '▁কর': 21, '▁এক': 21, '▁এখন': 20, '▁বাড়ি': 20, 'ছ': 20, '▁সে': 20, '▁আ': 20, '▁আস': 20, '▁আছে': 19, '▁চল'

## 6. Generate Train and Val Data Loader

In [45]:
def data_process(chattogram, bangla):
    data = []
    for (raw_chattogram, raw_bangla) in zip(chattogram, bangla):
        chattogram_tensor_ = torch.tensor([full_chattogram_vocab[token] for token in chattogram_tokenizer.encode(raw_chattogram, out_type=str)],dtype=torch.long)
        bangla_tensor_ = torch.tensor([full_bangla_vocab[token] for token in bangla_tokenizer.encode(raw_bangla, out_type=str)],dtype=torch.long)
        data.append((chattogram_tensor_, bangla_tensor_))
    return data

In [46]:
train_data = data_process(train_chattogram, train_bangla)

In [47]:
train_data[:10]

[(tensor([4, 5, 6]), tensor([4, 5, 6])),
 (tensor([ 7,  8,  9, 10, 11]), tensor([ 7,  8,  9, 10, 11, 12])),
 (tensor([12, 13, 14, 15, 16, 17]), tensor([13, 14, 15, 16,  5, 17])),
 (tensor([18,  5, 19, 20,  5]), tensor([18, 19])),
 (tensor([21,  5, 22, 23]), tensor([20, 21])),
 (tensor([24, 23, 25]), tensor([22, 23, 24])),
 (tensor([26, 27, 28]), tensor([25, 26, 20])),
 (tensor([29, 30, 31]), tensor([27, 28, 29, 30])),
 (tensor([ 4, 30, 32, 33]), tensor([31, 28, 32, 23, 33])),
 (tensor([34, 35, 36]), tensor([34, 35, 36]))]

In [48]:
val_data = data_process(val_chattogram, val_bangla)

In [49]:
val_data[:10]

[(tensor([ 56, 374,  14,  78]), tensor([ 34, 106,  59, 670, 236])),
 (tensor([761,  24,  14,  40]), tensor([  6, 670])),
 (tensor([ 56, 380,  27, 374,  14]), tensor([ 34,  81,  23, 670, 236])),
 (tensor([218,  48, 374, 103, 243]), tensor([ 37,  22, 124, 440])),
 (tensor([121, 449, 297]), tensor([193,  35, 364])),
 (tensor([302, 397, 561,  60,  14]),
  tensor([394,  67,  40, 779, 534,  67, 762])),
 (tensor([863,  75, 722]), tensor([ 44,  72, 733])),
 (tensor([365, 723, 643,  30,   9, 168]), tensor([181, 408, 249, 604])),
 (tensor([ 56, 411, 548, 827]), tensor([ 34, 293, 248])),
 (tensor([445,  16, 218,  75, 760]), tensor([263, 307,  37,  72, 711]))]

In [50]:
BATCH_SIZE = 150
PAD_IDX = full_chattogram_vocab['<pad>']
BOS_IDX = full_chattogram_vocab['<bos>']
EOS_IDX = full_chattogram_vocab['<eos>']

def generate_batch(data_batch):
    chattogram_batch, bangla_batch = [], []
    for (chattogram_item, bangla_item) in data_batch:
        chattogram_batch.append(torch.cat([torch.tensor([BOS_IDX]), chattogram_item, torch.tensor([EOS_IDX])], dim=0))
        bangla_batch.append(torch.cat([torch.tensor([BOS_IDX]), bangla_item, torch.tensor([EOS_IDX])], dim=0))
    chattogram_batch = pad_sequence(chattogram_batch, padding_value=PAD_IDX)
    bangla_batch = pad_sequence(bangla_batch, padding_value=PAD_IDX)
    return chattogram_batch, bangla_batch

train_iter = DataLoader(train_data, batch_size=BATCH_SIZE,shuffle=True, collate_fn=generate_batch)
val_iter = DataLoader(val_data, batch_size=BATCH_SIZE,shuffle=True, collate_fn=generate_batch)

In [51]:
for batch in val_iter:
  print(batch[0][0])

tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2])
tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2])


## 7. Implement Sequence 2 Sequence (Seq2Seq) Model using Transformer

In [52]:
from torch.nn import (TransformerEncoder, TransformerDecoder,
                      TransformerEncoderLayer, TransformerDecoderLayer)


class Seq2SeqTransformer(nn.Module):
    def __init__(self, num_encoder_layers: int, num_decoder_layers: int,
                 emb_size: int, src_vocab_size: int, tgt_vocab_size: int,
                 dim_feedforward:int = 512, dropout:float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        encoder_layer = TransformerEncoderLayer(
            d_model=emb_size,
            nhead=NHEAD,
            dim_feedforward=dim_feedforward
            )
        self.transformer_encoder = TransformerEncoder(
            encoder_layer,
            num_layers=num_encoder_layers
            )
        decoder_layer = TransformerDecoderLayer(
            d_model=emb_size,
            nhead=NHEAD,
            dim_feedforward=dim_feedforward
            )
        self.transformer_decoder = TransformerDecoder(
            decoder_layer,
            num_layers=num_decoder_layers
            )

        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)

    def forward(self, src: Tensor, trg: Tensor, src_mask: Tensor,
                tgt_mask: Tensor, src_padding_mask: Tensor,
                tgt_padding_mask: Tensor, memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        memory = self.transformer_encoder(src_emb, src_mask, src_padding_mask)
        outs = self.transformer_decoder(tgt_emb, memory, tgt_mask, None,
                                        tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer_encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer_decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

In [53]:
class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout, maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding +
                            self.pos_embedding[:token_embedding.size(0),:])

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size
    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

In [54]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=device)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len), device=device).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

## 8. Define Train Parameter for Seq2Seq Model

In [55]:
from tqdm import tqdm
SRC_VOCAB_SIZE = len(full_chattogram_vocab)
TGT_VOCAB_SIZE = len(full_bangla_vocab)
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 150
NUM_ENCODER_LAYERS = 6
NUM_DECODER_LAYERS = 6
NUM_EPOCHS = 300


transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS,
                                 EMB_SIZE, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE,
                                 FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(device)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

def train_epoch(model, train_iter, optimizer):
    model.train()
    losses = 0
    for idx, (src, tgt) in enumerate(train_iter):
#         print("training iter : ", idx)
#     for idx in tqdm(range(len(train_iter))):
#         src, tgt = train_iter[idx]
        src = src.to(device)
        tgt = tgt.to(device)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,
                                src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[1:,:]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()
    return losses / len(train_iter)




In [56]:
def evaluate(model, val_iter):
    model.eval()
    losses = 0
    for idx, (src, tgt) in (enumerate(val_iter)):
#         print(idx)
        src = src.to(device)
        tgt = tgt.to(device)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,
                                  src_padding_mask, tgt_padding_mask, src_padding_mask)
        tgt_out = tgt[1:,:]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()
    return losses / len(val_iter)

## 9. Train Seq2Seq Transformer Model

In [57]:
for epoch in range(1, NUM_EPOCHS+1):
    start_time = time.time()
    train_loss = train_epoch(transformer, train_iter, optimizer)
#     if epoch % 5 == 0:
    val_loss = evaluate(transformer, val_iter)
    end_time = time.time()
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, val loss : {val_loss:.3f} "
          f"Epoch time = {(end_time - start_time):.3f}s"))


    # save model + checkpoint to resume training later
    torch.save({
      'epoch': NUM_EPOCHS,
      'model_state_dict': transformer.state_dict(),
      'optimizer_state_dict': optimizer.state_dict(),
      'loss': train_loss,
      }, '/content/drive/MyDrive/Chattogram_language/model/model_checkpoint.tar')



Epoch: 1, Train loss: 6.408, val loss : 6.293 Epoch time = 2.869s
Epoch: 2, Train loss: 5.893, val loss : 6.114 Epoch time = 1.190s
Epoch: 3, Train loss: 5.751, val loss : 6.099 Epoch time = 1.143s
Epoch: 4, Train loss: 5.675, val loss : 5.997 Epoch time = 1.215s
Epoch: 5, Train loss: 5.634, val loss : 5.995 Epoch time = 1.186s
Epoch: 6, Train loss: 5.566, val loss : 5.932 Epoch time = 1.105s
Epoch: 7, Train loss: 5.507, val loss : 5.912 Epoch time = 1.120s
Epoch: 8, Train loss: 5.378, val loss : 5.813 Epoch time = 1.196s
Epoch: 9, Train loss: 5.277, val loss : 5.768 Epoch time = 1.102s
Epoch: 10, Train loss: 5.149, val loss : 5.817 Epoch time = 1.179s
Epoch: 11, Train loss: 5.007, val loss : 5.783 Epoch time = 1.146s
Epoch: 12, Train loss: 4.890, val loss : 5.756 Epoch time = 1.220s
Epoch: 13, Train loss: 4.752, val loss : 5.857 Epoch time = 1.222s
Epoch: 14, Train loss: 4.648, val loss : 5.752 Epoch time = 1.108s
Epoch: 15, Train loss: 4.507, val loss : 5.724 Epoch time = 1.211s
Epoc

## 10. Evaluate Model

In [58]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [59]:
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(device)
    src_mask = src_mask.to(device)
    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(device)
    for i in range(max_len-1):
        memory = memory.to(device)
        memory_mask = torch.zeros(ys.shape[0], memory.shape[0]).to(device).type(torch.bool)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                                    .type(torch.bool)).to(device)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim = 1)
        next_word = next_word.item()
        ys = torch.cat([ys,torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys

def translate(model, src, src_vocab, tgt_vocab, src_tokenizer):
    model.eval()
    tokens = [BOS_IDX] + [src_vocab.get_stoi()[tok] for tok in src_tokenizer.encode(src, out_type=str)]+ [EOS_IDX]
    num_tokens = len(tokens)
    src = (torch.LongTensor(tokens).reshape(num_tokens, 1) )
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    p_text = " ".join([tgt_vocab.get_itos()[tok] for tok in tgt_tokens]).replace("<bos>", "").replace("<eos>", "")
    pts = " ".join(list(map(lambda x : x , p_text.replace(" ", "").split("▁"))))
    return pts.strip()

In [60]:
# for i in data[:10]:
text = "বালা অতে টিয়া ন লাগে"
pre = translate(transformer, text, full_chattogram_vocab, full_bangla_vocab, chattogram_tokenizer)
print(f"input : {text}")
print(f"prediction: {pre}")

input : বালা অতে টিয়া ন লাগে
prediction: ভালো হতে টাকা পয়সা লাগে না


In [None]:
# itos = ja_vocab.itos()

## 11. Save Vocab and PyTorch model in Local Storage

In [61]:
import pickle
full_chattogram_vocab_pickle_file='/content/drive/MyDrive/Chattogram_language/model/full_chattogram_vocab.pkl'
full_bangla_vocab_pickle_file='/content/drive/MyDrive/Chattogram_language/model/full_bangla_vocab.pkl'
# open a file, where you want to store the data
file = open(full_chattogram_vocab_pickle_file, 'wb')
# dump information to that file
pickle.dump(full_chattogram_vocab, file)
file.close()
file = open(full_bangla_vocab_pickle_file, 'wb')
pickle.dump(full_bangla_vocab, file)
file.close()

In [62]:
# save model + checkpoint to resume training later
torch.save({
  'epoch': NUM_EPOCHS,
  'model_state_dict': transformer.state_dict(),
  'optimizer_state_dict': optimizer.state_dict(),
  'loss': train_loss,
  }, '/content/drive/MyDrive/Chattogram_language/model/seq2seq_chattogram_to_standard_bangla_model_1000_data.pt')

## 13. Inference
Here the inference script after load sentencepice train tokenizer model, vocal and train model

In [63]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [64]:
chattogram_model_path='/content/drive/MyDrive/Chattogram_language/model/chattogram_model.model'
bangla_model_path='/content/drive/MyDrive/Chattogram_language/model/bangla_model.model'
chattogram_tokenizer = spm.SentencePieceProcessor(model_file=chattogram_model_path)
bangla_tokenizer = spm.SentencePieceProcessor(model_file=bangla_model_path)

In [65]:
chattogram_vocab_file_path='/content/drive/MyDrive/Chattogram_language/model/full_chattogram_vocab.pkl'
file = open(chattogram_vocab_file_path, 'rb')
chattogram_vocab = pickle.load(file)
file.close()

In [66]:
bangla_vocab_file_path='/content/drive/MyDrive/Chattogram_language/model/full_bangla_vocab.pkl'
file = open(bangla_vocab_file_path, 'rb')
bangla_vocal = pickle.load(file)
file.close()

In [67]:
PATH = "/content/drive/MyDrive/Chattogram_language/model/seq2seq_chattogram_to_standard_bangla_model_1000_data.pt"

model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS,
                                 EMB_SIZE, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE,
                                 FFN_HID_DIM)
model.to(device)
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']

model.eval()



Seq2SeqTransformer(
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=512, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (transformer_decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )

In [68]:

def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(device)
    src_mask = src_mask.to(device)
    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(device)
    for i in range(max_len-1):
        memory = memory.to(device)
        memory_mask = torch.zeros(ys.shape[0], memory.shape[0]).to(device).type(torch.bool)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                                    .type(torch.bool)).to(device)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim = 1)
        next_word = next_word.item()
        ys = torch.cat([ys,torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys

def translate(model, src, src_vocab, tgt_vocab, src_tokenizer):
#     model.eval()
    tokens = [BOS_IDX] + [src_vocab.get_stoi()[tok] for tok in src_tokenizer.encode(src, out_type=str)]+ [EOS_IDX]
    num_tokens = len(tokens)
    src = (torch.LongTensor(tokens).reshape(num_tokens, 1) )
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    p_text = " ".join([tgt_vocab.get_itos()[tok] for tok in tgt_tokens]).replace("<bos>", "").replace("<eos>", "")
    pts = " ".join(list(map(lambda x : x , p_text.replace(" ", "").split("▁"))))
    return pts.strip()

## 14. Generate Samples with Model

In [70]:
for i in data[-10:]:
    # text = "বালা অতে টিয়া ন লাগে"
    pre = translate(model, i[0],full_chattogram_vocab, full_bangla_vocab, chattogram_tokenizer)
    print(f"input : {i[0]}")
    print(f"Ground Truth : {i[1]}")
    print(f"prediction: {pre}")
    print("================================\n")

input : সুন্দর আর পইজ্জ ফরিষ্কার বাংলাদেশ চাই।
Ground Truth : সুন্দর ও পরিষ্কার বাংলাদেশ চাই
prediction: যদি এক কোন আক্লেল জ্ঞান নেই

input : যেডে মাইনসে স্সথির ভাত খাইত তারিবু
Ground Truth : যেখানে মানুষ শান্তিতে ভাত খেতে পারবে
prediction: রিয়াজ উদ্দিন বাজোরে গিয়ে লাল জামা কিনতে হবে।

input : খুদা আর দারিদ্রতা মুক্ত বাংলাদেশ চাইতাম চাই।
Ground Truth : ক্ষুদ দারিদ্রতামুক্ত বাংলাদেশ চাই
prediction: দুই শত টাকা দিয়ে আর কত রকময় নিও

input : ভালা মাইসর হাতে ভালা বাংলাদেশ চাইতাম চাই
Ground Truth : ভালো মানুষের হাতে বাংলাদেশ দেখতে চাই
prediction: ভালোর দাম নেই

input : নিরাপদ ও শান্তি পূর্ণ বাংলাদেশ চাইতাম চাই ।
Ground Truth : নিরাপদ ও শান্তিপূন বাংলাদেশ দেখতে চাই
prediction: ড্রাইভারে কোন আক্লেল জ্ঞান নেই

input : আঁই এন ওগ্গো বাংলাদেশ দেইকতামচাই জেড়ে ধনী-গরীব, ক্ষমতাবান -নিচুবান হোন ভেদাভেদ থাইকতুনো
Ground Truth : আমি এমন একটা বাংলাদেশ দেখতে চাই যেখানে ধণী গরিব, ক্ষমতাবান নিচু কোন ভেদাভেদ থাকবে না
prediction: আশাা ছিলো আপনাকে নিয়ে সুখের ঘর বাধঁবো

input : উঁচুজাত-নিচুজাত কোন ভেদাভেদ 

In [71]:
for i in data[:20]:
    # text = "বালা অতে টিয়া ন লাগে"
    pre = translate(model, i[0],full_chattogram_vocab, full_bangla_vocab, chattogram_tokenizer)
    print(f"input : {i[0]}")
    print(f"Ground Truth : {i[1]}")
    print(f"prediction: {pre}")
    print("================================\n")

input : অনেরা কডে
Ground Truth : আপনারা কোথায়
prediction: আপনারা কোথায়

input : অলপল গরি এইজ্জু
Ground Truth : আগোছালো করে রাখসো
prediction: আগোছালো করে রাখসো

input : ফ্ইর বানাইবা ফানলার
Ground Truth : ভিক্ষুক বানাবা মনেহয়
prediction: ভিক্ষুক বানাবা মনেহয়

input : আজিয়া বিইয়্যা
Ground Truth : আজকে বিয়ে
prediction: আজকে বিয়ে

input : কালিয়া মেজ্জান
Ground Truth : কালকে মেজবান
prediction: কালকে মেজবান

input : যন নাই
Ground Truth : যেতে নাই
prediction: যেতে নাই

input : গত খাইল
Ground Truth : গত কালকে
prediction: গত কালকে

input : ইতারে ধর
Ground Truth : তাকে ধরো
prediction: তাকে ধরো

input : অনেরে তোয়ারদে
Ground Truth : আপনাকে খুঁজতেসে
prediction: আপনাকে খুঁজতেসে

input : আ্যাই কিত্তাম
Ground Truth : আমি কী করবো
prediction: আমি কী করবো

input : তুউই এন্ডে দে
Ground Truth : তুমি এখানে যে
prediction: তুমি এখানে যে

input : গোসসা গরি ফারিবানা
Ground Truth : রাগ করে পারবেন কী
prediction: রাগ করে পারবেন কী

input : ঐইন্না ন খাইয়ুম
Ground Truth : এখন খাবো না
prediction: এখন খাবো না

input

## 15. Save the Inference Result into CSV File

In [74]:
import pandas as pd

# Create a list of dictionaries containing the data
data_list = []
for i in range(1000):
    input_text = data[i][0]  # Input Bangla text
    ground_truth = data[i][1]  # Ground truth Standard Bangla text
    prediction = translate(model, input_text, full_chattogram_vocab, full_bangla_vocab, chattogram_tokenizer)  # Translate input to Standard Bangla

    data_list.append({
        'Input Text': input_text,
        'Ground Truth': ground_truth,
        'Prediction': prediction
    })

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(data_list)

# Save DataFrame to a CSV file
csv_file_path='/content/drive/MyDrive/Chattogram_language/logs/predictions.csv'
df.to_csv(csv_file_path, index=False)

print("CSV file saved successfully.")


CSV file saved successfully.


In [75]:
for i in data[:20]:
    # text = "বালা অতে টিয়া ন লাগে"
    pre = translate(model, i[0],full_chattogram_vocab, full_bangla_vocab, chattogram_tokenizer)
    print(f"input : {i[0]}")
    print(f"Ground Truth : {i[1]}")
    print(f"prediction: {pre}")
    print("================================\n")

input : অনেরা কডে
Ground Truth : আপনারা কোথায়
prediction: আপনারা কোথায়

input : অলপল গরি এইজ্জু
Ground Truth : আগোছালো করে রাখসো
prediction: আগোছালো করে রাখসো

input : ফ্ইর বানাইবা ফানলার
Ground Truth : ভিক্ষুক বানাবা মনেহয়
prediction: ভিক্ষুক বানাবা মনেহয়

input : আজিয়া বিইয়্যা
Ground Truth : আজকে বিয়ে
prediction: আজকে বিয়ে

input : কালিয়া মেজ্জান
Ground Truth : কালকে মেজবান
prediction: কালকে মেজবান

input : যন নাই
Ground Truth : যেতে নাই
prediction: যেতে নাই

input : গত খাইল
Ground Truth : গত কালকে
prediction: গত কালকে

input : ইতারে ধর
Ground Truth : তাকে ধরো
prediction: তাকে ধরো

input : অনেরে তোয়ারদে
Ground Truth : আপনাকে খুঁজতেসে
prediction: আপনাকে খুঁজতেসে

input : আ্যাই কিত্তাম
Ground Truth : আমি কী করবো
prediction: আমি কী করবো

input : তুউই এন্ডে দে
Ground Truth : তুমি এখানে যে
prediction: তুমি এখানে যে

input : গোসসা গরি ফারিবানা
Ground Truth : রাগ করে পারবেন কী
prediction: রাগ করে পারবেন কী

input : ঐইন্না ন খাইয়ুম
Ground Truth : এখন খাবো না
prediction: এখন খাবো না

input

## 16. Calculate Model Performance

In [76]:
def calculate_accuracy(model, data, full_chattogram_vocab, full_bangla_vocab, chattogram_tokenizer):
    total = 0
    correct = 0

    for i in data:
        input_text = i[0]  # Input Bangla text
        ground_truth = i[1]  # Ground truth Standard Bangla text
        prediction = translate(model, i[0], full_chattogram_vocab, full_bangla_vocab, chattogram_tokenizer)  # Translate input to Standard Bangla

        # Check if prediction matches ground truth
        if prediction.strip() == ground_truth.strip():
            correct += 1
        total += 1

    accuracy = correct / total
    return accuracy


In [77]:
# Calculate accuracy of full dataset
accuracy = calculate_accuracy(model, data, full_chattogram_vocab, full_bangla_vocab, chattogram_tokenizer)
print("accuracy:", accuracy)

accuracy: 0.556


In [78]:
# Calculate accuracy of full dataset
train_accuracy = calculate_accuracy(model, data[:800], full_chattogram_vocab, full_bangla_vocab, chattogram_tokenizer)
print("train accuracy:", train_accuracy)

train accuracy: 0.69


In [79]:
# Calculate accuracy of full dataset
val_accuracy = calculate_accuracy(model, data[800:], full_chattogram_vocab, full_bangla_vocab, chattogram_tokenizer)
print("val accuracy:", val_accuracy)

val accuracy: 0.02
