# Initialization Steps




In [None]:
!nvidia-smi

Sun Jul  4 10:13:03 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
#@title Installing Required Packages {form-width: "15%"}
!pip install pyonmttok
!pip install --upgrade nltk

Collecting pyonmttok
[?25l  Downloading https://files.pythonhosted.org/packages/90/73/034c3e0584322e3f3f03c0965c8a83df0ab7ae5ca65172203cd606ffe8ce/pyonmttok-1.26.4-cp37-cp37m-manylinux1_x86_64.whl (14.3MB)
[K     |████████████████████████████████| 14.3MB 7.6MB/s 
[?25hInstalling collected packages: pyonmttok
Successfully installed pyonmttok-1.26.4
Collecting nltk
[?25l  Downloading https://files.pythonhosted.org/packages/5e/37/9532ddd4b1bbb619333d5708aaad9bf1742f051a664c3c6fa6632a105fd8/nltk-3.6.2-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 7.7MB/s 
Installing collected packages: nltk
  Found existing installation: nltk 3.2.5
    Uninstalling nltk-3.2.5:
      Successfully uninstalled nltk-3.2.5
Successfully installed nltk-3.6.2


In [None]:
#@title Unzipping the data {form-width: "15%"}
!unzip -qo "/content/drive/MyDrive/Academics/Courses/Deep Learning/CA4/Data/AFEC-merged-all.zip"
!unzip -qo "/content/drive/MyDrive/Academics/Courses/Deep Learning/CA4/Data/Test.zip"


In [None]:
with open("AFEC-merged.en") as orgfile:
    with open("AFEC-merged_uncased.en", "w") as wfile:
        wfile.write(
            orgfile.read().lower()
        )

with open("AFEC-merged.fa") as orgfile:
    with open("AFEC-merged_uncased.fa", "w") as wfile:
        wfile.write(
            orgfile.read().lower()
        )

In [None]:
#@title Importing Libraries { form-width: "15%" }
import gc
import sys
import os
import shutil
from pathlib import Path

import collections
import itertools
import h5py
import json 


import re

from tqdm.auto import tqdm, trange
from nltk import wordpunct_tokenize

import math
import numpy as np
import pandas as pd

from nltk.translate.nist_score import corpus_nist
from nltk.translate.bleu_score import corpus_bleu



from gensim.parsing.preprocessing import (strip_tags,
                                          strip_punctuation,
                                          strip_multiple_whitespaces,
                                          strip_numeric,
                                          remove_stopwords,
                                          strip_short)

from gensim.parsing.preprocessing import preprocess_string as gensim_preprocess

import pyonmttok


import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader, random_split


# Defining run time constants

In [None]:
# CUDA PARAMS
AMP_FLAG = False
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
torch.backends.cudnn.benchmark = True

# DATA PARAMS
DATA_LOCATION = "/content/drive/MyDrive/Academics/Courses/Deep Learning/CA4/Data"
OUTPUT_LOCATION = "/content/drive/MyDrive/Academics/Courses/Deep Learning/CA4/Output"
MODEL_SAVE_LOCATION = "/content/drive/MyDrive/Academics/Courses/Deep Learning/CA4/models"

Path(OUTPUT_LOCATION).mkdir(exist_ok=True)
Path(MODEL_SAVE_LOCATION).mkdir(exist_ok=True)

TRAIN_TEST_SEED = 42

# DICTIONARY TOKENS
UNK_TOKEN = "<UNK>"
PAD_TOKEN = "<PAD>"
START_TOKEN = "<START>"
END_TOKEN = "<END>"

# TOKENIZER PARAMS
MAX_VOCAB_LENGTH = int(2 ** 14) - 4 # PAD, SOS, EOS, UNK
MAX_SEQ_LENGTH = 128
MIN_TEXT_LENGTH = 10
MIN_WORD_FREQ = 10
POST_TOKENIZATION_LENGTH_MIN = 3

# TRAINING PARAMS
TRAIN_PARAM_COEF = 1
TRAIN_BATCH_SIZE = 128 * TRAIN_PARAM_COEF if device.type == "cuda" else 2
TEST_BATCH_SIZE = 128 * TRAIN_PARAM_COEF if device.type == "cuda" else 2
LR = 1e-4
BETAS = (0.9, 0.98)
OPT_EPS = 1e-9
LABEL_SMOOTHING_COEF = 0.1
MAX_GRADIENT_NORM = 0.5
PREFETCH_FACTOR = 2
MAX_ITERS = 100_000 // TRAIN_PARAM_COEF
REPORT_STEP = 5_000 // TRAIN_PARAM_COEF
SAVE_INTERVAL = REPORT_STEP * 2


# MODEL PARAMS

EMBED_DIM = 256
NUM_HEADS = 8
KEY_DIM = 64
VALUE_DIM = 64
FFN_INNER_DIM = 1024
NUM_SUBLAYERS = 3
LAYER_NORM_EPS = 1e-5

# Setting Debug environment

In [None]:
DEBUG = False

# Definitions

## Data Definitions

In [None]:
def read_train_data():
    english_sents = []
    farsi_sents = []
    with open("AFEC-merged.en") as en_file:
        for l in en_file:
            english_sents.append(l)
    with open("AFEC-merged.fa") as fa_file:
        for l in fa_file:
            farsi_sents.append(l)
            
    train_data = [
            {
                "en": en,
                "fa": fa,
            } for en, fa in zip(english_sents, farsi_sents) \
                 if len(en) > MIN_TEXT_LENGTH and len(fa) > MIN_TEXT_LENGTH
    ]
    return train_data

def read_test_data():
    english_sents = []
    farsi_sents = []
    with open("/content/Test/test.en") as en_file:
        for l in en_file:
            english_sents.append(l)
    for fnum in range(4):
        with open(f"/content/Test/test.fa{fnum}") as fa_file:
            for idx, l in enumerate(fa_file):
                if len(farsi_sents) > idx:
                    farsi_sents[idx].append(l)
                else:
                    farsi_sents.append([l])
    test_data = [
        {
            "en": en,
            "fa": fa
        } for en, fa in zip(english_sents, farsi_sents)
    ]
    return test_data

#### Defining Vocabulary and Text Stuff

In [None]:
class Vocabulary:
    def __init__(self, tokenized, filter):
        counter = collections.Counter(
            itertools.chain(*tokenized)
        )
        if filter:
            words = [x[0] for x in counter.most_common(MAX_VOCAB_LENGTH) if x[1] > MIN_WORD_FREQ]
        else:
            words = list(counter.keys())

        words = [PAD_TOKEN, UNK_TOKEN, START_TOKEN, END_TOKEN] + words

        self.word2id = {w: idx for idx , w in enumerate(words)}
        self.id2word = words
        del counter


    def __len__(self):
        return len(self.word2id)
    
    @property
    def size(self):
        return len(self)

    @property
    def unk_token_idx(self):
        return self.word2id[UNK_TOKEN]

    @property
    def padding_token_idx(self):
        return self.word2id[PAD_TOKEN]

    @property
    def start_token_idx(self):
        return self.word2id[START_TOKEN]

    @property
    def end_token_idx(self):
        return self.word2id[END_TOKEN]

In [None]:
def train_vocabulary(train_data, english_tokenizer, farsi_tokenizer, filter=True):
    en_tokenized = iter(english_tokenizer(x["en"]) for x in train_data)
    fa_tokenized = iter(farsi_tokenizer(x["fa"]) for x in train_data)
    english_vocab = Vocabulary(en_tokenized, filter=filter)
    farsi_vocab = Vocabulary(fa_tokenized, filter=filter)
    return english_vocab, farsi_vocab

#### Defining Dataset

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, 
                 data,
                 english_transforms,
                 english_vocab,
                 farsi_transforms,
                 farsi_vocab,
                 train=True,
                 do_transform=True,
               ):
        # English 
        self.english_transforms = english_transforms
        self.english_vocab = english_vocab
        # Farsi
        self.farsi_transforms = farsi_transforms
        self.farsi_vocab = farsi_vocab
        # Dataset type
        self.train = train
        self.do_transform = do_transform
        if train:
            self._init_database(data)
        else:
            self.data = data
            self._len = len(data)

    def _init_database(self, data):
        self.h5py_root = "train_dataset.h5"
        if os.path.exists(self.h5py_root):
            os.remove(self.h5py_root)
        rdcc_params = dict(rdcc_nbytes=1024**2*500, #500 MB of cache
                            rdcc_nslots=5e3)
        hf = h5py.File(self.h5py_root, 'w', **rdcc_params)
        # sents = iter((item["en"], item["fa"]) for item in data)
        dt = h5py.special_dtype(vlen=np.int)
        # creating groups
        english_ids = []
        farsi_ids = []
        skipped_count = 0
        for item in tqdm(data, desc="creating dataset"):
            en, fa = self.apply_transforms(item["en"], item["fa"])
            en = np.asarray(en)
            fa = np.asarray(fa)
            if any([len(x) < POST_TOKENIZATION_LENGTH_MIN for x in [en, fa]]):
                skipped_count += 1
                continue
            english_ids.append(en)
            farsi_ids.append(fa)

        self._len = len(english_ids)
        
        print(f"Skipped {skipped_count}")
        english_ids = np.asarray(english_ids, dtype=dt)
        farsi_ids = np.asarray(farsi_ids, dtype=dt)
        hf.create_dataset("en", data=english_ids, dtype=dt, chunks=True)
        hf.create_dataset("fa", data=farsi_ids, dtype=dt, chunks=True)
        hf.close()
        self.hf = h5py.File(self.h5py_root, 'r', **rdcc_params)

    def __len__(self):
        return self._len
    
    def apply_transforms(self, unprocessed_en, unprocessed_fa):
        # English sentences
        en = self.english_transforms(unprocessed_en)
        # Farsi sentences
        if self.train:
            fa = self.farsi_transforms(unprocessed_fa)
        else:
            assert isinstance(unprocessed_fa, list)
            fa = []
            for sent in unprocessed_fa:
                fa.append(self.farsi_transforms(sent, disable_token2id=True))
        return en, fa

    def __getitem__(self, idx):
        if self.train:
            en = self.hf["en"][idx].tolist()
            fa = self.hf["fa"][idx].tolist()
        else:
            unprocessed_en, unprocessed_fa = self.data[idx]["en"], self.data[idx]["fa"]
            en, fa = self.apply_transforms(unprocessed_en, unprocessed_fa)    
        return {
            "en": en,
            "fa": fa,
        }

## Transforms

In [None]:
#@title Data Transforms { form-width: "15%"}
class AbstractTransformer:
    def __call__(self, *args, **kwargs):
        assert all([k in ["inverse", "disable_token2id", "disable_joining"] for k in kwargs.keys()])
        if kwargs.get("inverse", False) is True:
            return self.inverse_transform(*args, **kwargs)
        else:
            return self.transform(*args, **kwargs)
    
    def transform(self, x, **kwargs):
        raise NotImplementedError()
    
    def inverse_transform(self, x, **kwargs):
        raise NotImplementedError()


class AddSpecialTokens(AbstractTransformer):
    def __init__(self, add_start):
        super().__init__()
        self.add_start = add_start
    
    def transform(self, tokenized, **kwargs):
        result = tokenized + [END_TOKEN]
        if self.add_start:
            result = [START_TOKEN] + result
        return result

    def inverse_transform(self, tokenized, **kwargs):
        if self.add_start:
            return tokenized[1:-1]
        else:
            return tokenized[:-1]


class Token2IdTransformer(AbstractTransformer):
    def __init__(self, vocab):
        super().__init__()
        self.vocab = vocab

    def transform(self, tokens, **kwargs):
        if kwargs.get("disable_token2id", False):
            return tokens
        else:
            return [self.vocab.word2id.get(t, self.vocab.unk_token_idx) for t in tokens]

    def inverse_transform(self, ids, **kwargs):
        if kwargs.get("disable_token2id", False):
            tokens = ids # we don't have any token
            return tokens
        else:
            return [self.vocab.id2word[i] for i in ids]


class TransformList:
    def __init__(self, 
                 transforms):
        self.transforms = transforms
    
    def __call__(self, x, **kwargs):
        if kwargs.get("inverse", False):
            transforms = self.transforms[::-1]
        else:
            transforms = self.transforms
        for trns in transforms:
            x = trns(x, **kwargs)
        return x

### Tokenizers

In [None]:
# from here: https://github.com/SamLynnEvans/Transformer/blob/master/Tokenize.py
def tokenize(sentence):
    # sentence = re.sub(
    #     r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", sentence, flags=re.UNICODE)
    # sentence = re.sub(r"[ ]+", " ", sentence, flags=re.UNICODE)
    # sentence = re.sub(r"\!+", "!", sentence, flags=re.UNICODE)
    # sentence = re.sub(r"\,+", ",", sentence, flags=re.UNICODE)
    # sentence = re.sub(r"\?+", "?", sentence, flags=re.UNICODE)
    sentence = sentence.strip().lower()
    return wordpunct_tokenize(sentence)

class NaiveTokenizer(AbstractTransformer):
    def __init__(self, max_seq_length=MAX_SEQ_LENGTH-2):
        self.max_seq_length = max_seq_length
    
    def transform(self, text, **kwargs):
        tokens = tokenize(text)
        return tokens[:self.max_seq_length]
    
    def inverse_transform(self, tokens, **kwargs):
        if kwargs.get("disable_joining", False):
            return tokens
        else:
            return " ".join(tokens)

def RTLtagify(persian):
    return f"\u202B<{persian}\u202B>"
    # return "\u202B<" + "\u202C \u202B" + persian + "\u202C" + "> \u202B"



class NaiveFarsiTokenizer(NaiveTokenizer):
    EN_TO_FA_CONST_MAP = {
        PAD_TOKEN: "",
        UNK_TOKEN: RTLtagify("غیرمشخص"),
        START_TOKEN: RTLtagify("شروع"),
        END_TOKEN: RTLtagify("پایان")
    }
    pattern = re.compile("(\\u200c|\\xad)", re.UNICODE)
    def transform(self, text, **kwargs):
        # Halfspace, joiner, LTR and RTL or poper
        # text = NaiveFarsiTokenizer.pattern.sub(" ", text)
        tokens = tokenize(text)
        # Farsi is RTL, better change it to LTR for consistency
        return tokens[::-1][:self.max_seq_length]
    
    def inverse_transform(self, tokens, **kwargs):
        if kwargs.get("disable_joining", False):
            return tokens
        else:
            tokens = tokens[::-1]
            return " ".join(
                [
                    NaiveFarsiTokenizer.EN_TO_FA_CONST_MAP.get(t, t) for t in tokens
                ]   
            )

        
class BPETokenizer(NaiveTokenizer):
    def __init__(self, lang, max_seq_length=MAX_SEQ_LENGTH-2):
        super().__init__(max_seq_length)
        self.model_path = f"/content/bpe_{lang}"
        self.lang = lang
        self._tokenizer = pyonmttok.Tokenizer(
             mode="conservative",
                lang=lang, 
                joiner_annotate=True, 
                joiner_new=True,
                # preserve_segmented_tokens=True)
                segment_alphabet_change=True,
                segment_numbers=True,
        )
        self.learner = pyonmttok.BPELearner(
            tokenizer=self._tokenizer,
            min_frequency=MIN_WORD_FREQ, 
            symbols=MAX_VOCAB_LENGTH,
            total_symbols=False,
            )
        
        self.learner.ingest_file(f"/content/AFEC-merged_uncased.{self.lang}")
        self._tokenizer = self.learner.learn(self.model_path, verbose=True)
    
    def transform(self, text, **kwargs):
        result =  self._tokenizer.tokenize(text.lower(),
                                           training=False)[0][:self.max_seq_length]
        if self.lang == "fa":
            result = result[::-1]
        return result

    def inverse_transform(self, tokens, **kwargs):
        if kwargs.get("disable_joining", False):
            return tokens
        else:
            if self.lang == "fa":
                tokens = [NaiveFarsiTokenizer.EN_TO_FA_CONST_MAP.get(t, t) for t in tokens]
                tokens = tokens[::-1]
            return self._tokenizer.detokenize(tokens)

In [None]:
#@title Batch Transforms { form-width: "15%"}
class BatchProcessor:
    def __init__(self, english_padding_idx, farsi_padding_idx, train):
        self.english_padding_idx = english_padding_idx
        self.farsi_padding_idx = farsi_padding_idx
        self.train = train

    def __call__(self, data):
        items = {
            k: [] for k in ["en", "fa"]
        }
        for d in data:
            items["en"].append(
                torch.LongTensor(d["en"])
            )
            if self.train:
                t = torch.LongTensor(d["fa"])
            else:
                t = d["fa"]
            items["fa"].append(t)
            
        
        items["en"] = torch.nn.utils.rnn.pad_sequence(items["en"], 
                                                      batch_first=True, 
                                                      padding_value=self.english_padding_idx)

        if self.train:
            items["fa"] = torch.nn.utils.rnn.pad_sequence(items["fa"], 
                                                          batch_first=True, 
                                                          padding_value=self.farsi_padding_idx)
        

        return items


In [None]:
def get_transforms(english_tokenizer,
                         farsi_tokenizer,
                         english_vocab,
                         farsi_vocab):
    english_transforms = TransformList([
                                        english_tokenizer,
                                        AddSpecialTokens(add_start=False),
                                        Token2IdTransformer(english_vocab)])


    farsi_transforms = TransformList(
        [
        farsi_tokenizer,
        AddSpecialTokens(add_start=True),
        Token2IdTransformer(farsi_vocab)
        ]
    )
    return english_transforms, farsi_transforms

## Network Definitions

In [None]:
class ScaledDotProductAttention(nn.Module):
        def __init__(self, key_dim, value_dim):
            super().__init__()
            # projection layers
            self.dropout = nn.Dropout(p=0.1)
            self.register_buffer("scale", torch.Tensor([1 / math.sqrt(key_dim)]).float())
        
        def forward(self, q, k, v, padding_mask=None):
            '''
                q: (batch_size, num_heads, seq_len, key_dim)
                k: (batch_size, num_heads, seq_len, key_dim)
                v: (batch_size, num_heads, seq_len, value_dim)
                padding_mask: (batch_size, seq_len)
            '''
            batch_size, num_heads, seq_len, _ = q.size()

            s = torch.matmul(q, k.transpose(-1, -2))
            s = s * self.scale
            if padding_mask is not None:
                mask = padding_mask.view(batch_size, 1, 1, seq_len)
                s.masked_fill(mask, float("-inf"))
            attn = F.softmax(s, dim=-1)
            attn = self.dropout(attn)
            # if padding_mask is not None:
                # attn.masked_fill(mask, 0)
            x = torch.matmul(attn, v) # TODO: Check this for shape mismatch
            return x, attn

In [None]:
class MultiheadAttention(nn.Module):
    def __init__(self,
                 embed_dim,
                 key_dim,
                 value_dim,
                 num_heads,
                 residual):
        super().__init__()

        self.key_dim = key_dim
        self.value_dim = value_dim
        self.num_heads = num_heads
        

        self.Q = nn.Linear(embed_dim, key_dim * num_heads, bias=False)
        self.K = nn.Linear(embed_dim, key_dim * num_heads, bias=False)
        self.V = nn.Linear(embed_dim, value_dim * num_heads, bias=False)

        self.attention = ScaledDotProductAttention(key_dim, value_dim)
        self.dropout = nn.Dropout(p=0.1)

        self.norm = nn.LayerNorm(embed_dim, eps=LAYER_NORM_EPS)
        self.linear = nn.Linear(value_dim * num_heads, embed_dim, bias=False)

        self.register_buffer("residual", torch.Tensor([residual]).bool())

    def forward(self, embed, padding_mask=None):
        '''
            embed: (batch_size, seq_len, embed_dim)
            attention_masks: (batch_size, seq_len)
        '''
        batch_size, seq_len, embed_dim = embed.size()

        identity = embed 

        # Creating query, key and value matrices
        q = self.Q(embed)
        k = self.K(embed)
        v = self.V(embed)

        #### change shape to (batch_size, num_heads, seq_len, key_dim or value_dim)
        q = q.view(batch_size, seq_len, self.num_heads, self.key_dim).transpose(2, 1)#.contiguous()
        k = k.view(batch_size, seq_len, self.num_heads, self.key_dim).transpose(2, 1)#.contiguous()
        v = v.view(batch_size, seq_len, self.num_heads, self.value_dim).transpose(2, 1)#.contiguous()

        x, attn = self.attention(q, k, v, padding_mask)

        # Concatenating
        #### change to (batch_size, seq_len, num_heads, value_dim)
        x = x.transpose(2, 1)#.contiguous()
        #### concatenate along last dimension
        x = x.reshape(batch_size, seq_len, self.num_heads * self.value_dim)
        x = self.linear(x) # TODO: Check if this is residual?
        x = self.dropout(x) # TODO: Check if dropout should be here
        if self.residual:
            x = x + identity
        x = self.norm(x)
        return x

class PositionWiseFeedForward(nn.Module):
    def __init__(self, embed_dim, inner_dim):
        super().__init__()
        # Encoder-decoder architecture
        self.compressor = nn.Linear(embed_dim, inner_dim)
        self.decompressor = nn.Linear(inner_dim, embed_dim)

        self.norm = nn.LayerNorm(embed_dim, eps=LAYER_NORM_EPS)
        self.dropout = nn.Dropout(p=0.1)

    def forward(self, x):
        identity = x
        x = F.relu(self.compressor(x))
        x = self.decompressor(x)
        x = self.dropout(x) # TODO: Check if dropout should be here
        x = x + identity
        x = self.norm(x)
        return x


class PositionalEncoding(nn.Module):
    def __init__(self, max_len, embed_dim):
        super().__init__()
        encoding = torch.zeros(max_len, embed_dim)
        pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-float(np.log(10_000.0)) / embed_dim))
        encoding[..., 0::2] = torch.sin(pos * div_term)
        encoding[..., 1::2] = torch.cos(pos * div_term)
        encoding = encoding.unsqueeze(0)
        self.register_buffer("encoding", encoding)
        self.norm = nn.LayerNorm(embed_dim, eps=LAYER_NORM_EPS)
        self.dropout = nn.Dropout(p=0.1)

    def forward(self, x):
        batch_size, seq_len, embed_size = x.size()
        identity = x
        x = self.encoding[..., :seq_len, :] + identity
        x = self.dropout(x)
        x = self.norm(x)
        return x

In [None]:
  class TranslatorNet(nn.Module):
    class TokenEmbedding(nn.Module):
        def __init__(self, vocab_size, embed_dim, padding_idx):
            super().__init__()
            self.embedding = nn.Embedding(vocab_size, embed_dim, 
                                          padding_idx=padding_idx)
            self.embed_dim = embed_dim

            self.register_buffer("scale", torch.Tensor([math.sqrt(self.embed_dim)]).float())
        
        def forward(self, tokens):
            return self.embedding(tokens) * self.scale

    class Encoder(nn.Module):
        def __init__(self,
                     embed_dim,
                     key_dim,
                     value_dim,
                     num_heads,
                     ffn_inner_dim,
                     residual):
            super().__init__()
            self.self_attention = MultiheadAttention(embed_dim,
                                                    key_dim,
                                                    value_dim,
                                                    num_heads,
                                                    residual=residual)
            
            self.ffn = PositionWiseFeedForward(embed_dim, ffn_inner_dim)


        
        def forward(self, input):
            x, padding_mask = input
            x = self.self_attention(x, padding_mask)
            x = self.ffn(x)
            return x, padding_mask
    
    def __init__(self,
                 source_vocab,
                 target_vocab,
                 max_len,
                 residual,
                 num_sublayers=3,
                 embed_dim=256,
                 key_dim=64,
                 value_dim=64,
                 num_heads=8,
                 ffn_inner_dim=1024,
                 smooth_loss_factor=0,
                 device=device):
        super().__init__()

        self.max_len = max_len
        
        self.pos_encoder = PositionalEncoding(max_len, embed_dim)

        # Encoder section
        self.source_vocab = source_vocab

        self.source_embedding = TranslatorNet.TokenEmbedding(
            source_vocab.size,
            embed_dim,
            padding_idx = source_vocab.padding_token_idx,
        )
        
        self.encoder = nn.Sequential(*[TranslatorNet.Encoder(
                                          embed_dim,
                                          key_dim,
                                          value_dim,
                                          num_heads,
                                          ffn_inner_dim,
                                          residual=residual) for i in range(num_sublayers)])
        
        # Decoder section
        self.target_vocab = target_vocab
        self.target_embedding = TranslatorNet.TokenEmbedding(
            target_vocab.size,
            embed_dim,
            padding_idx = target_vocab.padding_token_idx
        )
        decoder_layer = nn.TransformerDecoderLayer(d_model=embed_dim, 
                                                   nhead=num_heads, 
                                                   dim_feedforward=ffn_inner_dim,
                                                   layer_norm_eps=LAYER_NORM_EPS,
                                                   batch_first=True)
        self.decoder = nn.TransformerDecoder(
            decoder_layer,
            num_sublayers,
            # nn.LayerNorm(embed_dim, LAYER_NORM_EPS=1e-5) # TODO: Check whether we need this?
        )

        self.target_projector = nn.Linear(embed_dim, target_vocab.size)
        
        # Weight sharing:
        self.target_projector.weight = self.target_embedding.embedding.weight

        self.device = device
        self.to(device)

        self.smooth_loss_factor = smooth_loss_factor

        self._reset_parameters()

    def _reset_parameters(self):
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p) 
            if p.dim() == 0:
                # bias initialization
                nn.init.zeros_(p)

    def generate_no_peek_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float("-inf")).masked_fill(mask == 1, float(0.0))
        mask = mask.to(self.device)
        return mask

    def generate_padding_mask(self, seq, pad_idx):
        return (seq == pad_idx).to(self.device)

    
    def patch_targets(self, targets):
        tgt, gold = targets[:, :-1].to(self.device), targets[:, 1:].to(self.device)
        gold = gold.contiguous()
        return tgt, gold

    def loss(self, logits, targets):
        batch_size, seq_len, n_class = logits.size()
        targets = targets.view(-1)
        logits = logits.view(-1, self.target_vocab.size)    
        non_pad_mask = targets.ne(self.target_vocab.padding_token_idx)
        if self.smooth_loss_factor == 0:
            loss = F.cross_entropy(logits, targets, 
                                   ignore_index=self.target_vocab.padding_token_idx,)
        else:
            # From here: https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/132907dd272e2cc92e3c10e6c4e783a87ff8893d/train.py
            one_hot = F.one_hot(targets, num_classes=n_class)
            one_hot = one_hot * (1 - self.smooth_loss_factor) + \
                      (1 - one_hot) * self.smooth_loss_factor / (n_class - 1)
            log_prb = F.log_softmax(logits, dim=-1)
            
            loss = -(one_hot * log_prb).mean(dim=-1)
            loss = loss.masked_select(non_pad_mask).sum()

        numel = non_pad_mask.sum().item()
        return loss, numel

    def forward(self, src_seq, tgt_seq):
        
        # right shifting targets

        _, src_seq_len = src_seq.size()
        _, tgt_seq_len = tgt_seq.size()

        
        src_padding_mask = self.generate_padding_mask(src_seq, self.source_vocab.padding_token_idx)
        
        x = self.source_embedding(src_seq)
        x = self.pos_encoder(x)
        x, src_padding_mask = self.encoder((x, src_padding_mask))
        # x = self.encoder(x, src_key_padding_mask = src_padding_mask)
        
        tgt_causal_mask = self.generate_no_peek_mask(tgt_seq_len)
        tgt_padding_mask = self.generate_padding_mask(tgt_seq, 
                                                      self.target_vocab.padding_token_idx)

        y = self.target_embedding(tgt_seq)
        y = self.pos_encoder(y)
        y = self.decoder(y, x,
                         tgt_mask=tgt_causal_mask,
                         memory_key_padding_mask=src_padding_mask,
                         tgt_key_padding_mask=tgt_padding_mask)
        logits = self.target_projector(y)
        return logits
    
    def infer(self, src_seq, max_len=MAX_SEQ_LENGTH):
        with torch.no_grad():
            assert src_seq.dim() == 2
            _, src_seq_len = src_seq.size()
            
            src_padding_mask = self.generate_padding_mask(src_seq, self.source_vocab.padding_token_idx)
            x = self.source_embedding(src_seq)
            x = self.pos_encoder(x)
            # encoders
            x, _ = self.encoder((x, src_padding_mask))
            # x = self.encoder(x, src_key_padding_mask = src_padding_mask)

            first_token = [[self.source_vocab.start_token_idx]]
            decoded_tokens = torch.LongTensor(first_token).to(self.device)
            for i in range(max_len):
                y = self.target_embedding(decoded_tokens)
                y = self.pos_encoder(y)
                tgt_causal_mask = self.generate_no_peek_mask(i + 1)
                y = self.decoder(y, x, 
                            tgt_mask=tgt_causal_mask,
                            memory_key_padding_mask=src_padding_mask)
                logits = self.target_projector(y)
                top_indices = torch.argmax(logits, dim=-1)
                top_indices_last_token = top_indices[...,[-1]]
                decoded_tokens = torch.cat(
                    [decoded_tokens, top_indices_last_token], dim=1
                )
                last_word = top_indices_last_token.squeeze().item()
                if last_word == self.target_vocab.end_token_idx:
                    break
            word_ids = decoded_tokens.squeeze().detach().cpu().tolist()
            return word_ids

In [None]:
# From here: https://stackoverflow.com/a/66773267

import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn.modules.loss import _WeightedLoss


class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes, smoothing=0.0, dim=-1, weight = None):
        """if smoothing == 0, it's one-hot method
           if 0 < smoothing < 1, it's smooth method
        """
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.weight = weight
        self.cls = classes
        self.dim = dim

    def forward(self, pred, target):
        assert 0 <= self.smoothing < 1
        pred = pred.log_softmax(dim=self.dim)

        if self.weight is not None:
            pred = pred * self.weight.unsqueeze(0)   

        with torch.no_grad():
            true_dist = torch.zeros_like(pred)
            true_dist.fill_(self.smoothing / (self.cls - 1))
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))

# Preparing for Training

### Defining initialization params and methods

In [None]:
def init_datasets(train_data, 
                  test_data, 
                  english_vocab, 
                  farsi_vocab,
                  english_transforms, 
                  farsi_transforms):
    non_splitted_dataset = TranslationDataset(train_data, 
                                              english_transforms, english_vocab,
                                              farsi_transforms, farsi_vocab)

    test_dataset = TranslationDataset(test_data, 
                                      english_transforms, english_vocab,
                                      farsi_transforms, farsi_vocab,
                                      train=False)
    return non_splitted_dataset, test_dataset

In [None]:
def init_subsets(non_splitted_dataset):
    train_length = int(len(non_splitted_dataset) * 0.9)
    val_length = len(non_splitted_dataset) - train_length

    train_dataset, validation_dataset = random_split(non_splitted_dataset, 
                                                    [train_length, val_length], 
                                                    generator=torch.Generator(device="cpu")
                                                    .manual_seed(TRAIN_TEST_SEED))
    return train_dataset, validation_dataset

In [None]:
def init_loaders(train_dataset, 
                 validation_dataset, 
                 test_dataset,
                 english_vocab,
                 farsi_vocab):
    train_loader_params = {
        "dataset": train_dataset,
        "batch_size": TRAIN_BATCH_SIZE,
        "shuffle": True,
        "num_workers": 2,
        "prefetch_factor": PREFETCH_FACTOR,
        "collate_fn": BatchProcessor(english_vocab.padding_token_idx,
                                    farsi_vocab.padding_token_idx,
                                    train=True),
    }
    validation_loader_params = {
        "dataset": validation_dataset,
        "batch_size": TEST_BATCH_SIZE,
        "shuffle": False,
        "collate_fn": BatchProcessor(english_vocab.padding_token_idx,
                                    farsi_vocab.padding_token_idx,
                                    train=True),
    }

    test_loader_params = {
        "dataset": test_dataset,
        "batch_size": TEST_BATCH_SIZE,
        "shuffle": False,
        "collate_fn": BatchProcessor(english_vocab.padding_token_idx,
                                    farsi_vocab.padding_token_idx,
                                    train=False),
    }


    train_loader = DataLoader(**train_loader_params)
    validation_loader = DataLoader(**validation_loader_params)
    test_loader = DataLoader(**test_loader_params)

    return train_loader, validation_loader, test_loader

In [None]:
def init_pretraining_params(english_tokenizer, farsi_tokenizer, filter=True):
    train_data = read_train_data()
    test_data = read_test_data()
    english_vocab, farsi_vocab = train_vocabulary(train_data, 
                                              english_tokenizer, 
                                              farsi_tokenizer,
                                              filter=filter)

    print(f"English Vocab Size: {len(english_vocab)}")
    print(f"Farsi Vocab Size: {len(farsi_vocab)}")
    english_transforms, farsi_transforms = get_transforms(english_tokenizer,
                                                            farsi_tokenizer,
                                                            english_vocab,
                                                            farsi_vocab)
    
    non_splitted_dataset, test_dataset = init_datasets(train_data, 
                                                       test_data, 
                                                       english_vocab, 
                                                       farsi_vocab,
                                                       english_transforms, 
                                                       farsi_transforms)
    
    train_dataset, validation_dataset = init_subsets(non_splitted_dataset)
    train_loader, validation_loader, test_loader = init_loaders(train_dataset, 
                                                                validation_dataset, 
                                                                test_dataset,
                                                                english_vocab,
                                                                farsi_vocab)

    return dict(
        english_vocab=english_vocab,
        farsi_vocab=farsi_vocab,
        train_dataset=train_dataset,
        validation_dataset=validation_dataset,
        test_dataset=test_dataset,
        train_loader=train_loader,
        validation_loader=validation_loader,
        test_loader=test_loader
    )

### Sample Generation  Class

In [None]:
class SampleGenerator:
    def __init__(self,
                 n,
                 dataset, 
                 dataset_type, save_loc=None):
        assert dataset_type in ["test", "val"]
        self.dataset = dataset
        self.dataset_type = dataset_type
        if dataset_type == "test":
            assert save_loc is not None
        self.save_loc = save_loc
        self.n = n

    def __call__(self, model, random_state=None, **kwargs):
        if random_state is None:
            inds = np.random.choice(np.arange(len(self.dataset)), size=self.n)
        else:
            random_state = np.random.RandomState(random_state)
            inds = random_state.choice(np.arange(len(self.dataset)), size=self.n)
        info = []

        if self.dataset_type == "val":
            farsi_transforms = self.dataset.dataset.farsi_transforms
            english_transforms = self.dataset.dataset.english_transforms
        else:
            farsi_transforms = self.dataset.farsi_transforms
            english_transforms = self.dataset.english_transforms

        for i in inds:
            item = self.dataset[i]
            en = torch.LongTensor(item["en"]).unsqueeze(0).to(device)
            fa = item["fa"]
            generated_ids = model.infer(en)
            generated = farsi_transforms(generated_ids, inverse=True)
            if self.dataset_type == "val":
                fa = farsi_transforms(fa, inverse=True)
                print(f"""
                
                ###### Original:
                
                {fa}
                
                ###### Translated:
                
                {generated}
                
                """)
              
            else:
                en = en.squeeze().detach().cpu().tolist()
                en = english_transforms(en, inverse=True)
                fa = farsi_transforms(fa[0], inverse=True, disable_token2id=True)
                
                info.append({
                    "en": en,
                    "fa": fa,
                    "gen_fa": generated
                })

        if self.dataset_type == "test":
           df = pd.DataFrame(info)
           df.to_csv(
               os.path.join(
                   self.save_loc,
                   "test_result.csv"
               ),
               index=False
           )
           from IPython.display import display, HTML
           display(df)
            
    
            

# Defining Trainer Class

In [None]:
def batch_cycle(loader):
    while True:
        for batch in loader:
            yield batch
            

class Trainer:
    def __init__(self, 
                 model, 
                 model_name,
                 train_loader,
                 val_loader,
                 save_interval=SAVE_INTERVAL,
                 iteration_report_loop=REPORT_STEP,
                 device=device,
                 max_iters=MAX_ITERS,
                 iteration_loop_callbacks=None,
                 amp=True):
        
        self.model = model
        self.model_name = model_name

        self.device = device                
        self.opt = torch.optim.Adam(self.model.parameters(), 
                                    lr=LR, 
                                    betas=BETAS, 
                                    eps=OPT_EPS)

        self.train_loader = train_loader
        self.val_loader = val_loader

        self.cur_iter = 1
        self.max_iters = max_iters

        self.save_interval = save_interval
        self.iteration_report_loop = iteration_report_loop
        self.training_info = []
        
        self.iteration_loop_callbacks = iteration_loop_callbacks

        self._debug = DEBUG
        self.amp = amp
        self.scaler = torch.cuda.amp.GradScaler(enabled=self.amp)

    @property
    def model_name(self):
        return self._model_name
    
    @model_name.setter
    def model_name(self, name):
        self._model_name = name
        self.save_loc = os.path.join(
            MODEL_SAVE_LOCATION, name
        )
        Path(self.save_loc).mkdir(exist_ok=True)
        self.model_path = os.path.join(
            self.save_loc, f"{self.model_name}.model"
        )
        self.training_info_json_path = os.path.join(
            self.save_loc, "training_info.json"
        )

    @property
    def debug(self):
        return self._debug
    
    @debug.setter
    def debug(self, flag):
        self._debug = flag
        if flag is True:
            self.save_interval = 4
            self.iteration_report_loop =2
            self.max_iters = 16
            self.model_name = "debug"



    @property
    def gpu(self):
        return self.device.type == "cuda"

    def train(self):
        for iteration, batch in enumerate(batch_cycle(self.train_loader), start=self.cur_iter):
            if iteration > self.max_iters:
                break
            if iteration % self.iteration_report_loop == 1:
                model.train()
                running_loss = 0
                num_elements = 0
                pbar = tqdm(total=self.iteration_report_loop, desc="Loop #:"
                f"{iteration//self.iteration_report_loop + 1} out of "
                f"{self.max_iters//self.iteration_report_loop}", 
                                leave=False)
            
            self.opt.zero_grad()
            en = batch["en"] = batch["en"].to(self.device)
            fa = batch["fa"] = batch["fa"].to(self.device)

            tgt, gold = self.model.patch_targets(fa)
            with torch.cuda.amp.autocast(enabled=self.amp):
                logits = self.model(en, tgt)
                loss, numel = self.model.loss(logits, gold)
            self.scaler.scale(loss).backward()
            # fixing nans
            self.scaler.unscale_(self.opt)
            torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                                MAX_GRADIENT_NORM, 
                                                error_if_nonfinite=False)
            self.scaler.step(self.opt)
            self.scaler.update()
            
            if not torch.isnan(loss):
                batch_loss = loss.item()
                running_loss += batch_loss * numel
                num_elements += numel
                pbar.set_postfix(batch_loss=batch_loss)
           
 
            pbar.update()

            self.cur_iter = iteration

            del batch, loss, en, fa, tgt, gold, logits

            if iteration % self.iteration_report_loop == 0:
                model.eval()
                pbar.close()
                train_loss = running_loss / num_elements
                if self.debug:
                    val_loss = 0
                else:
                    val_loss = self._validate()
                # Updating model's history

                self.training_info.append(
                    dict(
                        iteration=iteration, train_loss=train_loss, val_loss=val_loss
                    )
                )
                print(
                    f"iteration: {iteration}\t Train Loss: {train_loss:.3f}\t Val Loss: {val_loss:.3f}"
                )
                # just for assurance
                model.train()

            if iteration % self.save_interval == 0:
                checkpoint = {
                                "model": self.model.state_dict(),
                                "opt": self.opt.state_dict(),
                                "scaler": self.scaler.state_dict()
                              }
                state_dict = self.model.state_dict()
                torch.save(
                    checkpoint,
                    self.model_path
                )
                with open(self.training_info_json_path, "w") as jfile:
                    json.dump(self.training_info, jfile)
                    
                for callback in self.iteration_loop_callbacks:
                    callback(self.model)

    
    def load_model(self):
        checkpoint = torch.load(self.model_path)
        self.model.load_state_dict(checkpoint["model"])
        self.opt.load_state_dict(checkpoint["opt"])
        self.scaler.load_state_dict(checkpoint["scaler"])
        with open(self.training_info_json_path) as jfile:
            self.training_info = json.load(jfile)
        self.cur_iter = self.training_info[-1]["iteration"] + 1


    def _validate(self):
        model.eval()
        running_loss = 0
        num_elements = 0
        with torch.no_grad():
            for batch in self.val_loader:
                en = batch["en"].to(self.device)
                fa = batch["fa"].to(self.device)
                tgt, gold = self.model.patch_targets(fa)
                with torch.cuda.amp.autocast(enabled=self.amp):
                    logits = self.model(en, tgt)
                    loss, numel = self.model.loss(logits, gold)
                if torch.isnan(loss):
                    continue
                loss = loss.item()
                running_loss += loss * numel
                num_elements += numel
        return running_loss / num_elements

    def test(self, 
             test_dataset, 
             test_loader,
             max_len=MAX_SEQ_LENGTH,
             random_state=5):
        model.eval()
        
        original_translations = []
        generated_translations = []
        
        original_tokenized = []
        generated_tokenized = []

        english_vocab = self.model.source_vocab
        farsi_vocab = self.model.target_vocab

        farsi_transforms = test_dataset.farsi_transforms

        with torch.no_grad():
            for idx, batch in enumerate(test_loader):
                if self.debug and idx >= 2:
                    break
                en = batch["en"].to(self.device)
                fa_translations = batch["fa"]
                for i in range(len(en)):
                    word_ids = self.model.infer(en[[i],...], max_len=max_len)
                    words = farsi_transforms(word_ids, inverse=True, disable_joining=True)
                    sentence = farsi_transforms(word_ids, inverse=True)
                    generated_translations.append(words)
                    fa_pack = fa_translations[i]
                    for j in range(len(fa_pack)):
                        fa_pack[j] = farsi_transforms(fa_pack[j], 
                                                      inverse=True, 
                                                      disable_token2id=True, 
                                                      disable_joining=True)
                    
                original_translations.extend(fa_translations)
        
        # print("##\n".join(" ".join(original_translations[0][::-1])), "\n", " ".join(generated_translations[0][::-1]))
        n = 50 if not self.debug else 2
        SampleGenerator(n=n, 
                        dataset=test_dataset, 
                        dataset_type="test",
                        save_loc=self.save_loc)(model, random_state=random_state)

        bleu = corpus_bleu(original_translations, generated_translations)
        nist = corpus_nist(original_translations, generated_translations)

        self.test_info = {
            "bleu": bleu,
            "nist": nist
        }

        with open(os.path.join(self.save_loc, "metrics.json"), "w") as jfile:
            json.dump(self.test_info, jfile)

        print(f"Bleu: {bleu:.3f}\t Nist: {nist:.3f}")    

    def __del__(self):
        del self.model, self.opt
        self.model = None
        self.opt = None

In [None]:
def compute_correct(func, references, hypotheses):
    correct_refs = []
    correct_hyps = []
    for h in hypotheses:
        joined = " ".join(h)
        correct_hyps.append(gensim_preprocess(joined, filters=[strip_tags,strip_punctuation,strip_multiple_whitespaces]))
    for ref in references:
        corr_ref = []
        for r in ref:
            joined = " ".join(r)
            corr_ref.append(gensim_preprocess(joined, filters=[strip_tags,strip_punctuation,strip_multiple_whitespaces]))
        correct_refs.append(corr_ref)
    return func(correct_refs, correct_hyps)

# Tqdm Setup

In [None]:
# Set the tqdm text color to white.

from IPython.display import HTML, display

def set_css_in_cell_output():
    display(HTML('''
        <style>
            .jupyter-widgets {color: #d5d5d5 !important;}
            .widget-label {color: #d5d5d5 !important;}
        </style>
    '''))

get_ipython().events.register('pre_run_cell', set_css_in_cell_output)

# Training

In [None]:
model_params = dict(
    max_len=MAX_SEQ_LENGTH,
    embed_dim=EMBED_DIM,
    key_dim=KEY_DIM,
    value_dim=VALUE_DIM,
    num_heads=NUM_HEADS,
    ffn_inner_dim=FFN_INNER_DIM,
    num_sublayers=NUM_SUBLAYERS,
    # smooth_loss_factor=LABEL_SMOOTHING_COEF,
    device=device,
)

## Part A

In [None]:
english_tokenizer = NaiveTokenizer()
farsi_tokenizer = NaiveFarsiTokenizer()

params = init_pretraining_params(english_tokenizer, farsi_tokenizer)

English Vocab Size: 16384
Farsi Vocab Size: 16384


HBox(children=(FloatProgress(value=0.0, description='creating dataset', max=679671.0, style=ProgressStyle(desc…


Skipped 333


In [None]:
model_name = "part_a_finally_fixed"

model = TranslatorNet(
    source_vocab=params["english_vocab"],
    target_vocab=params["farsi_vocab"],
    residual=True,
    **model_params
)

trainer = Trainer(
    model,
    model_name,
    params["train_loader"],
    params["validation_loader"], 
    device=device,
    save_interval=SAVE_INTERVAL,
    iteration_report_loop=REPORT_STEP,
    max_iters=MAX_ITERS,
    iteration_loop_callbacks=[SampleGenerator(2, params["validation_dataset"], "val")],
    amp=False
)
trainer.load_model()
trainer.amp

False

In [None]:
trainer.train()

In [None]:
trainer.test(
    params["test_dataset"], 
    params["test_loader"],
    max_len=MAX_SEQ_LENGTH
)

Unnamed: 0,en,fa,gen_fa
0,the <UNK> .,استدهتل .,"‫<غیرمشخص‫> ‫<غیرمشخص‫> , ‫<غیرمشخص‫> و ناراحت..."
1,we will leave at half past eight and will arri...,ما هشت و نیم حرکت خواهیم کرد و ساعت ده به هامب...,ما در ساعت هشت و نیم به هامبورگ خواهیم رفت و د...
2,what do you think about the hotel <UNK> ?,شما در مورد هتل گواندهاس چه فکر میکنید ؟,نظر شما در مورد هتل ‫<غیرمشخص‫> چیست ?
3,i have booked two single rooms . one for me an...,دو اتاق تکنفره رزرو کرده ­ ام . یکی برای من و ...,دو اتاق فقط یک اتاق را رزرو کرده - ام . یکی بر...
4,that is perfect . would you like to go out for...,این عالی است . آیا دوست دارید عصر برای غذا بیر...,این عالی است . آیا شما دوست دارید برای ناهار د...
5,great . i will reserve two single rooms .,خوب . دو اتاق تکنفره رزرو خواهم کرد .,عالی است . من دو اتاق یک اتاق را رزرو خواهم کرد .
6,a single room and it is important for you to h...,یک اتاق تکنفره و برای شما مهم است که یک بار دا...,یک اتاق واحد و برای شما مهم است که یک بار داشت...
7,"yes , that is a good idea .",بله ، این فکر خوبی است .,"بله , این ایده خوبی است ."
8,"what did you say , please ?",شما چه گفتید ، لطفا ؟,"شما چه گفتید , لطفا ?"
9,that is very good . at which hotel should we s...,این خیلی خوب است . پس در کدام هتل باید بمانیم ؟,این خیلی خوب است . آیا ما باید در آن هتل بمانی...


Bleu: 0.407	 Nist: 6.717


## PART B

### Tokenization config

In [None]:
farsi_tokenizer = BPETokenizer("fa")
english_tokenizer = BPETokenizer("en")

params = init_pretraining_params(english_tokenizer, farsi_tokenizer, filter=False)

English Vocab Size: 14994
Farsi Vocab Size: 14755


HBox(children=(FloatProgress(value=0.0, description='creating dataset', max=679671.0, style=ProgressStyle(desc…


Skipped 12


### Training

In [None]:
model_name = "part_b_finally_fixed"

model = TranslatorNet(
    source_vocab=params["english_vocab"],
    target_vocab=params["farsi_vocab"],
    residual=True,
    **model_params
)

trainer = Trainer(
    model,
    model_name,
    params["train_loader"],
    params["validation_loader"], 
    device=device,
    save_interval=SAVE_INTERVAL,
    iteration_report_loop=REPORT_STEP,
    max_iters=MAX_ITERS,
    iteration_loop_callbacks=[SampleGenerator(2, params["validation_dataset"], "val")],
    amp=False
)

trainer.load_model()
print(f"AMP is {trainer.amp}")

AMP is False


In [None]:
trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Loop #:17 out of 20', max=5000.0, style=ProgressStyle(des…

iteration: 85000	 Train Loss: 1.903	 Val Loss: 1.819


HBox(children=(FloatProgress(value=0.0, description='Loop #:18 out of 20', max=5000.0, style=ProgressStyle(des…

iteration: 90000	 Train Loss: 1.885	 Val Loss: 1.804

                
                ###### Original:
                
                مادر باید به کودکش یا ژاکت ضخیم بپوشاند .
                
                ###### Translated:
                
                مادرش باید یک ژاکت با اندازه‌اش را بپوشد . مادرش ممکن است برای کودک یک ژاکت ضخیم‌اش را بپوشد .
                
                

                
                ###### Original:
                
                به گزارش کانیوز.ارگ , اداره مبارزه با مواد مخدر گفت که این مظنون در حال رانندگی با خودرویی دارای بیش از 20 کیلوگرم حشیش پنهان در صندوق عقبش بود .
                
                ###### Translated:
                
                به گزارش کانیوز.ارگ , اداره مبارزه با مواد مخدر گفته است که این مظنون با بیش از 20 کیلوگرم حشیش افغان که در کیسه مخفی شده بود یک ماشین رانندگی می‌کرده است .
                
                


HBox(children=(FloatProgress(value=0.0, description='Loop #:19 out of 20', max=5000.0, style=ProgressStyle(des…

iteration: 95000	 Train Loss: 1.868	 Val Loss: 1.795


HBox(children=(FloatProgress(value=0.0, description='Loop #:20 out of 20', max=5000.0, style=ProgressStyle(des…

iteration: 100000	 Train Loss: 1.855	 Val Loss: 1.788

                
                ###### Original:
                
                روز بعد , سومین جلسه شورای همکاری اروپا - آتلانتیک در آستانه گشایش یافت .
                
                ###### Translated:
                
                سومین مجمع شورای شراکت یورو - آتلانتیک در روز آینده در آستانه افتتاح شد .
                
                

                
                ###### Original:
                
                جزئیات این پرونده به ولز ارتباطی ندارند ولی , اصول آن ربط دارند و در نتیجه نخست وزیر مردان ما را برای حمایت از موضع اسکاتلند فرستاد .
                
                ###### Translated:
                
                جزئیات پرونده مربوط به ولز نیست اما اصلی است و بنابراین اولین مرد ما را فرستاده است تا از موقعیت اسکاتلند حمایت کند .
                
                


In [None]:
trainer.test(
    params["test_dataset"], 
    params["test_loader"],
    max_len=MAX_SEQ_LENGTH,
)

Unnamed: 0,en,fa,gen_fa
0,the stadthotel .,استدهتل .,استادیوم‌ها و استادیوم‌ها است .
1,we will leave at half past eight and will arri...,ما هشت و نیم حرکت خواهیم کرد و ساعت ده به هامب...,ما ساعت هشت و نیم به هامبورگ می‌رویم و در ساعت...
2,what do you think about the hotel gewandhaus ?,شما در مورد هتل گواندهاس چه فکر میکنید ؟,نظر شما در مورد هتل گووندهوس چیست ?
3,i have booked two single rooms . one for me an...,دو اتاق تکنفره رزرو کرده­ام . یکی برای من و یک...,من دو اتاق را رزرو کرده‌ام . یک برای من و یک ب...
4,that is perfect . would you like to go out for...,این عالی است . آیا دوست دارید عصر برای غذا بیر...,خیلی عالی است . آیا شما دوست دارید برای ناهار ...
5,great . i will reserve two single rooms .,خوب . دو اتاق تکنفره رزرو خواهم کرد .,عالی است . من دو اتاق هر دو اتاق را رزرو خواهم...
6,a single room and it is important for you to h...,یک اتاق تکنفره و برای شما مهم است که یک بار دا...,یک اتاق مهم است و برای شما مهم است که برای شما...
7,"yes , that is a good idea .",بله ، این فکر خوبی است .,"بله , این ایده خوبی است ."
8,"what did you say , please ?",شما چه گفتید ، لطفا ؟,"شما گفتید , شما چه گفتید , لطفا چه گفته‌اید ? ?"
9,that is very good . at which hotel should we s...,این خیلی خوب است . پس در کدام هتل باید بمانیم ؟,خیلی خوب است . پس در کدام هتل باید بمانیم ? گگ...


Bleu: 0.424	 Nist: 6.663


## PART C

### Training

In [None]:
model_name = "part_c_finally_fixed"

model = TranslatorNet(
    source_vocab=params["english_vocab"],
    target_vocab=params["farsi_vocab"],
    residual=False,
    **model_params
)

trainer = Trainer(
    model,
    model_name,
    params["train_loader"],
    params["validation_loader"], 
    device=device,
    save_interval=SAVE_INTERVAL,
    iteration_report_loop=REPORT_STEP,
    max_iters=MAX_ITERS,
    iteration_loop_callbacks=[SampleGenerator(2, params["validation_dataset"], "val")],
    amp=False
)
trainer.load_model()
print(f"AMP is {trainer.amp}")

AMP is False


In [None]:
trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Loop #:17 out of 20', max=5000.0, style=ProgressStyle(des…

iteration: 85000	 Train Loss: 2.543	 Val Loss: 2.367


HBox(children=(FloatProgress(value=0.0, description='Loop #:18 out of 20', max=5000.0, style=ProgressStyle(des…

iteration: 90000	 Train Loss: 2.479	 Val Loss: 2.311

                
                ###### Original:
                
                از این پس , مشتریان خدمات میزبانی وب ما , از یک کنترل پنل فارسی کامل که بر مبنای طراحی شده است , بهره‌مند خواهند گردید .
                
                ###### Translated:
                
                اکنون از خارج از ایران هستند که متعلق به زبان فارسی ( عرفرسی ) است که زبان فارسی ( ایران ) است که می‌سی آن را در آن قرار داده شده است ,
                
                

                
                ###### Original:
                
                باید درست انجام شود .
                
                ###### Translated:
                
                باید انجام شود . کاری که باید انجام شود , در حال انجام است .
                
                


HBox(children=(FloatProgress(value=0.0, description='Loop #:19 out of 20', max=5000.0, style=ProgressStyle(des…

iteration: 95000	 Train Loss: 2.425	 Val Loss: 2.257


HBox(children=(FloatProgress(value=0.0, description='Loop #:20 out of 20', max=5000.0, style=ProgressStyle(des…

iteration: 100000	 Train Loss: 2.381	 Val Loss: 2.216

                
                ###### Original:
                
                نماز میت 19 مامور پلیس در اواخر روز 8 اوت در پلیس لاینز برگذار شد .
                
                ###### Translated:
                
                در روز 8 صبح روز 8 اوت .. . این آیین‌های پلیس , پلیس , اس..های پلیس را برگذار کردند . . اس . پی . . . . پی . . . پی . . اس . . پی . . اس . اس . اس . پی . اس . . اس . . پی . پی . پی . اس . پی . پی . اس . اس . اس . 8.پلیس در روز 8 ماه مارس [8.8.پلیس برگذار می‌کردند .
                
                

                
                ###### Original:
                
                " محمد فارغ عامر " اظهار داشت که طی بازدید هیات نمایندگان , گفتگو هایی در رابطه با ایجاد پروژه های مشترک بین طرفین قرارداد صورت خواهند گرفت تا بدین ترتیب همکاری های لازم صورت گیرد زیرا تقریبا بالغ بر 10 پروژه مشترک بین طرفین مصری و فرانسوی وجود دارند .
                
                ###### Translated:
                
  

In [None]:
trainer.test(
    params["test_dataset"], 
    params["test_loader"],
    max_len=MAX_SEQ_LENGTH
)

Unnamed: 0,en,fa,gen_fa
0,the stadthotel .,استدهتل .,این‌ها . استادیوم‌ها .
1,we will leave at half past eight and will arri...,ما هشت و نیم حرکت خواهیم کرد و ساعت ده به هامب...,ما به هامبورگ خواهد شد و تا ساعت ساعت دوازده س...
2,what do you think about the hotel gewandhaus ?,شما در مورد هتل گواندهاس چه فکر میکنید ؟,چی? و اوکلاهو ? چی ? ? ما ?
3,i have booked two single rooms . one for me an...,دو اتاق تکنفره رزرو کرده­ام . یکی برای من و یک...,. تو . تو . تو . تو . تو . تو . تو . تو . برای...
4,that is perfect . would you like to go out for...,این عالی است . آیا دوست دارید عصر برای غذا بیر...,خوب است ? برای شام ? شب ? ? ? ? ? ? ? ? ? ? ? ...
5,great . i will reserve two single rooms .,خوب . دو اتاق تکنفره رزرو خواهم کرد .,----------------------------------------------...
6,a single room and it is important for you to h...,یک اتاق تکنفره و برای شما مهم است که یک بار دا...,یک اتاق است . فقط یک اتاق دارید . فقط یک اتاق ...
7,"yes , that is a good idea .",بله ، این فکر خوبی است .,"یک ایده خوب است . بله , خوب است , بله , خوب است ."
8,"what did you say , please ?",شما چه گفتید ، لطفا ؟,"لطفا ? لطفا , لطفا ?"
9,that is very good . at which hotel should we s...,این خیلی خوب است . پس در کدام هتل باید بمانیم ؟,آن چقدر خوب است ? چقدر خوب است ?


Bleu: 0.235	 Nist: 4.506
