# **Modules**

In [1]:
!pip install fasttext

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import numpy as np
import pandas as pd

import os
import re
import random
import fasttext

import unicodedata as ud

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 KB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2
  Using cached pybind11-2.10.3-py3-none-any.whl (222 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp38-cp38-linux_x86_64.whl size=4402260 sha256=378f684972055bad35443548f678027b495ea3a183cc9bec1b02597ff489c353
  Stored in directory: /root/.cache/pip/wheels/93/61/2a/c54711a91c418ba06ba195b1d78ff24fcaad8592f2a694ac94
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.10.3


# **Fastext embedder**

In [2]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fa.300.bin.gz
!gunzip /content/cc.fa.300.bin.gz
Embedder = fasttext.load_model("/content/cc.fa.300.bin")

--2023-02-05 11:11:42--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fa.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.75.142, 104.22.74.142, 172.67.9.4, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.75.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4502524724 (4.2G) [application/octet-stream]
Saving to: ‘cc.fa.300.bin.gz’


2023-02-05 11:13:19 (44.6 MB/s) - ‘cc.fa.300.bin.gz’ saved [4502524724/4502524724]





# **Data loading**

In [3]:
from google.colab import drive

drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


In [4]:
tmp = open("/content/gdrive/MyDrive/ferdousi.txt", encoding='utf-8')

data = tmp.readlines()
Data = []

for i in range(len(data)):
    if len(data[i]) <= 10:
        continue
    else:
        Data.append(data[i])

ferdousi = []
n = 0
for i in range(len(Data)):
    if i == (len(Data)-1)/2:
        break
    ferdousi.append(Data[n]+Data[n+1])
    n += 2

# **Data preproccesing**

In [6]:
in_sen = []
out_sen = []
for i in range(int(0.5*(len(ferdousi)))):
  in_sen.append(ferdousi[2*i])
  out_sen.append((ferdousi[2*i+1]))

In [5]:
SOS_token = 0
EOS_token = 1
MAX_LENGTH = 20

#initialize Lang Class
class Lang:
   def __init__(self):
       #initialize containers to hold the words and corresponding index
       self.word2index = {}
       self.word2count = {}
       self.index2word = {0: "SOS", 1: "EOS"}
       self.n_words = 2  # Count SOS and EOS

#split a sentence into words and add it to the container
   def addSentence(self, sentence):
       for word in sentence.split(' '):
           self.addWord(word)

#If the word is not in the container, the word will be added to it, 
#else, update the word counter
   def addWord(self, word):
       if word not in self.word2index:
           self.word2index[word] = self.n_words
           self.word2count[word] = 1
           self.index2word[self.n_words] = word
           self.n_words += 1
       else:
           self.word2count[word] += 1

In [7]:
#Normalize every sentence

def normalize_sentence(df):
   sentence = df.lower()
   sentence = ud.normalize('NFC',sentence)
   return sentence

def read_sentence(sen1,sen2):
   sentence1 = []
   sentence2 = []
   for i in range(len(sen1)):
     sentence1.append(normalize_sentence(sen1[i]))
     sentence2.append(normalize_sentence(sen2[i]))
   return sentence1, sentence2

def process_data(sen1,sen2):
   sentence1, sentence2 = read_sentence(sen1,sen2)

   source = Lang()
   target = Lang()
   pairs = []
   for i in range(len(sen1)):
       if len(sentence1[i].split(' ')) < MAX_LENGTH and len(sentence2[i].split(' ')) < MAX_LENGTH:
           full = [sentence1[i], sentence2[i]]
           source.addSentence(sentence1[i])
           target.addSentence(sentence2[i])
           pairs.append(full)

   return source, target, pairs

In [8]:
def indexesFromSentence(lang, sentence):
   return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
   indexes = indexesFromSentence(lang, sentence)
   indexes.append(EOS_token)
   return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorsFromPair(input_lang, output_lang, pair):
   input_tensor = tensorFromSentence(input_lang, pair[0])
   target_tensor = tensorFromSentence(output_lang, pair[1])
   return (input_tensor, target_tensor)

# **Encoder and Decoder classes with LSTM**

In [20]:
class Encoder(nn.Module):
   def __init__(self, input_dim, hidden_dim, embbed_dim, num_layers):
       super(Encoder, self).__init__()
      
       #set the encoder input dimesion , embbed dimesion, hidden dimesion, and number of layers 
       self.input_dim = input_dim
       self.embbed_dim = embbed_dim
       self.hidden_dim = hidden_dim
       self.num_layers = num_layers

       #initialize the embedding layer with input and embbed dimention
       self.embedding = nn.Embedding(self.input_dim,  self.embbed_dim)
       #intialize the LSTM to take the input dimetion of embbed, and output dimention of hidden and
       #set the number of lstm layers
       self.rnn = nn.LSTM(self.embbed_dim,self.hidden_dim, self.num_layers)
              
   def forward(self, src):      
       embedded = self.embedding(src)

       outputs, (hidden, cell) = self.rnn(embedded)
        # outputs shape: (seq_length, N, hidden_size)

       return hidden, cell

class Decoder(nn.Module):
   def __init__(self, input_size, output_dim, hidden_dim, embbed_dim, num_layers):
       super(Decoder, self).__init__()

#set the encoder output dimension, embed dimension, hidden dimension, and number of layers 
       self.embbed_dim = embbed_dim
       self.hidden_dim = hidden_dim
       self.output_dim = output_dim
       self.num_layers = num_layers

# initialize every layer with the appropriate dimension. For the decoder layer, it will consist of an embedding, GRU, a Linear layer and a Log softmax activation function.
       self.embedding = nn.Embedding(input_size,  self.embbed_dim)
       self.rnn = nn.LSTM(embbed_dim, hidden_size, num_layers)
       self.fc = nn.Linear(hidden_size, output_size)

   def forward(self, x, hidden, cell):
        # x shape: (N) where N is for batch size, we want it to be (1, N), seq_length
        # is 1 here because we are sending in a single word and not a sentence
        x = x.unsqueeze(0)

        embedding = self.embedding(x)
        # embedding shape: (1, N, embedding_size)

        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        # outputs shape: (1, N, hidden_size)

        predictions = self.fc(outputs)

        # predictions shape: (1, N, length_target_vocabulary) to send it to
        # loss function we want it to be (N, length_target_vocabulary) so we're
        # just gonna remove the first dim
        predictions = predictions.squeeze(0)

        return predictions, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        vocab_size = self.decoder.output_dim

        outputs = torch.zeros(target_len, batch_size, vocab_size).to(device)

        hidden, cell = self.encoder(source)

        # Grab the first input to the Decoder which will be <SOS> token
        x = target[0]

        for t in range(1, target_len):
            # Use previous hidden, cell as context from encoder at start
            output, hidden, cell = self.decoder(x, hidden, cell)

            # Store next output prediction
            outputs[t] = output

            # Get the best word the Decoder predicted (index in the vocabulary)
            best_guess = output.argmax(1)

            # With probability of teacher_force_ratio we take the actual next word
            # otherwise we take the word that the Decoder predicted it to be.
            # Teacher Forcing is used so that the model gets used to seeing
            # similar inputs at training and testing time, if teacher forcing is 1
            # then inputs at test time might be completely different than what the
            # network is used to. This was a long comment.
            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs
       


# **Model training class**

In [23]:
teacher_forcing_ratio = 0.5

def clacModel(model, input_tensor, target_tensor, model_optimizer, criterion):
   model_optimizer.zero_grad()

   input_length = input_tensor.size(0)
   loss = 0
   epoch_loss = 0
   # print(input_tensor.shape)

   output = model(input_tensor, target_tensor)

   num_iter = output.size(0)
   print(num_iter)

#calculate the loss from a predicted sentence with the expected result
   for ot in range(num_iter):
       loss += criterion(output[ot], target_tensor[ot])

   loss.backward()
   model_optimizer.step()
   epoch_loss = loss.item() / num_iter

   return epoch_loss

def trainModel(model, source, target, pairs, num_iteration=20000):
   model.train()

   optimizer = optim.Adam(model.parameters(), lr=0.02)
   criterion = nn.CrossEntropyLoss()
   total_loss_iterations = 0

   training_pairs = [tensorsFromPair(source, target, random.choice(pairs))
                     for i in range(num_iteration)]
  
   for iter in range(1, num_iteration+1):
       training_pair = training_pairs[iter - 1]
       input_tensor = training_pair[0]
       target_tensor = training_pair[1]

       loss = clacModel(model, input_tensor, target_tensor, optimizer, criterion)

       total_loss_iterations += loss

       if iter % 5000 == 0:
           avarage_loss= total_loss_iterations / 5000
           total_loss_iterations = 0
           print('%d %.4f' % (iter, avarage_loss))
          
   torch.save(model.state_dict(), 'mytraining.pt')
   return model

# **Evaluation functions**

In [24]:
def evaluate(model, input_lang, output_lang, sentences, max_length=MAX_LENGTH):
   with torch.no_grad():
       input_tensor = tensorFromSentence(input_lang, sentences[0])
       output_tensor = tensorFromSentence(output_lang, sentences[1])
  
       decoded_words = []
  
       output = model(input_tensor, output_tensor)
       # print(output_tensor)
  
       for ot in range(output.size(0)):
           topv, topi = output[ot].topk(1)
           # print(topi)

           if topi[0].item() == EOS_token:
               decoded_words.append('<EOS>')
               break
           else:
               decoded_words.append(output_lang.index2word[topi[0].item()])
   return decoded_words

def evaluateRandomly(model, source, target, pairs, n=10):
   for i in range(n):
       pair = random.choice(pairs)
       print('source {}'.format(pair[0]))
       print('target {}'.format(pair[1]))
       output_words = evaluate(model, source, target, pair)
       output_sentence = ' '.join(output_words)
       print('predicted {}'.format(output_sentence))

# **Training with LSTM**

In [26]:
source, target, pairs = process_data(in_sen, out_sen)

randomize = random.choice(pairs)
print('random sentence {}'.format(randomize))

#print number of words
input_size = source.n_words
output_size = target.n_words
print('Input : {} Output : {}'.format(input_size, output_size))

embed_size = 128
hidden_size = 256
num_layers = 1
num_iteration = 250

#create encoder-decoder model
encoder = Encoder(input_size, hidden_size, embed_size, num_layers)
decoder = Decoder(input_size, output_size, hidden_size, embed_size, num_layers)

model = Seq2Seq(encoder, decoder).to(device)

#print model 
print(encoder)
print(decoder)

model = trainModel(model, source, target, pairs, num_iteration)
evaluateRandomly(model, source, target, pairs)

random sentence ['چو ضحاک دست اندر آورد و خورد\nشگفت آمدش زان هشیوار مرد\n', 'بدو گفت بنگر که از آرزوی\nچه خواهی بگو با من ای نیکخوی\n']
Input : 31534 Output : 31650
Encoder(
  (embedding): Embedding(31534, 128)
  (rnn): LSTM(128, 256)
)
Decoder(
  (embedding): Embedding(31534, 128)
  (rnn): LSTM(128, 256)
  (fc): Linear(in_features=256, out_features=31650, bias=True)
)
14
14
9
12
10
11
12
14
12
10
12
11
8
13
9
12
11
14
11
11
9
9
11
10
11
13
14
11
14
12
11
12
12
11
10
14
11
12
12
12
10
11
11
13
14
11
13
10
13
10
12
11
13
12
12
14
11
13
10
11
13
11
12
10
14
8
12
17
14
10
13
11
11
10
11
11
13
12
11
12
9
9
12
13
12
11
12
9
11
12
10
11
11
11
13
13
10
10
9
10
11
11
10
13
9
11
12
12
14
14
13
11
15
14
11
12
13
12
12
10
12
13
11
12
12
10
11
13
10
13
13
9
11
13
12
10
11
12
12
9
13
12
9
11
13
13
10
14
10
12
12
10
12
9
11
17
11
11
10
14
12
14
13
11
13
14
8
12
13
12
10
11
12
15
11
11
11
12
10
11
10
14
10
11
13
9
12
13
10
14
13
12
11
11
12
10
14
11
12
14
13
15
11
12
12
11
8
7
13
10
11
13
12
14
11
9

# **Encoder and Decoder classes with GRU**

In [28]:
class Encoder(nn.Module):
   def __init__(self, input_dim, hidden_dim, embbed_dim, num_layers):
       super(Encoder, self).__init__()
      
       #set the encoder input dimesion , embbed dimesion, hidden dimesion, and number of layers 
       self.input_dim = input_dim
       self.embbed_dim = embbed_dim
       self.hidden_dim = hidden_dim
       self.num_layers = num_layers

       #initialize the embedding layer with input and embbed dimention
       self.embedding = nn.Embedding(input_dim, self.embbed_dim)
       #intialize the GRU to take the input dimetion of embbed, and output dimention of hidden and
       #set the number of gru layers
       self.gru = nn.GRU(self.embbed_dim, self.hidden_dim, num_layers=self.num_layers)
              
   def forward(self, src):
      
       embedded = self.embedding(src).view(1,1,-1)
       outputs, hidden = self.gru(embedded)
       return outputs, hidden

class Decoder(nn.Module):
   def __init__(self, output_dim, hidden_dim, embbed_dim, num_layers):
       super(Decoder, self).__init__()

#set the encoder output dimension, embed dimension, hidden dimension, and number of layers 
       self.embbed_dim = embbed_dim
       self.hidden_dim = hidden_dim
       self.output_dim = output_dim
       self.num_layers = num_layers

# initialize every layer with the appropriate dimension. For the decoder layer, it will consist of an embedding, GRU, a Linear layer and a Log softmax activation function.
       self.embedding = nn.Embedding(output_dim, self.embbed_dim)
       self.gru = nn.GRU(self.embbed_dim, self.hidden_dim, num_layers=self.num_layers)
       self.out = nn.Linear(self.hidden_dim, output_dim)
       self.softmax = nn.LogSoftmax(dim=1)
      
   def forward(self, input, hidden):

# reshape the input to (1, batch_size)
       input = input.view(1, -1)
       embedded = F.relu(self.embedding(input))
       output, hidden = self.gru(embedded, hidden)       
       prediction = self.softmax(self.out(output[0]))
      
       return prediction, hidden

class Seq2Seq(nn.Module):
   def __init__(self, encoder, decoder, device, MAX_LENGTH=MAX_LENGTH):
       super().__init__()
      
#initialize the encoder and decoder
       self.encoder = encoder
       self.decoder = decoder
       self.device = device
     
   def forward(self, source, target, teacher_forcing_ratio=0.5):

       input_length = source.size(0) #get the input length (number of words in sentence)
       batch_size = target.shape[1] 
       target_length = target.shape[0]
       vocab_size = self.decoder.output_dim
      
#initialize a variable to hold the predicted outputs
       outputs = torch.zeros(target_length, batch_size, vocab_size).to(self.device)

#encode every word in a sentence
       for i in range(input_length):
           encoder_output, encoder_hidden = self.encoder(source[i])

#use the encoder’s hidden layer as the decoder hidden
       decoder_hidden = encoder_hidden.to(device)
  
#add a token before the first predicted word
       decoder_input = torch.tensor([SOS_token], device=device)  # SOS

#topk is used to get the top K value over a list
#predict the output word from the current target word. If we enable the teaching force,  then the #next decoder input is the next word, else, use the decoder output highest value. 

       for t in range(target_length):   
           decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
           outputs[t] = decoder_output
           teacher_force = random.random() < teacher_forcing_ratio
           topv, topi = decoder_output.topk(1)
           input = (target[t] if teacher_force else topi)
           if(teacher_force == False and input.item() == EOS_token):
               break

       return outputs

# **Training with GRU**

In [29]:
randomize = random.choice(pairs)
print('random sentence {}'.format(randomize))

#print number of words
input_size = source.n_words
output_size = target.n_words
print('Input : {} Output : {}'.format(input_size, output_size))

embed_size = 128
hidden_size = 256
num_layers = 1
num_iteration = 250

#create encoder-decoder model
encoder = Encoder(input_size, hidden_size, embed_size, num_layers)
decoder = Decoder(output_size, hidden_size, embed_size, num_layers)

model = Seq2Seq(encoder, decoder, device).to(device)

#print model 
print(encoder)
print(decoder)

model = trainModel(model, source, target, pairs, num_iteration)
evaluateRandomly(model, source, target, pairs)

random sentence ['کنون لاجرم روی گیتی بمرد\nبیاراستم تا کی آید نبرد\n', 'مرا ساز و لشکر ز شاهان پیش\nفزونست و هم دولت و رای بیش\n']
Input : 31534 Output : 31650
Encoder(
  (embedding): Embedding(31534, 128)
  (gru): GRU(128, 256)
)
Decoder(
  (embedding): Embedding(31650, 128)
  (gru): GRU(128, 256)
  (out): Linear(in_features=256, out_features=31650, bias=True)
  (softmax): LogSoftmax(dim=1)
)
10
10
11
11
11
12
12
12
10
15
14
10
11
10
13
12
12
14
13
12
11
12
10
13
11
13
10
10
10
11
11
10
16
8
11
12
10
13
10
14
14
14
11
10
14
13
11
8
11
11
12
10
13
11
13
12
10
11
10
9
12
11
14
12
10
12
11
11
12
12
11
12
11
12
14
10
12
12
10
10
11
9
12
13
10
11
11
10
11
14
9
10
10
12
11
10
10
12
11
11
12
15
13
10
11
9
14
9
12
15
11
14
13
10
11
10
13
9
10
10
12
14
9
13
11
8
11
10
12
9
11
14
13
11
8
9
13
9
11
10
11
10
14
11
12
11
11
10
11
12
11
12
14
11
11
10
14
13
14
10
14
11
12
9
13
15
10
12
11
12
10
13
12
11
11
11
9
12
11
12
13
13
13
9
9
10
11
11
13
11
9
9
10
10
10
12
13
13
13
11
11
13
9
11
11
9
12
13
