# 1)-Importing Dependencies <a class="anchor" id="1-bullet"></a> 

In [None]:
!pip install utils

In [None]:
# Data handling and traditional algebraic operations
import math
import numpy as np
import pandas as pd
import random
import sys

# DL dependencies
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import Dataset as TorchDataset
from torch.utils.tensorboard import SummaryWriter

import nltk
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from nltk import pos_tag
nltk.download('punkt')

# ML dependencies and scores
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score
from torchtext.data.metrics import bleu_score

# Text manipulation tools
import re
import string
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Device configuration

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
if torch.cuda.is_available():
    print("CUDA is available.")
else:
    print("CUDA is not available.")

## 1.1- Problem statement <a class="anchor" id="1-bullet"></a> 

The Sequence-to-Sequence (Seq2Seq) problem is a fundamental task in natural language processing (NLP) and machine translation, where the goal is to transform an input sequence into an output sequence of potentially different lengths. This problem is commonly encountered in tasks such as machine translation, summarization, text generation, Grammar Error Correction (GEC) and more. 

**Why Encoder-Decoder Models are Useful:**

Encoder-Decoder models are well-suited for the Seq2Seq problem because they provide an effective framework for handling input and output sequences of varying lengths. The key components of the Encoder-Decoder architecture are:

**1)-Encoder:** The Encoder takes the input sequence and compresses it into a fixed-size context vector, also known as the "thought vector" or "latent representation." This context vector aims to capture the essential information from the input sequence and serves as the foundation for generating the output.

**2)-Decoder:** The Decoder takes the context vector produced by the Encoder and generates the output sequence one element at a time. It uses the context vector and the previously generated elements of the output sequence (during training) to conditionally generate the next element in the sequence.

Using an Encoder-Decoder architecture allows the model to handle variable-length input and output sequences in a way that traditional models like bag-of-words or fixed-size input models cannot.

**Carrying out the Problem with RNN, LSTM, and LSTM Seq2Seq:**

**1)-RNN(Recurrent Neural Network):**
* RNNs can be used for Seq2Seq problems by feeding the input sequence step-by-step into the RNN cell and using the final hidden state as the context vector.
* *Limitation:* RNNs suffer from the vanishing gradient problem, which makes it difficult for them to capture long-range dependencies in sequences, leading to difficulties in handling long sequences.

**2)-LSTM (Long Short-Term Memory):**
* LSTMs are a variant of RNNs that mitigate the vanishing gradient problem by using memory cells and gating mechanisms.
* LSTMs can better capture long-term dependencies, making them more effective for Seq2Seq problems compared to simple RNNs.

**3)-LSTM Seq2Seq:**
* An LSTM-based Seq2Seq model combines two blocks of LSTM networks as both the Encoder and Decoder.
* The Encoder LSTM processes the input sequence, and the final hidden state becomes the context vector.
* The Decoder LSTM generates the output sequence by taking the context vector as input and predicting each element of the output sequence step-by-step.
* *Limitation:* While LSTM Seq2Seq models are an improvement over simple RNNs, they may encounter difficulties with extremely long sequences due to the limitations of LSTMs.

Encoder-Decoder models provide an elegant solution to Seq2Seq problems, enabling the handling of variable-length input and output sequences. RNNs and LSTMs are foundational components for these models, with LSTMs being preferred due to their ability to capture long-range dependencies. However, even LSTM-based Seq2Seq models have limitations in handling very long sequences, and for extremely challenging cases, more advanced architectures like Transformer-based models have been introduced to overcome these limitations.

## 1.2- Utilities <a class="anchor" id="1-bullet"></a> 

In [None]:
def correct_sentence(model, sentence, inp_vocabulary, out_vocabulary, device, max_length=50):
    # Ensure the sentence is a string and convert tokens to lowercase
    if not isinstance(sentence, str):
        raise ValueError("Input sentence must be a string")
    sentence = sentence.lower()

    # Load tokenizer for English text
    tokenizer_eng = get_tokenizer('basic_english')

    # Tokenize English sentence
    tokens = tokenizer_eng(sentence)

    # Add <SOS> and <EOS> tokens in the beginning and end, respectively
    tokens.insert(0, inp_vocabulary.init_token)
    tokens.append(inp_vocabulary.eos_token)

    # Convert tokens to indices
    text_to_indices = [inp_vocabulary.vocab.stoi[token] for token in tokens]

    # Convert to Tensor
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    # Build encoder hidden, cell state
    with torch.no_grad():
        hidden, cell = model.encoder(sentence_tensor)

    outputs = [out_vocabulary.vocab.stoi["<sos>"]]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden, cell = model.decoder(previous_word, hidden, cell)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if output.argmax(1).item() == out_vocabulary.vocab.stoi["<eos>"]:
            break

    translated_sentence = [out_vocabulary.vocab.itos[idx] for idx in outputs]

    # Remove start token
    return translated_sentence[1:]

    
    

In [None]:
def bleu(data, model, input_text, corr_text, device):
    targets = []
    outputs = []
    
    for example in data:
        src = vars(example)["src"]
        trg = vars(example)["trg"]
        
        prediction = correct_sentence(model, src, input_text, corr_text, device)
        prediction = prediction[:1] # remove <eos> token
        
        targets.append([trg])
        outputs.append(prediction)
    
    return bleu_score(outputs, targets)

In [None]:
def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)
    
def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])

# 2)-Data Preprocessing <a class="anchor" id="1-bullet"></a> 


## 2.1-Data importing <a class="anchor" id="1-bullet"></a> 

The basic idea is to train the model to take a sentence with grammar errors as input and generate the corrected sentence as the output. However, there are some key considerations to keep in mind when adapting the model for this new task:
1. The dataset must consists of pairs of sentences where one sentece contains grammar error, and the other sentence, is the same sentence with the errors corrected.
2. Tokenization and Vocabulary: Ensure that both the input and output sentences are properly tokenized and that you build separate vocabularies for the input and output languages (in this case, the original sentence with errors and the corrected sentence).
3. Loss Function: Consider using a loss function that is tailored for sequence generation tasks, such as the CrossEntropyLoss, but it should be designed to handle variable- lenght sequences.
4. Data Augmentation: In grammar error correction, you might not have a huge amount of labeled data. Data augmentation techniques like adding synthetic errors to the correct sentences can help improve the model's generalization and performance.
5. Preprocessing: Depending on the complexity of the grammar errors you are dealing with, you might need to perform additional preprocessing steps to handle specific error patterns. For example, if you're dealing with spelling mistakes, you might need to use techniques like lemmatization or stemming to handle word variations.
6. Encoder-Decoder Architecture is the best option to carry sequence to sequence tasks.
7. Post-processing: After the Seq2Seq model generates the corrected sentence, you might need to perform some post-processing to ensure that the output is in a grammatically correct and coherent form. This can include tasks like capitalization, punctuation, and word order adjustments.
8. Evaluation: You will need to establish appropriate evaluation metrics for grammar error correction, such as precision, recall, F1 score, or BLEU score, depending on your specific requirements.
9. Fine-tuning and Regularization: Fine-tuning the pre-trained Seq2Seq model on the grammar error correction task might yield better results. Additionally, regularization techniques like dropout can help prevent overfitting and improve generalization.

First, we are going to use two standard datasets for grammatical error detection:

*   Lang-8



### Lang-8 <a class="anchor" id="1-bullet"></a> 

In [None]:
# Lang-8 loading and extraction of correct and incorrect sentences

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

#path = "/kaggle/working/Lang8_entries.train"
path = "/kaggle/input/entriestrain/entries.train"

f1 = open(path)
lines1 = f1.readlines()
inp1 = [] # list for incorrect sentences
tgt1 = [] # storing

for i in lines1:
    lst = i.split("\t")

# IF LENGTH OF THE LIST IS GREATER THAN 5 THEN CORRECT SENTTENCE EXISTS OTHERWISE ONLY INCORRECT SENTENCE IS PRESENT
    if len(lst)>5  :     #IF LENGTH IS GREATER THAN 5
        inp1.append(lst[-2]) # APPEND SECONG LAST ITEM IN LIST WHICH IS INCORRECT SENTENCE
        tgt1.append(lst[-1]) # APPEND LAST ITEM IN THE LIST WHICH IS

In [None]:
df = pd.DataFrame()
df['y'] = list('1'*len(inp1))
df['input'] = inp1
df['output'] = tgt1

In [None]:
df.head()

In [None]:
df.describe()

## 2.2-Data cleaning <a class="anchor" id="1-bullet"></a> 

### 2.2.1-Data formating <a class="anchor" id="1-bullet"></a> 

In [None]:
def remove_spaces(text):
    text = re.sub(r" '(\w)",r"'\1",text)
    text = re.sub(r" \,",",",text)
    text = re.sub(r" \.+",".",text)
    text = re.sub(r" \!+","!",text)
    text = re.sub(r" \?+","?",text)
    text = re.sub(" n't","n't",text)
    text = re.sub("[\(\)\;\_\^\`\/]","",text)
    return text

def decontract(text):
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text

def preprocess(text):
    text = re.sub("\n","",text)
    text = remove_spaces(text)
    text = re.sub(r"\.+",".",text)
    text = re.sub(r"\!+","!",text)
    text = decontract(text)
    text = re.sub("[^A-Za-z0-9 ]+","",text)
    text = text.lower()
    return text

In [None]:
df["enc_input"] = df.input.apply(preprocess)
df["dec_input"] = df.input.apply(preprocess)
df["dec_output"] = df.output.apply(preprocess)
df =df.drop(["input","output"],axis=1)
df = df[df.enc_input.notnull()]
df = df[df.dec_input.notnull()]
df = df[df.dec_output.notnull()]
df = df.drop_duplicates()

In [None]:
df.head()

### 2.2.2-Removing null values and duplicates <a class="anchor" id="1-bullet"></a> 

In [None]:
df.shape

In [None]:
df = df.drop_duplicates()
df = df[df.enc_input.notnull()]
df = df[df.dec_input.notnull()]
df = df[df.dec_output.notnull()]
print(df.shape)
df.head()

## 2.2-Tokenization <a class="anchor" id="1-bullet"></a> 

We crate a tokenizer that takes into account the special ``<start>`` and ``<end>`` tokens. So we first add these tokens to the data to ensure correct formating.

In [None]:
# Add <start> and <end> tokens to the dec_input column
# df["dec_input"] = df["dec_input"].apply(lambda x: "<start> " + x + " <end>")
df["dec_input"] = df["dec_input"].apply(lambda x: "<sos> " + x)

# Add <start> and <end> tokens to the dec_output column
#df["dec_output"] = df["dec_output"].apply(lambda x: "<start> " + x + " <end>")
df["dec_output"] = df["dec_output"].apply(lambda x: x + " <eos>")
df.head()

In [None]:
print(df["dec_input"].iloc[1])
print(df["dec_output"].iloc[1])

In [None]:
# DON´T RUN THIS CELL

# Adding <start> and <end> token
df["dec_input"] = "<start>"+ df["dec_input"]
df["dec_input"].iloc[0] = df["dec_input"].iloc[0] + "<end>"
df["dec_output"].iloc[0] = df["dec_output"].iloc[0] + "<end>"
df.head()

We then split the original dataset into training, validation and test sets.

In [None]:
# Train, validation and test split
df_train ,df_val_train = train_test_split(df, test_size=0.3,random_state = 16, stratify = df.y )
df_val, df_test = train_test_split(df_val_train, test_size=0.5, random_state = 16, stratify = df_val_train.y)

In [None]:
print("Train Shape =",df_train.shape)
print("Val Shape =",df_val.shape)
print("Test Shape =",df_test.shape)

Finally, we run the tokenizer using ```get_tokenizer``, perform a word counter for future uses using the ``Counter()`` cosntructor and generate the input and output training vocabulary for the encoder using ```build_vocab_from_iterator``.

In [None]:
from collections import Counter

def count_token_occurrences(tokens_list):
  counter = Counter()
  for sentence_tokens in tokens_list:
    counter.update(sentence_tokens)
  return counter

# Define the tokenizer function (use basic_english tokenizer)
tokenizer = get_tokenizer('basic_english')
traindata_in = df_train.dec_input.apply(str).tolist()
traindata_out = df_train.dec_output.apply(str).tolist()

# Tokenization and preprocessing for encoder input
enc_input_tokens = [tokenizer(sentence) for sentence in traindata_in]

# Tokenization and preprocessing for decoder input
dec_input_tokens = [tokenizer(sentence) for sentence in traindata_out]

# Build vocabulary for encoder input
counter_enc = count_token_occurrences(enc_input_tokens)
tk_inp = build_vocab_from_iterator(enc_input_tokens,specials=['<pad>'])
# Build vocabulary for decoder input
counter_dec = count_token_occurrences(dec_input_tokens)
tk_out = build_vocab_from_iterator(dec_input_tokens, specials=['<pad>', '<sos>', '<eos>'])


In [None]:
print(type(tk_out.get_stoi()))

### 3.3-Text data into integer sequences <a class="anchor" id="1-bullet"></a> 

We now try to convert the text data into integer sequences wich also has a padding. This padding of sequences is necessary to ensure that all sequences in a batch have the same length. Padding adds special tokens (pad token) to the sequences to that all sequences pocesses the same amount of tokens.

In [None]:
class conv_dataset(TorchDataset):
    def __init__(self, data, tk_inp, tk_out, max_len):
        self.encoder_in = data["enc_input"].apply(str).values
        self.decoder_in = data["dec_input"].apply(str).values
        self.decoder_out = data["dec_output"].apply(str).values
        self.tk_inp = tk_inp.get_stoi()
        self.tk_out = tk_out.get_stoi()
        self.tokenizer = get_tokenizer('basic_english')
        self.max_len = max_len

    def __getitem__(self, i):
        # Input sequences
        encoder_seq = self.encoder_in[i]
        encoder_tokens = self.tokenizer(encoder_seq)
        encoder_indices = [self.tk_inp[token] for token in encoder_tokens]
        encoder_tensor = torch.tensor(encoder_indices)

        # Input encoder sequences
        decoder_in_seq = self.decoder_in[i]
        decoder_in_tokens = self.tokenizer(decoder_in_seq)
        # Special handling for <start> and <end> tokens
        decoder_in_indices = [self.tk_inp[token] for token in decoder_in_tokens]
        decoder_in_tensor = torch.tensor(decoder_in_indices)

        # Input decoder sequences
        decoder_out_seq = self.decoder_out[i]
        decoder_tokens = self.tokenizer(decoder_out_seq)
        # Special handling for <start> and <end> tokens
        decoder_out_indices = [self.tk_out[token] for token in decoder_tokens]
        decoder_out_tensor = torch.tensor(decoder_out_indices)

        # Tokenizer padding
        encoder_tensor = F.pad(encoder_tensor, pad=(0, self.max_len - len(encoder_tensor)))
        decoder_in_tensor = F.pad(decoder_in_tensor, pad=(0, self.max_len - len(decoder_in_tensor)))
        decoder_out_tensor = F.pad(decoder_out_tensor, pad=(0, self.max_len - len(decoder_out_tensor)))

        return encoder_tensor, decoder_in_tensor, decoder_out_tensor

    def __len__(self):
        return len(self.encoder_in)

### 3.4-Data batching <a class="anchor" id="1-bullet"></a> 

##### 3.4.1-DataLoader from scratch <a class="anchor" id="1-bullet"></a> 

In [None]:
class Dataloader(DataLoader):
  def __init__(self, batch_size, dataset):
    self.dataset = dataset
    self.batch_size = batch_size
    self.total_points = self.dataset.encoder_in.shape[0]

  def __iter__(self):
    for i in range(len(self)):
      yield self.dataset[i]

  def __getitem__(self,i):
    start = i * self.batch_size
    stop = (i+1) * self.batch_size

    batch_enc = []
    batch_dec_input = []
    batch_dec_out = []

    for j in range(start, stop):
      a, b, c = self.dataset[j]
      batch_enc.append(a[0])
      batch_dec_input.append(b[0])
      batch_dec_out.append(c[0])

    batch_enc = torch.tensor(batch_enc, dtype=torch.long)
    batch_dec_input = torch.tensor(batch_dec_input, dtype=torch.long)
    batch_dec_out = torch.tensor(batch_dec_out, dtype=torch.long)

    return [batch_enc, batch_dec_input], batch_dec_out

  def __len__(self):
    #return math.ceil(self.total_points / self.batch_size)
    return math.ceil(len(self.dataset) / self.batch_size)

In [None]:
import torch.utils.data as data

class Dataloader(DataLoader):
    def __init__(self, batch_size, dataset, shuffle=True):
        self.dataset = dataset
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.total_points = self.dataset.encoder_in.shape[0]

    def __iter__(self):
        if self.shuffle:
            indices = torch.randperm(len(self.dataset))
            self.dataset = data.Subset(self.dataset, indices)
        for i in range(len(self)):
            yield self.dataset[i]

    def __getitem__(self, i):
        start = i * self.batch_size
        stop = (i + 1) * self.batch_size

        batch_enc = []
        batch_dec_input = []
        batch_dec_out = []

        for j in range(start, stop):
            a, b, c = self.dataset[j]  # Modify this line to unpack only encoder and decoder outputs
            batch_enc.append(a)
            batch_dec_input.append(b)
            batch_dec_out.append(c)

        batch_enc = torch.tensor(batch_enc, dtype=torch.long)
        batch_dec_input = torch.tensor(batch_dec_input, dtype=torch.long)
        batch_dec_out = torch.tensor(batch_dec_out, dtype=torch.long)

        return batch_enc, batch_dec_input, batch_dec_out   # Return inputs and targets as a tuple

    def __len__(self):
        return int(self.total_points / self.batch_size)


##### 3.4.2-Built-in DataLoader  <a class="anchor" id="1-bullet"></a> 

In [None]:
import torch
import torch.utils.data as data

def custom_collate_fn(batch):
    enc_inputs, dec_inputs, dec_outputs = zip(*batch)
    enc_inputs = torch.stack(enc_inputs, dim=0).long()  # Use 'long' dtype for the encoder input
    dec_inputs = torch.stack(dec_inputs, dim=0).long()  # Use 'long' dtype for the decoder input
    dec_outputs = torch.stack(dec_outputs, dim=0).long()  # Use 'long' dtype for the decoder output
    return enc_inputs, dec_inputs, dec_outputs

def custom_collate_fn(batch):
    enc_inputs, dec_inputs, dec_outputs = zip(*batch)

    # Pad the input sequences to the maximum length in the batch
    enc_inputs = torch.nn.utils.rnn.pad_sequence(enc_inputs, batch_first=True)
    dec_inputs = torch.nn.utils.rnn.pad_sequence(dec_inputs, batch_first=True)
    dec_outputs = torch.nn.utils.rnn.pad_sequence(dec_outputs, batch_first=True)

    return enc_inputs, dec_inputs, dec_outputs

# Create the DataLoader with the custom collate function

# Train processed data
train_data = conv_dataset(df_train, tk_inp, tk_out, 35)
train_loader = DataLoader(batch_size=256, dataset=train_data, shuffle=True, collate_fn=custom_collate_fn)

# Validation processed data
val_data = conv_dataset(df_val, tk_inp, tk_out, 35)
val_loader = DataLoader(batch_size=256, dataset=val_data, shuffle=True, collate_fn=custom_collate_fn)

# Test processed data
test_data = conv_dataset(df_test, tk_inp, tk_out, 35)
test_loader = DataLoader(batch_size=256, dataset=test_data, shuffle=True, collate_fn=custom_collate_fn)


In [None]:
# Train processed data
train_data = conv_dataset(df_train, tk_inp, tk_out, 35)
train_loader = DataLoader(batch_size=8, dataset=train_data, shuffle=True)

# Validation processed data
val_data = conv_dataset(df_val, tk_inp, tk_out, 35)
val_loader = Dataloader(batch_size=8, dataset=val_data, shuffle=True)

# Test processed data
test_data = conv_dataset(df_test, tk_inp, tk_out, 35)
test_loader = Dataloader(batch_size=8, dataset=test_data, shuffle=True)

In [None]:
train_DL = train_loader
data_iter = iter(train_DL)
batch_data = next(data_iter)

for tensor in batch_data:
    print("Tensor shape:", tensor.shape)

In [None]:
batch_size = 512

# Train processed data
train_Dataloader = torch.utils.data.DataLoader(dataset=train_data,
                                               batch_size=batch_size,
                                               shuffle=True)

# Validation processed data
val_Dataloader = torch.utils.data.DataLoader(dataset=val_data,
                                               batch_size=batch_size,
                                               shuffle=True)

# Test processed data
test_Dataloader = torch.utils.data.DataLoader(dataset=test_data,
                                               batch_size=batch_size,
                                               shuffle=True)

To check that the vocabulary is correctly generated, we perform a small hand-made tokenisation of a phrase from the dataset and compare its tokens with respecto to the kyes of the dictionary containing the vocabulary, to see if the elements match in apossible assigment made from the PyTorch vocabulary. However, we have to perform a re-organization of the keys that match the values(indices) we are looking for with respect to the sample phrase, because the output of this search is not an organsed list.

In [None]:
# vocabularies to test
encoder_vocab = tk_inp.get_stoi()
decorer_in_vocab = tk_inp.get_stoi()
decoder_out_vocab = tk_out.get_stoi()
vocabs = [encoder_vocab, decorer_in_vocab, decoder_out_vocab]

# examples to test
sample_index = 2
vocab_ref = 2

# manually constructed tokens to test the vocabulary
the_phrase = df_train["dec_input"].iloc[sample_index]
the_splited_phrase = the_phrase.split()

# manually extracted indices for the sampled phrase
check_list_encoder = [encoder_vocab[word] for word in the_splited_phrase]
check_list_in_decoder = [decorer_in_vocab[word] for word in the_splited_phrase]
check_list_out_decoder = [decoder_out_vocab[word] for word in the_splited_phrase]

# sample from the generated vocabulary
sample = train_data[sample_index]

# Extract the keys that are present in the big dictionary and also in the values_list
filtered_keys = [key for key in vocabs[vocab_ref].keys() if vocabs[vocab_ref][key] in sample[vocab_ref].numpy()]

# Sort the filtered keys based on the order of values_list
values_list = list(sample[vocab_ref].numpy())
sorted_keys = sorted(filtered_keys, key=lambda x: values_list.index(vocabs[vocab_ref][x]))

print("*"*60)
print("Phrase to check =", the_phrase)
print("*"*60)
print("Phrase according to the encoder_vocab, dec_in_vocab and dec_out_vocab =")
print("manual encoder_view = ",check_list_encoder)
print("manual in_decoder_view = ",check_list_in_decoder)
print("manual out_decoder_view =",check_list_out_decoder)
print("*"*60)
print("Reconstructed phrase according to the selected vocabulary")
print(sorted_keys)
print("*"*60)
print("torch tensor numerization of selected phrase")
sample

In [None]:
encoder_vocab = tk_inp.get_stoi()
decorer_in_vocab = tk_inp.get_stoi()
decoder_out_vocab = tk_out.get_stoi()

print(encoder_vocab['<end>'])
print(decorer_in_vocab['<end>'])
print(decoder_out_vocab['<end>'])

# 3)-Model Architecture <a class="anchor" id="1-bullet"></a> 

1. Training: The model is trained on batches of sequences, where each sequence has a fixed lenght defined during data preprocessing, where the sentences are padded or truncated to the specified lenght. During the forward pass, the input sequences are preprocessed by the RNN/LST/GRN and the output sequences will have the same sequence lenght as the input sequences.

2. Testing (inference): During testing, we can input sequences of varying lenghts to the trained model. However the model will requiere fixed-lenght input sequences to process them in batches. If our input sequence is shorter than the fixed lenght,  we would need to pad it to reach the specified lenght. The output sequence will be of the same lenght as the input sequence because the RNN/LSTM/GRN processes each input token and generates an output token at each time step.

## 3.1-RNN Model <a class="anchor" id="1-bullet"></a> 

### 3.1.1-Architecture <a class="anchor" id="1-bullet"></a> 

In [None]:
class RNN(nn.Module):

  def __init__(self, input_size, embedding_size, hidden_size, num_layers, vocab_size):
    super(RNN, self).__init__()
    self.input_size = input_size
    self.embedding_size = embedding_size
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.vocab_size = vocab_size

    # Layers of the model
    # -> x.shape() = (batch_size, seq, input_size)
    self.embedding = nn.Embedding(input_size, embedding_size)
    self.rnn = nn.RNN(embedding_size, hidden_size, num_layers, batch_first=True)
    self.fc = nn.Linear(hidden_size, vocab_size)

  def forward(self, x):
    h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
    embeds = self.embedding(x)
    out, h_out = self.rnn(embeds, h0)
    out = self.fc(out) # use the entire output for correction
    return out


In [None]:
# Model Hipper parameters
num_layers = 3
learning_rate = 0.001
num_epochs = 10

# x hipper parameters
batch_size = 256
input_size = 35
sequence_length = 35
output_size = 35
hidden_size = 256 # encoding units

vocab_size = len(tk_inp)
num_classes = vocab_size
embedding_size = 150

# Momentum
beta1 = 0.1  # Momentum value for the momentum term in Adam
beta2 = 0.1  # Value for the squared gradient term in Adam

In [None]:
# Model RNN instance
import torch
torch.cuda.empty_cache()

model_rnn = RNN(input_size = vocab_size,
                embedding_size = embedding_size,
                hidden_size = hidden_size,
                num_layers = num_layers,
                vocab_size = vocab_size).to(device)

In [None]:
# Loss and optimizer

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_rnn.parameters(),
                             lr = learning_rate
                             #,betas=(beta1, beta2)
                            )
                        

### 3.1.2-Training procedure <a class="anchor" id="1-bullet"></a> 

In [None]:
#Training procedure
import torch
torch.cuda.empty_cache()


train_DL = train_loader

n_total_steps = len(train_DL)
loss_history_epochs = []
loss_history_batches = []

for epoch in range(num_epochs):
  total_loss = 0
  for i, batch_data in enumerate(train_DL):

    enc_input, dec_input, dec_output = batch_data
    enc_input, dec_input, dec_output = enc_input.to(device), dec_input.to(device), dec_output.to(device)
    inputs, targets = enc_input, dec_output

    # Forward pass
    outputs = model_rnn(inputs)
    # Loss and Backpropagation
    loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

    if (i + 1) % 100 == 0:
      avg_loss = total_loss / input_size
      print(f'Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(train_loader)}], Loss: {avg_loss:.4f}')
      total_loss = 0
      loss_history_batches.append(avg_loss)
  loss_history_epochs.append(avg_loss)


### 3.2-LSTM Models <a class="anchor" id="1-bullet"></a> 

### 3.2.1-Simple LSTM model <a class="anchor" id="1-bullet"></a> 

In [None]:
class Simp_LSTM(nn.Module):

  def __init__(self, input_size, embedding_size, hidden_size, num_layers, vocab_size):
    """
    -> x.shape() = (batch_size, seq, input_size)
    -> out.shape() = (batch_size, seq_length, hidden_size)
    -> self.fc(out) = (seq_length)
    -> h0.shape() = (num_layers, batch_size, hidden_size)
    -> c0.shape() =
    """
    
    super(RNN, self).__init__()
    self.input_size = input_size
    self.embedding_size = embedding_size # Input size
    self.hidden_size = hidden_size # Number of encoder units
    self.num_layers = num_layers
    self.vocab_size = vocab_size

    # Layers of the model
    self.embedding = nn.Embedding(input_size, embedding_size)
    self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first=True)
    self.fc = nn.Linear(hidden_size, vocab_size)

  def forward(self, x):
    h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
    c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
    embeds = self.embedding(x)
    out, h_out = self.lstm(embeds, (h0, c0) ) # out:(batch_size, seq_length, hidden_size)
    out = self.fc(out) # use the entire output for correction                                                      
    return out

In [None]:
# Model Simple LSTM instance
import torch
torch.cuda.empty_cache()

model_Simp_LSTM = Simp_LSTM(input_size = vocab_size,
                embedding_size = embedding_size,
                hidden_size = hidden_size,
                num_layers = num_layers,
                vocab_size = vocab_size).to(device)

### 3.2.2-Simple seq2seq LSTM model <a class="anchor" id="1-bullet"></a> 

<img src="https://miro.medium.com/v2/resize:fit:1400/format:webp/1*6yI-Ecx36JYixgomc-inPg.png">


<img src="https://miro.medium.com/v2/resize:fit:1400/format:webp/1*0aHodc667UfSyZj-UY8OQw.png">

**Encoder:**

*1)-Input:*
  * x.shape = (seq_length, N) where N is the batch size

*2)-Embedding Layer:*
  * embedding.shape = (seq_length, N, embedding_size)

*3)-LSTM Layer:*
  * outputs.shape = (seq_length, N, hidden_size)
  * hidden.shape = (num_layers, N, hidden_size)
  * cell.shape = (num_layers, N, hidden_size)

**Decoder:**

*1)- Input:*
  * x.shape = (N), where N is for batch_size, we     want it to be (1,N), because the seq_lenght is 1 here because we are sending in a single word and not a sequence at each time step. This corresponds to the Context vector.

*2)-Embedding Layer:*
  * embedding.shape = (1, N, embedding_size)

*3)-LSTM Layer:*
  * output.shape = (1, N, hidden_size)
  * hidden.shape = (num_layers, N, hidden_size)
  * cell.shape = (num_layers, N, hidden_size)
  
*4)-Fully Connected Layer:*
  * predictions.shape = (1, N, output_size), to send it to the loss function we want it to be (N, output_size) so we're just gonna remove the first dimension.
  * predictions.shape (after squeeze) = (N,    output_size).
  

In [None]:
# Encoder Block LSTM

class EncoderSS2S(nn.Module):
    
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super(EncoderSS2S, self).__init__()
        
        self.dropout = nn.Dropout(p)
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # Layers of the encoding block
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p, batch_first=False)
        
    def forward(self, x):
        """
        -> x.shape() = (seq_lenght, batch_size)
        -> emb.shape() = (seq_lenght, batch_size, embedding_size)
        -> out.shape() = (seq_lenght, batch_size, hidden_size)
        -> hidden.shape() = (num_layers, batch_size, hidden_size)
        -> cell.shape() = (num_layers, batch_size, hidden_size)
        """
        embedding = self.dropout(self.embedding(x))
        out, (hidden, cell) = self.lstm(embedding)
        return hidden, cell
    
# Decoder Block LSTM

class DecoderSS2S(nn.Module):
    
    def __init__(self, input_size, embedding_size, hidden_size, output_size,num_layers, p):
        super(DecoderSS2S, self).__init__()
        
        self.dropout = nn.Dropout(p)
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        
        # Layers of the Decoder block
        
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p, batch_first=False)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x, hidden, cell):
        """
        ->x.shape() = (batch_size, 1, embedding_size)
        ->embedding.shape() = (batch_size, 1, embedding_size)
        """
        x = x.unsqueeze(0)
        embedding = self.dropout(self.embedding(x))
        out, (hidden, cell) = self.lstm(embedding, (hidden,cell))
        
        predictions = self.fc(out)
        
        predictions = predictions.squeeze(0)
        
        return predictions, hidden, cell
        

In the forward functins of the decoder, the decoder expects a single word at each time step instead of an entire sequence. In a Seq2Seq model, the decoder processes one token at a time during inference, generating the output sequence word by word. However, in practice, we still need to handle batch processing for efficiency. To make if compatible with the LSTM, which expects a 3-dimensional input with dimensions (batch_size, seq_lenght, input_size), the ``x`` tensor with shape ``(N)`` (where ``N`` is the batch size) needs to be reshaped to ``(1,N)`` to represent a single word token. The additional dimension of 1 indicates that the input word is a sequence of lenght 1.

On the other hand, the ``predictions`` tensor represents the output probabilities for each word in the ``target`` vocabulary. Its shape is ``(1, N, tk_out_size)`` where ``1`` is the sequence lenght (we predict one word at a time), ``N`` is the batch size, and  "tk_out_lenght" is the lenght of the target vocabulary.

However, for calculating the loss function during training, we need to compare the predictions with the actual targets. The loss function generally expects the predictions to have the shale ``(N, tk_out_size)`` to compare them with the target tensor of the same shape. To achive this, the DecoderSS2S class remove the first dimension of the  "predictions" tensor using  ``squeeze(0)``. This operation effectively converts the shape from "``(1 , N, tk_out_size)`` to  ``(N, tk_out_size)``.

The ``teacher_force_ratio`` is a hyperparameter that controls how much teacher forcing is used during training.  Teacher forcing is a technique commonly used in sequence-to-sequence models to stabilize and speed up the training.

When ``teacher_force_ratio`` is set to ``1.0``, the model uses teacher forcing for all time steps during trainign. Teacher forcing means that the decoder is feed with the actual ground truth  target words at each time step, rather than using its own predictions as input. This helps the model to learn more quicky and effectivelly, especially when the predicted outputs may be incorrect during early stages of training. On the other hand, when ``teacher_force_ratio`` is set to ``0.0`` the model does not use teacher forching at all, then, the decoder´s predictions from the previous timestep are used as input to the decoder at the current time step. This forces the model to rely on its own redictions and can be useful for generating mode diverse outputs during inference.

In [None]:
class SSeq2Seq(nn.Module):
    
    def __init__(self, encoder, decoder, out_vocab):
        super(SSeq2Seq, self).__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.dec_out_vocab = out_vocab
        
    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(self.dec_out_vocab)
        
        output = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
        
        hidden, cell = self.encoder(source)
        
        # Grab the first input to the Decoder which will be <start> token
        x = target[0]
        
        output_predictions = []  # Store output predictions for each time step
        
        for t in range(1, target_len):
            # Pick the previous hidden and cell tensors as context from encoder at start
            output_t, hidden, cell = self.decoder(x, hidden, cell)
            
            # Store the next output prediction
            #output[t] = output_t
            
            # Append the output prediction to the list
            output_predictions.append(output_t)
            
            # Get the best word that the Decoder predict (index in the vocabulary)
            best_guess = output_t.argmax(1)
            
            x = target[t] if random.random() < teacher_force_ratio else best_guess
            
        # Concatenate the list of output predictions to get the final output tensor
        output = torch.stack(output_predictions, dim=0)
        
        return output
             

In [None]:
# Model Hipper parameters
load_model = False
num_layers = 3
learning_rate = 0.001
num_epochs = 10

# x hipper parameters

input_size_encoder = len(tk_inp)
input_size_decoder = len(tk_out)
output_size = len(tk_out)

encoder_embedding_size = 150
decoder_embedding_size = 150
hidden_size = 256 # encoding units, same for each LSTM block

batch_size = 256
input_size = 35
sequence_length = 35
output_size = 35

enc_dropout = 0.5
dec_dropout = 0.5

# Tensorboard to get nice loss plot
writer = SummaryWriter(f"runs/loss_plot")
step = 0

# Momentum
beta1 = 0.1  # Set your desired momentum value here (for the momentum term in Adam)
beta2 = 0.1  # Set your desired value here (for the squared gradient term in Adam)


In [None]:
# Model Seq2Seq instance

# ->Encoder network
encoder_net = EncoderSS2S(
    input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout
).to(device)

# ->Decoder network
decoder_net = DecoderSS2S(
    input_size_decoder,
    decoder_embedding_size,
    hidden_size,
    output_size,
    num_layers,
    dec_dropout,
).to(device)

# ->S2S Network
model_SS2S = SSeq2Seq(encoder_net, decoder_net, tk_inp).to(device)


In [None]:
# Loss and optimizer
pad_idx = tk_out.get_stoi()["<pad>"]
criterionSS2S = nn.CrossEntropyLoss(ignore_index=pad_idx)


#criterionSS2S = nn.CrossEntropyLoss()
optimizerSS2S = torch.optim.Adam(model_SS2S.parameters(),
                             lr = learning_rate
                             #,betas=(beta1, beta2)
                            )

#### 3.2.2.1-Training procedure <a class="anchor" id="1-bullet"></a> 

In [None]:
# Training procedure

sentence = "i was will be there"
train_DL = train_loader

if load_model:
    lead_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)

    
for epoch in range(num_epochs):
    print(f"[Epoch {epoch} / {num_epochs}]")
    
    checkpoint = {
        "state_dict": model_SS2S.state_dict(),
        "optimizer": optimizerSS2S.state_dict() 
                 }
    save_checkpoint(checkpoint)
    
    model_SS2S.eval() #
    
    model_SS2S.train()
    
    for batch_idx, batch_data in enumerate(train_loader):
        
        # Get input and targets and move them to the device
        enc_input, dec_input, dec_output = batch_data
        enc_input, dec_input, dec_output = enc_input.to(device), dec_input.to(device), dec_output.to(device)
        inputs, target = enc_input, dec_output
        
        # Forward prop
        output = model_SS2S(inputs, targets)
        # Transpose output tensor to match target's shape
        output = output.permute(1, 0, 2)
        
        # Remove the first timestep from both output and target
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)
        
        optimizerSS2S.zero_grad()
        loss = criterionSS2S(output, target)
        
        # Back prop
        loss.backward()
        
        # Avoid exploding gradients, here, we make sure that gradients 
        # are within a stipulated range
        
        torch.nn.utils.clip_grad_norm(model_SS2S.parameters(), max_norm=1)
        
        # Gradient descent step
        optimizer.step()
        
        # Plot to tensorboard
        writer.add_scalar("Training loss", loss, global_step=step)
        step += 1
        
score = bleu(test_data[1:100], model, german, english, device)
print(f"Bleu score {score*100:.2f}")

In [None]:
# Training procedure

sentence = "i was will be there"

if load_model:
    lead_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)

    
for epoch in range(num_epochs):
    print(f"[Epoch {epoch} / {num_epochs}]")
    
    checkpoint = {
        "state_dict": model_SS2S.state_dict(),
        "optimizer": optimizerSS2S.state_dict() 
                 }
    save_checkpoint(checkpoint)
    
    model_SS2S.eval() #
    
    model_SS2S.train()
    
    for betch_idx, batch in enumerate(train_iterator):
        
        # Get input and targets and move them to the device
        inp_data = batch.src.to(device)
        target = batch.trg.to(device)
        
        # Forward prop
        output = model_SS2S(inp_data, target)
        
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)
        
        optimizer.zero_grad()
        loss = criterion(output, target)
        
        # Back prop
        loss.backward()
        
        # Avoid exploding gradients, here, we make sure that gradients 
        # are within a stipulated range
        
        torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=1)
        
        # Gradient descent step
        optimizer.step()
        
        # Plot to tensorboard
        writer.add_scalar("Training loss", loss, global_step=step)
        step += 1
        
score = bleu(test_data[1:100], model, german, english, device)
print(f"Bleu score {score*100:.2f}")

### 3.2.2-Seq2Seq LSTM with Attention model <a class="anchor" id="1-bullet"></a> 

#### 3.1-Encoder - Decoder Layers

In [None]:
# Encoder Class

class Encoder(nn.Module):

  def __init__(self, vocab_size, embedding_dim, enc_units, input_len):
    super(Encoder, self).__init__()

    self.vocab_size = vocab_size
    self.embedding_dim = embedding_dim
    self.enc_units = enc_units
    self.input_len = input_len

    # Embedding layer initialization
    self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)

    # Bidirectional LSTM layer initialization
    self.lstm_bi = nn.LSTM(self.embedding_dim, self.enc_units, bidirectional=True, batch_first=True)

  def forward(self, input):
    # Convert input to embedded vectors
    emb = self.embedding(input)

    # Passing through the Bidirectional LSTM layer
    enc_output, (state_h, state_c) = self.lstm_bi(emb)

    return enc_output, state_h, state_c

In [None]:
# Decoder Class

class Decoder(nn.Module):

  def __init__(self, vocab_size, embedding_dim, dec_unit, input_len):
    super(Decoder, self).__init__()
    

    self.vocab_size = vocab_size
    self.embedding_dim = embedding_dim
    self.dec_unit = dec_unit
    self.input_len = input_len

    # Embedding and LSTM layer initialization

    self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
    self.lstm = nn.LSTM(self.embedding_dim, self.dec_unit, batch_first=True)

  def forward(self, input, state):
    # Embedded vectors
    emb = self.embedding(input)
    # LSTM output
    dec_out, (state_h, state_c) = self.lstm(emb,state)

    return dec_out, state_h, state_c


####3.2-Model Architecture

In [None]:
# Creating the model

class Seq2Seq(nn.Module):
    def __init__(self, vocab_size_in, vocab_size_out, embedding_dim, enc_units, dec_units, input_len):
        super(Seq2Seq, self).__init__()

        self.vocab_size_in = vocab_size_in
        self.vocab_size_out = vocab_size_out
        self.embedding_dim = embedding_dim
        self.enc_units = enc_units
        self.dec_units = dec_units
        self.input_len = input_len

        # Initialize the Encoder and Decoder inside the forward pass
        self.encoder = Encoder(self.vocab_size_in, self.embedding_dim, self.enc_units, self.input_len)
        self.decoder = Decoder(self.vocab_size_out, self.embedding_dim, self.dec_units, self.input_len)

        # Initializing the Dense Layer with Softmax activation
        self.dense = nn.Linear(dec_units, vocab_size_out)
        self.softmax = nn.Softmax(dim=2)

    def forward(self, enc_input, dec_input):
        # Getting the Encoder output and states
        enc_output, enc_state_h, enc_state_c = self.encoder(enc_input)
        print(enc_output.shape)
        print(dec_input.shape)
        # Storing the Encoder states in a variable
        enc_state = [enc_state_h, enc_state_c]

        # Getting the Decoder output and states
        dec_output, _, _ = self.decoder(dec_input, enc_state)

        # Applying Softmax activation to the dense layer
        dense_output = self.softmax(self.dense(dec_output))

        return dense_output

# Model Initialization
vocab_size_in = len(tk_inp)
vocab_size_out = len(tk_out)
embedding_dim = 300
enc_units = 256
dec_units = 512
input_len = 35
model = Seq2Seq(vocab_size_in=vocab_size_in,
                vocab_size_out=vocab_size_out,
                embedding_dim=embedding_dim,
                enc_units=enc_units,
                dec_units=dec_units,
                input_len=input_len)


In [None]:
for batch_idx, (enc_input, dec_input, target) in enumerate(train_loader):
    # Print the shapes of input tensors
    print("Encoder Input Shape:", enc_input.shape)
    print("Decoder Input Shape:", dec_input.shape)
    print("Target Shape:", target.shape)

    # Pass the data through the model
    output = model(enc_input, dec_input)

    # Print the shape of the output
    print("Output Shape:", output.shape)

####3.3-Training Procedure

In [None]:
# Training Callbacks of the model

class Train_Callback:

  def __init__(self, model, train_dataloader, val_dataloader, checkpoint_path, log_path, patience=5, min_delta=0.0001):
    self.model = model
    self.train_dataloader = train_dataloader
    self.val_dataloader = val_dataloader
    self.checkpoint_path = checkpoint_path
    self.log_path = log_path
    self.patience = patience
    self.min_delta = min_delta

  def train(self, num_epochs, vocab_size_in, vocab_size_out, embedding_dim, enc_units, dec_units, input_len):

    # TensorBoard writer for logging
    writer = SummaryWriter(self.log_path)

    # Training and validation steps for one epoch
    train_steps = len(self.train_dataloader)
    val_steps = len(self.val_dataloader)

    # Loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(self.model.parameters())

    # Initialization of early stoping variables
    best_val_los = float('inf')
    counter = 0

    # Training loop
    for epoch in range(num_epochs):
      self.model.train()
      total_loss = 0.0

      for batch_idx, (enc_input, dec_input, target) in enumerate(self.train_dataloader):
        # Zero the gradients
        optimizer.zero_grad()
        # Fordward propagation
        output = self.model(enc_input, dec_input)
        # Loss computation
        loss = criterion(output.view(-1, output.size(-1)), target.view(-1))
        # BackPropagation
        loss.backward()
        # Weight update
        optimizer.step()
        # Total loss accumulation
        total_loss +=loss.item()

      # Average training loss for the epoch
      average_train_loss = total_loss/train_steps

      # Validation loop
      self.model.eval()
      val_loss = 0.0

      with torch.no_grad():
        for batch_idx, (enc_input, dec_input, target) in enumerate(self.val_dataloader):
           # Fordward propagation
           output = self.model(enc_input, dec_input)
           # Loss term
           loss = criterion(output.view(-1, output.size(-1)), target.view(-1))
           # Loss accumulation
           val_loss += loss.item()
        # Validation average loss
        average_val_loss = val_loss / val_steps

        # Early stoping
        if average_val_loss - best_val_loss > self.min_delta:
          counter += 1
        else:
          counter = 0

        if counter >= self.patience:
          print("Early stopping reached. Training finished.")
          break

        # Save the best model
        if average_val_loss < best_val_loss:
          best_val_loss = average_val_loss
          torch.save(self.model.state_dict(), self.checkpoint_path)

        # Log the losses in TensorBoard
        writer.add_scalar("Loss/train", average_train_loss, epoch)
        writer.add_scalar("Loss/validation", average_val_loss, epoch)

      # Close the TensorBoard writer
      writer.close()




In [None]:
checkpoint_path = "/content/drive/MyDrive/Colab Notebooks/2)-Machine Learning/4)-NeuroMatch Academy -DL/NLP - Project/Models/bidirectional_train.pth"
log_path = "/content/drive/MyDrive/Colab Notebooks/2)-Machine Learning/4)-NeuroMatch Academy -DL/NLP - Project/Models/bidirectional_train.pth/logs"
patience = 5

vocab_size_in = len(tk_inp)  # 70118
vocab_size_out = len(tk_out)  # 55474
embedding_dim = 300
enc_units = 256
dec_units = 512
input_len = 35

# Trainer instance initializatin
trainer = Train_Callback(model, train_loader, val_loader, checkpoint_path, log_path)
# Train the model
num_epochs = 10
trainer.train(num_epochs=num_epochs,
              vocab_size_in=vocab_size_in,
              vocab_size_out=vocab_size_out,
              embedding_dim=embedding_dim,
              enc_units=enc_units,
              dec_units=dec_units,
              input_len=input_len)

In [None]:
encoder = Encoder(vocab_size_in, embedding_dim, enc_units, input_len)

# Create some example data for the encoder input
# Note: The input should be a tensor with shape (batch_size, input_len)
# Here, we are assuming batch_size=2 for demonstration purposes
batch_size = 512
example_input = torch.randint(0, vocab_size_in, size=(batch_size, input_len))

# Pass the input through the encoder
enc_output, state_h, state_c = encoder(example_input)

# Print the shapes of the encoder output and states
print("Encoder Output Shape:", enc_output.shape)
print("State_h Shape:", state_h.shape)
print("State_c Shape:", state_c.shape)

In [None]:
for batch_idx, (enc_input, dec_input, target) in enumerate(train_loader):
    # Pass the data through the model
    output = model(enc_input, dec_input)

    # Print the shape of the output
    print("Output Shape:", output.shape)

In [None]:
max_index_inp = max(tk_inp.get_stoi().values())
max_index_out = max(tk_out.get_stoi().values())
print("Maximum token index (encoder input):", max_index_inp)
print("Maximum token index (decoder input):", max_index_out)

In [None]:
max_index_enc_input = torch.max(enc_input)
max_index_dec_input = torch.max(dec_input)
max_index_target = torch.max(target)

print("Max Index Encoder Input:", max_index_enc_input)
print("Max Index Decoder Input:", max_index_dec_input)
print("Max Index Target:", max_index_target)

#3.1-Grammar error detection (inference)

##3.2-Grammar error correction


#4)-Training

#5)-Evaluation

#6)-Inference
