#1)-Importing Dependencies

In [47]:
# Data handling and traditional algebraic operations
import math
import numpy as np
import pandas as pd
# ML dependencies
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score
# DL dependencies
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import Dataset as TorchDataset
from torch.utils.tensorboard import SummaryWriter

import nltk
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from nltk import pos_tag
nltk.download('punkt')

# Text manipulation tools
import re
import string
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#2)-Data Preprocessing

##2.1-Data importing

First, we are going to use two standard datasets for grammatical error detection:

*   Lang-8
*   NUS Social Media Text Normalization and Translation Corpus.



###Lang-8

In [92]:
# Lang-8 loading and extraction of correct and incorrect sentences
path = "/content/drive/MyDrive/Colab Notebooks/2)-Machine Learning/4)-NeuroMatch Academy -DL/NLP - Project/Data/entries.train"
f1 = open(path)
lines1 = f1.readlines()
inp1 = [] # list for incorrect sentences
tgt1 = [] # storing

for i in lines1:
    lst = i.split("\t")

# IF LENGTH OF THE LIST IS GREATER THAN 5 THEN CORRECT SENTTENCE EXISTS OTHERWISE ONLY INCORRECT SENTENCE IS PRESENT
    if len(lst)>5  :     #IF LENGTH IS GREATER THAN 5
        inp1.append(lst[-2]) # APPEND SECONG LAST ITEM IN LIST WHICH IS INCORRECT SENTENCE
        tgt1.append(lst[-1]) # APPEND LAST ITEM IN THE LIST WHICH IS

In [93]:
df = pd.DataFrame()
df['y'] = list('1'*len(inp1))
df['input'] = inp1
df['output'] = tgt1


In [7]:
df.head()

Unnamed: 0,y,input,output
0,1,And he took in my favorite subject like soccer .,And he took in my favorite subjects like socce...
1,1,"Actually , who let me know about Lang - 8 was ...","Actually , he was the one who let me know abou..."
2,1,His Kanji 's ability is much better than me .,His Kanji ability is much better than mine .\n
3,1,"We 've known each other for only half a year ,...","We 've known each other for only half a year ,..."
4,1,I heard a sentence last night when I watched TV .,I heard a sentence last night when I was watch...


In [8]:
df.describe()

Unnamed: 0,y,input,output
count,509163,509163,509163
unique,1,504476,500801
top,1,thank you .,Hello !\n
freq,509163,40,90


###NUS Social Media Text Normalization and Translation Corpus

In [None]:
path = "/content/drive/MyDrive/Colab Notebooks/2)-Machine Learning/4)-NeuroMatch Academy -DL/NLP - Project/Data/10gec_annotations/en2cn-2k.en2nen2cn"
f2 = open(path,"r",encoding="UTF-8")


#f2 = open("/content/drive/MyDrive/Colab Notebooks/cs2/text_sms/release/en2cn-2k.en2nen2cn","r",encoding="UTF-8") # READING THE FILE

lines2 = f2.readlines() # STORING ALL THE LINES IN A VARIABLE
inp2 = [] # LIST FOR STORING INCORRECT SENTENCES
tgt2 = [] # LIST FOR STORING CORRECT SENTENCES

# THE DASET CONTAINS 2000 DATAPOINTS, THEREFORE RUNNING THE LOOP FOR 2000 TIMES
for i in range(2000):
    inp2.append(lines2[i*3]) #APPEDING FIRST ROW FOR EACH DATAPOINT
    tgt2.append(lines2[i*3+1]) # APPENDING SECOND ROW FOR EACH DATAPOINT

FileNotFoundError: ignored

##2.2-Data cleaning

###2.2.1-Data formating

In [94]:
def remove_spaces(text):
    text = re.sub(r" '(\w)",r"'\1",text)
    text = re.sub(r" \,",",",text)
    text = re.sub(r" \.+",".",text)
    text = re.sub(r" \!+","!",text)
    text = re.sub(r" \?+","?",text)
    text = re.sub(" n't","n't",text)
    text = re.sub("[\(\)\;\_\^\`\/]","",text)
    return text

def decontract(text):
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text

def preprocess(text):
    text = re.sub("\n","",text)
    text = remove_spaces(text)
    text = re.sub(r"\.+",".",text)
    text = re.sub(r"\!+","!",text)
    text = decontract(text)
    text = re.sub("[^A-Za-z0-9 ]+","",text)
    text = text.lower()
    return text

In [95]:
df["enc_input"] = df.input.apply(preprocess)
df["dec_input"] = df.input.apply(preprocess)
df["dec_output"] = df.output.apply(preprocess)
df =df.drop(["input","output"],axis=1)
df = df[df.enc_input.notnull()]
df = df[df.dec_input.notnull()]
df = df[df.dec_output.notnull()]
df = df.drop_duplicates()

In [96]:
df.head()

Unnamed: 0,y,enc_input,dec_input,dec_output
0,1,and he took in my favorite subject like soccer,and he took in my favorite subject like soccer,and he took in my favorite subjects like soccer
1,1,actually who let me know about lang 8 was him,actually who let me know about lang 8 was him,actually he was the one who let me know about ...
2,1,his kanji is ability is much better than me,his kanji is ability is much better than me,his kanji ability is much better than mine
3,1,we have known each other for only half a year ...,we have known each other for only half a year ...,we have known each other for only half a year ...
4,1,i heard a sentence last night when i watched tv,i heard a sentence last night when i watched tv,i heard a sentence last night when i was watch...


###2.2.2-Removing null values and duplicates

In [12]:
df.shape

(503901, 4)

In [13]:
df = df.drop_duplicates()
df = df[df.enc_input.notnull()]
df = df[df.dec_input.notnull()]
df = df[df.dec_output.notnull()]
print(df.shape)
df.head()

(503901, 4)


Unnamed: 0,y,enc_input,dec_input,dec_output
0,1,and he took in my favorite subject like soccer,and he took in my favorite subject like soccer,and he took in my favorite subjects like soccer
1,1,actually who let me know about lang 8 was him,actually who let me know about lang 8 was him,actually he was the one who let me know about ...
2,1,his kanji is ability is much better than me,his kanji is ability is much better than me,his kanji ability is much better than mine
3,1,we have known each other for only half a year ...,we have known each other for only half a year ...,we have known each other for only half a year ...
4,1,i heard a sentence last night when i watched tv,i heard a sentence last night when i watched tv,i heard a sentence last night when i was watch...


##2.2-Tokenization

We crate a tokenizer that takes into account the special ``<start>`` and ``<end>`` tokens. So we first add these tokens to the data to ensure correct formating.

In [14]:
# Add <start> and <end> tokens to the dec_input column
df["dec_input"] = df["dec_input"].apply(lambda x: "<start> " + x + " <end>")

# Add <start> and <end> tokens to the dec_output column
df["dec_output"] = df["dec_output"].apply(lambda x: "<start> " + x + " <end>")

df.head()

Unnamed: 0,y,enc_input,dec_input,dec_output
0,1,and he took in my favorite subject like soccer,<start> and he took in my favorite subject lik...,<start> and he took in my favorite subjects li...
1,1,actually who let me know about lang 8 was him,<start> actually who let me know about lang 8...,<start> actually he was the one who let me kno...
2,1,his kanji is ability is much better than me,<start> his kanji is ability is much better th...,<start> his kanji ability is much better than ...
3,1,we have known each other for only half a year ...,<start> we have known each other for only half...,<start> we have known each other for only half...
4,1,i heard a sentence last night when i watched tv,<start> i heard a sentence last night when i w...,<start> i heard a sentence last night when i w...


In [15]:
print(df["dec_input"].iloc[1])
print(df["dec_output"].iloc[1])

<start> actually who let me know about lang  8 was him <end>
<start> actually he was the one who let me know about lang  8 <end>


In [None]:
# DON´T RUN THIS CELL

# Adding <start> and <end> token
df["dec_input"] = "<start>"+ df["dec_input"]
df["dec_input"].iloc[0] = df["dec_input"].iloc[0] + "<end>"
df["dec_output"].iloc[0] = df["dec_output"].iloc[0] + "<end>"
df.head()

Unnamed: 0,y,enc_input,dec_input,dec_output
0,1,and he took in my favorite subject like soccer,<start>and he took in my favorite subject like...,and he took in my favorite subjects like socce...
1,1,actually who let me know about lang 8 was him,<start>actually who let me know about lang 8 ...,actually he was the one who let me know about ...
2,1,his kanji is ability is much better than me,<start>his kanji is ability is much better tha...,his kanji ability is much better than mine
3,1,we have known each other for only half a year ...,<start>we have known each other for only half ...,we have known each other for only half a year ...
4,1,i heard a sentence last night when i watched tv,<start>i heard a sentence last night when i wa...,i heard a sentence last night when i was watch...


We then split the original dataset into training, validation and test sets.

In [16]:
# Train, validation and test split
df_train ,df_val_train = train_test_split(df, test_size=0.3,random_state = 16, stratify = df.y )
df_val, df_test = train_test_split(df_val_train, test_size=0.5, random_state = 16, stratify = df_val_train.y)

In [17]:
print("Train Shape =",df_train.shape)
print("Val Shape =",df_val.shape)
print("Test Shape =",df_test.shape)

Train Shape = (352730, 4)
Val Shape = (75585, 4)
Test Shape = (75586, 4)


Finally, we run the tokenizer using ```get_tokenizer``, perform a word counter for future uses using the ``Counter()`` cosntructor and generate the input and output training vocabulary for the encoder using ```build_vocab_from_iterator``.

In [18]:
from collections import Counter

def count_token_occurrences(tokens_list):
  counter = Counter()
  for sentence_tokens in tokens_list:
    counter.update(sentence_tokens)
  return counter

# Define the tokenizer function (use basic_english tokenizer)
tokenizer = get_tokenizer('basic_english')
traindata_in = df_train.dec_input.apply(str).tolist()
traindata_out = df_train.dec_output.apply(str).tolist()

# Tokenization and preprocessing for encoder input
enc_input_tokens = [tokenizer(sentence) for sentence in traindata_in]

# Tokenization and preprocessing for decoder input
dec_input_tokens = [tokenizer(sentence) for sentence in traindata_out]

# Build vocabulary for encoder input
counter_enc = count_token_occurrences(enc_input_tokens)
tk_inp = build_vocab_from_iterator(enc_input_tokens,specials=['<pad>'])
# Build vocabulary for decoder input
counter_dec = count_token_occurrences(dec_input_tokens)
tk_out = build_vocab_from_iterator(dec_input_tokens, specials=['<pad>', '<start>', '<end>'])


In [19]:
print(type(tk_out.get_stoi()))

<class 'dict'>


###3.3-Text data into integer sequences

We now try to convert the text data into integer sequences wich also has a padding. This padding of sequences is necessary to ensure that all sequences in a batch have the same length. Padding adds special tokens (pad token) to the sequences to that all sequences pocesses the same amount of tokens.

In [20]:
class conv_dataset(TorchDataset):
    def __init__(self, data, tk_inp, tk_out, max_len):
        self.encoder_in = data["enc_input"].apply(str).values
        self.decoder_in = data["dec_input"].apply(str).values
        self.decoder_out = data["dec_output"].apply(str).values
        self.tk_inp = tk_inp.get_stoi()
        self.tk_out = tk_out.get_stoi()
        self.tokenizer = get_tokenizer('basic_english')
        self.max_len = max_len

    def __getitem__(self, i):
        # Input sequences
        encoder_seq = self.encoder_in[i]
        encoder_tokens = self.tokenizer(encoder_seq)
        encoder_indices = [self.tk_inp[token] for token in encoder_tokens]
        encoder_tensor = torch.tensor(encoder_indices)

        # Input encoder sequences
        decoder_in_seq = self.decoder_in[i]
        decoder_in_tokens = self.tokenizer(decoder_in_seq)
        # Special handling for <start> and <end> tokens
        decoder_in_indices = [self.tk_inp[token] for token in decoder_in_tokens]
        decoder_in_tensor = torch.tensor(decoder_in_indices)

        # Input decoder sequences
        decoder_out_seq = self.decoder_out[i]
        decoder_tokens = self.tokenizer(decoder_out_seq)
        # Special handling for <start> and <end> tokens
        decoder_out_indices = [self.tk_out[token] for token in decoder_tokens]
        decoder_out_tensor = torch.tensor(decoder_out_indices)

        # Tokenizer padding
        encoder_tensor = F.pad(encoder_tensor, pad=(0, self.max_len - len(encoder_tensor)))
        decoder_in_tensor = F.pad(decoder_in_tensor, pad=(0, self.max_len - len(decoder_in_tensor)))
        decoder_out_tensor = F.pad(decoder_out_tensor, pad=(0, self.max_len - len(decoder_out_tensor)))

        return encoder_tensor, decoder_in_tensor, decoder_out_tensor

    def __len__(self):
        return len(self.encoder_in)

In [111]:
class Dataloader(DataLoader):
  def __init__(self, batch_size, dataset):
    self.dataset = dataset
    self.batch_size = batch_size
    self.total_points = self.dataset.encoder_in.shape[0]

  def __iter__(self):
    for i in range(len(self)):
      yield self.dataset[i]

  def __getitem__(self,i):
    start = i * self.batch_size
    stop = (i+1) * self.batch_size

    batch_enc = []
    batch_dec_input = []
    batch_dec_out = []

    for j in range(start, stop):
      a, b, c = self.dataset[j]
      batch_enc.append(a[0])
      batch_dec_input.append(b[0])
      batch_dec_out.append(c[0])

    batch_enc = torch.tensor(batch_enc)
    batch_dec_input = torch.tensor(batch_dec_input)
    batch_dec_out = torch.tensor(batch_dec_out)

    return [batch_enc, batch_dec_input], batch_dec_out

  def __len__(self):
    #return math.ceil(self.total_points / self.batch_size)
    return math.ceil(len(self.dataset) / self.batch_size)

In [112]:
# Train processed data
train_data = conv_dataset(df_train, tk_inp, tk_out, 35)
train_loader = DataLoader(batch_size=512, dataset=train_data, shuffle=True)

# Validation processed data
val_data = conv_dataset(df_val, tk_inp, tk_out, 35)
val_loader = DataLoader(batch_size=512, dataset=val_data, shuffle=True)

# Test processed data
test_data = conv_dataset(df_test, tk_inp, tk_out, 35)
test_loader = DataLoader(batch_size=512, dataset=test_data, shuffle=True)

To check that the vocabulary is correctly generated, we perform a small hand-made tokenisation of a phrase from the dataset and compare its tokens with respecto to the kyes of the dictionary containing the vocabulary, to see if the elements match in apossible assigment made from the PyTorch vocabulary. However, we have to perform a re-organization of the keys that match the values(indices) we are looking for with respect to the sample phrase, because the output of this search is not an organsed list.

In [23]:
# vocabularies to test
encoder_vocab = tk_inp.get_stoi()
decorer_in_vocab = tk_inp.get_stoi()
decoder_out_vocab = tk_out.get_stoi()
vocabs = [encoder_vocab, decorer_in_vocab, decoder_out_vocab]

# examples to test
sample_index = 2
vocab_ref = 2

# manually constructed tokens to test the vocabulary
the_phrase = df_train["dec_input"].iloc[sample_index]
the_splited_phrase = the_phrase.split()

# manually extracted indices for the sampled phrase
check_list_encoder = [encoder_vocab[word] for word in the_splited_phrase]
check_list_in_decoder = [decorer_in_vocab[word] for word in the_splited_phrase]
check_list_out_decoder = [decoder_out_vocab[word] for word in the_splited_phrase]

# sample from the generated vocabulary
sample = train_data[sample_index]

# Extract the keys that are present in the big dictionary and also in the values_list
filtered_keys = [key for key in vocabs[vocab_ref].keys() if vocabs[vocab_ref][key] in sample[vocab_ref].numpy()]

# Sort the filtered keys based on the order of values_list
values_list = list(sample[vocab_ref].numpy())
sorted_keys = sorted(filtered_keys, key=lambda x: values_list.index(vocabs[vocab_ref][x]))

print("*"*60)
print("Phrase to check =", the_phrase)
print("*"*60)
print("Phrase according to the encoder_vocab, dec_in_vocab and dec_out_vocab =")
print("manual encoder_view = ",check_list_encoder)
print("manual in_decoder_view = ",check_list_in_decoder)
print("manual out_decoder_view =",check_list_out_decoder)
print("*"*60)
print("Reconstructed phrase according to the selected vocabulary")
print(sorted_keys)
print("*"*60)
print("torch tensor numerization of selected phrase")
sample

************************************************************
Phrase to check = <start> actually i get up late every day <end>
************************************************************
Phrase according to the encoder_vocab, dec_in_vocab and dec_out_vocab =
manual encoder_view =  [2, 264, 3, 79, 78, 478, 141, 60, 1]
manual in_decoder_view =  [2, 264, 3, 79, 78, 478, 141, 60, 1]
manual out_decoder_view = [1, 266, 3, 84, 81, 477, 145, 61, 2]
************************************************************
Reconstructed phrase according to the selected vocabulary
['<start>', 'actually', 'i', 'get', 'up', 'late', 'every', 'day', '<end>', '<pad>']
************************************************************
torch tensor numerization of selected phrase


(tensor([264,   3,  79,  78, 478, 141,  60,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0]),
 tensor([  2, 264,   3,  79,  78, 478, 141,  60,   1,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0]),
 tensor([  1, 266,   3,  84,  81, 477, 145,  61,   2,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0]))

In [91]:
encoder_vocab = tk_inp.get_stoi()
decorer_in_vocab = tk_inp.get_stoi()
decoder_out_vocab = tk_out.get_stoi()

print(encoder_vocab['<end>'])
print(decorer_in_vocab['<end>'])
print(decoder_out_vocab['<end>'])

1
1
2


#3)-Model Architecture

###3.1-Encoder - Decoder Layers

In [113]:
# Encoder Class

class Encoder(nn.Module):

  def __init__(self, vocab_size, embedding_dim, enc_units, input_len):
    super(Encoder, self).__init__()

    self.vocab_size = vocab_size
    self.embedding_dim = embedding_dim
    self.enc_units = enc_units
    self.input_len = input_len

    # Embedding layer initialization
    self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)

    # Bidirectional LSTM layer initialization
    self.lstm_bi = nn.LSTM(self.embedding_dim, self.enc_units, bidirectional=True, batch_first=True)

  def forward(self, input):
    # Convert input to embedded vectors
    emb = self.embedding(input)

    # Passing through the Bidirectional LSTM layer
    enc_output, (state_h, state_c) = self.lstm_bi(emb)

    return enc_output, state_h, state_c

In [63]:
# Decoder Class

class Decoder(nn.Module):

  def __init__(self, vocab_size, embedding_dim, dec_unit, input_len):
    super(Decoder, self).__init__()

    self.vocab_size = vocab_size
    self.embedding_dim = embedding_dim
    self.dec_unit = dec_unit
    self.input_len = input_len

    # Embedding and LSTM layer initialization

    self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
    self.lstm = nn.LSTM(self.embedding_dim, self.dec_unit, batch_first=True)

  def forward(self, input, state):
    # Embedded vectors
    emb = self.embedding(input)
    # LSTM output
    dec_out, (state_h, state_c) = self.lstm(emb,state)

    return dec_out, state_h, state_c


###3.2-Model Architecture

In [114]:
# Creating the model

class Seq2Seq(nn.Module):
    def __init__(self, vocab_size_in, vocab_size_out, embedding_dim, enc_units, dec_units, input_len):
        super(Seq2Seq, self).__init__()

        self.vocab_size_in = vocab_size_in
        self.vocab_size_out = vocab_size_out
        self.embedding_dim = embedding_dim
        self.enc_units = enc_units
        self.dec_units = dec_units
        self.input_len = input_len

        # Initialize the Encoder and Decoder inside the forward pass
        self.encoder = Encoder(self.vocab_size_in, self.embedding_dim, self.enc_units, self.input_len)
        self.decoder = Decoder(self.vocab_size_out, self.embedding_dim, self.dec_units, self.input_len)

        # Initializing the Dense Layer with Softmax activation
        self.dense = nn.Linear(dec_units, vocab_size_out)
        self.softmax = nn.Softmax(dim=2)

    def forward(self, enc_input, dec_input):
        # Getting the Encoder output and states
        enc_output, enc_state_h, enc_state_c = self.encoder(enc_input)
        print(enc_output.shape)
        print(dec_input.shape)
        # Storing the Encoder states in a variable
        enc_state = [enc_state_h, enc_state_c]

        # Getting the Decoder output and states
        dec_output, _, _ = self.decoder(dec_input, enc_state)

        # Applying Softmax activation to the dense layer
        dense_output = self.softmax(self.dense(dec_output))

        return dense_output

# Model Initialization
vocab_size_in = len(tk_inp)
vocab_size_out = len(tk_out)
embedding_dim = 300
enc_units = 256
dec_units = 512
input_len = 35
model = Seq2Seq(vocab_size_in=vocab_size_in,
                vocab_size_out=vocab_size_out,
                embedding_dim=embedding_dim,
                enc_units=enc_units,
                dec_units=dec_units,
                input_len=input_len)


In [115]:
for batch_idx, (enc_input, dec_input, target) in enumerate(train_loader):
    # Print the shapes of input tensors
    print("Encoder Input Shape:", enc_input.shape)
    print("Decoder Input Shape:", dec_input.shape)
    print("Target Shape:", target.shape)

    # Pass the data through the model
    output = model(enc_input, dec_input)

    # Print the shape of the output
    print("Output Shape:", output.shape)

Encoder Input Shape: torch.Size([512, 35])
Decoder Input Shape: torch.Size([512, 35])
Target Shape: torch.Size([512, 35])
torch.Size([512, 35, 512])
torch.Size([512, 35])


IndexError: ignored

###3.3-Training Procedure

In [83]:
# Training Callbacks of the model

class Train_Callback:

  def __init__(self, model, train_dataloader, val_dataloader, checkpoint_path, log_path, patience=5, min_delta=0.0001):
    self.model = model
    self.train_dataloader = train_dataloader
    self.val_dataloader = val_dataloader
    self.checkpoint_path = checkpoint_path
    self.log_path = log_path
    self.patience = patience
    self.min_delta = min_delta

  def train(self, num_epochs, vocab_size_in, vocab_size_out, embedding_dim, enc_units, dec_units, input_len):

    # TensorBoard writer for logging
    writer = SummaryWriter(self.log_path)

    # Training and validation steps for one epoch
    train_steps = len(self.train_dataloader)
    val_steps = len(self.val_dataloader)

    # Loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(self.model.parameters())

    # Initialization of early stoping variables
    best_val_los = float('inf')
    counter = 0

    # Training loop
    for epoch in range(num_epochs):
      self.model.train()
      total_loss = 0.0

      for batch_idx, (enc_input, dec_input, target) in enumerate(self.train_dataloader):
        # Zero the gradients
        optimizer.zero_grad()
        # Fordward propagation
        output = self.model(enc_input, dec_input)
        # Loss computation
        loss = criterion(output.view(-1, output.size(-1)), target.view(-1))
        # BackPropagation
        loss.backward()
        # Weight update
        optimizer.step()
        # Total loss accumulation
        total_loss +=loss.item()

      # Average training loss for the epoch
      average_train_loss = total_loss/train_steps

      # Validation loop
      self.model.eval()
      val_loss = 0.0

      with torch.no_grad():
        for batch_idx, (enc_input, dec_input, target) in enumerate(self.val_dataloader):
           # Fordward propagation
           output = self.model(enc_input, dec_input)
           # Loss term
           loss = criterion(output.view(-1, output.size(-1)), target.view(-1))
           # Loss accumulation
           val_loss += loss.item()
        # Validation average loss
        average_val_loss = val_loss / val_steps

        # Early stoping
        if average_val_loss - best_val_loss > self.min_delta:
          counter += 1
        else:
          counter = 0

        if counter >= self.patience:
          print("Early stopping reached. Training finished.")
          break

        # Save the best model
        if average_val_loss < best_val_loss:
          best_val_loss = average_val_loss
          torch.save(self.model.state_dict(), self.checkpoint_path)

        # Log the losses in TensorBoard
        writer.add_scalar("Loss/train", average_train_loss, epoch)
        writer.add_scalar("Loss/validation", average_val_loss, epoch)

      # Close the TensorBoard writer
      writer.close()




In [None]:
checkpoint_path = "/content/drive/MyDrive/Colab Notebooks/2)-Machine Learning/4)-NeuroMatch Academy -DL/NLP - Project/Models/bidirectional_train.pth"
log_path = "/content/drive/MyDrive/Colab Notebooks/2)-Machine Learning/4)-NeuroMatch Academy -DL/NLP - Project/Models/bidirectional_train.pth/logs"
patience = 5

vocab_size_in = len(tk_inp)  # 70118
vocab_size_out = len(tk_out)  # 55474
embedding_dim = 300
enc_units = 256
dec_units = 512
input_len = 35

# Trainer instance initializatin
trainer = Train_Callback(model, train_loader, val_loader, checkpoint_path, log_path)
# Train the model
num_epochs = 10
trainer.train(num_epochs=num_epochs,
              vocab_size_in=vocab_size_in,
              vocab_size_out=vocab_size_out,
              embedding_dim=embedding_dim,
              enc_units=enc_units,
              dec_units=dec_units,
              input_len=input_len)

In [97]:
encoder = Encoder(vocab_size_in, embedding_dim, enc_units, input_len)

# Create some example data for the encoder input
# Note: The input should be a tensor with shape (batch_size, input_len)
# Here, we are assuming batch_size=2 for demonstration purposes
batch_size = 512
example_input = torch.randint(0, vocab_size_in, size=(batch_size, input_len))

# Pass the input through the encoder
enc_output, state_h, state_c = encoder(example_input)

# Print the shapes of the encoder output and states
print("Encoder Output Shape:", enc_output.shape)
print("State_h Shape:", state_h.shape)
print("State_c Shape:", state_c.shape)

Encoder Output Shape: torch.Size([512, 35, 512])
State_h Shape: torch.Size([2, 512, 256])
State_c Shape: torch.Size([2, 512, 256])


In [None]:
for batch_idx, (enc_input, dec_input, target) in enumerate(train_loader):
    # Pass the data through the model
    output = model(enc_input, dec_input)

    # Print the shape of the output
    print("Output Shape:", output.shape)

In [99]:
max_index_inp = max(tk_inp.get_stoi().values())
max_index_out = max(tk_out.get_stoi().values())
print("Maximum token index (encoder input):", max_index_inp)
print("Maximum token index (decoder input):", max_index_out)

Maximum token index (encoder input): 70117
Maximum token index (decoder input): 55473


In [98]:
max_index_enc_input = torch.max(enc_input)
max_index_dec_input = torch.max(dec_input)
max_index_target = torch.max(target)

print("Max Index Encoder Input:", max_index_enc_input)
print("Max Index Decoder Input:", max_index_dec_input)
print("Max Index Target:", max_index_target)

Max Index Encoder Input: tensor(70071)
Max Index Decoder Input: tensor(70071)
Max Index Target: tensor(55434)


#3.1-Grammar error detection

##3.2-Grammar error correction


#4)-Training

#5)-Evaluation

#6)-Inference
