<a href="https://colab.research.google.com/github/Adithyan-mp/Sequence_Model/blob/main/EncoderDecoderRnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [43]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [44]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset,DataLoader
import torch.nn as nn
import torch.functional as F
from torch.optim import adam
from nltk import word_tokenize
from torch.nn.utils.rnn import pad_sequence
from collections import Counter

In [45]:
df = pd.read_csv('/content/eng_-french.csv')
df.tail()

Unnamed: 0,English words/sentences,French words/sentences
175616,"Top-down economics never works, said Obama. ""T...","« L'économie en partant du haut vers le bas, ç..."
175617,A carbon footprint is the amount of carbon dio...,Une empreinte carbone est la somme de pollutio...
175618,Death is something that we're often discourage...,La mort est une chose qu'on nous décourage sou...
175619,Since there are usually multiple websites on a...,Puisqu'il y a de multiples sites web sur chaqu...
175620,If someone who doesn't know your background sa...,Si quelqu'un qui ne connaît pas vos antécédent...


In [46]:
print(df.info())
print(f" \n null value : {df.isna().sum()}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175621 entries, 0 to 175620
Data columns (total 2 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   English words/sentences  175621 non-null  object
 1   French words/sentences   175621 non-null  object
dtypes: object(2)
memory usage: 2.7+ MB
None
 
 null value : English words/sentences    0
French words/sentences     0
dtype: int64


In [47]:
x=df['English words/sentences'][:500]
y=df['French words/sentences'][:500]
print(f"length of the feature {len(x)}")
print(f"length of the target {len(y)}")

length of the feature 500
length of the target 500


In [48]:
x_tokenized_list = [word_tokenize(word) for word in x]
print(x_tokenized_list[0:5])
y_tokenized_list = [word_tokenize(word) for word in y]
print(y_tokenized_list[0:5])

[['Hi', '.'], ['Run', '!'], ['Run', '!'], ['Who', '?'], ['Wow', '!']]
[['Salut', '!'], ['Cours', '!'], ['Courez', '!'], ['Qui', '?'], ['Ça', 'alors', '!']]


In [49]:
def collate_fn(batch):
    x_batch, y_batch = zip(*batch)  # Unzip list of tuples
    x_batch = [torch.tensor(x, dtype=torch.long) for x in x_batch]
    y_batch = [torch.tensor(y, dtype=torch.long) for y in y_batch]

    x_padded = pad_sequence(x_batch, batch_first=True, padding_value=0)
    y_padded = pad_sequence(y_batch, batch_first=True, padding_value=0)

    return x_padded, y_padded

def get_vocab(x_tokenized_list,y_tokenized_list):

  x_tokenized = pd.Series([token for lists in x_tokenized_list for token in lists])
  y_tokenized = pd.Series([token for lists in y_tokenized_list for token in lists])
  count_x = x_tokenized.value_counts(ascending=False)
  count_y = y_tokenized.value_counts(ascending=False)
  source_vocab = {key:idx+4 for idx,(key,value) in enumerate(count_x.items()) if idx<100000}
  target_vocab = {key:idx+4 for idx,(key,value) in enumerate(count_y.items()) if idx<100000}
  source_vocab = {'<PAD>':0,'<EOS>':1,"<SOS>":2,"<UNK>":3,**source_vocab}
  target_vocab = {'<PAD>':0,'<EOS>':1,"<SOS>":2,"<UNK>":3,**target_vocab}

  return source_vocab,target_vocab

def get_dataset(source_vocab,target_vocab,x_tokenized_list,y_tokenized_list):
  x = []
  y = []

  for lists in x_tokenized_list :
    temp = []
    for token in lists:
      temp.append(source_vocab.get(token.lower(),source_vocab['<UNK>']))
    temp.append(source_vocab['<EOS>'])
    x.append(temp)

  for lists in y_tokenized_list :
    temp = []
    temp.append(target_vocab['<SOS>'])
    for token in lists:
      temp.append(target_vocab.get(token.lower(),target_vocab['<UNK>']))
    temp.append(target_vocab['<EOS>'])
    y.append(temp)
  return x,y

class CustomDataset(Dataset):
  def __init__(self,x,y,transform=None) -> None:
    super().__init__()
    self.x = x
    self.y = y
    self.transform = transform
  def __len__(self):
    return len(self.x)
  def __getitem__(self, index) :
    x_i,y_i = self.x[index],self.y[index]

    if self.transform:
      x_i = self.transform(x_i)
      y_i = self.transform(y_i)
    return x_i,y_i

class ToTensor:
  def __call__(self, array) :
    return torch.tensor(array,dtype=torch.long)

class Encoder(nn.Module):
    def __init__(self, vocab_size=100000, embedding_dim=300, hidden_size=100):
        super().__init__()
        self.embed_layer = nn.Embedding(vocab_size, embedding_dim=embedding_dim, padding_idx=0)
        self.rnn = nn.RNN(input_size=embedding_dim, hidden_size=hidden_size, batch_first=True)

    def forward(self, input):
        batch_size, _ = input.size()
        device = input.device  # ensure tensor compatibility
        embedded = self.embed_layer(input)
        h0 = torch.zeros((1, batch_size, 100), device=device)  # move to same device
        _, hn = self.rnn(embedded, h0)
        return hn  # shape: [1, batch_size, hidden]


class Decoder(nn.Module):
    def __init__(self, vocab_size=100000, embedding_dim=300, hidden_size=100):
        super().__init__()
        self.embed_layer = nn.Embedding(vocab_size, embedding_dim=embedding_dim, padding_idx=0)
        self.rnn = nn.RNN(input_size=embedding_dim, hidden_size=hidden_size, batch_first=True)
        self.fc_out = nn.Linear(hidden_size, vocab_size)

    def forward(self, hn_e, max_len=20, return_logits=False):
        batch_size = hn_e.size(1)
        device = hn_e.device
        input = torch.full((batch_size, 1), 2, dtype=torch.long, device=device)  # 2 = <SOS>
        outputs = []
        logits_list = []
        hn = hn_e

        for _ in range(max_len):
            # [batch_size, 1, embedding_dim]
            # out: [batch_size, 1, hidden]
            # logits: [batch_size, 1, vocab_size]
            # [batch_size, 1]

            embedded = self.embed_layer(input)
            out, hn = self.rnn(embedded, hn)
            logits = self.fc_out(out)
            pred = torch.argmax(logits, dim=2)

            logits_list.append(logits)
            outputs.append(pred)
            input = pred

            if torch.all(pred == 1):  # 1 = <EOS>
                break

        if return_logits:
            return torch.cat(logits_list, dim=1)  # shape: [batch_size, max_len, vocab_size]
        else:
            return torch.cat(outputs, dim=1)      # shape: [batch_size, max_len]


class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = Encoder()
        self.decoder = Decoder()

    def forward(self, input, return_logits=False):
        hn = self.encoder(input)
        output = self.decoder(hn, return_logits=return_logits)
        return output


In [50]:
# Example: Assuming you already have tokenized input/output lists
# x_tokenized_list = [...]
# y_tokenized_list = [...]

source_vocab, target_vocab = get_vocab(x_tokenized_list, y_tokenized_list)
x, y = get_dataset(source_vocab, target_vocab, x_tokenized_list, y_tokenized_list)

transform = ToTensor()
dataset = CustomDataset(x, y, transform=transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vocab_size = len(target_vocab)

model = Model().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss(ignore_index=0)  # Ignore <PAD>


def train(model, dataloader, optimizer, loss_fn, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for x_batch, y_batch in dataloader:
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)

            optimizer.zero_grad()
            output = model(x_batch, return_logits=True)  # [batch, seq_len, vocab]


            # Match shapes for loss
            if output.size(1) < y_batch.size(1):
                pad = torch.full((output.size(0), y_batch.size(1) - output.size(1),100000), 0, dtype=torch.float, device=device)
                output = torch.cat([output, pad], dim=1)
            elif output.size(1) > y_batch.size(1):
                output = output[:, :y_batch.size(1),:]
                # print(output.size())

            output = output.reshape(-1, output.size(-1))  # shape: [batch_size * sequence_len, vocab_size]
            y_batch = y_batch.view(-1)                    # [batch * seq_len]
            loss = loss_fn(output, y_batch)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")

train(model, dataloader, optimizer, loss_fn, num_epochs=10)




  x_batch = [torch.tensor(x, dtype=torch.long) for x in x_batch]
  y_batch = [torch.tensor(y, dtype=torch.long) for y in y_batch]


Epoch 1, Loss: 10.0941
Epoch 2, Loss: 7.4926
Epoch 3, Loss: 5.3778
Epoch 4, Loss: 5.3240
Epoch 5, Loss: 4.7210
Epoch 6, Loss: 4.8924
Epoch 7, Loss: 4.5330
Epoch 8, Loss: 4.4796
Epoch 9, Loss: 4.4701
Epoch 10, Loss: 4.4333
