<a href="https://colab.research.google.com/github/DrMiracle/Colab-Projects/blob/main/NLP/NLP_lab_seq2seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://huggingface.co/datasets/RUCAIBox/Style-Transfer

Cloning into 'Style-Transfer'...
remote: Enumerating objects: 11, done.[K
remote: Total 11 (delta 0), reused 0 (delta 0), pack-reused 11 (from 1)[K
Unpacking objects: 100% (11/11), 1.62 KiB | 332.00 KiB/s, done.


In [None]:
!tar -xvzf Style-Transfer/gyafc_em.tgz
!tar -xvzf Style-Transfer/gyafc_fr.tgz

gyafc_em/
gyafc_em/valid.src
gyafc_em/test.tgt
gyafc_em/train.tgt
gyafc_em/train.src
gyafc_em/.DS_Store
gyafc_em/valid.tgt
gyafc_em/test.src
gyafc_fr/
gyafc_fr/valid.src
gyafc_fr/test.tgt
gyafc_fr/train.tgt
gyafc_fr/train.src
gyafc_fr/.DS_Store
gyafc_fr/valid.tgt
gyafc_fr/test.src


In [None]:
with open("gyafc_fr/train.src", "r") as f:
  print("Input:")
  print(f.readline())

with open("gyafc_fr/train.tgt", "r") as f:
  print("Target:")
  print(f.readline())

Input:
Sure, it's ok, but I always have let the guy ask me.

Target:
I prefer to let the guy ask me.



In [None]:
!pip install torchtext

In [None]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F

from transformers import AutoTokenizer
from tokenizers.processors import TemplateProcessing
from nltk.translate.bleu_score import corpus_bleu

import random

from tqdm import tqdm

In [None]:
tokenizer = AutoTokenizer.from_pretrained("t5-small", bos_token = "<s>")
tokenizer._tokenizer.post_processor = TemplateProcessing(
    single="<s> $A </s>",
    pair="<s> $A </s> <s> $B </s>",
    special_tokens=[("</s>", tokenizer.eos_token_id), ("<s>", tokenizer.bos_token_id)]
)
BOS_INDEX = tokenizer.bos_token_id
PAD_INDEX = tokenizer.pad_token_id
EOS_INDEX = tokenizer.eos_token_id
VOCAB_SIZE = tokenizer.vocab_size+1

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
HIDDEN_SIZE = 512
LEARNING_RATE = 0.001
NUM_EPOCHS = 10
BATCH_SIZE = 64
MAX_LENGTH = 50

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def read_dataframe(source, target, train = True):
    df = []
    with open(source, "r") as src, open(target, "r") as tgt:
        for s, t in zip(src, tgt):
            df.append({
                "source": s,
                "target": t if train else eval(t)
            })
    return pd.DataFrame(df)

In [None]:
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, train = True):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.train = train

    def __len__(self):
        return self.dataframe.shape[0]

    def __getitem__(self, idx):
        input_ids = self.tokenizer(self.dataframe['source'][idx],
                                   max_length=MAX_LENGTH,
                                   padding="max_length",
                                   truncation=True,
                                   return_tensors="pt")["input_ids"].squeeze()

        target_ids = self.tokenizer(self.dataframe['target'][idx],
                                        max_length=MAX_LENGTH,
                                        padding="max_length",
                                        truncation=True,
                                        return_tensors="pt")["input_ids"].squeeze()
        return input_ids, target_ids

class Encoder(nn.Module):

    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        scores = scores.squeeze(2).unsqueeze(1)

        weights = F.softmax(scores, dim=-1)
        context = torch.bmm(weights, keys)

        return context, weights

class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = Attention(hidden_size)
        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(BOS_INDEX)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        attentions = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden, attn_weights = self.forward_step(
                decoder_input, decoder_hidden, encoder_outputs
            )
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        attentions = torch.cat(attentions, dim=1)

        return decoder_outputs, decoder_hidden, attentions


    def forward_step(self, input, hidden, encoder_outputs):
        embedded =  self.dropout(self.embedding(input))

        query = hidden.permute(1, 0, 2)
        context, attn_weights = self.attention(query, encoder_outputs)
        input_gru = torch.cat((embedded, context), dim=2)

        output, hidden = self.gru(input_gru, hidden)
        output = self.out(output)

        return output, hidden, attn_weights


In [None]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):

    total_loss = 0
    for input, target in tqdm(dataloader):
        input, target = input.to(device), target.to(device)

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [None]:
def evaluate_bleu(dataloader, encoder, decoder):
    total_bleu = 0
    with torch.no_grad():
        for input, target in tqdm(dataloader):
            input, target = input.to(device), target.to(device)

            encoder_outputs, encoder_hidden = encoder(input)
            decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden)

            _, topi = decoder_outputs.topk(1)
            decoded_ids = topi.reshape(input.shape[0], -1)

            hypothesis = [tokenizer.decode(ids) for ids in decoded_ids]
            references = [[tokenizer.decode(t) for t in l_t] for l_t in target]

            total_bleu += corpus_bleu(references, hypothesis)

    return total_bleu / len(dataloader)

In [None]:
train_df = read_dataframe("gyafc_em/train.src", "gyafc_em/train.tgt")
valid_df = read_dataframe("gyafc_em/valid.src", "gyafc_em/valid.tgt", train = False)
test_df = read_dataframe("gyafc_em/test.src", "gyafc_em/test.tgt", train = False)

train_dataset = TextDataset(train_df, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

valid_dataset = TextDataset(valid_df, tokenizer)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

encoder = Encoder(VOCAB_SIZE, HIDDEN_SIZE).to(device)
decoder = Decoder(HIDDEN_SIZE, VOCAB_SIZE).to(device)

en_optimizer = optim.Adam(encoder.parameters(), lr=LEARNING_RATE)
de_optimizer = optim.Adam(decoder.parameters(), lr=LEARNING_RATE)
criterion = nn.NLLLoss(ignore_index=tokenizer.pad_token_id)

losses = []
bleus = []
for epoch in range(NUM_EPOCHS):
    train_loss = train_epoch(train_loader, encoder, decoder, en_optimizer, de_optimizer, criterion)
    bleu = evaluate_bleu(valid_loader, encoder, decoder)

    losses.append(train_loss)
    bleus.append(bleu)

    print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Train Loss: {train_loss:.4f}, Bleu: {bleu:.4f}")

100%|██████████| 411/411 [04:44<00:00,  1.45it/s]
100%|██████████| 23/23 [00:14<00:00,  1.62it/s]


Epoch 1/10, Train Loss: 4.6641, Bleu: 0.1197


100%|██████████| 411/411 [04:44<00:00,  1.45it/s]
100%|██████████| 23/23 [00:13<00:00,  1.65it/s]


Epoch 2/10, Train Loss: 3.4419, Bleu: 0.1512


100%|██████████| 411/411 [04:45<00:00,  1.44it/s]
100%|██████████| 23/23 [00:14<00:00,  1.58it/s]


Epoch 3/10, Train Loss: 2.8031, Bleu: 0.1746


100%|██████████| 411/411 [04:46<00:00,  1.44it/s]
100%|██████████| 23/23 [00:14<00:00,  1.62it/s]


Epoch 4/10, Train Loss: 2.3385, Bleu: 0.1865


100%|██████████| 411/411 [04:44<00:00,  1.44it/s]
100%|██████████| 23/23 [00:14<00:00,  1.59it/s]


Epoch 5/10, Train Loss: 1.9859, Bleu: 0.1921


100%|██████████| 411/411 [04:45<00:00,  1.44it/s]
100%|██████████| 23/23 [00:14<00:00,  1.61it/s]


Epoch 6/10, Train Loss: 1.7179, Bleu: 0.1998


100%|██████████| 411/411 [04:44<00:00,  1.44it/s]
100%|██████████| 23/23 [00:14<00:00,  1.58it/s]


Epoch 7/10, Train Loss: 1.5145, Bleu: 0.1968


100%|██████████| 411/411 [04:44<00:00,  1.44it/s]
100%|██████████| 23/23 [00:14<00:00,  1.60it/s]


Epoch 8/10, Train Loss: 1.3551, Bleu: 0.2047


100%|██████████| 411/411 [04:44<00:00,  1.45it/s]
100%|██████████| 23/23 [00:14<00:00,  1.61it/s]


Epoch 9/10, Train Loss: 1.2293, Bleu: 0.2058


100%|██████████| 411/411 [04:44<00:00,  1.45it/s]
100%|██████████| 23/23 [00:14<00:00,  1.60it/s]

Epoch 10/10, Train Loss: 1.1247, Bleu: 0.2013





In [None]:
plt.figure(figsize=(8, 5))
plt.plot(losses, marker='o', linestyle='-', color='b', label='Loss')
plt.title("Loss Curve", fontsize=14)
plt.xlabel("Epoch", fontsize=12)
plt.ylabel("Loss", fontsize=12)
plt.legend(fontsize=10)
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(bleus, marker='o', linestyle='-', color='b', label='Loss')
plt.title("Bleu Curve", fontsize=14)
plt.xlabel("Epoch", fontsize=12)
plt.ylabel("Bleu", fontsize=12)
plt.legend(fontsize=10)
plt.show()

In [None]:
test_dataset = TextDataset(test_df, tokenizer)
test_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

evaluate_bleu(test_loader, encoder, decoder)

In [None]:
def evaluate(encoder, decoder, sentence):
    with torch.no_grad():
        input_tensor = tokenizer([sentence], max_length=MAX_LENGTH, padding="max_length", truncation=True, return_tensors="pt")["input_ids"].squeeze().to(device)
        input_tensor = input_tensor.unsqueeze(0)
        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_INDEX:
                decoded_words.append('</s>')
                break
            decoded_words.append(tokenizer.decode(idx))
    return decoded_words, decoder_attn

def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = test_df.sample()
        print('>', pair['source'].iloc[0], end='')
        print('=', pair['target'].iloc[0])
        output_words, _ = evaluate(encoder, decoder, pair['source'].iloc[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [None]:
evaluateRandomly(encoder, decoder, n=3)

> u culd try researching it on google or try bestbuy and blockbuster
= ['You could try researching it on Google.  You could also try Best Buy and Blockbuster.', 'Take a look at either Google, Best Buy, or Blockbuster.', 'You might consider a research attempt on Google or explore Best Buy and Blockbuster.', 'You could research it on Google, or try a store like Best Buy or Blockbuster.']
< <s> You could try downloading it on Google or Soul ter from Block bus ter . </s>

> and the blonde in the boat says, this is, like, a sea of wheat, duh.
= ['The blonde who was in the boat stated that it was "like a sea of wheat".', 'The blonde in the boat calls this a sea of wheat.', 'The blonde in the boat then replies, "This is a sea of wheat!"', 'And the reply from the blonde in the boat is "this is like a sea of wheat."']
< <s> The blonde in the boat , and this is the one who keep in the boat ,  a picture of  a rifle . </s>

> name your favorite movie actress or actor
= ['Name your favorite movie a