# Laboratory: transformers for sentence english to spanish translation

Authors:
Diego Villacreses

## Load libraries

In [106]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random
import time
import os
import math

## PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, RandomSampler
from torch.nn.utils.rnn import pad_sequence
from torch.nn.functional import softmax

## lightning
import pytorch_lightning as pl

##
import numpy as np

import matplotlib.pyplot as plt

In [7]:
os.chdir('/home/dvillacreses/nlp')

In [8]:
device = torch.device('cuda')

## Data Loading

In [116]:
SOS_token = 0
EOS_token = 1
MAX_LENGTH = 16

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1



def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()

def filterPair(p):
    try:
        return len(p[0].split(' ')) < MAX_LENGTH and \
            len(p[1].split(' ')) < MAX_LENGTH #and \
#            p[0].startswith(eng_prefixes)
    except:
        print(p)

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

def prepareData(lang1, lang2, file):
    text = open(file, encoding='utf-8').read().split('\n')
    pairs = [[normalizeString(s) for s in l.split('\t')][:2] for l in text ]
    pairs = [pair for pair in pairs if len(pair) == 2]

    input_lang = Lang(lang1)
    output_lang = Lang(lang2)

    pairs = filterPairs(pairs)

    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

file_dir = 'spa.txt'
input_lang, output_lang, pairs = prepareData('eng', 'spa', file_dir)

Counted words:
eng 13526
spa 26437


In [117]:
from itertools import islice

def take(n, iterable):
    """Return the first n items of the iterable as a list."""
    return list(islice(iterable, n))

print(take(10, input_lang.index2word.items()))
print(take(10, output_lang.index2word.items()))

[(0, 'SOS'), (1, 'EOS'), (2, 'go'), (3, 'hi'), (4, 'run'), (5, '!'), (6, 'who'), (7, '?'), (8, 'wow'), (9, 'duck')]
[(0, 'SOS'), (1, 'EOS'), (2, 've'), (3, 'vete'), (4, 'vaya'), (5, 'vayase'), (6, 'hola'), (7, 'corre'), (8, '!'), (9, 'corran')]


## Training step by step transformer

In [118]:
import torch
import torch.nn as nn
import torch.optim as optim
import pytorch_lightning as pl
import math
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

# Define special tokens
SOS_token = 0
EOS_token = 1
MAX_LENGTH = 16

# Positional Encoding function
def positional_encoding(max_len, hidden_dim):
    pe = torch.zeros(max_len, hidden_dim)
    for pos in range(max_len):
        for i in range(0, hidden_dim, 2):
            pe[pos, i] = math.sin(pos / (10000 ** (i / hidden_dim)))
            pe[pos, i + 1] = math.cos(pos / (10000 ** (i / hidden_dim)))
    return pe.unsqueeze(0)

# Multihead Attention implementation
class MultiheadAttention(nn.Module):
    def __init__(self, model_dim, num_heads):
        super(MultiheadAttention, self).__init__()
        assert model_dim % num_heads == 0, "model_dim must be divisible by num_heads"

        self.num_heads = num_heads
        self.head_dim = model_dim // num_heads

        # Linear layers to project input into Q, K, V
        self.q_proj = nn.Linear(model_dim, model_dim)
        self.k_proj = nn.Linear(model_dim, model_dim)
        self.v_proj = nn.Linear(model_dim, model_dim)
        self.out_proj = nn.Linear(model_dim, model_dim)

    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)

        # Linear projections and reshape to split heads
        q = self.q_proj(query).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        k = self.k_proj(key).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        v = self.v_proj(value).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)

        # Scaled dot-product attention
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))
        attn_weights = torch.softmax(scores, dim=-1)
        context = torch.matmul(attn_weights, v)

        # Concatenate heads and apply output projection
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.head_dim)
        output = self.out_proj(context)
        return output

# Encoder block with multi-head attention and feedforward network
class EncoderBlock(nn.Module):
    def __init__(self, model_dim, num_heads, feedforward_dim, dropout=0.1):
        super(EncoderBlock, self).__init__()
        self.attention = MultiheadAttention(model_dim, num_heads)
        self.feedforward = nn.Sequential(
            nn.Linear(model_dim, feedforward_dim),
            nn.ReLU(),
            nn.Linear(feedforward_dim, model_dim),
        )
        self.norm1 = nn.LayerNorm(model_dim)
        self.norm2 = nn.LayerNorm(model_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        # Self-attention block
        attn_out = self.attention(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_out))

        # Feedforward block
        ff_out = self.feedforward(x)
        x = self.norm2(x + self.dropout(ff_out))

        return x

# Decoder block with masked multi-head attention, encoder-decoder attention, and feedforward network
class DecoderBlock(nn.Module):
    def __init__(self, model_dim, num_heads, feedforward_dim, dropout=0.1):
        super(DecoderBlock, self).__init__()
        self.self_attention = MultiheadAttention(model_dim, num_heads)
        self.enc_dec_attention = MultiheadAttention(model_dim, num_heads)
        self.feedforward = nn.Sequential(
            nn.Linear(model_dim, feedforward_dim),
            nn.ReLU(),
            nn.Linear(feedforward_dim, model_dim),
        )
        self.norm1 = nn.LayerNorm(model_dim)
        self.norm2 = nn.LayerNorm(model_dim)
        self.norm3 = nn.LayerNorm(model_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
        # Masked self-attention for decoder
        self_attn_out = self.self_attention(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(self_attn_out))

        # Encoder-decoder attention
        enc_dec_attn_out = self.enc_dec_attention(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(enc_dec_attn_out))

        # Feedforward block
        ff_out = self.feedforward(x)
        x = self.norm3(x + self.dropout(ff_out))

        return x

# Transformer Encoder with multiple EncoderBlocks
class TransformerEncoder(nn.Module):
    def __init__(self, num_layers, model_dim, num_heads, feedforward_dim, dropout=0.1):
        super(TransformerEncoder, self).__init__()
        self.layers = nn.ModuleList([EncoderBlock(model_dim, num_heads, feedforward_dim, dropout) for _ in range(num_layers)])

    def forward(self, src, mask=None):
        for layer in self.layers:
            src = layer(src, mask)
        return src

# Transformer Decoder with multiple DecoderBlocks
class TransformerDecoder(nn.Module):
    def __init__(self, num_layers, model_dim, num_heads, feedforward_dim, dropout=0.1):
        super(TransformerDecoder, self).__init__()
        self.layers = nn.ModuleList([DecoderBlock(model_dim, num_heads, feedforward_dim, dropout) for _ in range(num_layers)])

    def forward(self, tgt, enc_output, src_mask=None, tgt_mask=None):
        for layer in self.layers:
            tgt = layer(tgt, enc_output, src_mask, tgt_mask)
        return tgt

# Complete Transformer-based Translator model (encoder-decoder)
class TransformerTranslator(pl.LightningModule):
    def __init__(self, input_vocab_size, output_vocab_size, model_dim, num_heads, num_layers, feedforward_dim, dropout=0.1, lr=1e-4):
        super(TransformerTranslator, self).__init__()
        self.save_hyperparameters()

        self.model_dim = model_dim
        self.lr = lr

        # Embedding layers for source and target
        self.input_embedding = nn.Embedding(input_vocab_size, model_dim)
        self.output_embedding = nn.Embedding(output_vocab_size, model_dim)

        # Positional encoding
        self.positional_encoding = positional_encoding(MAX_LENGTH, model_dim)

        # Transformer encoder and decoder
        self.encoder = TransformerEncoder(num_layers, model_dim, num_heads, feedforward_dim, dropout)
        self.decoder = TransformerDecoder(num_layers, model_dim, num_heads, feedforward_dim, dropout)

        # Final output layer to predict tokens
        self.output_layer = nn.Linear(model_dim, output_vocab_size)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        # Get the current device (either CPU or GPU)
        device = src.device

        # Apply embedding and positional encoding, move positional encoding to the same device as src/tgt
        src = self.input_embedding(src) + self.positional_encoding[:, :src.size(1), :].to(device)
        tgt = self.output_embedding(tgt) + self.positional_encoding[:, :tgt.size(1), :].to(device)

        # Pass through encoder and decoder
        enc_output = self.encoder(src, src_mask)
        dec_output = self.decoder(tgt, enc_output, src_mask, tgt_mask)

        # Output projection
        output = self.output_layer(dec_output)

        return output


    def training_step(self, batch, batch_idx):
        src, tgt = batch

        # Prepare input and target for decoder
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]

        # Forward pass
        output = self(src, tgt_input)

        # Compute loss
        loss = nn.CrossEntropyLoss()(output.reshape(-1, self.hparams.output_vocab_size), tgt_output.reshape(-1))
        self.log('train_loss', loss, on_epoch=True, prog_bar=True)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)

    # Hook to print the average loss per epoch
    def on_train_epoch_end(self):
        avg_loss = self.trainer.callback_metrics["train_loss"].item()
        print(f"Epoch {self.current_epoch+1} training loss: {avg_loss}")


# Dataset preparation
def tensorFromSentence(lang, sentence):
    indexes = [lang.word2index[word] for word in sentence.split(' ')]
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

# Custom collate function for DataLoader
def collate_fn(batch):
    # Separate source and target sequences
    src_batch, tgt_batch = zip(*batch)

    # Pad the source and target sequences
    src_batch = pad_sequence(src_batch, batch_first=True, padding_value=0)  # padding_value=0 for padding tokens
    tgt_batch = pad_sequence(tgt_batch, batch_first=True, padding_value=0)

    return src_batch, tgt_batch

# Dummy Dataset (replace with your dataset)
class TranslationDataset(torch.utils.data.Dataset):
    def __init__(self, pairs):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        return tensorsFromPair(self.pairs[idx])

# Hyperparameters
hidden_dim = 256
nheads = 8
num_layers = 2
feedforward_dim = 512
dropout = 0.2
learning_rate = 0.001
batch_size = 64
n_epochs = 5

# Dummy Language Class for Mapping Words to Indexes and Vice Versa
class Lang:
    def __init__(self):
        self.word2index = {}
        self.index2word = {}
        self.n_words = 0  # Count of words

    def add_sentence(self, sentence):
        for word in sentence.split(' '):
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word
            self.n_words += 1

# Dummy data for demonstration
input_lang = Lang()
output_lang = Lang()

for pair in pairs:
    input_lang.add_sentence(pair[0])
    output_lang.add_sentence(pair[1])

input_size = input_lang.n_words
output_size = output_lang.n_words

# Create dataset and dataloader
train_dataset = TranslationDataset(pairs)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# Initialize the model
model = TransformerTranslator(input_vocab_size=input_size, output_vocab_size=output_size,
                              model_dim=hidden_dim, num_heads=nheads, num_layers=num_layers,
                              feedforward_dim=feedforward_dim, dropout=dropout, lr=learning_rate)

# Initialize the PyTorch Lightning trainer
trainer = pl.Trainer(max_epochs=n_epochs)
trainer.fit(model, train_loader)


INFO:pytorch_lightning.utilities.rank_zero:Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name             | Type               | Params | Mode 
----------------------------------------------------------------
0 | input_embedding  | Embedding          | 3.5 M  | train
1 | output_embedding | Embedding          | 6.8 M  | train
2 | encoder          | TransformerEncoder | 1.1 M  | trai

Training: |          | 0/? [00:00<?, ?it/s]

Epoch 1 training loss: 0.5996176600456238
Epoch 2 training loss: 0.08028876036405563
Epoch 3 training loss: 0.027580171823501587
Epoch 4 training loss: 0.01449747383594513
Epoch 5 training loss: 0.011489653028547764


INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.


In [129]:
# Function to convert a sentence into a tensor of word indexes
def tensorFromSentence(lang, sentence):
    indexes = [lang.word2index[word] for word in sentence.split(' ')]
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long).unsqueeze(0)  # Shape [1, seq_len]

# Function to convert word indexes back to a sentence
def sentenceFromTensor(lang, tensor):
    return ' '.join([lang.index2word[idx.item()] for idx in tensor])

# Function to translate a phrase using the trained model
def translate_sentence(model, input_sentence, input_lang, output_lang, max_length=MAX_LENGTH):
    # Prepare the input tensor
    input_tensor = tensorFromSentence(input_lang, input_sentence)

    # Initialize the target sentence with the SOS token
    decoder_input = torch.tensor([[SOS_token]], dtype=torch.long)

    # Move tensors to the same device as the model
    device = model.device
    input_tensor = input_tensor.to(device)
    decoder_input = decoder_input.to(device)

    # Run the encoder and get the encoder output
    with torch.no_grad():
        encoder_output = model.input_embedding(input_tensor) + model.positional_encoding[:, :input_tensor.size(1), :].to(device)
        encoder_output = model.encoder(encoder_output)

    decoded_words = []

    # Greedy decoding (one word at a time)
    for _ in range(max_length):
        with torch.no_grad():
            decoder_output = model.output_embedding(decoder_input) + model.positional_encoding[:, :decoder_input.size(1), :].to(device)
            transformer_output = model.decoder(decoder_output, encoder_output)
            output_logits = model.output_layer(transformer_output)

        # Get the most likely next word (greedy decoding)
        next_token = output_logits[:, -1].argmax(dim=1)
        next_word = output_lang.index2word[next_token.item()]

        # Stop if EOS token is generated
        if next_token.item() == EOS_token:
            break

        # Append word to the decoded sentence
        decoded_words.append(next_word)

        # Update the decoder input with the new word
        decoder_input = torch.cat((decoder_input, next_token.unsqueeze(0)), dim=1)

    return ' '.join(decoded_words)

# Example phrases for translation
for phrase in ['hello', 'she is my sister', 'i am cleaning my house', 'i m scared', 'what is my name ?']:
    input_sentence = phrase
    output_sentence = translate_sentence(model, input_sentence, input_lang, output_lang)
    print(f"Translated sentence from '{input_sentence}' to: '{output_sentence}'")


Translated sentence from 'hello' to: 've ve ve ve ve ve ve ve ve ve ve ve ve ve ve ve'
Translated sentence from 'she is my sister' to: 've ve ve ve ve ve ve ve ve ve ve ve ve ve ve ve'
Translated sentence from 'i am cleaning my house' to: 've ve ve ve ve ve ve ve ve ve ve ve ve ve ve ve'
Translated sentence from 'i m scared' to: 've ve ve ve ve ve ve ve ve ve ve ve ve ve ve ve'
Translated sentence from 'what is my name ?' to: 've ve ve ve ve ve ve ve ve ve ve ve ve ve ve ve'


## traning with nn.Transformer

In [100]:
import torch
import torch.nn as nn
import torch.optim as optim
import pytorch_lightning as pl
import math
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

# Define special tokens
SOS_token = 0
EOS_token = 1
MAX_LENGTH = 16

# Positional Encoding function
def positional_encoding(max_len, hidden_dim):
    pe = torch.zeros(max_len, hidden_dim)
    for pos in range(max_len):
        for i in range(0, hidden_dim, 2):
            pe[pos, i] = math.sin(pos / (10000 ** (i / hidden_dim)))
            pe[pos, i + 1] = math.cos(pos / (10000 ** (i / hidden_dim)))
    return pe.unsqueeze(0)

# Transformer-based Translator model
class TransformerTranslator(pl.LightningModule):
    def __init__(self, input_vocab_size, output_vocab_size, model_dim, num_heads, num_layers, dropout=0.1, lr=1e-4):
        super(TransformerTranslator, self).__init__()
        self.save_hyperparameters()

        self.model_dim = model_dim
        self.lr = lr

        # Embedding layers for source and target
        self.input_embedding = nn.Embedding(input_vocab_size, model_dim)
        self.output_embedding = nn.Embedding(output_vocab_size, model_dim)

        # Positional encoding
        self.positional_encoding = positional_encoding(MAX_LENGTH, model_dim)

        # Transformer model
        self.transformer = nn.Transformer(d_model=model_dim, nhead=num_heads, num_encoder_layers=num_layers,
                                          num_decoder_layers=num_layers, dropout=dropout, batch_first=True)

        # Final output layer to predict tokens
        self.output_layer = nn.Linear(model_dim, output_vocab_size)

    def forward(self, src, tgt):
        # Get the current device (either CPU or GPU)
        device = src.device

        # Apply embedding and positional encoding, move positional encoding to the same device as src/tgt
        src = self.input_embedding(src) + self.positional_encoding[:, :src.size(1), :].to(device)
        tgt = self.output_embedding(tgt) + self.positional_encoding[:, :tgt.size(1), :].to(device)

        # Pass through transformer and output layer
        output = self.transformer(src, tgt)
        output = self.output_layer(output)

        return output

    def training_step(self, batch, batch_idx):
        src, tgt = batch

        # Prepare input and target for decoder
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]

        # Forward pass
        output = self(src, tgt_input)

        # Compute loss using reshape instead of view
        loss = nn.CrossEntropyLoss()(output.reshape(-1, self.hparams.output_vocab_size), tgt_output.reshape(-1))

        # Log batch loss
        self.log('train_loss', loss, on_epoch=True, prog_bar=True)

        return loss


    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)

    # Hook to print the average loss per epoch
    def on_train_epoch_end(self):
        avg_loss = self.trainer.callback_metrics["train_loss"].item()
        print(f"Epoch {self.current_epoch+1} training loss: {avg_loss}")

# Dummy Language Class for Mapping Words to Indexes and Vice Versa
class Lang:
    def __init__(self):
        self.word2index = {}
        self.index2word = {}
        self.n_words = 0  # Count of words

    def add_sentence(self, sentence):
        for word in sentence.split(' '):
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word
            self.n_words += 1

# Create input and output language objects
input_lang = Lang()
output_lang = Lang()


# Populate the language objects with the example data
for pair in pairs:
    input_lang.add_sentence(pair[0])
    output_lang.add_sentence(pair[1])

# Dataset preparation
def tensorFromSentence(lang, sentence):
    indexes = [lang.word2index[word] for word in sentence.split(' ')]
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long)  # No need to add batch dimension here

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

# Custom Dataset for translation
class TranslationDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        return tensorsFromPair(self.pairs[idx])

# Custom collate function for padding
def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)

    # Pad sequences to ensure uniform length
    src_batch = pad_sequence(src_batch, batch_first=True, padding_value=0)  # Padding token = 0
    tgt_batch = pad_sequence(tgt_batch, batch_first=True, padding_value=0)

    return src_batch, tgt_batch

# Hyperparameters
hidden_dim = 256
nheads = 8
num_layers = 2
dropout = 0.2
learning_rate = 0.001
batch_size = 64
n_epochs = 5


input_size = input_lang.n_words
output_size = output_lang.n_words

# Create dataset and dataloader
train_dataset = TranslationDataset(pairs)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# Initialize the model
model = TransformerTranslator(input_vocab_size=input_size, output_vocab_size=output_size,
                              model_dim=hidden_dim, num_heads=nheads, num_layers=num_layers, dropout=dropout, lr=learning_rate)

# Initialize the PyTorch Lightning trainer
trainer = pl.Trainer(max_epochs=n_epochs)  # Set gpus=0 to use CPU, adjust if GPU is available
trainer.fit(model, train_loader)


INFO:pytorch_lightning.utilities.rank_zero:Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name             | Type        | Params | Mode 
---------------------------------------------------------
0 | input_embedding  | Embedding   | 3.5 M  | train
1 | output_embedding | Embedding   | 6.8 M  | train
2 | transformer      | Transformer | 5.8 M  | train
3 | output_layer     | Linear    

Training: |          | 0/? [00:00<?, ?it/s]

Epoch 1 training loss: 0.7338055968284607
Epoch 2 training loss: 0.117411307990551
Epoch 3 training loss: 0.04921085014939308
Epoch 4 training loss: 0.027806900441646576
Epoch 5 training loss: 0.02332763560116291


INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.


In [105]:
# Function to convert a sentence into a tensor of word indexes
def tensorFromSentence(lang, sentence):
    indexes = [lang.word2index[word] for word in sentence.split(' ')]
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long).unsqueeze(0)  # Shape [1, seq_len]

# Function to convert word indexes back to a sentence
def sentenceFromTensor(lang, tensor):
    return ' '.join([lang.index2word[idx.item()] for idx in tensor])

# Function to translate a phrase using the trained model
def translate_sentence(model, input_sentence, input_lang, output_lang, max_length=MAX_LENGTH):
    # Prepare the input tensor
    input_tensor = tensorFromSentence(input_lang, input_sentence)

    # Initialize the target sentence with the SOS token
    decoder_input = torch.tensor([[SOS_token]], dtype=torch.long)

    # Move tensors to the same device as the model
    input_tensor = input_tensor.to(model.device)
    decoder_input = decoder_input.to(model.device)

    # Run the encoder and get the encoder output
    with torch.no_grad():
        encoder_output = model.input_embedding(input_tensor) + model.positional_encoding[:, :input_tensor.size(1), :]
        encoder_output = model.transformer.encoder(encoder_output)

    decoded_words = []

    # Greedy decoding (one word at a time)
    for _ in range(max_length):
        with torch.no_grad():
            decoder_output = model.output_embedding(decoder_input) + model.positional_encoding[:, :decoder_input.size(1), :]
            transformer_output = model.transformer.decoder(
                decoder_output, encoder_output
            )
            output_logits = model.output_layer(transformer_output)

        # Get the most likely next word (greedy decoding)
        next_token = output_logits[:, -1].argmax(dim=1)
        next_word = output_lang.index2word[next_token.item()]

        # Stop if EOS token is generated
        if next_token.item() == EOS_token:
            break

        # Append word to the decoded sentence
        decoded_words.append(next_word)

        # Update the decoder input with the new word
        decoder_input = torch.cat((decoder_input, next_token.unsqueeze(0)), dim=1)

    return ' '.join(decoded_words)


for phrase in ['hello','she is my sister','i am cleaning my house','i m scared','what is my name ?']:
    input_sentence = phrase
    output_sentence = translate_sentence(model, input_sentence, input_lang, output_lang)
    print(f"Translated sentence from {input_sentence} to: {output_sentence}")


Translated sentence from hello to: ve ve ve ve ve ve ve ve ve ve ve ve ve ve ve ve
Translated sentence from she is my sister to: ve ve ve ve ve ve ve ve ve ve ve ve ve ve ve ve
Translated sentence from i am cleaning my house to: ve ve ve ve ve ve ve ve ve ve ve ve ve ve ve ve
Translated sentence from i m scared to: repruebas repruebas repruebas repruebas repruebas repruebas repruebas repruebas repruebas repruebas repruebas
Translated sentence from what is my name ? to: ve ve ve ve ve ve ve ve ve ve ve ve ve ve ve ve
