In [None]:
import math
import time
import spacy
import numpy as np
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchtext
from torchtext.data.functional import to_map_style_dataset
from torch.nn.functional import pad
from torchtext.vocab import build_vocab_from_iterator
import torchtext.datasets as datasets
from torchtext.data.utils import get_tokenizer
from torch.utils.data import DataLoader



In [None]:
import os
import torch
import torch.nn.functional as F
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset, random_split

# Function to read your custom dataset
def read_telugu_english_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        raw_data = []
        for line in file:
            telugu_sentence, english_sentence = line.strip().split('++++$++++')
            raw_data.append((telugu_sentence, english_sentence))
    return raw_data

# Splitting the dataset
def split_dataset(data, train_split=0.7, val_split=0.15, test_split=0.15):
    total_size = len(data)
    train_size = int(total_size * train_split)
    val_size = int(total_size * val_split)
    test_size = total_size - train_size - val_size
    train_data, remaining_data = random_split(data, [train_size, total_size - train_size])
    val_data, test_data = random_split(remaining_data, [val_size, test_size])
    return list(train_data), list(val_data), list(test_data)

# Custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

    def get_raw_texts(self):
        return [(src, trg) for src, trg in self.data]

# Define tokenizers
tokenizer_te = get_tokenizer('basic_english')  # Replace with a suitable tokenizer for Telugu
tokenizer_en = get_tokenizer('basic_english')  # Suitable tokenizer for English

# Build vocabulary function
def build_vocabulary(tokenizer, dataset, min_freq=2):
    def yield_tokens(data):
        for src, trg in data:
            yield tokenizer(src)
            yield tokenizer(trg)

    vocab = build_vocab_from_iterator(yield_tokens(dataset.get_raw_texts()), specials=["<unk>", "<pad>", "<bos>", "<eos>"], min_freq=min_freq)
    vocab.set_default_index(vocab['<unk>'])  # Set default index for unknown tokens
    return vocab

# Read the dataset
file_path = '/content/english_telugu_data.txt'
raw_data = read_telugu_english_data(file_path)
train_data_raw, val_data_raw, test_data_raw = split_dataset(raw_data)

# Create datasets
train_dataset = CustomDataset(train_data_raw)
valid_dataset = CustomDataset(val_data_raw)
test_dataset = CustomDataset(test_data_raw)

# Load vocabularies
vocab_src = build_vocabulary(tokenizer_te, train_dataset)
vocab_trg = build_vocabulary(tokenizer_en, train_dataset)

# Batch generation function
def generate_batch(data_batch):
    de_batch, en_batch = [], []
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    for (de_item, en_item) in data_batch:
        # Convert list of indices into tensors
        de_indices = torch.tensor([vocab_src[token] for token in tokenizer_te(de_item)], dtype=torch.long)
        en_indices = torch.tensor([vocab_trg[token] for token in tokenizer_en(en_item)], dtype=torch.long)

        # Concatenate BOS, indices, EOS
        de_temp = torch.cat([torch.tensor([vocab_src['<bos>']], dtype=torch.long), de_indices, torch.tensor([vocab_src['<eos>']], dtype=torch.long)], dim=0).to(device)
        en_temp = torch.cat([torch.tensor([vocab_trg['<bos>']], dtype=torch.long), en_indices, torch.tensor([vocab_trg['<eos>']], dtype=torch.long)], dim=0).to(device)

        # Pad sequences to ensure consistent length
        padded_de = F.pad(de_temp, (0, 20 - len(de_temp)), value=vocab_src['<pad>'])
        padded_en = F.pad(en_temp, (0, 20 - len(en_temp)), value=vocab_trg['<pad>'])

        de_batch.append(padded_de)
        en_batch.append(padded_en)

    return torch.stack(de_batch), torch.stack(en_batch)


# DataLoader setup
BATCH_SIZE = 128
train_iter = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, collate_fn=generate_batch)
valid_iter = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, collate_fn=generate_batch)
test_iter = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, collate_fn=generate_batch)
BOS_IDX = vocab_trg['<bos>']
EOS_IDX = vocab_trg['<eos>']
PAD_IDX = vocab_trg['<pad>']
MAX_PADDING = 20
BATCH_SIZE = 128


# The Embedding Layer


In [None]:
class Embeddings(nn.Module):
  def __init__(self, vocab_size: int, d_model: int):
    """
    Args:
      vocab_size:    size of vocabulary
      d_model:       dimension of embeddings
    """
    super().__init__()
    self.lut = nn.Embedding(vocab_size, d_model)
    self.d_model = d_model

  def forward(self, x):
    """
    Args:
      x:        input tensor (batch_size, sseq_lenght)

      returns:  embedding vector
    """
    return (self.lut(x) * math.sqrt(self.d_model))

# Positional Encoding


In [None]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model: int, dropout: float = 0.1, max_length: int = 5000):
    """
    Args:
      d_model:     dimension of embeddings
      dropout:     randomly zeroes-out some of the input
      max_lenght:  amx sequence length
    """

    super().__init__()
    self.dropout=nn.Dropout(p=dropout)
    pe=torch.zeros(max_length,d_model)
    for k in np.arange(max_length):
      for i in np.arange(d_model//2):
        theta = k / (100** ((2*i)/d_model))


        pe[k, 2*i] = math.sin(theta)


        pe[k, 2*i+1] = math.cos(theta)
        self.register_buffer("pe",pe)

  def forward(self, x):
    """
    Args:
      x:        embeddings (batch_size, seq_lenght, d_model)
      returns:  embeddings + positonal encodings (batch_size, seq_length, d_model)
    """
    x = x + self.pe[:x.size(1)].requires_grad_(False)
    return self.dropout(x)


# Multi-Head Attention

In [None]:
class MultiHeadAttention(nn.Module):
  def __init__(self, d_model, n_heads, dropout: float = 0.1):
    """
    Args:
      d_model:      dimension of embeddings
      n_heads:      number of self attention heads
      dropout:      probability of dropout occuring
    """
    super().__init__()
    self.d_model = d_model
    self.n_heads = n_heads
    self.d_key = d_model // n_heads

    # create query, key, value, output weights
    self.Wq = nn.Linear(d_model, d_model)
    self.Wk = nn.Linear(d_model,d_model)
    self.Wv = nn.Linear(d_model,d_model)
    self.Wo = nn.Linear(d_model,d_model)

    self.dropout = nn.Dropout(p = dropout)

  def forward(self, query, key, value, mask = None):
    """
    Args:
      query:    query vector (batch_size, q_length, d_model)
      key:      key vector (batch_size, k_length, d_model)
      value:    value vector (batch_size, s_length, d_model)
      mask:     mask for decoder

    Returns:
      output:    attention values (batch_size, q_lenght, d_model)
      attn_probs:  softmax scores (batchsize, n_heads, q_length, k_length)
    """
    batch_size = query.shape[0]

    # calculate query, key, and value tensors
    Q = self.Wq(query)
    K = self.Wk(key)          # (32, 10, 512) x (512, 512) = (32, 10, 512)
    V = self.Wv(value)

    # split each tensor into n_heads to compute attention

    # query tensor
    Q = Q.view(batch_size,
               -1,                                    # (32, 10, 512) -> (32, 10, 8 ,64)
               self.n_heads,                          # -1 = q_lenght
               self.d_key).permute(0, 2, 1, 3)        # (32, 10, 8, 64) -> (32, 8, 10, 64)

    # key tensor
    K = K.view(batch_size,
               -1,
               self.n_heads,
               self.d_key).permute(0, 2, 1, 3)

    # value tensor
    V = V.view(batch_size,
               -1,
               self.n_heads,
               self.d_key).permute(0, 2, 1, 3)

    # computes attention
    # scalled dot product -> QK^{T}
    scaled_dot_prod = torch.matmul(Q, K.permute(0, 1, 3, 2)) / math.sqrt(self.d_key)

    # fill thoes positions of product as (-1e10) where mask positions are 0
    if mask is not None:
      scaled_dot_prod = scaled_dot_prod.masked_fill(mask == 0, -1e10)

    attn_probs = torch.softmax(scaled_dot_prod, dim = -1)

    # multiply by values to get attention
    A = torch.matmul(self.dropout(attn_probs), V)


    # reshape attention back to (32, 10, 512)
    A = A.permute(0,2,1,3).contiguous()               # (32, 8, 10, 64) -> (32, 10, 8 ,64)
    A = A.view(batch_size, -1, self.n_heads*self.d_key)     # (32, 10, 8, 64) -> (32, 10, 8*64) = (32, 10, 512)

    output = self.Wo(A)

    return output, attn_probs



# Position-Wise Feed Forward Network (FFN)


In [None]:
class PositionwiseFeedForward(nn.Module):
  def __init__(self, d_model: int, d_ffn: int, dropout: float = 0.1):
    """
    Args:
      d_model:      dimension of embeddings
      d_ffn:        dimension of feed-forward network
      dropout:      probability of dropout occuring
    """

    super().__init__()
    self.linear_layer_1 = nn.Linear(d_model, d_ffn)
    self.linear_layer_2 = nn.Linear(d_ffn, d_model)
    self.dropout = nn.Dropout(p=dropout)

  def forward(self, x):
    """
    Args:
      x:        output from attention (batch_size, seq_length, d_model)

    Returns:
      expanded-and-contracted representation (batch_size, seq_length, d_model)
    """

    return self.linear_layer_2(self.dropout(self.linear_layer_1(x).relu()))



# The Encoder

In [None]:
class EncoderLayer(nn.Module):
  def __init__(self, d_model: int, n_heads: int, d_ffn: int, dropout: float):
    """
    Args:
      d_model:      dimension of embeddings
      n_heads:      number of heads
      d_ffn:        dimension of feed-forward network
      dropout:      probability of dropout ocurring
    """
    super().__init__()
    self.attention = MultiHeadAttention(d_model, n_heads, dropout)
    self.attn_layer_norm = nn.LayerNorm(d_model)
    self.positionwise_fnn = PositionwiseFeedForward(d_model, d_ffn, dropout)
    self.fnn_layer_norm = nn.LayerNorm(d_model)
    self.dropout = nn.Dropout(dropout)

  def forward(self, src, src_mask):
    """
    Args:
      src:      positionally embedded sequences (batch_size, seq_length, d_model)
      src_mask: mask for the sequences (batch_size, 1, 1, seq_lenght)
    Returns:
      src:      Sequences after self-attention (batch_size, seq_length, d_model)
    """

    _src, attn_probs = self.attention(src, src, src, src_mask)

    src = self.attn_layer_norm(src + self.dropout(_src))

    _src = self.positionwise_fnn(src)

    src = self.fnn_layer_norm(src + self.dropout(_src))

    return src, attn_probs


class Encoder(nn.Module):
  def __init__(self, d_model: int, n_layers: int, n_heads: int, d_ffn: int, dropout: float = 0.1):
    """
    Args:
      d_model:      dimension of embeddings
      n_layers:     number of encoder layers
      n_heads:      number of heads
      d_ffn:        dimension of feed-forward network
      dropout:      probability of dropout occuring
    """
    super().__init__()

    # create n_layers encoders
    self.layers = nn.ModuleList([EncoderLayer(d_model, n_heads, d_ffn, dropout) for _ in range(n_layers)])

    self.dropout = nn.Dropout(dropout)

  def forward(self, src, src_mask):
    """
    Args:
      src:      positionally embedded sequences (batch_size, seq_length, d_model)
      src_mask: mask for the sequences (batch_size, 1, 1, seq_lenght)
    Returns:
      src:      Sequences after self-attention (batch_size, seq_length, d_model)
    """

    # Pass the sequence through each encoder
    for layer in self.layers:
      src, attn_probs = layer(src, src_mask)

    self.attn_probs = attn_probs
    return src


# The Decoder

In [None]:
class DecoderLayer(nn.Module):
  def __init__(self, d_model: int, n_heads: int, d_ffn: int, dropout: float):
    """
    Args:
      d_model:      dimension of embeddings
      n_heads:      number of heads
      d_ffn:        dimension of feed-forward network
      dropout:      probability of dropout occuring
    """
    super().__init__()
    self.masked_attention = MultiHeadAttention(d_model, n_heads, dropout)
    self.masked_attn_layer_norm = nn.LayerNorm(d_model)
    self.attention = MultiHeadAttention(d_model, n_heads, dropout)
    self.attn_layer_norm = nn.LayerNorm(d_model)
    self.positionwise_fnn = PositionwiseFeedForward(d_model, d_ffn, dropout)
    self.fnn_layer_norm = nn.LayerNorm(d_model)
    self.dropout = nn.Dropout(dropout)

  def forward(self, trg, src, trg_mask, src_mask):
    """
    Args:
      trg:          embedded sequences (batch_size, trg_seq_length, d_model)
      src:          embedded sequences (batch_size, src_seq_length, d_model)
      trg_mask:     mask for the sequences (batch_size, 1, trg_seq_length, trg_seq_lengt
      src_mask:     mask for the sequences (batch_size, 1, 1, src_seq_length)

    Returns:
      trg: sequences after self-attention (batch_size, trg_seq_length, d_model)
      attn_probs: self-attention softmax scores (batch_size, n_heads, trg_seq_length, src_seq_lenght)
    """

    _trg, attn_probs = self.masked_attention(trg, trg, trg, trg_mask)

    trg = self.masked_attn_layer_norm(trg + self.dropout(_trg))

    _trg, attn_probs = self.attention(trg, src, src, src_mask)

    trg = self.attn_layer_norm(trg + self.dropout(_trg))

    _trg = self.positionwise_fnn(trg)

    trg = self.fnn_layer_norm(trg + self.dropout(_trg))

    return trg, attn_probs

class Decoder(nn.Module):
  def __init__(self, vocab_size: int, d_model: int, n_layers: int, n_heads: int, d_ffn: int, dropout: float = 0.1):
    """
    Args:
      vocab_size:     size of the target vocabulary
      d_model:        dimension of embeddings
      n_layers:       number of encoder layers
      n_heads:        number of heads
      d_ffn:          dimension of feed-forward network
      dropout:        probability of dropout occurring
    """
    super().__init__()

    # create n_layers encoders
    self.layers = nn.ModuleList([DecoderLayer(d_model, n_heads, d_ffn, dropout) for _ in range(n_layers)])

    self.dropout = nn.Dropout(dropout)

    # set output layer
    self.Wo = nn.Linear(d_model, vocab_size)

  def forward(self, trg, src, trg_mask, src_mask):
      """
      Args:
        trg:          embedded sequences (batch_size, trg_seq_length, d_model)
        src:          embedded sequences (batch_size, src_seq_length, d_model)
        trg_mask:     mask for the sequences (batch_size, 1, trg_seq_length, trg_seq_lengt
        src_mask:     mask for the sequences (batch_size, 1, 1, src_seq_length)

      Returns:
        output:       sequences after decoder (batch_size, trg_seq_length, vocab_size)
        attn_probs:   self-attention softmax scores (batch_size, n_heads, trg_seq_length, src_seq

      """

      # pass the sequences through each decoder
      for layer in self.layers:
        trg , attn_probs = layer(trg, src, trg_mask, src_mask)

      self.attn_probs = attn_probs
      return self.Wo(trg)



# The Transformer

In [None]:
class Transformer(nn.Module):
  def __init__(self, encoder: Encoder, decoder: Decoder, src_embed: Embeddings,
               trg_embed: Embeddings, src_pad_idx: int, trg_pad_idx: int, device):
    """
    Args:
      encoder:        encoder stack
      decoder:        decoder stack
      src_embed:      source embeddings and encodings
      trg_embed:      target embeddings and encodings
      src_pad_idx:    padding index
      trg_pad_idx:    padding index
      device:         cpu or gpu

    Returns:
      output:         sequences after decoder (batch_size, trg_seq_length, vocab_size)
    """
    super().__init__()

    self.encoder = encoder
    self.decoder = decoder
    self.src_embed = src_embed
    self.trg_embed = trg_embed
    self.device = device
    self.src_pad_idx = src_pad_idx
    self.trg_pad_idx = trg_pad_idx

  def make_src_mask(self, src):
    """
    Args:
      src:        raw sequence with padding     (batch_size, seq_length)

    Returns:
      src_mask:   mask for each sequence        (batch_size, 1, 1, seq_lenght)
    """
    # assign 1 to tokens that need attended to and 0 to padding tokens, then add 2 dimensions
    src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
    return src_mask

  def make_trg_mask(self, trg):
    """
    Args:
      trg:        raw sequence with padding     (batch_size, seq_length)

    Returns:
      trg_mask:   mask for each sequence        (batch_size, 1, seq_length, seq_lenght)
    """
    seq_length = trg.shape[1]

    # assign True to tokens that need attended to and False to padding tokens, then add 2 dimensions
    trg_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)

    # generate subsequent mask
    trg_sub_mask = torch.tril(torch.ones((seq_length, seq_length), device = self.device )).bool()

    # bitwise "and" operator
    trg_mask = trg_mask & trg_sub_mask
    return trg_mask

  def forward(self, src, trg):
    """
    Args
      trg:        raw target sequence (batch_size, trg_seq_length)
      src:        raw src sequences (batch_size, src_seq_length)

    Returns:
      output:     sequences after decoder   (batch_size, trg_seq_length, output_dim)
    """

    # create source and target masks
    src_mask = self.make_src_mask(src)    #(batch_size, 1, 1, src_seq_length)
    trg_mask = self.make_trg_mask(trg)    #(batch_size, 1, trg_seq_length, trg_seq_length)

    # push the src through the encoder layers
    src = self.encoder(self.src_embed(src), src_mask)   # (batch_size, src_seq_length, d_model)

    # decoder output and attention probabilities
    output = self.decoder(self.trg_embed(trg), src, trg_mask, src_mask)

    return output


# Generating a Model

In [None]:
def make_model(device, src_vocab, trg_vocab, n_layers: int = 3, d_model: int = 256,
               d_ffn: int = 2048, n_heads: int = 8, dropout: float = 0.1,
               max_length: int = 5000):
  """
    Construct a model when provided parameters.

    Args:
      src_vocab:      source vocubulary
      trg_vocab:      target vocubulary
      n_layers:       Number of encoder and decoders
      d_model:        dimension of embeddinsg
      d_ffn:          dimension of feed-forwaed network
      n_heads:        number of heads
      dropout:        probability of dropout ocurring
      max_length:     maximum sequence length for positional encodings

    Returns:
      Transformer model based on hyperparameters
  """
  encoder = Encoder(d_model, n_layers, n_heads, d_ffn, dropout)
  decoder = Decoder(len(trg_vocab), d_model, n_layers, n_heads, d_ffn, dropout)
  src_embed = Embeddings(len(src_vocab), d_model)
  trg_embed = Embeddings(len(trg_vocab), d_model)
  pos_enc = PositionalEncoding(d_model, dropout, max_length)

  # create the Transformer model
  model = Transformer(encoder, decoder, nn.Sequential(src_embed, pos_enc),
                      nn.Sequential(trg_embed, pos_enc),
                      src_pad_idx = src_vocab.get_stoi()["<pad>"],
                      trg_pad_idx = trg_vocab.get_stoi()["<pad>"],
                      device = device)

  # initialize parameters with Xavier/Glorot
  for p in model.parameters():
    if p.dim() > 1:
      nn.init.xavier_uniform_(p)

  return model



## Pre-processing the data



In [None]:
with open('/content/english_telugu_data.txt', 'r', encoding='utf-8') as file:
  raw = []
  n = 0
  for line in file:
    n +=1
    if n==5:
      break
    telugu_sentence, english_sentence = line.strip().split('++++$++++')
    raw.append((telugu_sentence, english_sentence))

print(raw)


[('His legs are long.', 'అతని కాళ్ళు పొడవుగా ఉన్నాయి.'), ('Who taught Tom how to speak French?', 'టామ్ ఫ్రెంచ్ మాట్లాడటం ఎలా నేర్పించారు?'), ('I swim in the sea every day.', 'నేను ప్రతి రోజు సముద్రంలో ఈత కొడతాను.'), ('Tom popped into the supermarket on his way home to buy some milk.', 'టామ్ కొంచెం పాలు కొనడానికి ఇంటికి వెళ్ళేటప్పుడు సూపర్ మార్కెట్లోకి ప్రవేశించాడు.')]


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = make_model(device, vocab_src, vocab_trg,
                   n_layers=3, n_heads=8, d_model=256,
                   d_ffn=512, max_length=50)

model.cuda()


Transformer(
  (encoder): Encoder(
    (layers): ModuleList(
      (0-2): 3 x EncoderLayer(
        (attention): MultiHeadAttention(
          (Wq): Linear(in_features=256, out_features=256, bias=True)
          (Wk): Linear(in_features=256, out_features=256, bias=True)
          (Wv): Linear(in_features=256, out_features=256, bias=True)
          (Wo): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (attn_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (positionwise_fnn): PositionwiseFeedForward(
          (linear_layer_1): Linear(in_features=256, out_features=512, bias=True)
          (linear_layer_2): Linear(in_features=512, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (fnn_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, 

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 23,393,984 trainable parameters


In [None]:
LEARNING_RATE = 0.0005

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

In [None]:
def train(model, iterator, optimizer, criterion, clip):

  # set the model to training mode
  model.train()

  epoch_loss = 0

  # loop through each batch in the iterator
  for i, batch in enumerate(iterator):

    # set the source and target batches
    src,trg = batch

    # zero the gradients
    optimizer.zero_grad()

    # logits for each output
    logits = model(src, trg[:,:-1])

    # expected output
    expected_output = trg[:,1:]

    # calculate the loss
    loss = criterion(logits.contiguous().view(-1, logits.shape[-1]),
                    expected_output.contiguous().view(-1))

    # backpropagation
    loss.backward()

    # clip the weights
    torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

    # update the weights
    optimizer.step()

    # update the loss
    epoch_loss += loss.item()

  # return the average loss for the epoch
  return epoch_loss / len(iterator)

In [None]:

def evaluate(model, iterator, criterion):

  # set the model to evaluation mode
  model.eval()

  epoch_loss = 0

  # evaluate without updating gradients
  with torch.no_grad():

    # loop through each batch in the iterator
    for i, batch in enumerate(iterator):

      # set the source and target batches
      src, trg = batch


      # logits for each output
      logits = model(src, trg[:,:-1])

      # expected output
      expected_output = trg[:,1:]

      # calculate the loss
      loss = criterion(logits.contiguous().view(-1, logits.shape[-1]),
                      expected_output.contiguous().view(-1))

      # update the loss
      epoch_loss += loss.item()

  # return the average loss for the epoch
  return epoch_loss / len(iterator)

In [None]:
def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
  return elapsed_mins, elapsed_secs

In [None]:
!nvidia-smi

Mon Aug 12 16:45:32 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   55C    P0              31W /  70W |    197MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
N_EPOCHS =10
CLIP = 1

best_valid_loss = float('inf')

# loop through each epoch
for epoch in range(N_EPOCHS):

  start_time = time.time()

  # calculate the train loss and update the parameters
  train_loss = train(model, train_iter, optimizer, criterion, CLIP)

  # calculate the loss on the validation set
  valid_loss = evaluate(model, valid_iter, criterion)

  end_time = time.time()

  # calculate how long the epoch took
  epoch_mins, epoch_secs = epoch_time(start_time, end_time)

  # save the model when it performs better than the previous run
  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict(), 'transformer-model_tel.pt')

  print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
  print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
  print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 1m 30s
	Train Loss: 4.020 | Train PPL:  55.702
	 Val. Loss: 2.700 |  Val. PPL:  14.884
Epoch: 02 | Time: 1m 30s
	Train Loss: 2.369 | Train PPL:  10.689
	 Val. Loss: 1.763 |  Val. PPL:   5.830
Epoch: 03 | Time: 1m 31s
	Train Loss: 1.613 | Train PPL:   5.019
	 Val. Loss: 1.354 |  Val. PPL:   3.871
Epoch: 04 | Time: 1m 31s
	Train Loss: 1.185 | Train PPL:   3.272
	 Val. Loss: 1.118 |  Val. PPL:   3.058
Epoch: 05 | Time: 1m 30s
	Train Loss: 0.932 | Train PPL:   2.539
	 Val. Loss: 1.023 |  Val. PPL:   2.781
Epoch: 06 | Time: 1m 30s
	Train Loss: 0.767 | Train PPL:   2.154
	 Val. Loss: 0.950 |  Val. PPL:   2.586
Epoch: 07 | Time: 1m 30s
	Train Loss: 0.656 | Train PPL:   1.926
	 Val. Loss: 0.916 |  Val. PPL:   2.500
Epoch: 08 | Time: 1m 30s
	Train Loss: 0.575 | Train PPL:   1.777
	 Val. Loss: 0.903 |  Val. PPL:   2.468
Epoch: 09 | Time: 1m 31s
	Train Loss: 0.513 | Train PPL:   1.671
	 Val. Loss: 0.891 |  Val. PPL:   2.437
Epoch: 10 | Time: 1m 31s
	Train Loss: 0.467 | Train PPL

In [None]:
# load the weights
model.load_state_dict(torch.load('transformer-model_tel.pt'))

# calculate the loss on the test set
test_loss = evaluate(model, test_iter, criterion)

print(f'Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f}')

Test Loss: 0.881 | Test PPL:   2.414


In [None]:
import torch

def translate_sentence(sentence, model, device, vocab_src, vocab_trg, tokenizer_te, max_length=50):
    model.eval()

    # Check if the input is a string and tokenize accordingly
    if isinstance(sentence, str):
        # Tokenize the sentence using the Telugu tokenizer
        tokens = tokenizer_te(sentence)
        src = ['<bos>'] + [token.lower() for token in tokens] + ['<eos>']
    else:
        src = ['<bos>'] + sentence + ['<eos>']

    # Map the tokens to their respective indices in the source vocabulary
    src_indexes = [vocab_src[token] if token in vocab_src else vocab_src['<unk>'] for token in src]

    # Convert the list of indices to a tensor and add a batch dimension
    src_tensor = torch.tensor(src_indexes, dtype=torch.long).unsqueeze(0).to(device)

    # Initialize the list of target indices with the index of '<bos>'
    trg_indexes = [vocab_trg['<bos>']]

    # Initialize the loop to generate tokens up to a maximum length
    for i in range(max_length):
        # Convert the current list of target indices to a tensor and add a batch dimension
        trg_tensor = torch.tensor(trg_indexes, dtype=torch.long).unsqueeze(0).to(device)

        with torch.no_grad():
            # Feed the source and target tensors to the model to get the logits
            output = model(src_tensor, trg_tensor)
            pred_token = output.argmax(2)[:, -1].item()

            # Check if the predicted token is '<eos>' or the maximum length is reached
            if pred_token == vocab_trg['<eos>'] or i == (max_length - 1):
                # Convert indices to tokens
                trg_tokens = [vocab_trg.lookup_token(index) for index in trg_indexes[1:]]  # Skip '<bos>'
                return src, trg_tokens

            # Append the predicted token to the list of target indices
            trg_indexes.append(pred_token)

# Example usage
src_text = "how was it?"
model = model  # Replace with your actual model
device = 'cuda' if torch.cuda.is_available() else 'cpu'  # Assuming CUDA is available and appropriate
src, trg_tokens = translate_sentence(src_text, model, device, vocab_src, vocab_trg, tokenizer_te)
print(f'source --> {" ".join(src[1:-1])}')
print(f'target translation --> {" ".join(trg_tokens)}')


source --> how was it ?
target translation --> అది ఎలా ఉంది ?


In [None]:
src_text = "The technology is increasing rapidly"
src, trg_tokens = translate_sentence(src_text, model, device, vocab_src, vocab_trg, tokenizer_te)
print(f'source --> {" ".join(src[1:-1])}')
print(f'target translation --> {" ".join(trg_tokens)}')

source --> the technology is increasing rapidly
target translation --> సాంకేతిక సాంకేతిక శక్తి వేగంగా పెరుగుతోంది .


In [None]:
src_text = "i like to learn new things"
src, trg_tokens = translate_sentence(src_text, model, device, vocab_src, vocab_trg, tokenizer_te)
print(f'source --> {" ".join(src[1:-1])}')
print(f'target translation --> {" ".join(trg_tokens)}')

source --> i like to learn new things
target translation --> నేను క్రొత్త పనులు నేర్చుకోవడం ఇష్టం .
