In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data import Field
from torchtext.data.dataset import TabularDataset
from torchtext.data.iterator import BucketIterator, Iterator
import numpy as np
import spacy
from tqdm import tqdm
import pandas as pd
import spacy
import random

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [3]:
import codecs
import re

class Tokenizer:
    '''class for tokenizer'''

    def __init__(self, text=None):
        if text is not None:
            self.text = text  # Removed .decode('utf-8') as it's unnecessary in Python 3
            self.clean_text()
        else:
            self.text = None
        self.sentences = []
        self.tokens = []
        self.stemmed_word = []
        self.final_list = []

    def read_from_file(self, filename):
        f = codecs.open(filename, encoding='utf-8')
        self.text = f.read()
        self.clean_text()

    def generate_sentences(self):
        '''generates a list of sentences'''
        text = self.text
        self.sentences = text.split(u"।")

    def clean_text(self):
        '''cleans up the text by removing unwanted characters'''
        text = self.text
        text = re.sub(r'(\d+)', r'', text)
        text = text.replace(u',', '')
        text = text.replace(u'"', '')
        text = text.replace(u'(', '')
        text = text.replace(u')', '')
        text = text.replace(u':', '')
        text = text.replace(u"'", '')
        text = text.replace(u"‘‘", '')
        text = text.replace(u"’’", '')
        text = text.replace(u"''", '')
        text = text.replace(u".", '')
        self.text = text

    def remove_only_space_words(self):
        self.tokens = list(filter(lambda tok: tok.strip(), self.tokens))

    def hyphenated_tokens(self):
        for each in self.tokens:
            if '-' in each:
                tok = each.split('-')
                self.tokens.remove(each)
                self.tokens.extend(tok)

    def tokenize(self):
        '''tokenizes the text into words'''
        if not self.sentences:
            self.generate_sentences()

        sentences_list = self.sentences
        tokens = []
        for each in sentences_list:
            word_list = each.split(' ')
            tokens += word_list
        self.tokens = tokens
        self.remove_only_space_words()
        self.hyphenated_tokens()

    def tokens_count(self):
        return len(self.tokens)

    def sentence_count(self):
        return len(self.sentences)

    def len_text(self):
        return len(self.text)

    def concordance(self, word):
        if not self.sentences:
            self.generate_sentences()
        concordance_sent = [each for each in self.sentences if word in each]
        return concordance_sent

    def generate_freq_dict(self):
        freq = {}
        if not self.tokens:
            self.tokenize()
        for each in self.tokens:
            freq[each] = freq.get(each, 0) + 1
        return freq

    def generate_stem_words(self, word):
        suffixes = {
            1: [u"ो", u"े", u"ू", u"ु", u"ी", u"ि", u"ा"],
            2: [u"कर", u"ाओ", u"िए", u"ाई", u"ाए", u"ने", u"नी", u"ना", u"ते", u"ीं", u"ती", u"ता", u"ाँ", u"ां", u"ों", u"ें"],
            3: [u"ाकर", u"ाइए", u"ाईं", u"ाया", u"ेगी", u"ेगा", u"ोगी", u"ोगे", u"ाने", u"ाना", u"ाते", u"ाती", u"ाता", u"तीं", u"ाओं", u"ाएं", u"ुओं", u"ुएं", u"ुआं"],
            4: [u"ाएगी", u"ाएगा", u"ाओगी", u"ाओगे", u"एंगी", u"ेंगी", u"एंगे", u"ेंगे", u"ूंगी", u"ूंगा", u"ातीं", u"नाओं", u"नाएं", u"ताओं", u"ताएं", u"ियाँ", u"ियों", u"ियां"],
            5: [u"ाएंगी", u"ाएंगे", u"ाऊंगी", u"ाऊंगा", u"ाइयाँ", u"ाइयों", u"ाइयां"],
        }
        for L in 5, 4, 3, 2, 1:
            if len(word) > L + 1:
                for suf in suffixes[L]:
                    if word.endswith(suf):
                        return word[:-L]
        return word

    def generate_stem_dict(self):
        stem_word = {}
        if not self.tokens:
            self.tokenize()
        for each_token in self.tokens:
            temp = self.generate_stem_words(each_token)
            stem_word[each_token] = temp
            self.stemmed_word.append(temp)
        return stem_word

    def remove_stop_words(self):
        f = codecs.open("rss.txt", encoding='utf-8')
        if not self.stemmed_word:
            self.generate_stem_dict()
        stopwords = [x.strip() for x in f.readlines()]
        tokens = [i for i in self.stemmed_word if i not in stopwords]
        self.final_tokens = tokens
        return tokens

In [4]:
def hindi_tokenizer(text):
    t=Tokenizer(text)
    t.tokenize()
    return t.tokens

In [5]:
nlp = spacy.load("en_core_web_sm")

In [6]:
def english_tokenizer(text):
    doc = nlp(text)
    return [token.text for token in doc]

In [7]:
english = Field(sequential=True, use_vocab=True, lower=True, tokenize=english_tokenizer)
hindi = Field(sequential=True, use_vocab=True, tokenize=hindi_tokenizer)

In [8]:
fields = {'hindi': ('hin', hindi), 'english': ('eng', english)}

In [9]:
train_data, validation_data, test_data = TabularDataset.splits(
    path = 'B:\Pytorch\From_Scratch_Implementations\hin_to_eng_dataset',
    train='hin_eng_train.json',
    validation='hin_eng_val.json',
    test='hin_eng_test.json',
    format='json',
    fields=fields
)

In [10]:
print(vars(train_data[0]))
print(vars(test_data[0]))

{'hin': ['सलीक़ा'], 'eng': ['benignancy']}
{'hin': ['वह', 'स्थिति', 'जिसमें', 'वित्त', 'सम्बन्धी', 'दशाओं', 'अथवा', 'वित्त', 'का', 'प्रबन्धन', 'किया', 'जा', 'रहा', 'हो'], 'eng': ['the', 'way', 'in', 'which', 'finances', 'are', 'placed', 'or', 'arranged', '.']}


In [11]:
english.build_vocab(train_data, max_size=10000, min_freq=2)
hindi.build_vocab(train_data, max_size=10000, min_freq=2)

In [12]:
train_iterator, val_iterator, test_iterator = BucketIterator.splits((train_data, validation_data, test_data),batch_size=32, sort_within_batch=True, sort_key = lambda x: len(x.hin), device=device)

In [13]:
# for idx, batch in enumerate(train_iterator):
#     if idx==4:
#         break
#     print(batch)

In [14]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, drop_p):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(drop_p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=drop_p)

    def forward(self, x):
        # x.shape-->(seq_len, N)
        embedding = self.dropout(self.embedding(x))
        # embedding.shape--> (seq_len, N, embedding_size)
        output, (hidden, cell) = self.rnn(embedding)

        return hidden, cell
    
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, 
                 num_layers, drop_p):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(drop_p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=drop_p)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
        # shape of x: (N), but we want it to be (1, N) as we process one word at a time
        x = x.unsqueeze(0)
        embedding = self.dropout(self.embedding(x))
        # embedding shape:(1, N, embedding_size)
        output, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        # shape of output: (1, N, hidden_size)
        predictions = self.fc(output)
        # shape of predictions = (1, N, length_of_vocab)
        predictions = predictions.squeeze(0)

        return predictions, hidden, cell
    

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio=0.5):
        # source shape --> (target, N)
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(english.vocab)
        outputs = torch.zeros(target_len, batch_size, target_vocab_size, requires_grad=True).to(device)
        hidden, cell = self.encoder(source)

        # Grad start token
        x = target[0]
        for t in range(1, target_len):
            output, hidden, cell = self.decoder(x, hidden, cell)

            outputs[t] = output

            # output--> (N, english_voacb)
            best_guess = output.argmax(1)

            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs

In [None]:
num_epochs = 2
learning_rates = 3e-4
batch_size = 32

load_model = Fals,e
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_size_encoder = len(hindi.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 512
decoder_embedding_size = 512
hidden_size = 1024
num_layers = 6
enc_dropout = 0.3
dec_dropout = 0.3

In [16]:
print("\033[1mConfiguration:\033[0m")
print(f"{'Number of Epochs:':<25} \033[92m{num_epochs}\033[0m")
print(f"{'Learning Rate:':<25} \033[92m{learning_rates}\033[0m")
print(f"{'Batch Size:':<25} \033[92m{batch_size}\033[0m")
print(f"{'Load Model:':<25} \033[92m{load_model}\033[0m")
print(f"{'Device:':<25} \033[92m{device}\033[0m")
print(f"{'Input Size (Encoder):':<25} \033[92m{input_size_encoder}\033[0m")
print(f"{'Input Size (Decoder):':<25} \033[92m{input_size_decoder}\033[0m")
print(f"{'Output Size:':<25} \033[92m{output_size}\033[0m")
print(f"{'Encoder Embedding Size:':<25} \033[92m{encoder_embedding_size}\033[0m")
print(f"{'Decoder Embedding Size:':<25} \033[92m{decoder_embedding_size}\033[0m")
print(f"{'Hidden Size:':<25} \033[92m{hidden_size}\033[0m")
print(f"{'Number of Layers:':<25} \033[92m{num_layers}\033[0m")
print(f"{'Encoder Dropout:':<25} \033[92m{enc_dropout}\033[0m")
print(f"{'Decoder Dropout:':<25} \033[92m{dec_dropout}\033[0m")

[1mConfiguration:[0m
Number of Epochs:         [92m2[0m
Learning Rate:            [92m0.0003[0m
Batch Size:               [92m32[0m
Load Model:               [92mFalse[0m
Device:                   [92mcuda[0m
Input Size (Encoder):     [92m10002[0m
Input Size (Decoder):     [92m10002[0m
Output Size:              [92m10002[0m
Encoder Embedding Size:   [92m512[0m
Decoder Embedding Size:   [92m512[0m
Hidden Size:              [92m1024[0m
Number of Layers:         [92m6[0m
Encoder Dropout:          [92m0.3[0m
Decoder Dropout:          [92m0.3[0m


In [17]:
encoder_net = Encoder(input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout).to(device)
decoder_net = Decoder(input_size_decoder, decoder_embedding_size, hidden_size, output_size, num_layers, dec_dropout).to(device)
model = Seq2Seq(encoder_net, decoder_net).to(device)

In [18]:
for param in model.parameters():
    if param.requires_grad == False:
        print("Found parameters not requiring gradients!")

In [19]:
english.vocab.stoi['<PAD>']

0

In [20]:
pad_idx = english.vocab.stoi['<PAD>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.Adam(model.parameters(), lr=learning_rates)

In [21]:
for epoch in range(num_epochs):
    print(f"Epoch [{epoch} / {num_epochs}]")
    # Training phase
    model.train()
    train_loss = 0
    
    # Wrap train_iterator with tqdm
    train_bar = tqdm(enumerate(train_iterator), 
                    total=len(train_iterator),
                    desc='Training',
                    leave=True)
    
    for batch_idx, batch in train_bar:
        input_data = batch.hin.to(device).long() 
        target_data = batch.eng.to(device).long()
        optimizer.zero_grad()
        output = model(input_data, target_data)
        output = output[1:].reshape(-1, output.shape[2])
        target = target_data[1:].reshape(-1)
        loss = criterion(output, target)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()
        train_loss += loss.item()
        
        # Update progress bar description with current loss
        train_bar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    avg_train_loss = train_loss / len(train_iterator)
    print(f"Training Loss: {avg_train_loss:.4f}")
    
    # Validation phase
    model.eval()
    val_loss = 0
    
    # Wrap val_iterator with tqdm
    val_bar = tqdm(enumerate(val_iterator), 
                  total=len(val_iterator),
                  desc='Validation',
                  leave=True)
    
    with torch.no_grad():
        for batch_idx, batch in val_bar:
            input_data = batch.hin.to(device).long() 
            target_data = batch.eng.to(device).long() 
            output = model(input_data, target_data, teacher_force_ratio=0) 
            output = output[1:].reshape(-1, output.shape[2])
            target = target_data[1:].reshape(-1)
            loss = criterion(output, target)
            val_loss += loss.item()
            
            # Update progress bar description with current loss
            val_bar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    avg_val_loss = val_loss / len(val_iterator)
    print(f"Validation Loss: {avg_val_loss:.4f}")

Epoch [0 / 2]


Training: 100%|██████████| 563/563 [05:57<00:00,  1.57it/s, loss=4.5144]


Training Loss: 3.1648


Validation: 100%|██████████| 188/188 [00:15<00:00, 11.94it/s, loss=3.8406]


Validation Loss: 2.8276
Epoch [1 / 2]


Training: 100%|██████████| 563/563 [06:09<00:00,  1.52it/s, loss=0.6588] 


Training Loss: 2.9236


Validation: 100%|██████████| 188/188 [00:17<00:00, 10.56it/s, loss=3.6920]

Validation Loss: 2.8070





In [22]:
def translate_sentence(model, sentence, hindi_tokenizer, hindi_vocab, english_vocab, device, max_length=50):
    model.eval()  # Set model to evaluation mode

    # Tokenize and numericalize the sentence
    tokens = hindi_tokenizer(sentence)  # Tokenize Hindi sentence
    tokens = [hindi_vocab.stoi["<sos>"]] + [hindi_vocab.stoi.get(token, hindi_vocab.stoi["<unk>"]) for token in tokens] + [hindi_vocab.stoi["<eos>"]]

    # Convert to tensor and add batch dimension
    source = torch.LongTensor(tokens).unsqueeze(1).to(device)
    
    with torch.no_grad():
        hidden, cell = model.encoder(source)

    # Prepare for decoding
    outputs = []
    x = torch.LongTensor([english_vocab.stoi["<sos>"]]).to(device)  # Start with <SOS> token

    for _ in range(max_length):
        with torch.no_grad():
            output, hidden, cell = model.decoder(x, hidden, cell)
        
        # Get the token with the highest probability
        best_guess = output.argmax(1).item()
        outputs.append(best_guess)

        # Break if <EOS> is generated
        if best_guess == english_vocab.stoi["<eos>"]:
            break

        # Set x to best_guess for next iteration
        x = torch.LongTensor([best_guess]).to(device)

    # Convert token indices back to words
    translated_sentence = [english_vocab.itos[idx] for idx in outputs if idx not in {english_vocab.stoi["<sos>"], english_vocab.stoi["<eos>"]}]
    return " ".join(translated_sentence)

In [23]:
sentence = "मेरा नाम भूषण है"  # Input Hindi sentence as a string
translated_sentence = translate_sentence(model, sentence, hindi_tokenizer, hindi.vocab, english.vocab, device)
print(translated_sentence)

of <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


In [24]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Example usage:
print(f"The model has {count_parameters(model):,} trainable parameters.")

The model has 117,061,394 trainable parameters.
