In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Installing Libraries

In [None]:
!pip install datasets transformers
!pip install huggingface_hub



## Importing necessary Libraries

In [37]:
from datasets import load_dataset
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import math
import os
import json
from transformers import PretrainedConfig
from huggingface_hub import HfApi, Repository
from transformers import PreTrainedTokenizer, PreTrainedModel
from transformers import BertTokenizer

## Dataset Loading (found on HuggingFace Datasets)

In [None]:
dataset = load_dataset("Ketan3101/English-Hindi-Translation")
subset_size = 200000  # Custom Size
train_data = dataset['train'].select(range(subset_size))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/503 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/45.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/348768 [00:00<?, ? examples/s]

## Defining Tokenizers and vocabs

In [None]:
def simple_tokenizer(text):
    return text.split()

def build_vocab(data_iter, lang):
    vocab = {'<pad>': 0, '<unk>': 1, '<sos>': 2, '<eos>': 3}
    for data in data_iter:
        for token in simple_tokenizer(data[lang].lower()):
            if token not in vocab:
                vocab[token] = len(vocab)
    return vocab

eng_vocab = build_vocab(train_data, 'english')
hindi_vocab = build_vocab(train_data, 'hindi')

## Creating the Dataset class

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, data, eng_vocab, hindi_vocab, max_len=50):
        self.data = data
        self.eng_vocab = eng_vocab
        self.hindi_vocab = hindi_vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        eng_text = self.data[idx]['english']
        hindi_text = self.data[idx]['hindi']

        eng_tokens = simple_tokenizer(eng_text.lower())
        hindi_tokens = simple_tokenizer(hindi_text.lower())

        eng_ids = [self.eng_vocab.get(token, self.eng_vocab['<unk>']) for token in eng_tokens]
        hindi_ids = [self.hindi_vocab.get(token, self.hindi_vocab['<unk>']) for token in hindi_tokens]

        eng_ids = [self.eng_vocab['<sos>']] + eng_ids[:self.max_len-2] + [self.eng_vocab['<eos>']]
        hindi_ids = [self.hindi_vocab['<sos>']] + hindi_ids[:self.max_len-2] + [self.hindi_vocab['<eos>']]

        eng_ids += [self.eng_vocab['<pad>']] * (self.max_len - len(eng_ids))
        hindi_ids += [self.hindi_vocab['<pad>']] * (self.max_len - len(hindi_ids))

        return torch.tensor(eng_ids), torch.tensor(hindi_ids)

## Preparing the Dataset for training

In [None]:
train_dataset = TranslationDataset(train_data, eng_vocab, hindi_vocab)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

## Create Positional Encoding Class

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000): # create matrix encodings
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :] # Add positional encodings to the input embeddings
        return x

## Define the main **Transformer model** for translation

In [None]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, max_seq_length, dropout=0.1):
        super(Transformer, self).__init__()

        self.src_embedding = nn.Embedding(src_vocab_size, d_model) # embedding layers
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model) # nn embedding
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length) # final linear layer

        # Core nn Tranformer model with custom configurations
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )

        self.fc = nn.Linear(d_model, tgt_vocab_size)

    def forward(self, src, tgt):

        # Embeds the input sequences, add positional encoding
        src_embedded = self.positional_encoding(self.src_embedding(src))
        tgt_embedded = self.positional_encoding(self.tgt_embedding(tgt))

        # Generate attention masks
        tgt_mask = self.generate_square_subsequent_mask(tgt.size(1)).to(tgt.device)
        src_padding_mask = (src == 0).to(src.device)
        tgt_padding_mask = (tgt == 0).to(tgt.device)

        # pass through transformer
        output = self.transformer(src_embedded, tgt_embedded, tgt_mask=tgt_mask,
                                  src_key_padding_mask=src_padding_mask,
                                  tgt_key_padding_mask=tgt_padding_mask)
        return self.fc(output)

    def generate_square_subsequent_mask(self, sz): # mask for the decoder
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

## Model Initialization

In [None]:
# Defining the model Hyperparameters :
src_vocab_size = len(eng_vocab)
tgt_vocab_size = len(hindi_vocab)
d_model = 192
nhead = 6
num_encoder_layers = 4
num_decoder_layers = 4
dim_feedforward = 768
max_seq_length = 100

model = Transformer(src_vocab_size, tgt_vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, max_seq_length)

# loss function
criterion = nn.CrossEntropyLoss(ignore_index=0)

# adam optimizer
optimizer = optim.Adam(model.parameters(), lr=0.0001)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# gradient clipping
clip_value = 1.0
torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)

tensor(0.)

# **Training Loop**

In [11]:
num_epochs = 10 # set epochs (iterations)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    for src, tgt in train_loader:
        src, tgt = src.to(device), tgt.to(device)

        optimizer.zero_grad()

        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]

        output = model(src, tgt_input)

        output_flat = output.contiguous().view(-1, output.size(-1))
        tgt_output_flat = tgt_output.contiguous().view(-1)

        loss = criterion(output_flat, tgt_output_flat)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        _, predicted = torch.max(output_flat, dim=1)
        correct_predictions += (predicted == tgt_output_flat).sum().item()
        total_predictions += tgt_output_flat.size(0)

    epoch_loss = total_loss / len(train_loader)
    epoch_accuracy = correct_predictions / total_predictions

    print(f"Epoch: {epoch+1}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}")



Epoch: 1, Loss: 6.4192, Accuracy: 0.0513
Epoch: 2, Loss: 5.4124, Accuracy: 0.0688
Epoch: 3, Loss: 4.9788, Accuracy: 0.0765
Epoch: 4, Loss: 4.6702, Accuracy: 0.0820
Epoch: 5, Loss: 4.4259, Accuracy: 0.0863
Epoch: 6, Loss: 4.2194, Accuracy: 0.0900
Epoch: 7, Loss: 4.0417, Accuracy: 0.0931
Epoch: 8, Loss: 3.8852, Accuracy: 0.0959
Epoch: 9, Loss: 3.7437, Accuracy: 0.0984
Epoch: 10, Loss: 3.6164, Accuracy: 0.1008


## Function to Translate

In [12]:
def translate(model, sentence, eng_vocab, hindi_vocab, device, max_length=50):
    model.eval()

    # tokenize sentence
    tokens = sentence.lower().split()
    src_indices = [eng_vocab.get(token, eng_vocab['<unk>']) for token in tokens]
    src_indices = [eng_vocab['<sos>']] + src_indices + [eng_vocab['<eos>']]
    src_indices += [eng_vocab['<pad>']] * (max_length - len(src_indices))
    src_tensor = torch.LongTensor(src_indices).unsqueeze(0).to(device)

    tgt_tensor = torch.LongTensor([[hindi_vocab['<sos>']]])

    for _ in range(max_length):
        tgt_tensor = tgt_tensor.to(device)
        output = model(src_tensor, tgt_tensor) # Generate predictions using the trained model

        next_word = output.argmax(2)[:, -1].item()
        tgt_tensor = torch.cat([tgt_tensor, torch.LongTensor([[next_word]]).to(device)], dim=1)

        if next_word == hindi_vocab['<eos>']:
            break

    hindi_vocab_inv = {v: k for k, v in hindi_vocab.items()}
    translated_tokens = [hindi_vocab_inv[idx.item()] for idx in tgt_tensor[0][1:]]

    return ' '.join(translated_tokens[:-1])

## Testing with Custom input

In [15]:
english_sentence = "What is this?"
hindi_translation = translate(model, english_sentence, eng_vocab, hindi_vocab, device)

print(f"English: {english_sentence}")
print(f"Hindi: {hindi_translation}")

English: What is this?
Hindi: यह क्या है?


In [16]:
english_sentence = "Who are you sir?"
hindi_translation = translate(model, english_sentence, eng_vocab, hindi_vocab, device)

print(f"English: {english_sentence}")
print(f"Hindi: {hindi_translation}")

English: Who are you sir?
Hindi: तुम कौन हो ?


In [18]:
english_sentence = "What can i do?"
hindi_translation = translate(model, english_sentence, eng_vocab, hindi_vocab, device)

print(f"English: {english_sentence}")
print(f"Hindi: {hindi_translation}")

English: What can i do?
Hindi: मैं क्या कर सकता हूं?


In [27]:
english_sentence = "Daughter is not here "
hindi_translation = translate(model, english_sentence, eng_vocab, hindi_vocab, device)

print(f"English: {english_sentence}")
print(f"Hindi: {hindi_translation}")

English: Daughter is not here 
Hindi: बेटी नहीं है


## Saving the Model Offline

In [44]:
save_dir = '/content/Saved_transformer_model'
os.makedirs(save_dir, exist_ok=True)

### Main model:

In [45]:
model_path = os.path.join(save_dir, 'transformer_model.pth')
torch.save(model.state_dict(), model_path)

### vocabularies:

In [46]:
with open(os.path.join(save_dir, 'eng_vocab.json'), 'w') as f:
    json.dump(eng_vocab, f)

with open(os.path.join(save_dir, 'hindi_vocab.json'), 'w') as f:
    json.dump(hindi_vocab, f)

## Preparing to Push into Hugging Face

In [31]:
class CustomTokenizer(PreTrainedTokenizer):
    def __init__(self, vocab_file, **kwargs):
        super().__init__(**kwargs)
        with open(vocab_file, 'r', encoding='utf-8') as f:
            vocab = json.load(f)
        self.vocab = vocab
        self.ids_to_tokens = {v: k for k, v in vocab.items()}

        self.unk_token = '<unk>'
        self.pad_token = '<pad>'
        self.eos_token = '<eos>'
        self.sos_token = '<sos>'

        for token in [self.unk_token, self.pad_token, self.eos_token, self.sos_token]:
            if token not in self.vocab:
                raise ValueError(f"Special token {token} not found in vocabulary.")

    def get_vocab(self):
        return dict(self.vocab)

    def _tokenize(self, text):
        return text.lower().split()

    def _convert_token_to_id(self, token):
        return self.vocab.get(token, self.vocab[self.unk_token])

    def _convert_id_to_token(self, index):
        return self.ids_to_tokens.get(index, self.unk_token)

    @property
    def vocab_size(self):
        return len(self.vocab)

    def save_vocabulary(self, save_directory):
        vocab_file = os.path.join(save_directory, "vocab.json")
        with open(vocab_file, "w", encoding="utf-8") as f:
            json.dump(self.vocab, f, ensure_ascii=False)
        return (vocab_file,)

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
        vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json")
        return cls(vocab_file, *init_inputs, **kwargs)

## Wrapper class to Fit into HuggingFace Library

In [32]:
class HFTransformer(PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.transformer = Transformer(
            config.src_vocab_size,
            config.tgt_vocab_size,
            config.d_model,
            config.nhead,
            config.num_encoder_layers,
            config.num_decoder_layers,
            config.dim_feedforward,
            config.max_seq_length
        )

    def forward(self, input_ids, decoder_input_ids):
        return self.transformer(input_ids, decoder_input_ids)

## BERT Tokenizer

In [33]:
def create_tokenizer(vocab_file):
    with open(vocab_file, 'r') as f:
        vocab = json.load(f)

    temp_vocab_file = os.path.join(save_dir, 'temp_vocab.txt')
    with open(temp_vocab_file, 'w') as f:
        for token, index in sorted(vocab.items(), key=lambda x: x[1]):
            f.write(f"{token}\n")

    tokenizer = BertTokenizer(temp_vocab_file, do_lower_case=True)

    os.remove(temp_vocab_file)

    return tokenizer

### Calling the tokenizer function and saving the two separate tokenizers

In [34]:
# create
src_tokenizer = create_tokenizer(os.path.join(save_dir, 'eng_vocab.json'))
tgt_tokenizer = create_tokenizer(os.path.join(save_dir, 'hindi_vocab.json'))

# Save
src_tokenizer.save_pretrained(os.path.join(save_dir, 'src_tokenizer'))
tgt_tokenizer.save_pretrained(os.path.join(save_dir, 'tgt_tokenizer'))

('/content/Saved_transformer_model/tgt_tokenizer/tokenizer_config.json',
 '/content/Saved_transformer_model/tgt_tokenizer/special_tokens_map.json',
 '/content/Saved_transformer_model/tgt_tokenizer/vocab.txt',
 '/content/Saved_transformer_model/tgt_tokenizer/added_tokens.json')

## HuggingFace Compatible Configuration

In [40]:
config = PretrainedConfig(
    src_vocab_size=src_vocab_size,
    tgt_vocab_size=tgt_vocab_size,
    d_model=d_model,
    nhead=nhead,
    num_encoder_layers=num_encoder_layers,
    num_decoder_layers=num_decoder_layers,
    dim_feedforward=dim_feedforward,
    max_seq_length=max_seq_length
)

## Save Model

In [41]:
hf_model = HFTransformer(config)
hf_model.transformer.load_state_dict(model.state_dict())
hf_model.save_pretrained(save_dir)

## Push Model and tokenizers and configs

In [42]:
api = HfApi()

hf_username = "Dharinesh"
model_name = "Transformer-English-to-Hindi-translation"
repo_name = f"{hf_username}/{model_name}"

api.create_repo(repo_id=repo_name, exist_ok=True)

# Upload
api.upload_folder(
    folder_path=save_dir,
    repo_id=repo_name,
    repo_type="model",
)

print(f"Model pushed to Hugging Face Hub: https://huggingface.co/{repo_name}")

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

transformer_model.pth:   0%|          | 0.00/309M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

Model pushed to Hugging Face Hub: https://huggingface.co/Dharinesh/Transformer-English-to-Hindi-translation


#### Upload in Gdrive and download model From there

In [43]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
