In [1]:

import os
import time
from tqdm import tqdm


# disable Weights and Biases
os.environ['WANDB_DISABLED']="true"
os.environ["HF_HOME"] = "~/scratch/hf-cache"
token=""
print(os.environ['WANDB_DISABLED'])  # Should output "true"
print(os.environ['HF_HOME'])  # Should output "~/scratch/hf-cache"

true
~/scratch/hf-cache


In [2]:
import re
import numpy as np 
import pandas as pd 
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    GenerationConfig
)

from trl import SFTTrainer
import torch
from pynvml import *
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, Seq2SeqTrainingArguments, Seq2SeqTrainer
from huggingface_hub import HfApi, login
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score

  from .autonotebook import tqdm as notebook_tqdm
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [3]:
# Specify the download directory for NLTK data
nltk.data.path.append('./nltk_data')
nltk.download('all', download_dir='./nltk_data')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to ./nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to ./nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     ./nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     ./nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     ./nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]   

True

In [4]:
def read_token_and_login(token_file):
    with open(token_file, 'r') as file:
        token = file.read().strip()
    api = HfApi()
    login(token=token)
    return api


In [5]:
def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

In [6]:
def get_pretrained_mbart_large_50_many_to_many_mmt():
    model_name = "facebook/mbart-large-50-many-to-many-mmt"
    tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
    model = MBartForConditionalGeneration.from_pretrained(model_name)
    return tokenizer, model

In [7]:
def filter_sentences(example):
    # Check sentence length
    if not (3 < len(example['translation']['en'].split()) < 30):
        return False
    if not (3 < len(example['translation']['hi'].split()) < 30):
        return False
    
    # Check for non-ASCII non-Unicode characters in Hindi text
    if re.search(r'[^\u0000-\u007F\u0900-\u097F]', example['translation']['hi']):
        return False
    
    # Hook for further restrictions (can be customized)
    # Example: if 'specific_word' in example['translation']['en']:
    #     return False
    
    return True

In [8]:
def get_reduced_dataset(dataset_name, train_size=14000, val_size=2000, test_size=4000):
    orig_data_set = load_dataset(dataset_name)
    print(orig_data_set)
    # Filter the dataset based on the criteria
    filtered_dataset = orig_data_set['train'].filter(filter_sentences)
    print(filtered_dataset)
    
    # Split the filtered dataset into train, validation, and test sets
    train_val_test_split = filtered_dataset.train_test_split(test_size=val_size + test_size, seed=42)
    val_test_split = train_val_test_split['test'].train_test_split(test_size=test_size, seed=42)
    
    small_data_set = DatasetDict({
        'train': train_val_test_split['train'].select(range(train_size)),
        'validation': val_test_split['train'],
        'test': val_test_split['test']
    })

    # Verify the size of the new dataset
    print(small_data_set)
    print(f"New train set size: {len(small_data_set['train'])}")
    print(f"New validation set size: {len(small_data_set['validation'])}")
    print(f"New test set size: {len(small_data_set['test'])}")
    
    return small_data_set


In [9]:
def preprocess_function(examples, tokenizer):
    global last_print_time
    current_time = time.time()
    
    if current_time - last_print_time >= 10:
        print("Examples:", examples['translation'][:2])
        last_print_time = current_time
    
    inputs = [ex['en'] for ex in examples['translation'] if ex['en'] is not None]
    targets = [ex['hi'] for ex in examples['translation'] if ex['hi'] is not None]
    
    if current_time - last_print_time >= 10:
        print("Inputs:", inputs[:2])
        print("Targets:", targets[:2])
    
    if len(inputs) == 0 or len(targets) == 0:
        return {}
    
    model_inputs = tokenizer(inputs, max_length=128, padding="max_length", truncation=True)
    
    if current_time - last_print_time >= 10:
        print("Model Inputs:", {k: v[:2] for k, v in model_inputs.items()})
    
    labels = tokenizer(targets, max_length=128, padding="max_length", truncation=True)
    
    if current_time - last_print_time >= 10:
        print("Labels:", {k: v[:2] for k, v in labels.items()})
    
    if "input_ids" not in labels or len(labels["input_ids"]) == 0:
        print("Labels are empty or not properly structured")
        return {}
    
    model_inputs["labels"] = labels["input_ids"]
    
    if current_time - last_print_time >= 10:
        print("Final Model Inputs:", {k: v[:2] for k, v in model_inputs.items()})
    
    return model_inputs

In [10]:
def prepare_model_for_training(model, tokenizer, tokenized_datasets, output_dir="./results", learning_rate=2e-5, batch_size=16, num_train_epochs=5, gradient_accumulation_steps=4):
    # Set up training arguments
    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=num_train_epochs,
        predict_with_generate=True,
        gradient_accumulation_steps=gradient_accumulation_steps,
        lr_scheduler_type="linear",
        warmup_steps=500,
        logging_dir='./logs',
        logging_steps=10,
    )

    # Initialize the Trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        tokenizer=tokenizer
    )

    # Freeze all layers except the last few layers
    for param in model.parameters():
        param.requires_grad = False

    # Unfreeze the last few layers
    for param in model.model.decoder.layers[-2:].parameters():
        param.requires_grad = True

    # Unfreeze the classification head
    for param in model.lm_head.parameters():
        param.requires_grad = True

    return trainer

In [11]:
def fine_tune_and_save(trainer, model, tokenizer, output_dir="./trained_model"):
    # Train the model
    trainer.train()

    # Save the trained model and tokenizer
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

In [12]:
def load_fine_tuned_model(output_dir="./trained_model"):
    model = MBartForConditionalGeneration.from_pretrained(output_dir)
    tokenizer = MBart50TokenizerFast.from_pretrained(output_dir)
    return model, tokenizer

In [13]:
def translate_text(model, tokenizer, input_text, src_lang="en_XX", tgt_lang="hi_IN"):
    # Tokenize the input text
    tokenizer.src_lang = src_lang
    encoded_input = tokenizer(input_text, return_tensors="pt")

    # Generate translation
    generated_tokens = model.generate(
        **encoded_input,
        forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang]
    )

    # Decode the generated tokens
    translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
    print("Translated text:", translated_text)
    return translated_text



In [14]:
def prepare_test_data(small_data_set, tokenizer, num_examples=200):
    # Load the test data
    test_data = small_data_set['test']
    print(test_data['translation'][0])
    print(len(test_data['translation']))
    
    # Select a subset of the test data
    test_data = test_data.select(range(num_examples))
    print(test_data['translation'][0])
    print(len(test_data['translation']))

    # Preprocess the test data
    def preprocess_test_data(examples):
        inputs = [ex['en'] for ex in examples['translation'] if ex['en'] is not None]
        model_inputs = tokenizer(inputs, max_length=128, padding="max_length", truncation=True)
        return model_inputs

    tokenized_test_data = test_data.map(preprocess_test_data, batched=True, remove_columns=["translation"])
    
    return test_data, tokenized_test_data

#count = 0

In [15]:
def perform_translation_testing(model, tokenizer, test_data, tokenized_test_data, src_lang="en_XX", tgt_lang="hi_IN"):
    count = 0
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    count = 0

    def generate_translation(batch):
        nonlocal count
        # Ensure input_ids and attention_mask are tensors
        input_ids = torch.tensor(batch["input_ids"]).to(device)
        attention_mask = torch.tensor(batch["attention_mask"]).to(device)
        
        count += 1
        print(f"Processing batch {count}")
        
        # Generate translation
        generated_tokens = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang]
            #max_length=128,
            #num_beams=5,
            #early_stopping=True
        )
        # Decode the generated tokens
        batch["translation"] = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        return batch

    translated_test_data = tokenized_test_data.map(generate_translation, batched=True)

    # Extract test_data from small_data_set
    #test_data = small_data_set['test']

    # Print the first 5 translations for inspection
    for i in range(5):
        print(f"Original: {test_data[i]['translation']['en']}")
        print(f"Translated: {translated_test_data[i]['translation']}")
        print(f"Reference: {test_data[i]['translation']['hi']}")
        print()

    return translated_test_data

In [16]:


def evaluate_translations_bertscore(test_data, translated_test_data):
    references = [test_data[i]['translation']['hi'] for i in range(len(test_data))]
    translations = [translated_test_data[i]['translation'] for i in range(len(test_data))]
    
    P, R, F1 = score(translations, references, lang="hi", verbose=True)
    
    # Print BERTScore for each example
    for i in range(len(test_data)):
        print(f"Original: {test_data[i]['translation']['en']}")
        print(f"Translated: {translated_test_data[i]['translation']}")
        print(f"Reference: {test_data[i]['translation']['hi']}")
        print(f"BERTScore F1: {F1[i].item():.4f}")
        print()
    
    print(f"Average BERTScore F1: {F1.mean().item():.4f}")



# ACTUAL CODE FLOW STARTS NOW!!!

In [17]:
api = read_token_and_login('hf_token')

In [18]:

original_tokenizer, original_model = get_pretrained_mbart_large_50_many_to_many_mmt()

In [19]:

dataset_name = "cfilt/iitb-english-hindi"
#small_data_set = get_reduced_dataset(dataset_name)
small_data_set = get_reduced_dataset(dataset_name, train_size = 140000, val_size=20000, test_size=40000)
# Initialize a global variable to keep track of the last print time
last_print_time = time.time()

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1659083
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
})
Dataset({
    features: ['translation'],
    num_rows: 839876
})
DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 140000
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 20000
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 40000
    })
})
New train set size: 140000
New validation set size: 20000
New test set size: 40000


In [20]:
tokenized_datasets = small_data_set.map(lambda examples: preprocess_function(examples, original_tokenizer), batched=True, remove_columns=["translation"])

Map:  32%|███▏      | 45000/140000 [00:10<00:23, 4115.57 examples/s]

Examples: [{'en': 'Undefined identifier "% 1".', 'hi': 'अपारिभाषित पहचानकर्ता "% 1". '}, {'en': 'My sincere regards to every person who has raised this institution to this height; I congratulate them.', 'hi': 'इस संस्था को इस ऊंचाई पर पहुंचाने वाले प्रत्येक व्यक्ति को मैं आदरपूवर्क नमन करता हूं, उनका अभिनंदन करता हूं।'}]


Map:  65%|██████▌   | 91000/140000 [00:20<00:10, 4694.39 examples/s]

Examples: [{'en': 'Secretary-General of the United Nations', 'hi': 'संयुक्त राष्ट्र के महासचिव '}, {'en': 'are going to be y is equal to plus or minus b over ax, so', 'hi': 'y है बी शून्य या अधिक के लिए बराबर कुल्हाड़ी से अधिक होने जा रहे हैं तो'}]


Map:  97%|█████████▋| 136000/140000 [00:30<00:00, 4300.93 examples/s]

Examples: [{'en': 'STARTTLS command failed:% s', 'hi': 'STARTTLS कमांड असफलः% s'}, {'en': 'Please update the following fields:', 'hi': 'कृपया निम्नलिखित क्षेत्र का अद्यतन करेंः'}]


Map: 100%|██████████| 140000/140000 [00:31<00:00, 4456.02 examples/s]
Map: 100%|██████████| 20000/20000 [00:04<00:00, 4550.79 examples/s]
Map:  55%|█████▌    | 22000/40000 [00:04<00:03, 4721.15 examples/s]

Examples: [{'en': 'Unable to open URI', 'hi': 'URI खोलने में असमर्थ'}, {'en': 'Tests fundamental GUI application accessibility', 'hi': 'मूलभूत जीयूआई अनुप्रयोग पहुंचनीयता का परीक्षण करता है'}]


Map: 100%|██████████| 40000/40000 [00:08<00:00, 4552.89 examples/s]


In [21]:
#Testing original untuned model
test_data, tokenized_test_data = prepare_test_data(small_data_set, original_tokenizer)

translated_test_data_untuned = perform_translation_testing(original_model, original_tokenizer, test_data, tokenized_test_data)

evaluate_translations_bertscore(test_data, translated_test_data_untuned)

{'en': 'But this disillusionment was with the policies of Stalin and the system prevailing in the Soviet Union, not with Marxism as such.', 'hi': 'परंतु यह मोहभंग स्टालिन की नीतियों और सोवियत संघ की तत्कालीन व्यवस्था से था, मार्क्सवाद से नहीं। '}
40000
{'en': 'But this disillusionment was with the policies of Stalin and the system prevailing in the Soviet Union, not with Marxism as such.', 'hi': 'परंतु यह मोहभंग स्टालिन की नीतियों और सोवियत संघ की तत्कालीन व्यवस्था से था, मार्क्सवाद से नहीं। '}
200


Map: 100%|██████████| 200/200 [00:00<00:00, 6038.62 examples/s]
Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Processing batch 1


Map: 100%|██████████| 200/200 [00:15<00:00, 12.65 examples/s]


Original: But this disillusionment was with the policies of Stalin and the system prevailing in the Soviet Union, not with Marxism as such.
Translated: लेकिन यह निराशा स्टालिन की नीतियों और सोवियत संघ में प्रचलित प्रणाली से थी, मार्क्सवाद के रूप में नहीं।
Reference: परंतु यह मोहभंग स्टालिन की नीतियों और सोवियत संघ की तत्कालीन व्यवस्था से था, मार्क्सवाद से नहीं। 

Original: 2. Brief history of the electronics company including products being made, capacities, related collaborators, achievements, capabilities etc. may be provided (including recent annual reports and company brochure)
Translated: 2. इलेक्ट्रॉनिक कंपनी का संक्षिप्त इतिहास जिसमें उत्पादों का निर्माण किया जा रहा है, क्षमताएं, संबद्ध सहयोगियों, उपलब्धियां, क्षमताएं आदि उपलब्ध हो सकती हैं (जिसके अंतर्गत हाल के वार्षिक प्रतिवेदन और कंपनी पुस्तिका भी उपलब्ध हो सकती है)
Reference: 6. बनाए जाने वाले उत्पा) दक, क्षमताओं, संबंधित सहयोगियों, उपलब्धिषयों, क्षमताओं आदि सहित इलेक्ट्रॉानिक्स् कंपनी का संक्षिप्त, इतिहास उपलब्ध कराया जाए (

100%|██████████| 7/7 [00:01<00:00,  4.28it/s]


computing greedy matching.


100%|██████████| 4/4 [00:01<00:00,  3.73it/s]

done in 2.72 seconds, 73.64 sentences/sec
Original: But this disillusionment was with the policies of Stalin and the system prevailing in the Soviet Union, not with Marxism as such.
Translated: लेकिन यह निराशा स्टालिन की नीतियों और सोवियत संघ में प्रचलित प्रणाली से थी, मार्क्सवाद के रूप में नहीं।
Reference: परंतु यह मोहभंग स्टालिन की नीतियों और सोवियत संघ की तत्कालीन व्यवस्था से था, मार्क्सवाद से नहीं। 
BERTScore F1: 0.9028

Original: 2. Brief history of the electronics company including products being made, capacities, related collaborators, achievements, capabilities etc. may be provided (including recent annual reports and company brochure)
Translated: 2. इलेक्ट्रॉनिक कंपनी का संक्षिप्त इतिहास जिसमें उत्पादों का निर्माण किया जा रहा है, क्षमताएं, संबद्ध सहयोगियों, उपलब्धियां, क्षमताएं आदि उपलब्ध हो सकती हैं (जिसके अंतर्गत हाल के वार्षिक प्रतिवेदन और कंपनी पुस्तिका भी उपलब्ध हो सकती है)
Reference: 6. बनाए जाने वाले उत्पा) दक, क्षमताओं, संबंधित सहयोगियों, उपलब्धिषयों, क्षमताओं आदि सहित




In [22]:
trainer = prepare_model_for_training(original_model, original_tokenizer, tokenized_datasets)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Seq2SeqTrainer(


In [23]:
fine_tune_and_save(trainer, original_model, original_tokenizer)

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
model, tokenizer = load_fine_tuned_model()

In [None]:
# Example usage
input_text = "Stop in the name of the law"
translated_text = translate_text(model, tokenizer, input_text)

In [None]:

test_data, tokenized_test_data = prepare_test_data(small_data_set, tokenizer)

In [None]:
translated_test_data = perform_translation_testing(model, tokenizer, test_data, tokenized_test_data)

In [None]:
evaluate_translations_bertscore(test_data, translated_test_data)

In [None]:
#original_tokenizer, original_model = get_pretrained_mbart_large_50_many_to_many_mmt()

In [None]:
#translated_test_data_untuned = perform_translation_testing(original_model, original_tokenizer, test_data, tokenized_test_data)

In [None]:
#evaluate_translations_bertscore(test_data, translated_test_data_untuned)