In [None]:
#Installing the required packages
!pip install datasets==2.15.0
!pip install transformers[torch]
!pip install nltk 
!pip install accelerate -U
!pip install torch
!pip install sentencepiece
!pip install matplotlib
!pip install sacrebleu

In [None]:
#Importing the required packages
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import MT5ForConditionalGeneration, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import numpy as np
from transformers import EarlyStoppingCallback
import sacrebleu

In [None]:
# model_checkpoint = 'google/mt5-base'
model_checkpoint = 'Chhabi/mt5-small-finetuned-Nepali-Health-50k-2'

task = "NepaliAI/Nepali-Health-Fact"

# Load the dataset
from datasets import load_dataset
raw_datasets = load_dataset(task)

# Split the dataset into train and test sets
splitted_datasets = raw_datasets['train'].train_test_split(test_size=0.1)

if model_checkpoint in ['google/mt5-small','google/mt5-base','Chhabi/mt5-small-finetuned-Nepali-Health-50k-2']:
    # Set the prefix to the model to specify the task (e.g. summarization) 
    prefix = "answer: "
else:
    prefix = ""


# Tokenize the data
# Set the maximum length the input and output can be 
max_input_length = 512
max_target_length = 512 

# Load the tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Define a function to preprocess the data 
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["Question"]]

    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(text_target=examples["Answer"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs
# Preprocess the data
tokenized_datasets = splitted_datasets.map(preprocess_function, batched=True,remove_columns=["Question", "Answer"])
# Set the format to pytorch
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
tokenized_datasets

In [None]:
# Define a function to post-process the predicted and reference texts
def postprocess_text(preds, labels):
    # Remove leading and trailing whitespaces from each predicted text
    preds = [pred.strip() for pred in preds]
    # Wrap each element in the labels list with another list
    labels = [[label.strip()] for label in labels]

    return preds, labels

# Define a function to compute BLEU score and generation length metrics
def compute_metrics(eval_preds):
    # Unpack the predictions and labels
    preds, labels = eval_preds
    # Check if predictions are a tuple and extract the first element if they are
    if isinstance(preds, tuple):
        preds = preds[0]
    
    # Replace -100 values in predictions and labels with the tokenizer's pad token ID
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
    # Decode predictions and labels into text using the tokenizer
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Apply post-processing to the decoded predictions and labels
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    
    # Wrap each element in the decoded_labels list with another list
    decoded_labels = [[label] for sublist in decoded_labels for label in sublist]

    # Compute BLEU score using sacrebleu library
    bleu_score = sacrebleu.corpus_bleu(decoded_preds, decoded_labels).score

    # Calculate the average generation length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    gen_len = np.mean(prediction_lens)
    
    # Create a dictionary to store the computed metrics
    result = {"bleu": bleu_score, "gen_len": gen_len}
    
    # Round the metrics to four decimal places
    result = {k: round(v, 4) for k, v in result.items()}
    
    return result


In [None]:
# Instantiate the model for conditional generation
model = MT5ForConditionalGeneration.from_pretrained(model_checkpoint)

# Define training arguments
batch_size = 2
args = Seq2SeqTrainingArguments(
    "NFT",  # Directory where checkpoints and logs will be saved
    evaluation_strategy="epoch",  # Evaluation will be performed at the end of each epoch
    save_strategy="epoch",  # Save a checkpoint at the end of each epoch
    
    learning_rate=2e-4,  # Learning rate for training
    optim="adafactor",  # Optimizer to use
    
    per_device_train_batch_size=batch_size,  # Batch size per GPU for training
    per_device_eval_batch_size=batch_size,  # Batch size per GPU for evaluation
    
    weight_decay=0.01,  # Weight decay to apply
    gradient_accumulation_steps=8,  # Number of steps for gradient accumulation
    
    save_total_limit=3,  # Limit the total number of saved checkpoints
    num_train_epochs=5,  # Number of training epochs
    
    predict_with_generate=True,  # Perform generation during evaluation
    load_best_model_at_end=True,  # Load the best model at the end of training
    
    generation_max_length=256,  # Maximum length of generated sequences
    # Decrease this value to 50
    fp16=False,  # Use mixed precision training if True
    report_to="tensorboard",  # Report metrics to TensorBoard
)


In [None]:
# Define the data collator for sequence-to-sequence tasks
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Instantiate the Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model,  # The model to train
    args,  # Training arguments
    train_dataset=tokenized_datasets["train"],  # Training dataset
    eval_dataset=tokenized_datasets["test"],  # Evaluation dataset
    data_collator=data_collator,  # Data collator for batching sequences
    tokenizer=tokenizer,  # Tokenizer for encoding/decoding sequences
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Early stopping callback
    compute_metrics=compute_metrics if args.predict_with_generate else None,  # Metric computation function
)


In [None]:
# Start training the model
trainer.train()

In [None]:
model.save_pretrained("NFT")
tokenizer.save_pretrained("NFT")

In [None]:
# Load the trained model
model = MT5ForConditionalGeneration.from_pretrained("NFT")

# Load the tokenizer for generating new output
tokenizer = AutoTokenizer.from_pretrained("NFT")

In [None]:
# Move the model to the CUDA device for GPU acceleration
model = model.to("cuda")

In [None]:
# Move the input text tensor to the CUDA device for GPU processing
input_text = "answer: ब्रोन्काइटिस र नाक फुक्दा मेरो कान बन्द हुनु र दाँत सुन्न हुनु सामान्य हो?"
inputs = tokenizer(input_text, return_tensors='pt', max_length=256, truncation=True).to("cuda")

# Print the input text and tokenized inputs
print(f'input_text: {input_text}')
print(f'tokenized_inputs: {inputs}')

# Generate text based on the input using the model
generated_text = model.generate(
    **inputs,
    max_length=256,
    min_length=256,
    length_penalty=4.0,
    num_beams=5,
    top_p=0.95,
    top_k=100,
    do_sample=True,
    temperature=0.7,
    num_return_sequences=1,
    no_repeat_ngram_size=4
)

# Decode the generated text and filter out special tokens
generated_response = tokenizer.batch_decode(generated_text, skip_special_tokens=True)[0]
tokens = generated_response.split(" ")
filtered_tokens = [token for token in tokens if not token.startswith("<extra_id_")]
print(' '.join(filtered_tokens))


In [None]:
generation_hyperparameters = {
    "max_length": 256,
    "min_length": 128,
    "length_penalty": 4.0,
    "num_beams": 5,
    "top_p": 0.95,
    "top_k": 150,
    "do_sample": True,
    "temperature": 0.7,
    "num_return_sequences": 1,
    "no_repeat_ngram_size": 3,
}
metadata = {
    "hyperparameters": generation_hyperparameters,
}


# Push the model to the Hugging Face Model Hub with metadata
model.push_to_hub("Chhabi/mt5-small-finetuned-Nepali-Health-50k-2", use_auth_token="place your write token", commit_message="Fine-tuned model with generation hyperparameters", metadata=metadata)

# Push the tokenizer to the Hugging Face Model Hub with metadata
tokenizer.push_to_hub("Chhabi/mt5-small-finetuned-Nepali-Health-50k-2", use_auth_token="place your write token", commit_message="Fine-tuned tokenizer with generation hyperparameters", metadata=metadata)