In [1]:
%pip install transformers==4.32 -q
%pip install datasets -q
%pip install scikit-learn -q

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
kaggle-environments 1.14.15 requires transformers>=4.33.1, but you have transformers 4.32.0 which is incompatible.[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


# <font color='grey'>**Step 1: Preparing the Dataset for Tokenization**</font>

In [2]:
import pandas as pd
import logging
import datasets

from typing import Tuple
from typing_extensions import Annotated
from sklearn.model_selection import train_test_split
from datasets import Dataset
logging.basicConfig(level=logging.INFO, force=True)

> <font color='red'>**Data Pipeline**

In [3]:
def load_data(path: str) -> Annotated[pd.DataFrame, "data"]:
    """
    Loads the .csv input data.
    Args:
        path: path to the .csv file.
    Returns:
        data: A pandas dataframe.
    """
    logging.info("Loading the data.")
    data= pd.read_csv(path)
    return data

In [4]:
def split_data(data: pd.DataFrame) -> Tuple[
    Annotated[pd.DataFrame,"train_dataset"],
    Annotated[pd.DataFrame,"validation_dataset"],
    Annotated[pd.DataFrame,"test_dataset"],
]:
    """
    Splits the pandas dataframe into train-val-test splits.
    Args:
        data: The pandas dataframe.
    Returns:
        train_dataset,
        validation_dataset, 
        test_dataset
    """
    logging.info("Preparing the train-val-test split.")
    train_val_df, test_dataset= train_test_split(data, test_size=0.1, random_state=42)
    train_dataset, validation_dataset= train_test_split(train_val_df, test_size=0.1, random_state=42)
    return train_dataset, validation_dataset, test_dataset

In [5]:
def torch_dataset(train_dataset: pd.DataFrame, validation_dataset: pd.DataFrame, test_dataset: pd.DataFrame) -> Tuple[
    Annotated[Dataset,"train_dataset"],
    Annotated[Dataset,"validation_dataset"],
    Annotated[Dataset,"test_dataset"],
]:
    """
    Convert the train, val and, test pandas dataset into hugging face datasets.
    Args:
        train_dataset, validation_dataset, test_dataset
    Returns:
        train_dataset: Type= Dataset
        validation_dataset: Type= Dataset
        test_dataset: Type= Dataset
    """
    logging.info("Converting to Huggingface datasets.")
    train_dataset= Dataset.from_pandas(train_dataset)
    validation_dataset= Dataset.from_pandas(validation_dataset)
    test_dataset= Dataset.from_pandas(test_dataset)
    
    logging.info("Removing unnecessary columns from the dataset.")
    train_dataset= train_dataset.remove_columns(["Hindi", "__index_level_0__"])
    validation_dataset= validation_dataset.remove_columns(["Hindi", "__index_level_0__"])
    test_dataset= test_dataset.remove_columns(["Hindi", "__index_level_0__"])
    
    return train_dataset, validation_dataset, test_dataset

In [6]:
def data_pipeline() -> Annotated[datasets.DatasetDict, "main_dataset"]:
    """
    Pipeline for loading the data
    """
    data= load_data("/kaggle/input/english-kangri-dataset/Kangri_final.csv")
    train_dataset, validation_dataset, test_dataset= split_data(data)
    train_dataset, validation_dataset, test_dataset= torch_dataset(train_dataset, validation_dataset, test_dataset) 
    main_dataset= datasets.DatasetDict({
        "Train": train_dataset,
        "Validation": validation_dataset,
        "Test": test_dataset
    })
    return main_dataset

In [7]:
main_dataset= data_pipeline()
main_dataset

INFO:root:Loading the data.
INFO:root:Preparing the train-val-test split.
INFO:root:Converting to Huggingface datasets.
INFO:root:Removing unnecessary columns from the dataset.


DatasetDict({
    Train: Dataset({
        features: ['English', 'Kangri'],
        num_rows: 21695
    })
    Validation: Dataset({
        features: ['English', 'Kangri'],
        num_rows: 2411
    })
    Test: Dataset({
        features: ['English', 'Kangri'],
        num_rows: 2679
    })
})

# <font color='grey'>**Step 2: Tokenizing the Dataset**</font>

In [8]:
from transformers import AutoModelForSeq2SeqLM
from transformers import NllbTokenizer
from transformers import DataCollatorForSeq2Seq

import torch

In [9]:
base_model= AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")



config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

  return torch.load(checkpoint_file, map_location=map_location)


generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

> <font color='red'>**Tokenizer pipeline**</font>

In [10]:
def load_tokenizer(src_lang: str, tgt_lang: str)-> Annotated[NllbTokenizer, "tokenizer"]:
    """
    Loads the NllbTokenizer.
    Args:
        src_lang: The language to be passed in as input to the tokenizer.
        tgt_lang: The language the tokenizer is to generate.
    Returns:
        tokenizer: NllbTokenizer with the src_lang and tgt_lang specified.
    """
    logging.info("Loading NllbTokenizer.")
    
    tokenizer= NllbTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", src_lang=src_lang, tgt_lang=tgt_lang)
    return tokenizer

In [11]:
def fix_tokenizer(tokenizer: NllbTokenizer, new_lang: str)-> Annotated[NllbTokenizer, "tokenizer"]:
    """
    Introduces the language code for the new_lang in the NllbTokenizer.
    Args:
        tokenizer: The original NllbTokenizer.
        new_lang: The language code to be added.
    Returns:
        tokenizer: The updated tokenizer.
    """
    logging.info("Fixing the tokenizer.")
    
    old_len= len(tokenizer) - int(new_lang in tokenizer.added_tokens_encoder)
    tokenizer.lang_code_to_id[new_lang]= old_len-1
    tokenizer.id_to_lang_code[old_len-1]= new_lang

    tokenizer.fairseq_tokens_to_ids["<mask>"]= len(tokenizer.sp_model) + len(tokenizer.lang_code_to_id) + tokenizer.fairseq_offset

    tokenizer.fairseq_tokens_to_ids.update(tokenizer.lang_code_to_id)
    tokenizer.fairseq_ids_to_tokens= {v: k for k, v in tokenizer.fairseq_tokens_to_ids.items()}
    if new_lang not in tokenizer._additional_special_tokens:
        tokenizer._additional_special_tokens.append(new_lang)
    tokenizer.added_tokens_encoder= {}
    tokenizer.added_tokens_decoder= {}
    
    return tokenizer

In [12]:
def adjusting_model(tokenizer: NllbTokenizer, added_id: str, similar_id: str):
    """
    Adjust the base_model's embedding layer to accomodate the new language code.
    Args:
        tokenizer: The adjusted NllbTokenizer.
        added_id: The new language code i.e.kangri_Deva.
        similar_id: The code of the language similar to the new language code i.e. hin_Deva.
    Returns:
        None.
    """
    logging.info("Adjusting the model.")
    
    base_model.resize_token_embeddings(len(tokenizer))
    added_token_id= tokenizer.convert_tokens_to_ids(added_id)
    similar_lang_id= tokenizer.convert_tokens_to_ids(similar_id)
    embeds= base_model.model.shared.weight.data
    # moving the embedding for "mask" to its new position
    embeds[added_token_id+1]= embeds[added_token_id]
    # initializing new language token with a token of a similar language
    embeds[added_token_id]= embeds[similar_lang_id]

In [13]:
def tokenizer_pipeline(src_lang: str, tgt_lang: str, similar_lang: str)-> Annotated[NllbTokenizer, "tokenizer"]:
    """
    Pipeline for Fixing the tokenizer.
    """
    tokenizer= load_tokenizer(src_lang= src_lang, tgt_lang=tgt_lang)
    tokenizer= fix_tokenizer(tokenizer, new_lang=tgt_lang)
    adjusting_model(tokenizer,added_id=tgt_lang,similar_id= similar_lang)
    return tokenizer

In [14]:
tokenizer= tokenizer_pipeline(src_lang="eng_Latn",tgt_lang= "kangri_Deva", similar_lang="hin_Deva")

INFO:root:Loading NllbTokenizer.


sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

INFO:root:Fixing the tokenizer.
INFO:root:Adjusting the model.
You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embeding dimension will be 256205. This might induce some performance reduction as *Tensor Cores* will not be available. For more details  about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


> **<font color= 'red'>Collation pipeline</font>**

**NOTE: The reason for not utilizing the <font color= 'cyan'>tokenizer_function</font> inside the <font color= 'cyan'>'tokenizer_fix'</font> pipeline above is because the .map function utilized in cell below passes in the main_dataset in the form of examples to the <font color= 'cyan'>tokenizer_function</font>. Any value that is not pre-filled will result in an error as such the tokenizer cannot be passed to generate the tokenized dataset.**

In [15]:
from torch.utils.data import DataLoader

In [16]:
def tokenizer_function(examples):
    """
    Tokenizes the main_dataset.
    """
    inputs= [ex for ex in examples["English"]]
    targets= [ex for ex in examples["Kangri"]]
    model_inputs= tokenizer(inputs, text_target= targets, max_length=128, truncation= True)
    return model_inputs

In [17]:
def data_collation(tokenized_dataset: datasets.DatasetDict)-> Tuple[
    Annotated[DataLoader,"Train_dataloader"],
    Annotated[DataLoader,"Validation_dataloader"], 
]:
    """
    Utilized to prepare batches of data for training and evaluation ensuring that sentences within the same batch are 
    of the same length.
    Args:
        tokenized_dataset: The tokenized dataset.
    Returns:
        Train_dataloader: Utilized to train the model.
        Validation_dataloader: Utilized to evaluate the model.
    """
    logging.info("Performing data collation.")
    data_collator= DataCollatorForSeq2Seq(tokenizer, model=base_model)

    tokenized_dataset.set_format("torch")
    Train_dataloader= DataLoader(
        tokenized_dataset["Train"],
        shuffle=True,
        collate_fn=data_collator,
        batch_size=6,
    )
    Validation_dataloader= DataLoader(
        tokenized_dataset["Validation"], collate_fn=data_collator, batch_size=6
    )
    
    return Train_dataloader, Validation_dataloader

In [18]:
def collation_pipeline()-> Tuple[
    Annotated[DataLoader,"Train_dataloader"],
    Annotated[DataLoader,"Validation_dataloader"],
]:
    """
    Data collation pipeline.
    Returns:
        Train_dataloader: Utilized to train the model.
        Validation_dataloader: Utilized to evaluate the model.
    """
    logging.info("Tokenizing the dataset.")
    tokenized_dataset= main_dataset.map(tokenizer_function, batched=True)
    logging.info("Removing unnecessary columns")
    tokenized_dataset= tokenized_dataset.remove_columns(["English", "Kangri"])
    print("Tokenised_Dataset: ", tokenized_dataset)
    
    Train_dataloader, Validation_dataloader= data_collation(tokenized_dataset)
    return Train_dataloader, Validation_dataloader

In [19]:
Train_dataloader, Validation_dataloader= collation_pipeline()

INFO:root:Tokenizing the dataset.


Map:   0%|          | 0/21695 [00:00<?, ? examples/s]

Map:   0%|          | 0/2411 [00:00<?, ? examples/s]

Map:   0%|          | 0/2679 [00:00<?, ? examples/s]

INFO:root:Removing unnecessary columns
INFO:root:Performing data collation.


Tokenised_Dataset:  DatasetDict({
    Train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21695
    })
    Validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2411
    })
    Test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2679
    })
})


# <font color='grey'>**Step 3: Fine-tuning facebook/nllb-200-distilled-600M**</font>

In [20]:
%pip install sacrebleu -q
%pip install evaluate -q
%pip install accelerate -q

  pid, fd = os.forkpty()


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [21]:
import evaluate
import numpy as np

from transformers import AdamW
from transformers import get_scheduler
from accelerate import Accelerator
from tqdm.auto import tqdm

> <font color='red'>**Setting up the metric, optimizer and accelerator**

In [22]:
metric= evaluate.load("sacrebleu")
optimizer= torch.optim.AdamW(base_model.parameters(), lr=2e-5)

accelerator = Accelerator()
base_model, optimizer, Train_dataloader, Validation_dataloader= accelerator.prepare(
    base_model, optimizer, Train_dataloader, Validation_dataloader
)

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [23]:
def compute_metrics(eval_preds):
    preds, labels= eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds= tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels= np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels= tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds= [pred.strip() for pred in decoded_preds]
    decoded_labels= [[label.strip()] for label in decoded_labels]

    result= metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

> <font color='red'>**Specifying the number of training epochs and setting up the learning rate scheduler.**

In [24]:
num_train_epochs= 10
num_update_steps_per_epoch= len(Train_dataloader)
num_training_steps= num_train_epochs * num_update_steps_per_epoch

lr_scheduler= get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [25]:
def postprocess(predictions, labels):
    predictions= predictions.cpu().numpy()
    labels= labels.cpu().numpy()

    decoded_preds= tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels= np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels= tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds= [pred.strip() for pred in decoded_preds]
    decoded_labels= [[label.strip()] for label in decoded_labels]
    return decoded_preds, decoded_labels

> <font color='red'>**Setting up the training loop.**

In [26]:
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    base_model.train()
    for batch in Train_dataloader:
        outputs= base_model(**batch)
        loss= outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    base_model.eval()
    for batch in tqdm(Validation_dataloader):
        with torch.no_grad():
            generated_tokens= accelerator.unwrap_model(base_model).generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
                max_length=128,
            )
        labels= batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        generated_tokens= accelerator.pad_across_processes(
            generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
        )
        labels= accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered= accelerator.gather(generated_tokens)
        labels_gathered= accelerator.gather(labels)

        decoded_preds, decoded_labels= postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=decoded_preds, references=decoded_labels)

    results= metric.compute()
    print(f"epoch {epoch}, BLEU score: {results['score']:.2f}")

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(base_model)
    unwrapped_model.save_pretrained('/kaggle/working/out', save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained('/kaggle/working/out')

  0%|          | 0/36160 [00:00<?, ?it/s]

  0%|          | 0/402 [00:00<?, ?it/s]

epoch 0, BLEU score: 1.06


  0%|          | 0/402 [00:00<?, ?it/s]

epoch 1, BLEU score: 1.64


  0%|          | 0/402 [00:00<?, ?it/s]

epoch 2, BLEU score: 2.06


  0%|          | 0/402 [00:00<?, ?it/s]

epoch 3, BLEU score: 2.11


  0%|          | 0/402 [00:00<?, ?it/s]

epoch 4, BLEU score: 2.26


  0%|          | 0/402 [00:00<?, ?it/s]

epoch 5, BLEU score: 2.23


  0%|          | 0/402 [00:00<?, ?it/s]

epoch 6, BLEU score: 2.33


  0%|          | 0/402 [00:00<?, ?it/s]

epoch 7, BLEU score: 2.56


  0%|          | 0/402 [00:00<?, ?it/s]

epoch 8, BLEU score: 2.45


  0%|          | 0/402 [00:00<?, ?it/s]

epoch 9, BLEU score: 2.48
