In [1]:
%pip install transformers==4.32 -q
%pip install datasets  -q

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.

kaggle-environments 1.14.15 requires transformers>=4.33.1, but you have transformers 4.32.0 which is incompatible.[0m[31m

[0mNote: you may need to restart the kernel to use updated packages.

Note: you may need to restart the kernel to use updated packages.


# <font color='grey'>Step 1: Loading facebook/nllb-200-distilled-600M</font>

In [2]:
from transformers import AutoModelForSeq2SeqLM
base_model= AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")




config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]


  torch.utils._pytree._register_pytree_node(


  torch.utils._pytree._register_pytree_node(




pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]


  return torch.load(checkpoint_file, map_location=map_location)


generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

> <font color = 'red'>**IMPORTANT**:</font>

**1. Source_language = Hindi**

**2. Target_language = English**

<font color = 'red'>**The objective here is to generate English sentences from the provided Hindi sentences in the parallel corporas. This'll allow us to align the English sentences on a sentence level with the Kangri and Kinauri sentences which will be further utilized for fine-tuning the LLM.**</font>

In [3]:
from transformers import NllbTokenizer
src_lang= 'hin_Deva' # Language code for Hindi
tgt_lang= 'eng_Latn' # Language code for english
tokenizer= NllbTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", src_lang=src_lang, tgt_lang=tgt_lang)

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

# <font color='grey'>Step 2: Loading the parallel dataset</font>

In [4]:
from typing_extensions import Annotated
from typing import Tuple
import logging
logging.basicConfig(level=logging.INFO, force=True)

In [5]:
def prepare_dataset(path: str) -> Annotated[list, "sentences"]:

    """
    Converts the .txt data into a list of sentences.
    Args:
        path: The path to the .txt file
    Returns:
        A list of sentences
    """
    sentences= []
 
    with open(path, 'r', encoding= 'utf-8') as file:
        for line in file:
            line= line.strip()
            if line:
                sentences.append(line)
    return sentences

**TODO: The alignment implementation could utilize a more robust solution. Manual inspection might be required to identify defaulters.**

In [6]:
def align_kangri(Kangri_hindi: list, Kangri: list) -> Tuple[
    Annotated[list, "aligned_hindi"],
    Annotated[list, "aligned_kangri"],
]:
    """
    Aligns the number of sentences in Hindi-Kangri dataset.
    Args:
        Kangri_hindi: Hindi sentences in the Hindi-Kangri dataset
        Kangri: Kangri sentences in the Hindi-kangri dataset
    Returns:
        A tuple of Hindi and Kangri sentences
    """
    
    min_len = min(len(Kangri_hindi), len(Kangri))
    aligned_hindi = Kangri_hindi[:min_len]
    aligned_kangri = Kangri[:min_len]
    
    return aligned_hindi, aligned_kangri

In [7]:
def load_dataset() -> Tuple[
    Annotated[list, "Kinnauri_hindi"],
    Annotated[list, "Kinnauri"],
    Annotated[list, "Kangri_hindi"],
    Annotated[list, "Kangri"],
]:
    """
    Loads the data utlizing the prepare_dataset function:
    Args:
        None
    Returns:
        Tuple of four list sentences
    """
    logging.info("Loading the Hindi-Kinnauri dataset..")
    Kinnauri_hindi= prepare_dataset(path= '/kaggle/input/parallel-kinnauir-data/Parallel_data_Hi.txt')
    Kinnauri= prepare_dataset(path= '/kaggle/input/parallel-kinnauir-data/Parallel_data_KP.txt')

    if len(Kinnauri_hindi) != len(Kinnauri):
        logging.error("The number of sentences in Hindi-Kinnauri dataset do not match!")
    else:
        logging.info("Successfully loaded Hindi-Kinnauri dataset!")
        
    logging.info("Loading the Hindi-kangri dataset..")
    Kangri_hindi= prepare_dataset(path= '/kaggle/input/parallel-hindi-kangri-dataset/Kr_4_Hindi.txt')
    Kangri= prepare_dataset(path= '/kaggle/input/parallel-hindi-kangri-dataset/Kr_4_kangri.txt')
    Kangri_hindi, Kangri= align_kangri(Kangri_hindi, Kangri)
    
    if len(Kangri_hindi) != len(Kangri):
        logging.error("The number of sentences in Hindi-Kangri dataset do not match!")
    else:
        logging.info("Successfully loaded Hindi-Kangri dataset!")
        
    return Kinnauri_hindi, Kinnauri, Kangri_hindi, Kangri

In [8]:
Kinnauri_hindi, Kinnauri, Kangri_hindi, Kangri= load_dataset()

INFO:root:Loading the Hindi-Kinnauri dataset..

INFO:root:Successfully loaded Hindi-Kinnauri dataset!

INFO:root:Loading the Hindi-kangri dataset..

INFO:root:Successfully loaded Hindi-Kangri dataset!


# <font color='grey'>Step 3: Preparing the Translator pipeline</font>

In [9]:
from transformers import pipeline
import pandas as pd
import csv

INFO:numexpr.utils:NumExpr defaulting to 4 threads.


> <font color= 'red'>**NOTE:**</font>

**src_lang and tgt_lang already defined under Step 1**

In [10]:
translator= pipeline('translation', model=base_model, tokenizer=tokenizer, src_lang=src_lang, tgt_lang=tgt_lang, device= 0)

In [11]:
def prepare_translations(Hindi: list) -> Annotated[list, "English"]:
    """
    Utilizes the translator defined in the above step to generate English translations for the input Hindi sentences.
    Args:
        Hindi: The Hindi sentences from both Kangri and Kinauri parallel corporas.
    Returns:
        A list of English sentences.
    """
    English= []
    for line in Hindi:
        translation= translator(line, max_length= 256)[0]['translation_text']
        English.append(translation)
    
    return English

In [12]:
def generate_translations(Kinnauri_hindi: list, Kangri_hindi: list) -> Tuple[
    Annotated[list, "Kinnauri_dataset"],
    Annotated[list, "Kangri_dataset"],
]:
    """
    Utilizes 'prepare_translations' to generate english translations for the coressponding Kinnauri and Kangri Hindi sentences
    Args:
        Kinnauri_hindi: List of Hindi sentences in Hindi-Kinnauri dataset.
        Kangri_hindi: List of Hindi sentences in Hindi-Kangri dataset.
    Returns:
        Tuple of generated english translations coressponding to Kinnauri and Kangri Hindi sentences.
    """
    Kinnauri_english= prepare_translations(Hindi= Kinnauri_hindi)
    Kangri_english= prepare_translations(Hindi= Kangri_hindi)
    
    Kinnauri_dataset= list(zip(Kinnauri_hindi, Kinnauri_english))
    Kangri_dataset= list(zip(Kangri_hindi, Kangri_english))
    
    return Kinnauri_dataset, Kangri_dataset

In [13]:
def prepare_csv(filename: str, dataset: list):
    """
    Saves the dataset to a csv file format.
    Args:
        filename: The name of the csv file to be constructed
        dataset: The dataset object of type list
    Returns:
        None
    """
    with open(filename, 'w', encoding='utf-8') as file:
        writer= csv.writer(file)
        writer.writerow(['Hindi', 'English'])
        
        for Hindi, English in dataset:
            writer.writerow([Hindi, English])

In [14]:
%%time
Kinnauri_dataset, Kangri_dataset= generate_translations(Kinnauri_hindi, Kangri_hindi)
prepare_csv(filename= "Kinnauri_dataset.csv", dataset= Kinnauri_dataset)
prepare_csv(filename= "Kangri_dataset.csv", dataset= Kangri_dataset)



Your input_length: 242 is bigger than 0.9 * max_length: 256. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


CPU times: user 4h 19min 53s, sys: 3min 16s, total: 4h 23min 9s

Wall time: 4h 22min 58s


**NOTE:**

**After a bit of research one possible alternative could've been to convert the dataset into the Hugging face datasets format to save some time on the translation phase. However, the translation is complete and this approach could be looked at later on.**

> <font color= 'red'>**Hence data ingestion is complete and fine tuning can be performed!**</font>