In [None]:
%pip install transformers==4.32 -q
%pip install datasets -q
%pip install scikit-learn -q

# <font color='grey'>**Step 1: This step follows the same approach till the collation pipeline as in the fine-tuning process**</font>

In [1]:
import pandas as pd
import logging
import datasets

from typing import Tuple
from typing_extensions import Annotated
from sklearn.model_selection import train_test_split
from datasets import Dataset
logging.basicConfig(level=logging.INFO, force=True)

> <font color = 'red'>**Data pipeline: Same as in the fine-tuning phase.**

In [2]:
def load_data(path: str) -> Annotated[pd.DataFrame, "data"]:
    """
    Loads the .csv input data.
    Args:
        path: path to the .csv file.
    Returns:
        data: A pandas dataframe.
    """
    logging.info("Loading the data.")
    data= pd.read_csv(path)
    return data

In [3]:
def split_data(data: pd.DataFrame) -> Tuple[
    Annotated[pd.DataFrame,"train_dataset"],
    Annotated[pd.DataFrame,"validation_dataset"],
    Annotated[pd.DataFrame,"test_dataset"],
]:
    """
    Splits the pandas dataframe into train-val-test splits.
    Args:
        data: The pandas dataframe.
    Returns:
        train_dataset,
        validation_dataset, 
        test_dataset
    """
    logging.info("Preparing the train-val-test split.")
    train_val_df, test_dataset= train_test_split(data, test_size=0.1, random_state=42)
    train_dataset, validation_dataset= train_test_split(train_val_df, test_size=0.1, random_state=42)
    return train_dataset, validation_dataset, test_dataset

In [4]:
def torch_dataset(train_dataset: pd.DataFrame, validation_dataset: pd.DataFrame, test_dataset: pd.DataFrame) -> Tuple[
    Annotated[Dataset,"train_dataset"],
    Annotated[Dataset,"validation_dataset"],
    Annotated[Dataset,"test_dataset"],
]:
    """
    Convert the train, val and, test pandas dataset into hugging face datasets.
    Args:
        train_dataset, validation_dataset, test_dataset
    Returns:
        train_dataset: Type= Dataset
        validation_dataset: Type= Dataset
        test_dataset: Type= Dataset
    """
    logging.info("Converting to Huggingface datasets.")
    train_dataset= Dataset.from_pandas(train_dataset)
    validation_dataset= Dataset.from_pandas(validation_dataset)
    test_dataset= Dataset.from_pandas(test_dataset)
    
    logging.info("Removing unnecessary columns from the dataset.")
    train_dataset= train_dataset.remove_columns(["Hindi", "__index_level_0__"])
    validation_dataset= validation_dataset.remove_columns(["Hindi", "__index_level_0__"])
    test_dataset= test_dataset.remove_columns(["Hindi", "__index_level_0__"])
    
    return train_dataset, validation_dataset, test_dataset

In [5]:
def data_pipeline() -> Annotated[datasets.DatasetDict, "main_dataset"]:
    """
    Pipeline for loading the data
    """
    data= load_data("/kaggle/input/english-kinnauri-dataset/Kinnauri_final.csv")
    train_dataset, validation_dataset, test_dataset= split_data(data)
    train_dataset, validation_dataset, test_dataset= torch_dataset(train_dataset, validation_dataset, test_dataset) 
    main_dataset= datasets.DatasetDict({
        "Train": train_dataset,
        "Validation": validation_dataset,
        "Test": test_dataset
    })
    return main_dataset

In [6]:
main_dataset= data_pipeline()
main_dataset

INFO:root:Loading the data.
INFO:root:Preparing the train-val-test split.
INFO:root:Converting to Huggingface datasets.
INFO:root:Removing unnecessary columns from the dataset.


DatasetDict({
    Train: Dataset({
        features: ['English', 'Kinnauri'],
        num_rows: 16448
    })
    Validation: Dataset({
        features: ['English', 'Kinnauri'],
        num_rows: 1828
    })
    Test: Dataset({
        features: ['English', 'Kinnauri'],
        num_rows: 2031
    })
})

> <font color = 'red'>**Tokenization pipeline: Removed the adjusted_model frunction in the evaluation phase.**

In [28]:
from transformers import NllbTokenizer

import torch

In [29]:
def load_tokenizer(src_lang: str, tgt_lang: str)-> Annotated[NllbTokenizer, "tokenizer"]:
    """
    Loads the NllbTokenizer.
    Args:
        src_lang: The language to be passed in as input to the tokenizer.
        tgt_lang: The language the tokenizer is to generate.
    Returns:
        tokenizer: NllbTokenizer with the src_lang and tgt_lang specified.
    """
    logging.info("Loading NllbTokenizer.")
    
    tokenizer= NllbTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", src_lang=src_lang, tgt_lang=tgt_lang)
    return tokenizer

In [30]:
def fix_tokenizer(tokenizer: NllbTokenizer, new_lang: str)-> Annotated[NllbTokenizer, "tokenizer"]:
    """
    Introduces the language code for the new_lang in the NllbTokenizer.
    Args:
        tokenizer: The original NllbTokenizer.
        new_lang: The language code to be added.
    Returns:
        tokenizer: The updated tokenizer.
    """
    logging.info("Fixing the tokenizer.")
    
    old_len= len(tokenizer) - int(new_lang in tokenizer.added_tokens_encoder)
    tokenizer.lang_code_to_id[new_lang]= old_len-1
    tokenizer.id_to_lang_code[old_len-1]= new_lang

    tokenizer.fairseq_tokens_to_ids["<mask>"]= len(tokenizer.sp_model) + len(tokenizer.lang_code_to_id) + tokenizer.fairseq_offset

    tokenizer.fairseq_tokens_to_ids.update(tokenizer.lang_code_to_id)
    tokenizer.fairseq_ids_to_tokens= {v: k for k, v in tokenizer.fairseq_tokens_to_ids.items()}
    if new_lang not in tokenizer._additional_special_tokens:
        tokenizer._additional_special_tokens.append(new_lang)
    tokenizer.added_tokens_encoder= {}
    tokenizer.added_tokens_decoder= {}
    
    return tokenizer

In [31]:
def tokenizer_pipeline(src_lang: str, tgt_lang: str, similar_lang: str)-> Annotated[NllbTokenizer, "tokenizer"]:
    """
    Pipeline for Fixing the tokenizer.
    """
    tokenizer= load_tokenizer(src_lang= src_lang, tgt_lang=tgt_lang)
    tokenizer= fix_tokenizer(tokenizer, new_lang=tgt_lang)
    return tokenizer

In [32]:
tokenizer= tokenizer_pipeline(src_lang="eng_Latn",tgt_lang= "kang_Deva", similar_lang="hin_Deva")

INFO:root:Loading NllbTokenizer.
INFO:root:Fixing the tokenizer.


# <font color='grey'>**Step 2: Loading the Fine-tuned model and preparing the translation pipeline**</font>

In [15]:
from transformers import pipeline
from transformers import AutoModelForSeq2SeqLM

In [17]:
fine_tuned_model= AutoModelForSeq2SeqLM.from_pretrained("/kaggle/input/english-kinnauri/pytorch/default/1")

  return torch.load(checkpoint_file, map_location=map_location)


In [40]:
translator= pipeline('translation', model=fine_tuned_model, tokenizer=tokenizer, src_lang='eng_Latn', tgt_lang='kang_Deva', device=0)

# <font color='grey'>**Step 3: Evaluating the fine-tuned model**</font>

In [49]:
%pip install sacrebleu -q
%pip install evaluate -q

  pid, fd = os.forkpty()


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [50]:
import evaluate
metric= evaluate.load("sacrebleu")

In [35]:
test_dataset= main_dataset["Test"]
test_dataset

Dataset({
    features: ['English', 'Kinnauri'],
    num_rows: 2031
})

In [42]:
df= pd.DataFrame(test_dataset)

translations= []
for index, row in df.iterrows():
    english_text= row['English']
    kinnauri_target= row['Kinnauri']

    translation= translator(english_text, max_length=128)[0]['translation_text']

    translations.append({
        'English': english_text,
        'Target Kinnauri': kinnauri_target,
        'Generated Kinnauri': translation
    })



In [44]:
results_df= pd.DataFrame(translations)
results_df.to_csv("/kaggle/working/English-Kinnauri-Translations.csv")

**Computing the bleu score on the generated translations**

In [59]:
df= pd.read_csv("/kaggle/working/English-Kinnauri-Translations.csv")
predictions= df["Generated Kinnauri"]
references= df["Target Kinnauri"]
results = metric.compute(predictions=predictions, references=references)

In [60]:
print(results)

{'score': 8.435075362674903, 'counts': [5013, 1690, 689, 323], 'totals': [16942, 14911, 12889, 10921], 'precisions': [29.58918663676071, 11.333914559721011, 5.345643572038172, 2.957604614962], 'bp': 0.9884977801312381, 'sys_len': 16942, 'ref_len': 17138}
