In [1]:
import torch
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    Seq2SeqTrainer, Seq2SeqTrainingArguments,
    pipeline, DataCollatorForSeq2Seq, EarlyStoppingCallback
)
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
# Read the CSV file
data = pd.read_csv("/kaggle/input/eng-naga/eng-naga.csv")

# Verify the data
print(data.head())


                                             English  \
0  Paul, Silvanus, and Timothy to the church of t...   
1  We always give thanks to God for all of you as...   
2  We remember before our God and Father your wor...   
3  Brothers loved by God, we know he has chosen you,   
4  because our gospel came to you not in word onl...   

                                            Nagamese  
0  Paul aru Silvanus aru Timothy pora Isor aru Pr...  
1  Amikhan hodai apnikhan nimite Isor ke dhanyaba...  
2  Amikhan pora apni khan laga biswas laga kaam, ...  
3  Isor pora morom kora bhai khan, amikhan jane T...  
4  kilemane Isor laga kotha apni khan logote khal...  


In [3]:
# data=data[:100]

In [4]:
# Load model and tokenizer
model_name = "facebook/m2m100_418M"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/298 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/3.71M [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

In [5]:
# Get the next available token ID
new_lang_id = len(tokenizer.lang_code_to_id)

# Register 'ng' as a new language
tokenizer.lang_code_to_id["ng"] = new_lang_id

print(f"Added Nagamese ('ng') with ID {new_lang_id}")

Added Nagamese ('ng') with ID 100


In [6]:
source_texts = list(data["English"])  
target_texts = list(data["Nagamese"]) 



# Split the data into training and validation sets
train_source_texts, val_source_texts, train_target_texts, val_target_texts = train_test_split(
    source_texts, target_texts, test_size=0.20, random_state=40
)


print(type(train_source_texts))
print(type(train_target_texts))
print(type(val_source_texts))
print(type(val_target_texts))


<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>


In [7]:
from datasets import Dataset

# Tokenize function
def tokenize_function(examples):
    model_inputs = tokenizer(examples["source_texts"], max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(examples["target_texts"], max_length=128, truncation=True, padding="max_length")["input_ids"]
    labels = [[-100 if token == tokenizer.pad_token_id else token for token in label] for label in labels]
    model_inputs["labels"] = labels
    return model_inputs

# Convert lists to Hugging Face Dataset
train_dataset = Dataset.from_dict({"source_texts": train_source_texts, "target_texts": train_target_texts})
val_dataset = Dataset.from_dict({"source_texts": val_source_texts, "target_texts": val_target_texts})

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/6360 [00:00<?, ? examples/s]

Map:   0%|          | 0/1590 [00:00<?, ? examples/s]

In [8]:
!pip install transformers sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-3.1.1 sacrebleu-2.5.1


In [9]:
import sacrebleu

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Decode tokenized outputs into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # SacreBLEU expects reference translations as a **list of lists**
    bleu = sacrebleu.corpus_bleu(decoded_preds, [decoded_labels])

    return {"bleu": bleu.score}

In [10]:
# # Define Data Collator
# data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="steps", 
    save_strategy="steps",
    eval_steps=200,
    save_steps=200,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    warmup_steps=200,
    seed = 40,
    weight_decay=0.01,
    save_total_limit=5,
    num_train_epochs=10,
    predict_with_generate=True,
    optim="adafactor", 
    report_to=[],
    fp16=True,
    logging_dir="./logs",
    logging_steps=200,
    overwrite_output_dir=True,
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    greater_is_better=True
)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    # tokenizer=tokenizer,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=4)],
    compute_metrics=compute_metrics,
    # data_collator=data_collator,
)

# Start Training
trainer.train()


Step,Training Loss,Validation Loss,Bleu
200,4.0059,2.228443,7.44932
400,1.8774,1.59553,19.164946
600,1.4244,1.460221,20.437098
800,1.1927,1.399927,22.184199
1000,1.0309,1.383815,22.366571
1200,0.9024,1.370413,23.013627
1400,0.8017,1.391551,23.060172
1600,0.7219,1.396016,23.569916
1800,0.6557,1.407653,23.449086


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

TrainOutput(global_step=1990, training_loss=1.3263672028354665, metrics={'train_runtime': 11349.399, 'train_samples_per_second': 5.604, 'train_steps_per_second': 0.175, 'total_flos': 1.72284815867904e+16, 'train_loss': 1.3263672028354665, 'epoch': 10.0})

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [12]:
model = model.to(device)

In [13]:
source_texts = val_source_texts
reference_texts = val_target_texts

In [14]:
# Translate source texts using the model
model.eval()
machine_translations = []

for text in source_texts:
    inputs = tokenizer(text, return_tensors="pt", max_length=128, truncation=True).to(device)
    with torch.no_grad():
        outputs = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id["ng"])
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    machine_translations.append(translated_text)

In [15]:

# Ensure reference_texts is in the required format
wrapped_reference_texts = [reference_texts] 

# Compute BLEU score
bleu = sacrebleu.corpus_bleu(machine_translations, wrapped_reference_texts)
print("BLEU Score:", bleu.score)

BLEU Score: 21.249111194570453
