In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Malayalam to English Transilation
Machine transilation has become an essential tool for bridging language barriers in our increasingly interconnected world.The main challenge of transilation between Malayalam and English  is distinct structure and cultural contexts.We use Hugging face to tackle this challenge

## Import Libraries




In [77]:
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import DataCollatorForSeq2Seq
import os
import re
# from sacrebleu import corpus_bleu
os.environ["WANDB_DISABLED"] = "true"



## Load Dataset
we use the dataset Hemanth-thunder/english-to-malayalam-mt from hugging face dataset.

In [78]:
raw_dataset = load_dataset("Hemanth-thunder/english-to-malayalam-mt")
raw_dataset 

DatasetDict({
    train: Dataset({
        features: ['en', 'ml'],
        num_rows: 5924426
    })
})

In [79]:

raw_dataset = DatasetDict({
    'train': raw_dataset['train'].select(range(200000))
})
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['en', 'ml'],
        num_rows: 200000
    })
})

## Preparing the  Data

In [80]:
raw_dataset['train'][0]

{'en': 'The plot of the movie revolves around the life of two cancer patients Kizie and Manny.',
 'ml': 'ക്യാന്\u200dസറിനോട് പോരാടുന്ന കിസി, മാനി എന്നിവരുടെ ജീവിതമാണ് ചിത്രം പറയുന്നത്.'}

In [81]:

def clean_malayalam_text(example):
    pattern = r"[\u2000-\u200F]+" #this will remove the zwj
    text = re.sub(pattern, '', example['ml'])
    text=text.lower()
    text=re.sub("'",'',text)
    text = re.sub('[^\u0D00-\u0D7F]+', ' ', text) #all other character than malayalam
    
    example['ml']=text
    return example

raw_dataset['train'] = raw_dataset['train'].map(clean_malayalam_text)


In [82]:
raw_dataset['train']['ml'][0]

'ക്യാന്സറിനോട് പോരാടുന്ന കിസി മാനി എന്നിവരുടെ ജീവിതമാണ് ചിത്രം പറയുന്നത് '

In [83]:
raw_dataset['train']['en'][0]

'The plot of the movie revolves around the life of two cancer patients Kizie and Manny.'

In [84]:
def clean_english(example):
    text = re.sub("'", '', example['en'])
    text=text.lower()
    text=re.sub('[^a-z]+',' ',text)
    example['en']=text
    return example
raw_dataset['train'] = raw_dataset['train'].map(clean_english)


Map:   0%|          | 0/200000 [00:00<?, ? examples/s]

In [85]:
raw_dataset['train']['en'][0]

'the plot of the movie revolves around the life of two cancer patients kizie and manny '

use the pretrained model *Helsinki-NLP/opus-mt-ml-en*,this model will help to tokenize and transilate to sequence to sequence task


In [87]:
model_name = "Helsinki-NLP/opus-mt-ml-en"

In [88]:
from transformers import AutoTokenizer 
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/818k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.72M [00:00<?, ?B/s]



In [89]:
raw_dataset=raw_dataset['train']
raw_dataset

Dataset({
    features: ['en', 'ml'],
    num_rows: 200000
})

In [92]:
max_length=128
def preprocess(example):
    text=[ml for ml in example['ml']]
    labels=[en for en in example['en']]
    model_input=tokenizer(text,max_length=max_length)
    with tokenizer.as_target_tokenizer():
        label=tokenizer(labels,max_length=max_length)
    model_input['labels']=label['input_ids']
    return model_input
    

In [93]:
tokenized_dataset=raw_dataset.map(preprocess,batched=True)
tokenized_dataset

Map:   0%|          | 0/200000 [00:00<?, ? examples/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Dataset({
    features: ['en', 'ml', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 200000
})

In [95]:
dataset=tokenized_dataset.train_test_split(test_size=0.2)
dataset

DatasetDict({
    train: Dataset({
        features: ['en', 'ml', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 160000
    })
    test: Dataset({
        features: ['en', 'ml', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 40000
    })
})

In [96]:
train_dataset=dataset['train']
test_dataset=dataset['test']

In [98]:
from transformers import AutoModelForSeq2SeqLM
model=AutoModelForSeq2SeqLM.from_pretrained(model_name)

pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

## Define Training configuration

In [99]:
training_args = Seq2SeqTrainingArguments(
    output_dir='./malayalam_to_english_results',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_dir='./logs',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    predict_with_generate=True,
    save_total_limit=2,  # Limit saved checkpoints
    generation_max_length=128
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [100]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model, return_tensors = "pt", pad_to_multiple_of=128)


In [101]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)


In [102]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,3.3784,3.20176
2,3.1399,3.014739
3,2.9856,2.967042


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[63223]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[63223]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[63223]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[63223]], 'forced_eos_token_id': 0}


TrainOutput(global_step=15000, training_loss=3.2898853190104167, metrics={'train_runtime': 7636.6443, 'train_samples_per_second': 62.855, 'train_steps_per_second': 1.964, 'total_flos': 1.627121516544e+16, 'train_loss': 3.2898853190104167, 'epoch': 3.0})

## Inferencing Model
To perform inference with your model, you'll need to convert your input text into numerical vectors using the tokenizer . Once your input is prepared, you can use the generate function to obtain predictions from the model

In [151]:
def translate_new_input(model,tokenizer,input_text,max_length=128):
    device=model.device
    model.to(device)  
    inputs=tokenizer(input_text,return_tensors="pt",truncation=True, max_length=max_length).to(device)
    outputs=model.generate(inputs["input_ids"], max_length=max_length)
    translated_text=tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_text

# Example usage
new_input =" സുഖമാണോ"  # Malayalam input
translated_output = translate_new_input(model, tokenizer, new_input)
print(f"Input: {new_input}")
print(f"Translated Output: {translated_output}")

Input:  സുഖമാണോ
Translated Output: how are you


In [121]:
# After training

# Save the model
output_dir = "/kaggle/working/my_model"  # Directory in the Kaggle environment where you can save files

trainer.save_model(output_dir)

# Optionally, save the tokenizer
tokenizer.save_pretrained(output_dir)


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[63223]], 'forced_eos_token_id': 0}


('/kaggle/working/my_model/tokenizer_config.json',
 '/kaggle/working/my_model/special_tokens_map.json',
 '/kaggle/working/my_model/vocab.json',
 '/kaggle/working/my_model/source.spm',
 '/kaggle/working/my_model/target.spm',
 '/kaggle/working/my_model/added_tokens.json')

In [122]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load model and tokenizer
model1 = AutoModelForSeq2SeqLM.from_pretrained("/kaggle/working/my_model")
tokenizer1 = AutoTokenizer.from_pretrained("/kaggle/working/my_model")




In [124]:
def translate_new_input(model,tokenizer,input_text,max_length=128):
    device=model1.device
    model1.to(device)  
    inputs=tokenizer1(input_text,return_tensors="pt",truncation=True, max_length=max_length).to(device)
    outputs=model1.generate(inputs["input_ids"], max_length=max_length)
    translated_text=tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_text

# Example usage
new_input = "റോസാദളങ്ങളാൽ പൊതിഞ്ഞ ഒരു ശരീരം നിങ്ങളുടെ വായ പൂവിട്ടത് എവിടെയാണെന്ന് അടയാളപ്പെടുത്തുന്നു"  # Malayalam input
translated_output = translate_new_input(model, tokenizer, new_input)
print(f"Input: {new_input}")
print(f"Translated Output: {translated_output}")

Input: റോസാദളങ്ങളാൽ പൊതിഞ്ഞ ഒരു ശരീരം നിങ്ങളുടെ വായ പൂവിട്ടത് എവിടെയാണെന്ന് അടയാളപ്പെടുത്തുന്നു
Translated Output: a body covered with rock has been marked by your mouth
