<a href="https://colab.research.google.com/github/ArunVignesh75/Machine-Learning/blob/main/MBart_Translation_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
!pip install transformers[sentencepiece]



In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import pipeline, MBart50TokenizerFast, MBartForConditionalGeneration
import re
import string
from string import digits

In [23]:
# Load the dataset
lines=pd.read_csv("Hindi_English_Truncated_Corpus.csv")
lines

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।
...,...,...,...
127602,indic2012,Examples of art deco construction can be found...,आर्ट डेको शैली के निर्माण मैरीन ड्राइव और ओवल ...
127603,ted,and put it in our cheeks.,और अपने गालों में डाल लेते हैं।
127604,tides,"As for the other derivatives of sulphur , the ...","जहां तक गंधक के अन्य उत्पादों का प्रश्न है , द..."
127605,tides,its complicated functioning is defined thus in...,Zरचना-प्रकिया को उसने एक पहेली में यों बांधा है .


In [24]:
lines.isnull().sum()

source              0
english_sentence    2
hindi_sentence      0
dtype: int64

In [25]:
lines.dropna(inplace = True)

In [26]:
lines.isnull().sum()

source              0
english_sentence    0
hindi_sentence      0
dtype: int64

In [27]:
# Find the maximum length of English sentences
max_length_english = lines['english_sentence'].apply(lambda x: len(x.split())).max()

print(f"Maximum length of English sentences: {max_length_english}")


Maximum length of English sentences: 398


In [28]:
# Find the maximum length of English sentences
max_length_hindi = lines['hindi_sentence'].apply(lambda x: len(x.split())).max()

print(f"Maximum length of Hindi sentences: {max_length_hindi}")

Maximum length of Hindi sentences: 418


In [29]:
# Data pre-processing
lines['english_sentence'] = lines['english_sentence'].apply(lambda x: x.lower())

In [30]:
lines['english_sentence']

0         politicians do not have permission to do what ...
1                i'd like to tell you about one such child,
2         this percentage is even greater than the perce...
3         what we really mean is that they're bad at not...
4         .the ending portion of these vedas is called u...
                                ...                        
127602    examples of art deco construction can be found...
127603                            and put it in our cheeks.
127604    as for the other derivatives of sulphur , the ...
127605    its complicated functioning is defined thus in...
127606    they've just won four government contracts to ...
Name: english_sentence, Length: 127605, dtype: object

In [31]:
lines['english_sentence'] = lines['english_sentence'].apply(lambda x: re.sub("'", '', x))
lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x: re.sub("'", '', x))

In [32]:
exclude = set(string.punctuation)
lines['english_sentence'] = lines['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))


In [33]:
remove_digits = str.maketrans('', '', digits)
lines['english_sentence'] = lines['english_sentence'].apply(lambda x: x.translate(remove_digits))
lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x: x.translate(remove_digits))

In [34]:
lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))


In [35]:
lines['english_sentence'] = lines['english_sentence'].apply(lambda x: x.strip())
lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x: x.strip())

In [36]:
lines['english_sentence'] = lines['english_sentence'].apply(lambda x: re.sub(" +", " ", x))
lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x: re.sub(" +", " ", x))

# Add start and end tokens to target sequences
lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x: 'START_ ' + x + ' _END')


In [37]:
# Train-test split
train_df, test_df = train_test_split(lines, test_size=0.2, random_state=42)

In [38]:
# Initialize the translation pipeline
translation_pipeline = pipeline(
    task="translation",
    model=MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt"),
    tokenizer=MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt")
)



In [None]:
# Translate English texts to Hindi using pipeline
translated_texts = [translation_pipeline(text, max_length=500, src_lang="en_XX", tgt_lang="hi_IN")[0]['translation_text']
                    for text in test_df['english_sentence']]



In [None]:
# Evaluate the translations
test_df['predicted_hindi'] = translated_texts
accuracy = accuracy_score(test_df['hindi_sentence'], test_df['predicted_hindi'])

# Print the accuracy
print(f"Accuracy: {accuracy * 100:.2f}%")

In [None]:
# You can also save the translated results to a CSV file if needed
test_df.to_csv('translated_results.csv', index=False)