In [28]:
import string
import re

from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import tensorflow as tf


In [29]:
from datasets import load_dataset

# Load the dataset
raw_datasets = load_dataset("cfilt/iitb-english-hindi")



In [30]:
def preprocess(text):
    text = ''.join(ch for ch in text if ch not in string.punctuation)
    text = text.lower()
    text = re.sub(r'\d','',text)
    text = re.sub(r'\s+',' ',text)  #Remove extra spaces
    text = text.strip()
    return text

In [31]:
data = raw_datasets["train"]["translation"]
    
eng_sen = []
hin_sen = []

# rows = [{"en": item["en"], "hi": item["hi"]} for item in data]
for item in data:
    eng_sen.append(item['en'])
    hin_sen.append(item['hi'])
    
eng_sen = [preprocess(en) for en in eng_sen]
hin_sen = [' ' + re.sub('[a-zA-Z]', '', preprocess(hi)) + ' ' for hi in hin_sen]

#Remove duplicate sentences
english_unique = set()
english_sentences_temp = []
hindi_sentences_temp = []
l = len(eng_sen)
for i in range(l):
    if eng_sen[i] not in english_unique:
        english_unique.add(eng_sen[i])
        english_sentences_temp.append(eng_sen[i])
        hindi_sentences_temp.append(hin_sen[i])

eng_sen = english_sentences_temp
hin_sen = hindi_sentences_temp

In [32]:
print(len(eng_sen), len(hin_sen))
print()
eng_sen[:3], hin_sen[:3]

1044136 1044136



(['give your application an accessibility workout',
  'accerciser accessibility explorer',
  'the default plugin layout for the bottom panel'],
 [' अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें ',
  ' एक्सेर्साइसर पहुंचनीयता अन्वेषक ',
  ' निचले पटल के लिए डिफोल्ट प्लगइन खाका '])

In [33]:
vocab_size = 100000
total_sentences = 50
maxlen = 10

In [34]:
en_data = []
hi_data = []

cnt = 0

for (en, hi) in zip(eng_sen, hin_sen):
    l = min(len(en.split()), len(hi.split()))
    if l <= maxlen:
        en_data.append(en)
        hi_data.append(hi)
        cnt += 1
    if cnt == total_sentences:
        break

In [43]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
  
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi")

model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-hi")

def translator(text):
  # function to translate english text to hindi
  input_ids = tokenizer.encode(text, return_tensors="pt", padding=True)
  outputs = model.generate(input_ids)
  decoded_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
  
  return decoded_text

#text you want translate
texts = en_data
hi_translated = []

for text in texts:
    hi_translated.append(translator(text))

In [44]:
import nltk

In [45]:
def remove_special_tokens(sentence):
    # Remove punctuation and special tokens
    cleaned_sentence = re.sub(r'[^\w\s]', '', sentence)
    return cleaned_sentence
cleaned_predictions = [remove_special_tokens(sentence) for sentence in hi_translated]
hi_translated = [nltk.word_tokenize(sentence) for sentence in cleaned_predictions]

hi_data = [nltk.word_tokenize(sentence) for sentence in hi_data]

In [50]:
smoothie = SmoothingFunction().method1
# bleu_score = corpus_bleu(hi_data_cleaned, hi_translated_cleaned, smoothing_function=smoothie)
bleu_score = corpus_bleu(hi_data, hi_translated, smoothing_function=smoothie)

print("\nBLEU score =", bleu_score)

KeyError: ('तरछ',)

In [51]:
hi_translated

[['अपन', 'अनपरयग', 'क', 'पहचनयत', 'वययम', 'क', 'लभ', 'द'],
 ['एकसरसइसर', 'पहचनयत', 'अनवषक'],
 ['नचल', 'पटल', 'क', 'लए', 'डफलट', 'पलगइन', 'खक'],
 ['ऊपर', 'पटल', 'क', 'लए', 'डफलट', 'पलगइन', 'खक'],
 ['उन',
  'पलगइन',
  'क',
  'सच',
  'जनह',
  'डफलट',
  'रप',
  'स',
  'नषकरय',
  'कय',
  'गय',
  'ह'],
 ['हइलइट', 'अवध'],
 ['पहचनय', 'आसध', 'नड', 'क', 'चनत', 'समय', 'हइलइट', 'बकस', 'क', 'अवध'],
 ['समत', 'बरडर', 'क', 'रग', 'क', 'हइलइट', 'कर'],
 ['हइलइट', 'कए', 'गए', 'समत', 'क', 'रग', 'और', 'अपरदरशत'],
 ['भरई', 'क', 'रग', 'क', 'हइलइट', 'कर'],
 ['हइलइट', 'कय', 'गय', 'भरई', 'क', 'रग', 'और', 'परदरशत'],
 ['एक', 'समनय', 'बरउजर'],
 ['इस',
  'समय',
  'जस',
  'परपत',
  'कय',
  'गय',
  'ह',
  'उसक',
  'वभनन',
  'वधय',
  'मथड',
  'म',
  'बरउज',
  'कर'],
 ['नज', 'गण', 'क', 'छपए'],
 ['वध'],
 ['गण'],
 ['मन'],
 ['आईपयथन', 'कसल'],
 ['इस',
  'समय',
  'चन',
  'गए',
  'एकससबल',
  'स',
  'कम',
  'लन',
  'क',
  'लए',
  'अतरकरयतमक',
  'कनसल'],
 ['घटन', 'मनटर'],
 ['घटनओ', 'क', 'मनटर', 'कर'],
 ['सफ', 'कर', 'चयन'],
 ['स