<a href="https://colab.research.google.com/github/Amankp1/LEGAL_TERMINOLOGY_MT/blob/main/Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Make imports
import numpy as np
import re
import pickle
import os
import seaborn as sns
import string

In [None]:
def preprocess(text):
  text = ''.join(ch for ch in text if ch not in string.punctuation)
  text = text.lower()
  text = re.sub(r'\d','',text)
  text = re.sub(r'\s+',' ',text)
  text = text.strip()
  return text

In [None]:
#Extract dataset and preprocess
dataset_root = "/content/"

if os.path.exists(dataset_root + "preprocessed_data.pickle"):
  with open(dataset_root + "preprocessed_data.pickle", 'rb') as f:
    english_sentences, hindi_sentences = pickle.load(f)
else:
  if not os.path.exists(dataset_root + "IITB.en-hi.en"):
    os.system("tar -xzf " + dataset_root + "parallel.tgz -C " + dataset_root)

  with open(dataset_root + "IITB.en-hi.en",'r') as f:
    english_sentences = f.read().split('\n')

  with open(dataset_root + "IITB.en-hi.hi",'r', encoding='utf-8') as f:
    hindi_sentences = f.read().split('\n')

  english_sentences = [preprocess(en) for en in english_sentences]
  hindi_sentences = ['<START> ' + re.sub('[a-zA-Z]','',preprocess(hi)) + ' <END>' for hi in hindi_sentences]

  #Remove duplicate sentences
  english_unique = set()
  english_sentences_temp = []
  hindi_sentences_temp = []
  #Use the minimum length to avoid IndexError
  l = min(len(english_sentences), len(hindi_sentences)) #Changed to use minimum length
  for i in range(l):
    if english_sentences[i] not in english_unique:
      english_unique.add(english_sentences[i])
      english_sentences_temp.append(english_sentences[i])
      hindi_sentences_temp.append(hindi_sentences[i])

  english_sentences = english_sentences_temp
  hindi_sentences = hindi_sentences_temp

  with open(dataset_root + "preprocessed_data.pickle",'wb') as f:
    pickle.dump((english_sentences, hindi_sentences), f)

In [None]:
print(len(english_sentences), len(hindi_sentences))
print()
english_sentences[:3], hindi_sentences[:3]

49543 49543



(['give your application an accessibility workout',
  'accerciser accessibility explorer',
  'the default plugin layout for the bottom panel'],
 ['<START> अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें <END>',
  '<START> एक्सेर्साइसर पहुंचनीयता अन्वेषक <END>',
  '<START> निचले पटल के लिए डिफोल्ट प्लगइन खाका <END>'])

In [None]:
#Some parameters
vocab_size = 10000
total_sentences = 25000
maxlen = 10
epochs = 70
validation_split = 0.05

In [None]:
en_data = []
hi_data = []

cnt = 0

for (en,hi) in zip(english_sentences, hindi_sentences):
  l = min(len(en.split()), len(hi.split()))
  if l <= maxlen:
    en_data.append(en)
    hi_data.append(hi)
    cnt += 1
  if cnt == total_sentences:
    break

In [None]:
import tensorflow as tf

#Tokenize the texts and convert to sequences
en_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<OOV>', lower=False)
en_tokenizer.fit_on_texts(en_data)
en_sequences = en_tokenizer.texts_to_sequences(en_data)

hi_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<OOV>', lower=False)
hi_tokenizer.fit_on_texts(hi_data)
hi_sequences = hi_tokenizer.texts_to_sequences(hi_data)

english_vocab_size = len(en_tokenizer.word_index) + 1
hindi_vocab_size = len(hi_tokenizer.word_index) + 1
print("English Vocab Size: ", english_vocab_size)
print("Hindi Vocab Size: ", hindi_vocab_size)

English Vocab Size:  8020
Hindi Vocab Size:  9395


In [None]:
#Prepare encoder data
encoder_inputs = tf.keras.preprocessing.sequence.pad_sequences(en_sequences, maxlen=maxlen, padding='post')

In [None]:
#Prepare decoder data
decoder_inputs = []
decoder_outputs = []

for hi in hi_sequences:
  decoder_inputs.append(hi[:-1])
  decoder_outputs.append(hi[1:])

decoder_inputs = tf.keras.preprocessing.sequence.pad_sequences(decoder_inputs, maxlen=maxlen, padding='post')
decoder_outputs = tf.keras.preprocessing.sequence.pad_sequences(decoder_outputs, maxlen=maxlen, padding='post')

In [None]:
# Training and Testing split
# 95%, 5%
split = int(0.95 * total_sentences)

X_train = [encoder_inputs[:split], decoder_inputs[:split]]
y_train = decoder_outputs[:split]

# Test data to evaluate our NMT model using BLEU score
X_test = en_data[:split]
y_test = hi_data[:split]

print(X_train[0].shape, X_train[1].shape, y_train.shape)

(23750, 10) (23750, 10) (23750, 10)


In [None]:
#Define LSTM model
d_model = 256

#Encoder
inputs = tf.keras.layers.Input(shape=(None,))
x = tf.keras.layers.Embedding(english_vocab_size, d_model, mask_zero=True)(inputs)
_,state_h,state_c = tf.keras.layers.LSTM(d_model,activation='relu',return_state=True)(x)

#Decoder
targets = tf.keras.layers.Input(shape=(None,))
embedding_layer = tf.keras.layers.Embedding(hindi_vocab_size, d_model, mask_zero=True)
x = embedding_layer(targets)
decoder_lstm = tf.keras.layers.LSTM(d_model,activation='relu',return_sequences=True, return_state=True)
x,_,_ = decoder_lstm(x, initial_state=[state_h, state_c])
dense1 = tf.keras.layers.Dense(hindi_vocab_size, activation='softmax')
x = dense1(x)

model = tf.keras.models.Model(inputs=[inputs, targets],outputs=x)
model.summary()

loss = tf.keras.losses.SparseCategoricalCrossentropy()
model.compile(optimizer='rmsprop', loss=loss, metrics=['accuracy'])

In [None]:
#Save model after each epoch
save_model_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='/content/en-hi.keras',
    monitor='val_accuracy',
    mode='max'
)

In [None]:
model.fit(X_train, y_train, epochs=epochs, validation_split=validation_split, callbacks=[save_model_callback, tf.keras.callbacks.TerminateOnNaN()])

Epoch 1/70
[1m294/706[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m6:35[0m 960ms/step - accuracy: 0.0964 - loss: 7.3083

KeyboardInterrupt: 

In [None]:
#Retrieve previously saved stuff
saved_model = tf.keras.models.load_model('/content/en-hi.keras')

saved_model.summary()

inputs = saved_model.get_layer('input_layer').output
_,state_h,state_c = saved_model.get_layer('lstm').output
targets = saved_model.get_layer('input_layer_1').output
embedding_layer = saved_model.get_layer('embedding_1')
decoder_lstm = saved_model.get_layer('lstm_1')
dense1 = saved_model.get_layer('dense')

In [None]:
#Inference Model

#Encoder
encoder = tf.keras.models.Model(inputs, [state_h, state_c])

#Decoder
decoder_input_h = tf.keras.layers.Input(shape=(d_model,))
decoder_input_c = tf.keras.layers.Input(shape=(d_model,))
x = embedding_layer(targets)
x, decoder_output_h, decoder_output_c = decoder_lstm(x, initial_state=[decoder_input_h, decoder_input_c])
x = dense1(x)
decoder = tf.keras.models.Model([targets] + [decoder_input_h, decoder_input_c],
                                [x] + [decoder_output_h, decoder_output_c])

In [None]:
import numpy as np

def predict_sentence(en_input):
    # Convert input to numpy array
    input_seq = np.array(en_tokenizer.texts_to_sequences([en_input]))

    # Predict the initial hidden and cell states for the encoder
    next_h, next_c = encoder.predict(input_seq)

    # Prepare the first token as the <START> token with correct shape
    curr_token = np.zeros((1, 1))  # Reshape to (1, 1) for (batch_size, sequence_length)
    curr_token[0, 0] = hi_tokenizer.word_index['<START>']

    pred_sentence = ''

    for i in range(maxlen):
        # Predict the next word
        output, next_h, next_c = decoder.predict([curr_token, next_h, next_c])

        # Get the token with the highest probability
        next_token = np.argmax(output[0, 0, :])
        next_word = hi_tokenizer.index_word[next_token]

        # Check for the <END> token
        if next_word == '<END>':
            break
        else:
            pred_sentence += ' ' + next_word
            curr_token[0, 0] = next_token  # Update curr_token with correct shape

    return pred_sentence


In [None]:
!pip install englisttohindi

In [None]:
from englisttohindi.englisttohindi import EngtoHindi
trans = EngtoHindi(message="Hello  Everyone  ")
print(trans.convert)

In [None]:
#Testing and Analysis
import nltk

candidates = []
references = []

ctr = 20
i = 0

while ctr>0:
  l = len(X_test[i].split())
  if l<=maxlen:   #Choose only sentences of length in range [5,15]
    pred_sentence = predict_sentence(X_test[i])
    candidates.append(pred_sentence.split())

    print("Input: ", X_test[i])
    print("Prediction: ", pred_sentence)

    trans = EngtoHindi(message=X_test[i])
    google_translated_sentence = trans.convert

    print("Google Translated Reference: ", google_translated_sentence)
    print("Dataset Reference: ", ' '.join(y_test[i].split()[1:-1]))
    print()
    references.append([y_test[i].split()[1:-1], google_translated_sentence.split()])

    ctr -= 1
  i += 1

print(nltk.translate.bleu_score.corpus_bleu(references, candidates))

In [None]:
#install transformers library
!pip install transformers -U -q

# install sentencepiece library
!pip install sentencepiece

In [None]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

# download and save model
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt")
# import tokenizer
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX")


In [None]:
# input sentences
input_text = ["Elon Musk sells $8.5 billion in Tesla stock",
              "I'm a professional academic and research writer.",
              "Get a job in US and work in Germany"]

# convert sentences to tensors
model_inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)


In [None]:
# translate from English to Hindi
generated_tokens = model.generate(
    **model_inputs,
    forced_bos_token_id=tokenizer.lang_code_to_id["hi_IN"]
)

translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

print(translation)

In [None]:
references = []
ctr = 3
i = 0

while ctr>0:


  trans = EngtoHindi(message=input_text[i])
  google_translated_sentence = trans.convert

  print("Google Translated Reference: ", google_translated_sentence)
  # print("Dataset Reference: ", ' '.join(y_test[i].split()[1:-1]))
  print()
  references.append(google_translated_sentence)

  ctr -= 1
  i += 1

In [None]:
import nltk

In [None]:
references

In [None]:
translation

In [None]:
print(nltk.translate.bleu_score.corpus_bleu(references, translation))

In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu

# Define reference sentences (each reference is a list of words, and references are lists of lists)
references = [
    ['एलन', 'मस्क', 'ने', 'टेस्ला', 'के', '8.5', 'अरब', 'डॉलर', 'के', 'शेयर', 'बेचे'],
    ['मैं', 'एक', 'पेशेवर', 'अकादमिक', 'और', 'शोध', 'लेखक', 'हूं।'],
    ['अमेरिका', 'में', 'नौकरी', 'पाएं', 'और', 'जर्मनी', 'में', 'काम', 'करें']
]

# Define candidate translations (each candidate is a list of words)
translations = [
    ['एलॉन', 'मस्क', 'ने', 'टेस्ला', 'स्टॉक', 'में', '8.5', 'बिलियन', 'डालर', 'की', 'बिक्री', 'की'],
    ['मैं', 'एक', 'पेशेवर', 'अकादमिक', 'और', 'अनुसंधान', 'लेखक', 'हूं।'],
    ['अमेरिका', 'में', 'नौकरी', 'और', 'जर्मनी', 'में', 'काम']
]

# Calculate and print BLEU score for each pair
for i, (ref, trans) in enumerate(zip(references, translations), 1):
    bleu_score = sentence_bleu([ref], trans)
    print(f"BLEU score for sentence {i}: {bleu_score:.4f}")


BLEU score for sentence 1: 0.0000
BLEU score for sentence 2: 0.5946
BLEU score for sentence 3: 0.4468


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu

# Define the reference and candidate sentences as lists of sentences
references = [['एलन मस्क ने टेस्ला के 8.5 अरब डॉलर के शेयर बेचे'],
              ['मैं एक पेशेवर अकादमिक और शोध लेखक हूं।'],
              ['अमेरिका में नौकरी पाएं और जर्मनी में काम करें']]

translations = ['एलॉन मस्क ने टेस्ला स्टॉक में 8.5 बिलियन डालर की बिक्री की',
                'मैं एक पेशेवर अकादमिक और अनुसंधान लेखक हूं।',
                'अमेरिका में नौकरी और जर्मनी में काम']

# Calculate BLEU score for each pair of reference and candidate
bleu_scores = []
for ref, trans in zip(references, translations):
    print("Reference:", ref[0])
    print("Translation:", trans)
    score = sentence_bleu([ref[0]], [trans])
    print("BLEU Score:", score)
    print()
    bleu_scores.append(score)

# Calculate average BLEU score
average_bleu_score = sum(bleu_scores) / len(bleu_scores)
print("Average BLEU Score:", average_bleu_score)



Reference: एलन मस्क ने टेस्ला के 8.5 अरब डॉलर के शेयर बेचे
Translation: एलॉन मस्क ने टेस्ला स्टॉक में 8.5 बिलियन डालर की बिक्री की
BLEU Score: 0

Reference: मैं एक पेशेवर अकादमिक और शोध लेखक हूं।
Translation: मैं एक पेशेवर अकादमिक और अनुसंधान लेखक हूं।
BLEU Score: 0

Reference: अमेरिका में नौकरी पाएं और जर्मनी में काम करें
Translation: अमेरिका में नौकरी और जर्मनी में काम
BLEU Score: 0

Average BLEU Score: 0.0


In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu

actual: str = "Elon Musk sells $8.5 billion in Tesla stock"
predicted: str = "Elon Musk sells $8.5 billion in Tesla stock"

# Define the reference and candidate sentences as lists of sentences
references = [['एलन मस्क ने टेस्ला के 8.5 अरब डॉलर के शेयर बेचे'],
              ['मैं एक पेशेवर अकादमिक और शोध लेखक हूं।'],
              ['अमेरिका में नौकरी पाएं और जर्मनी में काम करें']]

translations = ['एलॉन मस्क ने टेस्ला स्टॉक में 8.5 बिलियन डालर की बिक्री की',
                'मैं एक पेशेवर अकादमिक और अनुसंधान लेखक हूं।',
                'अमेरिका में नौकरी और जर्मनी में काम']

# Calculate BLEU score for each pair of reference and candidate
bleu_scores = []
for ref, trans in zip(references, translations):
    print("Reference:", ref[0])
    print("Translation:", trans)
    score = sentence_bleu([ref[0]], [trans])
    print("BLEU Score:", score)
    print()
    bleu_scores.append(score)

# Calculate average BLEU score
average_bleu_score = sum(bleu_scores) / len(bleu_scores)
print("Average BLEU Score:", average_bleu_score)

