<a href="https://colab.research.google.com/github/Amankp1/LEGAL_TERMINOLOGY_MT/blob/main/Legal_sentence_translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Make imports
import numpy as np
import re
import pickle
import os
import seaborn as sns
import string

In [None]:
!pip install englisttohindi

Collecting englisttohindi
  Downloading englisttohindi-4.1.0-py3-none-any.whl.metadata (2.0 kB)
Collecting bs4 (from englisttohindi)
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading englisttohindi-4.1.0-py3-none-any.whl (15 kB)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4, englisttohindi
Successfully installed bs4-0.0.2 englisttohindi-4.1.0


In [None]:
from englisttohindi.englisttohindi import EngtoHindi
trans = EngtoHindi(message="Hello  Everyone  ")
print(trans.convert)

सभी को नमस्कार


In [None]:
def preprocess(text):
  text = ''.join(ch for ch in text if ch not in string.punctuation)
  text = text.lower()
  text = re.sub(r'\d','',text)
  text = re.sub(r'\s+',' ',text)
  text = text.strip()
  return text

In [None]:
#Extract dataset and preprocess
dataset_root = "/content/"

if os.path.exists(dataset_root + "preprocessed_data.pickle"):
  with open(dataset_root + "preprocessed_data.pickle", 'rb') as f:
    english_sentences, hindi_sentences = pickle.load(f)
else:
  if not os.path.exists(dataset_root + "IITB.en-hi.en"):
    os.system("tar -xzf " + dataset_root + "parallel.tgz -C " + dataset_root)

  with open(dataset_root + "IITB.en-hi.en",'r') as f:
    english_sentences = f.read().split('\n')

  with open(dataset_root + "IITB.en-hi.hi",'r', encoding='utf-8') as f:
    hindi_sentences = f.read().split('\n')

  english_sentences = [preprocess(en) for en in english_sentences]
  hindi_sentences = ['<START> ' + re.sub('[a-zA-Z]','',preprocess(hi)) + ' <END>' for hi in hindi_sentences]

  #Remove duplicate sentences
  english_unique = set()
  english_sentences_temp = []
  hindi_sentences_temp = []
  #Use the minimum length to avoid IndexError
  l = min(len(english_sentences), len(hindi_sentences)) #Changed to use minimum length
  for i in range(l):
    if english_sentences[i] not in english_unique:
      english_unique.add(english_sentences[i])
      english_sentences_temp.append(english_sentences[i])
      hindi_sentences_temp.append(hindi_sentences[i])

  english_sentences = english_sentences_temp
  hindi_sentences = hindi_sentences_temp

  with open(dataset_root + "preprocessed_data.pickle",'wb') as f:
    pickle.dump((english_sentences, hindi_sentences), f)

In [None]:
len(english_sentences)

36764

In [None]:
len(hindi_sentences)

36764

In [None]:
import json

with open('output_data.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Extract answers and translated_answers into separate lists
answers = data["answers"]
translated_answers = data["translated_answers"]

In [None]:
len(answers)

4082

In [None]:
len(translated_answers)

4082

In [None]:
english_sentences = english_sentences + answers
hindi_sentences = hindi_sentences + translated_answers

In [None]:
len(english_sentences)

40846

In [None]:
english_sentences[6011]

'bug buddy'

In [None]:
hindi_sentences[6011]

'<START> बग बड्डी <END>'

In [None]:
print(len(english_sentences), len(hindi_sentences))
print()
english_sentences[:3], hindi_sentences[:3]

40846 40846



(['give your application an accessibility workout',
  'accerciser accessibility explorer',
  'the default plugin layout for the bottom panel'],
 ['<START> अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें <END>',
  '<START> एक्सेर्साइसर पहुंचनीयता अन्वेषक <END>',
  '<START> निचले पटल के लिए डिफोल्ट प्लगइन खाका <END>'])

In [None]:
#Some parameters
vocab_size = 10000
total_sentences = 25000
maxlen = 10
epochs = 70
validation_split = 0.05

In [None]:
en_data = []
hi_data = []

cnt = 0

for (en,hi) in zip(english_sentences, hindi_sentences):
  l = min(len(en.split()), len(hi.split()))
  if l <= maxlen:
    en_data.append(en)
    hi_data.append(hi)
    cnt += 1
  if cnt == total_sentences:
    break

In [None]:
import tensorflow as tf

#Tokenize the texts and convert to sequences
en_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<OOV>', lower=False)
en_tokenizer.fit_on_texts(en_data)
en_sequences = en_tokenizer.texts_to_sequences(en_data)

hi_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<OOV>', lower=False)
hi_tokenizer.fit_on_texts(hi_data)
hi_sequences = hi_tokenizer.texts_to_sequences(hi_data)

english_vocab_size = len(en_tokenizer.word_index) + 1
hindi_vocab_size = len(hi_tokenizer.word_index) + 1
print("English Vocab Size: ", english_vocab_size)
print("Hindi Vocab Size: ", hindi_vocab_size)

English Vocab Size:  8020
Hindi Vocab Size:  9395


In [None]:
#Prepare encoder data
encoder_inputs = tf.keras.preprocessing.sequence.pad_sequences(en_sequences, maxlen=maxlen, padding='post')

In [None]:
#Prepare decoder data
decoder_inputs = []
decoder_outputs = []

for hi in hi_sequences:
  decoder_inputs.append(hi[:-1])
  decoder_outputs.append(hi[1:])

decoder_inputs = tf.keras.preprocessing.sequence.pad_sequences(decoder_inputs, maxlen=maxlen, padding='post')
decoder_outputs = tf.keras.preprocessing.sequence.pad_sequences(decoder_outputs, maxlen=maxlen, padding='post')

In [None]:
# Training and Testing split
# 95%, 5%
split = int(0.95 * total_sentences)

X_train = [encoder_inputs[:split], decoder_inputs[:split]]
y_train = decoder_outputs[:split]

# Test data to evaluate our NMT model using BLEU score
X_test = en_data[:split]
y_test = hi_data[:split]

print(X_train[0].shape, X_train[1].shape, y_train.shape)

(23750, 10) (23750, 10) (23750, 10)


In [None]:
#Define LSTM model
d_model = 256

#Encoder
inputs = tf.keras.layers.Input(shape=(None,))
x = tf.keras.layers.Embedding(english_vocab_size, d_model, mask_zero=True)(inputs)
_,state_h,state_c = tf.keras.layers.LSTM(d_model,activation='relu',return_state=True)(x)

#Decoder
targets = tf.keras.layers.Input(shape=(None,))
embedding_layer = tf.keras.layers.Embedding(hindi_vocab_size, d_model, mask_zero=True)
x = embedding_layer(targets)
decoder_lstm = tf.keras.layers.LSTM(d_model,activation='relu',return_sequences=True, return_state=True)
x,_,_ = decoder_lstm(x, initial_state=[state_h, state_c])
dense1 = tf.keras.layers.Dense(hindi_vocab_size, activation='softmax')
x = dense1(x)

model = tf.keras.models.Model(inputs=[inputs, targets],outputs=x)
model.summary()

loss = tf.keras.losses.SparseCategoricalCrossentropy()
model.compile(optimizer='rmsprop', loss=loss, metrics=['accuracy'])

In [None]:
#Save model after each epoch
save_model_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='/content/en-hi.keras',
    monitor='val_accuracy',
    mode='max'
)

In [None]:
model.fit(X_train, y_train, epochs=epochs, validation_split=validation_split, callbacks=[save_model_callback, tf.keras.callbacks.TerminateOnNaN()])

Epoch 1/70
[1m706/706[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 19ms/step - accuracy: 0.1014 - loss: 6.6795 - val_accuracy: 0.1045 - val_loss: 5.8288
Epoch 2/70
[1m706/706[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 10ms/step - accuracy: 0.1233 - loss: 5.2679 - val_accuracy: 0.1093 - val_loss: 5.7617
Epoch 3/70
[1m706/706[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.1442 - loss: 4.8565 - val_accuracy: 0.1234 - val_loss: 5.5972
Epoch 4/70
[1m706/706[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.1590 - loss: 4.5483 - val_accuracy: 0.1279 - val_loss: 5.4504
Epoch 5/70
[1m706/706[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 10ms/step - accuracy: 0.1760 - loss: 4.2529 - val_accuracy: 0.1327 - val_loss: 5.4890
Epoch 6/70
[1m706/706[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 10ms/step - accuracy: 0.1882 - loss: 4.0243 - val_accuracy: 0.1335 - val_loss: 5.4291
Epoch 7/70
[1m706/7

<keras.src.callbacks.history.History at 0x7f5246abb0d0>

In [None]:
#Retrieve previously saved stuff
saved_model = tf.keras.models.load_model('/content/en-hi.keras')

saved_model.summary()

inputs = saved_model.get_layer('input_layer').output
_,state_h,state_c = saved_model.get_layer('lstm').output
targets = saved_model.get_layer('input_layer_1').output
embedding_layer = saved_model.get_layer('embedding_1')
decoder_lstm = saved_model.get_layer('lstm_1')
dense1 = saved_model.get_layer('dense')

In [None]:
#Inference Model

#Encoder
encoder = tf.keras.models.Model(inputs, [state_h, state_c])

#Decoder
decoder_input_h = tf.keras.layers.Input(shape=(d_model,))
decoder_input_c = tf.keras.layers.Input(shape=(d_model,))
x = embedding_layer(targets)
x, decoder_output_h, decoder_output_c = decoder_lstm(x, initial_state=[decoder_input_h, decoder_input_c])
x = dense1(x)
decoder = tf.keras.models.Model([targets] + [decoder_input_h, decoder_input_c],
                                [x] + [decoder_output_h, decoder_output_c])

In [None]:
import numpy as np

def predict_sentence(en_input):
    # Convert input to numpy array
    input_seq = np.array(en_tokenizer.texts_to_sequences([en_input]))

    # Predict the initial hidden and cell states for the encoder
    next_h, next_c = encoder.predict(input_seq)

    # Prepare the first token as the <START> token with correct shape
    curr_token = np.zeros((1, 1))  # Reshape to (1, 1) for (batch_size, sequence_length)
    curr_token[0, 0] = hi_tokenizer.word_index['<START>']

    pred_sentence = ''

    for i in range(maxlen):
        # Predict the next word
        output, next_h, next_c = decoder.predict([curr_token, next_h, next_c])

        # Get the token with the highest probability
        next_token = np.argmax(output[0, 0, :])
        next_word = hi_tokenizer.index_word[next_token]

        # Check for the <END> token
        if next_word == '<END>':
            break
        else:
            pred_sentence += ' ' + next_word
            curr_token[0, 0] = next_token  # Update curr_token with correct shape

    return pred_sentence


In [None]:
#Testing and Analysis
import nltk

candidates = []
references = []

ctr = 20
i = 0

while ctr>0:
  l = len(X_test[i].split())
  if l<=maxlen:   #Choose only sentences of length in range [5,15]
    pred_sentence = predict_sentence(X_test[i])
    candidates.append(pred_sentence.split())

    print("Input: ", X_test[i])
    print("Prediction: ", pred_sentence)

    trans = EngtoHindi(message=X_test[i])
    python_translated_sentence = trans.convert

    print("Google Translated Reference: ", python_translated_sentence)
    print("Dataset Reference: ", ' '.join(y_test[i].split()[1:-1]))
    print()
    references.append([y_test[i].split()[1:-1], python_translated_sentence.split()])

    ctr -= 1
  i += 1

print(nltk.translate.bleu_score.corpus_bleu(references, candidates))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 627ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 797ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
Input:  give your application an accessibility workout
Prediction:   अपने अनुप्रयोग के शब्द पहचान
Google Translated Reference:  अपने एप्लिकेशन को एक एक्सेसिबिलिटी वर्कआउट दें
Dataset Reference:  अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 424ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━

In [None]:
import numpy as np

def predict_sentence(en_input):
    # Step 1: Convert input text to sequence using the English tokenizer
    input_seq = np.array(en_tokenizer.texts_to_sequences([en_input]))

    # Step 2: Predict encoder hidden and cell states
    next_h, next_c = encoder.predict(input_seq)

    # Step 3: Initialize the first token as <START> for the decoder
    curr_token = np.zeros((1, 1))  # Shape: (batch_size, sequence_length)
    curr_token[0, 0] = hi_tokenizer.word_index.get('<START>', 1)  # Default to 1 if <START> not found

    pred_sentence = ''

    for _ in range(maxlen):
        # Step 4: Predict the next word
        output, next_h, next_c = decoder.predict([curr_token, next_h, next_c])

        # Step 5: Identify the word with the highest probability
        next_token = np.argmax(output[0, 0, :])
        next_word = hi_tokenizer.index_word.get(next_token, '')

        # Step 6: Check for <END> token to stop the translation
        if next_word == '<END>':
            break
        else:
            pred_sentence += ' ' + next_word
            curr_token[0, 0] = next_token  # Update current token for next prediction

    return pred_sentence.strip()


In [None]:
user_input = input("Enter an English sentence to translate to Hindi: ")
preprocessed_input = preprocess(user_input)  # Preprocess input
translated_sentence = predict_sentence(preprocessed_input)
print("Translated Hindi sentence:", translated_sentence)

Enter an English sentence to translate to Hindi: the jury deliberated on charges of the criminal act of deliberately setting fire to property . led to extensive the name for anything which can be owned.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 351ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Translated Hindi sentence: सीवीएस है के लिए यह सर्वर चाहिये निर्देशिका


In [None]:
#install transformers library
!pip install transformers -U -q

# install sentencepiece library
!pip install sentencepiece



In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

# download and save model
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt")
# import tokenizer
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/528 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/717 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

In [None]:
def translate_to_hindi(text):
    """
    Translates an English text input to Hindi.

    Args:
    text (str): The English text to be translated.

    Returns:
    str: The translated text in Hindi.
    """
    # Convert the input text to tensors
    model_inputs = tokenizer([text], return_tensors="pt", padding=True, truncation=True)

    # Translate from English to Hindi
    generated_tokens = model.generate(
        **model_inputs,
        forced_bos_token_id=tokenizer.lang_code_to_id["hi_IN"]
    )

    # Decode the generated tokens to get the Hindi translation
    translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

    return translation[0]

# Example usage
user_text = input("Enter a sentence in English to translate to Hindi: ")

trans = EngtoHindi(message=user_text)
print("Translation in Hindi:", translate_to_hindi(user_text))

python_translated_sentence = trans.convert
print("Python Translated Reference: ", python_translated_sentence)

Enter a sentence in English to translate to Hindi: The (the criminal act of deliberately setting fire to property.) led to extensive (the name for anything which can be owned.) damage .<n>The defendant 's legal team argued for an (the court's decision that a person is innocent of the crime they were charged with
Translation in Hindi: प्रतिवादी के कानूनी दल ने एक (न्यायालय के निर्णय के लिए तर्क दिया कि एक व्यक्ति उन अपराधों से निर्दोष है जिन पर वे आरोपित थे
Python Translated Reference:  (जानबूझकर संपत्ति में आग लगाने का आपराधिक कृत्य) से व्यापक क्षति हुई (किसी भी चीज़ का नाम जिसका स्वामित्व हो सकता है।) प्रतिवादी की कानूनी टीम ने अदालत के फैसले के लिए तर्क दिया कि एक व्यक्ति निर्दोष है 


In [None]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

def translate_to_hindi(text):
    """
    Translates an English text input to Hindi.

    Args:
    text (str): The English text to be translated.

    Returns:
    str: The translated text in Hindi.
    """
    # Convert the input text to tensors
    model_inputs = tokenizer([text], return_tensors="pt", padding=True, truncation=True)

    # Translate from English to Hindi
    generated_tokens = model.generate(
        **model_inputs,
        forced_bos_token_id=tokenizer.lang_code_to_id["hi_IN"]
    )

    # Decode the generated tokens to get the Hindi translation
    translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

    return translation[0]

# Example usage
user_text = input("Enter a sentence in English to translate to Hindi: ")
generated_translation = translate_to_hindi(user_text)

# Assuming `python_translated_sentence` is the reference translation from EngtoHindi library
trans = EngtoHindi(message=user_text)
python_translated_sentence = trans.convert  # Reference translation

# Tokenize the generated and reference translations
generated_tokens = word_tokenize(generated_translation)
reference_tokens = word_tokenize(python_translated_sentence)

# Calculate BLEU score
bleu_score = sentence_bleu([reference_tokens], generated_tokens)
print("Translation in Hindi:", generated_translation)
print("Python Translated Reference:", python_translated_sentence)
print("BLEU Score:", bleu_score)


Enter a sentence in English to translate to Hindi: the jury deliberated on charges of the criminal act of deliberately setting fire to property . led to extensive the name for anything which can be owned .'
Translation in Hindi: जौरी ने जानबूझकर संपत्ति पर आग लगाने के अपराध के आरोपों पर विचार-विमर्श किया. किसी भी संपत्ति के लिए विस्तृत नाम बनाने के लिए प्रेरित किया. '
Python Translated Reference: जूरी ने जानबूझकर संपत्ति में आग लगाने के आपराधिक कृत्य के आरोपों पर विचार-विमर्श किया। 
BLEU Score: 0.14887309680192354


In [None]:
user_text = input("Enter a sentence in English to translate to Hindi: ")
generated_translation = translate_to_hindi(user_text)

# Assuming `python_translated_sentence` is the reference translation from EngtoHindi library
trans = EngtoHindi(message=user_text)
python_translated_sentence = trans.convert  # Reference translation

# Tokenize the generated and reference translations
generated_tokens = word_tokenize(generated_translation)
reference_tokens = word_tokenize(python_translated_sentence)

# Calculate BLEU score
bleu_score = sentence_bleu([reference_tokens], generated_tokens)
print("Translation in Hindi:", generated_translation)
print("Python Translated Reference:", python_translated_sentence)
print("BLEU Score:", bleu_score)

Enter a sentence in English to translate to Hindi: The (the criminal act of deliberately setting fire to property.) led to extensive (the name for anything which can be owned.) damage .<n>The defendant 's legal team argued for an (the court's decision that a person is innocent of the crime they were charged with).
Translation in Hindi: प्रतिवादी के कानूनी दल ने एक (न्यायालय के निर्णय के लिए तर्क दिया कि कोई व्यक्ति उन अपराधों से निर्दोष है जिन पर वह आरोपित किया गया था) के लिए तर्क दिया।
Python Translated Reference: (जानबूझकर संपत्ति में आग लगाने का आपराधिक कृत्य) से व्यापक क्षति हुई (किसी भी चीज़ का नाम जिसका स्वामित्व हो सकता है।) प्रतिवादी की कानूनी टीम ने अदालत के फैसले के लिए तर्क दिया कि एक व्यक्ति निर्दोष है 
BLEU Score: 0.11059204097669541
