# Fine-Tune a Model: Description: Load a pre-trained LSTM-based NMT model and use it to translate asentence from one language to another. Description: GUI is not necessary. It will be evaluated on the basis of accuracy score.

In [1]:
import pandas as pd
import nltk
nltk.download('punkt')
import numpy as np
import string
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

[nltk_data] Downloading package punkt to C:\Users\Aryan
[nltk_data]     raina\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def load_data(path):
    input_file = path
    with open(input_file, "r") as f:
        data = f.read()
    return data.split('\n')

english_sentences = load_data('english.txt')
french_sentences = load_data('french.txt')

In [3]:
# Example data (replace with your actual data)
source_texts = english_sentences
target_texts = french_sentences

In [4]:
def clean_text(text):
    if isinstance(text, float) or isinstance(text, int):
        return ''
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator).lower()

In [5]:
# Tokenize the source language
def tokenize(x):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    sequences = tokenizer.texts_to_sequences(x)
    return sequences, tokenizer

In [6]:
def pad(x, length=None):
    if length is None:
        # Calculate the maximum length of sequences if not provided
        length = max([len(sentence) for sentence in x])
    # Pad sequences to the specified length
    return pad_sequences(x, maxlen=length, padding='post')

In [7]:
def preprocess(x,y):
    x=[clean_text(str(text)) for text in x]
    y=[clean_text(str(text)) for text in y]
    preprocess_x, x_tk=tokenize(x)
    preprocess_y, y_tk=tokenize(y)
    preprocess_x=pad(preprocess_x)
    preprocess_y=pad(preprocess_y)
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk

preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer = preprocess(source_texts,target_texts)

In [8]:
max_english_sequence_length = preproc_english_sentences.shape[1]
max_french_sequence_length = preproc_french_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)

print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

Data Preprocessed
Max English sentence length: 15
Max French sentence length: 21
English vocabulary size: 199
French vocabulary size: 344


In [9]:
print(preproc_english_sentences.shape)
print(preproc_french_sentences.shape)

(137861, 15)
(137861, 21, 1)


In [10]:
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Bidirectional,TimeDistributed,Dropout,GRU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy
def bidirectional_embed_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):

    # Build the layers
    model = Sequential()
    model.add(Embedding(english_vocab_size+1, 256, input_length=input_shape[1], input_shape=input_shape[1:]))
    model.add(Bidirectional(LSTM(256, return_sequences=True)))
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size+1, activation='softmax')))

    # Compile model
    model.compile(loss = sparse_categorical_crossentropy,
                  optimizer = Adam(0.005),
                  metrics = ['accuracy'])

    return model

In [11]:
tmp_x = pad(preproc_english_sentences, max_french_sequence_length)

In [13]:
tmp_x.shape

(137861, 21)

In [14]:
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2]))

In [16]:
tmp_x.shape

(137861, 21)

In [18]:
preproc_french_sentences.shape

(137861, 21, 1)

In [19]:
# Build the model
embed_rnn_model = bidirectional_embed_model(
    tmp_x.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size)
print(embed_rnn_model.summary())
embed_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=10, validation_split=0.2)

  super().__init__(**kwargs)


None
Epoch 1/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m213s[0m 2s/step - accuracy: 0.5654 - loss: 2.1899 - val_accuracy: 0.8762 - val_loss: 0.4106
Epoch 2/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m196s[0m 2s/step - accuracy: 0.8833 - loss: 0.3766 - val_accuracy: 0.9316 - val_loss: 0.2073
Epoch 3/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m213s[0m 2s/step - accuracy: 0.9329 - loss: 0.2097 - val_accuracy: 0.9526 - val_loss: 0.1464
Epoch 4/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m254s[0m 2s/step - accuracy: 0.9534 - loss: 0.1457 - val_accuracy: 0.9658 - val_loss: 0.1083
Epoch 5/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m249s[0m 2s/step - accuracy: 0.9640 - loss: 0.1134 - val_accuracy: 0.9719 - val_loss: 0.0892
Epoch 6/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 2s/step - accuracy: 0.9715 - loss: 0.0887 - val_accuracy: 0.9761 - val_loss: 0.0774
Epoch 7/10
[1m10

<keras.src.callbacks.history.History at 0x25eabece990>

In [24]:
def logits_to_text(logits, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

In [25]:
# Print prediction(s)
print("Prediciton:")
print(logits_to_text(embed_rnn_model.predict(tmp_x[:100])[0], french_tokenizer))

print("\nCorrect Translation:")
print(french_sentences[:1])

print('\nOriginal text:')
print(english_sentences[:1])

Prediciton:
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
new jersey est parfois calme pendant l automne et il est neigeux en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Correct Translation:
["new jersey est parfois calme pendant l' automne , et il est neigeux en avril ."]

Original text:
['new jersey is sometimes quiet during autumn , and it is snowy in april .']


In [26]:
embed_rnn_model.save('pretrained.keras')

In [23]:
print(embed_rnn_model.get_weights())

[array([[-1.1218866 , -0.23652576, -2.0734694 , ..., -2.282421  ,
         0.91584706,  0.16769366],
       [ 1.1287884 , -0.8447383 ,  0.20237197, ...,  0.41619843,
        -0.04619716, -0.33577126],
       [ 0.4399611 ,  0.37385496, -0.17300269, ..., -0.09677164,
        -0.43154332, -0.03003852],
       ...,
       [ 0.02575642, -0.24403514, -0.36345032, ...,  0.04278513,
         0.08664688,  0.0902779 ],
       [-0.22540173, -0.00858425,  0.00432144, ..., -0.06722135,
        -0.21301967,  0.1878747 ],
       [-0.03822861, -0.25390726,  0.02841015, ...,  0.21336497,
         0.18466514,  0.18843253]], dtype=float32), array([[ 0.16820714, -0.21916549,  0.22639371, ..., -0.00711458,
        -0.05878118,  0.04057365],
       [-0.14422204, -0.0783029 ,  0.10437075, ..., -0.01230261,
         0.02979372, -0.03530535],
       [ 0.00612178, -0.0238396 ,  0.09179616, ..., -0.04576714,
        -0.01551104, -0.06570072],
       ...,
       [ 0.11911643,  0.05388101,  0.00492679, ..., -0.096