# Describe the model: input-output

**ใช้ Dataset English-French จาก kaggle**
https://www.kaggle.com/datasets/dhruvildave/en-fr-translation-dataset <br/>


**input** เป็น English-Text <br/>
**output** เป็น French-Text<br/>
ทำการแปลงภาษาจาก English -> French โดยใช้โมเดล **Bi-directional RNNs** และ Embeddings

# Show the code for running the model

**Import all library**

**ก่อนหน้านี้ Run ใน collab แล้ว runtime มันเต็มผมเลยเอามารันใน VS code**

In [None]:
import pandas as pd
#from google.colab import drive
import collections
import numpy as np

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional,LSTM, Dropout
from keras.layers import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
from keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt
import random
from tabulate import tabulate


In [None]:
#drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


**นำข้อมูลมาใช้จากไดฟ์ **

In [None]:
df =pd.read_csv('D:\deep-learning\RN\eng_-french-50000.csv')
df.head(5)

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [None]:
# split a text into sentences
def to_lines(text):
    sents = text.strip().split('\n')
    sents = [i.split('\t') for i in sents]
    return sents

In [None]:
eng = df['English words/sentences']
fr = df['French words/sentences']

In [None]:
#นับศัพท์
english_words_counter = collections.Counter([word for sentence in eng for word in sentence.split()])
french_words_counter = collections.Counter([word for sentence in fr for word in sentence.split()])

#แยกศัพท์ใช้บ่อย
print('{} English words.'.format(len([word for sentence in eng for word in sentence.split()])))
print('{} unique English words.'.format(len(english_words_counter)))
print('10 Most common words in the English dataset:')
print('"' + '" "'.join(list(zip(*english_words_counter.most_common(10)))[0]) + '"')
print()
print('{} French words.'.format(len([word for sentence in fr for word in sentence.split()])))
print('{} unique French words.'.format(len(french_words_counter)))
print('10 Most common words in the French dataset:')
print('"' + '" "'.join(list(zip(*french_words_counter.most_common(10)))[0]) + '"')

192358 English words.
9946 unique English words.
10 Most common words in the English dataset:
"I" "a" "you" "is" "to" "the" "I'm" "He" "Tom" "was"

221631 French words.
17389 unique French words.
10 Most common words in the French dataset:
"Je" "?" "pas" "de" "Il" "!" "est" "ne" "le" "suis"


In [None]:
#ใช้tokenize
def tokenize(x):

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x), tokenizer

In [None]:
def pad(x, length=None):
    if length is None:
        length = max([len(sentence) for sentence in x])
    return pad_sequences(x, maxlen = length, padding = 'post')

In [None]:
def preprocess(x, y):

    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk

In [None]:
preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer = preprocess(eng, fr)

In [None]:
max_english_sequence_length = preproc_english_sentences.shape[1]
max_french_sequence_length = preproc_french_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)

print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

Max English sentence length: 7
Max French sentence length: 14
English vocabulary size: 6004
French vocabulary size: 12902


In [None]:
def logits_to_text(logits, tokenizer):

    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = ' '

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])



In [None]:
#โมเดลแรกที่เอามาทดลอง
def bd_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    learning_rate = 0.001

    model = Sequential()
    model.add(Embedding(english_vocab_size, 256, input_length=input_shape[1], input_shape=input_shape[1:]))
    model.add(Bidirectional(GRU(256, return_sequences=True)))
    model.add(AttentionLayer())
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))

    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model

**Train Model**

In [None]:
from tensorflow.keras.layers import Layer

class AttentionLayer(Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        super(AttentionLayer, self).build(input_shape)

    def call(self, inputs):
        # Implement the attention mechanism here
        return inputs

    def compute_output_shape(self, input_shape):
        return input_shape


In [None]:
# Reshape the input
tmp_x = pad(preproc_english_sentences, preproc_french_sentences.shape[1])
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2]))


# Train
model = bd_model(
    tmp_x.shape,
    preproc_french_sentences.shape[1],
    len(english_tokenizer.word_index)+1,
    len(french_tokenizer.word_index)+1)

model.summary()
history = model.fit(tmp_x, preproc_french_sentences, batch_size=64, epochs=5, validation_split=0.2)
#model.fit(tmp_x, preproc_french_sentences, batch_size=64, epochs=5, validation_split=0.2)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 14, 256)           1537280   
                                                                 
 bidirectional (Bidirection  (None, 14, 512)           789504    
 al)                                                             
                                                                 
 attention_layer (Attention  (None, 14, 512)           0         
 Layer)                                                          
                                                                 
 time_distributed (TimeDist  (None, 14, 1024)          525312    
 ributed)                                                        
                                                                 
 dropout (Dropout)           (None, 14, 1024)          0         
                                                       

In [None]:
model.save("bd_model.h5")

  saving_api.save_model(


In [None]:
i=150
print("\n-------------")
print("\nOriginal text:")
print(eng[i])
print("\n-------------")
print("\nPrediction:")
print(logits_to_text(model.predict(tmp_x[[i]])[0], french_tokenizer))
print("\n-------------")
print("\nCorrect Translation:")
print(fr[i])


-------------

Original text:
I dozed.

-------------

Prediction:
je me suis                      

-------------

Correct Translation:
Je me suis assoupi.


In [None]:
# Generate 5 random indices
random_indices = random.sample(range(len(eng)), 5)

# Create a list to store table rows
table_data = []

# Loop through the random indices and fill the table data
for i in random_indices:
    original_text = eng[i]
    correct_translation = fr[i]
    prediction = logits_to_text(model.predict(tmp_x[[i]])[0], french_tokenizer)

    # Append data to the table
    table_data.append([original_text, correct_translation, prediction])

# Define column headers
headers = ["Original text", "Correct Translation", "Prediction"]

# Print the table
print(tabulate(table_data, headers=headers, tablefmt="grid"))


+-------------------------+--------------------------------+----------------------------+
| Original text           | Correct Translation            | Prediction                 |
| I'm often in trouble.   | Je suis souvent en difficulté. | je suis souvent de         |
+-------------------------+--------------------------------+----------------------------+
| They're here.           | Ils sont là.                   | ils sont là                |
+-------------------------+--------------------------------+----------------------------+
| It had started to rain. | Il avait commencé à pleuvoir.  | il m'a commencé à pleuvoir |
+-------------------------+--------------------------------+----------------------------+
| They'll let us know.    | Elles nous le feront savoir.   | ils nous nous              |
+-------------------------+--------------------------------+----------------------------+
| Do you feel guilty?     | Vous sentez-vous coupable ?    | vous vous tu coupable      |
+---------

In [None]:
# ลองinput คำเอง
input_text = "Hello how are you"

# Tokenize and pad the input text
input_sequence = pad_sequences(english_tokenizer.texts_to_sequences([input_text]), maxlen=preproc_french_sentences.shape[1], padding='post')
# Make prediction
predicted_logits = model.predict(input_sequence)

# Convert logits to text using the tokenizer used for French sentences
predicted_translation = logits_to_text(predicted_logits[0], french_tokenizer)

# Print the predicted translation
print("Predicted Translation:")
print(predicted_translation)


Predicted Translation:
salut comment êtes tu                    


# Show a custom dataset for fine tuning

In [None]:
dfTune =pd.read_csv('D:\deep-learning\RN\eng_-french-Tune.csv')
dfTune.head(5)

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [None]:
eng = dfTune['English words/sentences']
fr = dfTune['French words/sentences']


In [None]:
#นับศัพท์
english_words_counter = collections.Counter([word for sentence in eng for word in sentence.split()])
french_words_counter = collections.Counter([word for sentence in fr for word in sentence.split()])

#แยกศัพท์ใช้บ่อย
print('{} English words.'.format(len([word for sentence in eng for word in sentence.split()])))
print('{} unique English words.'.format(len(english_words_counter)))
print('10 Most common words in the English dataset:')
print('"' + '" "'.join(list(zip(*english_words_counter.most_common(10)))[0]) + '"')
print()
print('{} French words.'.format(len([word for sentence in fr for word in sentence.split()])))
print('{} unique French words.'.format(len(french_words_counter)))
print('10 Most common words in the French dataset:')
print('"' + '" "'.join(list(zip(*french_words_counter.most_common(10)))[0]) + '"')

464941 English words.
15788 unique English words.
10 Most common words in the English dataset:
"I" "you" "to" "a" "the" "is" "Tom" "He" "I'm" "was"

520023 French words.
27315 unique French words.
10 Most common words in the French dataset:
"Je" "?" "pas" "de" "que" "ne" "Il" "à" "le" "Tom"


In [None]:
preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer = preprocess(eng, fr)

In [None]:
max_english_sequence_length = preproc_english_sentences.shape[1]
max_french_sequence_length = preproc_french_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)

print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

Max English sentence length: 9
Max French sentence length: 15
English vocabulary size: 8907
French vocabulary size: 19582


# Show the code for fine-tuning the model

In [None]:
def bd_model_Tune(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    learning_rate = 0.001

    model = Sequential()
    model.add(Embedding(english_vocab_size, 512, input_length=input_shape[1], input_shape=input_shape[1:]))
    model.add(Bidirectional(GRU(512, return_sequences=True)))
    #model.add(Attention())  # Add Attention layer, configure based on your specific use case
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))

    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model


In [None]:
# Reshape the input
tmp_x = pad(preproc_english_sentences, preproc_french_sentences.shape[1])
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2]))


# Train
model = bd_model_Tune(
    tmp_x.shape,
    preproc_french_sentences.shape[1],
    len(english_tokenizer.word_index)+1,
    len(french_tokenizer.word_index)+1)

model.summary()
history = model.fit(tmp_x, preproc_french_sentences, batch_size=64, epochs=5, validation_split=0.3)
#model.fit(tmp_x, preproc_french_sentences, batch_size=64, epochs=5, validation_split=0.2)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 15, 512)           4560896   
                                                                 
 bidirectional_1 (Bidirecti  (None, 15, 1024)          3151872   
 onal)                                                           
                                                                 
 time_distributed_2 (TimeDi  (None, 15, 1024)          1049600   
 stributed)                                                      
                                                                 
 dropout_1 (Dropout)         (None, 15, 1024)          0         
                                                                 
 time_distributed_3 (TimeDi  (None, 15, 19583)         20072575  
 stributed)                                                      
                                                      

In [None]:
model.save("bd_model.h5")

  saving_api.save_model(


In [None]:
i=275


print("Prediction:")
print(logits_to_text(model.predict(tmp_x[[i]])[0], french_tokenizer))

print("\nCorrect Translation:")
print(fr[i])

print("\nOriginal text:")
print(eng[i])

Prediction:
bon boulot                          

Correct Translation:
Bien joué !

Original text:
Good job!


In [None]:
# Generate 5 random indices
random_indices = random.sample(range(len(eng)), 5)

# Create a list to store table rows
table_data = []

# Loop through the random indices and fill the table data
for i in random_indices:
    original_text = eng[i]
    correct_translation = fr[i]
    prediction = logits_to_text(model.predict(tmp_x[[i]])[0], french_tokenizer)

    # Append data to the table
    table_data.append([original_text, correct_translation, prediction])

# Define column headers
headers = ["Original text", "Correct Translation", "Prediction"]

# Print the table
print(tabulate(table_data, headers=headers, tablefmt="grid"))


+-------------------------------+-----------------------------------+---------------------------+
| Original text                 | Correct Translation               | Prediction                |
| We must carry out that plan.  | Il faut exécuter ce plan.         | nous nous faut ce ce      |
+-------------------------------+-----------------------------------+---------------------------+
| He kept his hat on.           | Il garda son chapeau sur la tête. | il a son son chapeau      |
+-------------------------------+-----------------------------------+---------------------------+
| That could've been prevented. | Cela aurait pu être évité.        | ça aurait pu être         |
+-------------------------------+-----------------------------------+---------------------------+
| What a beautiful scene!       | Quel beau tableau !               | quel belle                |
+-------------------------------+-----------------------------------+---------------------------+
| It's still in good

In [None]:
# ลองinput คำเอง
input_text = "Hello how are you"

# Tokenize and pad the input text
input_sequence = pad_sequences(english_tokenizer.texts_to_sequences([input_text]), maxlen=preproc_french_sentences.shape[1], padding='post')
# Make prediction
predicted_logits = model.predict(input_sequence)

# Convert logits to text using the tokenizer used for French sentences
predicted_translation = logits_to_text(predicted_logits[0], french_tokenizer)

# Print the predicted translation
print("Predicted Translation:")
print(predicted_translation)


Predicted Translation:
bonjour comment vous vous                      


# Compare the performance before and after fine-tuning

**หลังจากทดสอบ** <br/>
**Model ก่อน Fine-Tuning** พบว่า มี Acuuracy ที่ดีที่สุดอยู่ที่ accuracy: 0.8290 <br/>
**และหลังจากทำการ Fine-Tuning** accuracy: 0.8346 <br/>
โมเดลสามารถใช้งาน predict ภาษา French จากการ input ภาษาอังกฤษได้แต่อาจมีความคลาดเคลื่อนบ้าง

# Build a model from scratch, train the model with the custom dataset

ทำการเลือก Dataset มาใช้ และทำการ Trainmodel ใช้ seq2seq

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pandas as pd

df =pd.read_csv('D:\deep-learning\RN\eng_-french-50000.csv')
english_sentences = df["English words/sentences"].tolist()
french_sentences = df["French words/sentences"].tolist()


In [None]:
# Adaptation des tokenizers aux données
tokenizer_eng = Tokenizer()
tokenizer_eng.fit_on_texts(english_sentences)
eng_seq = tokenizer_eng.texts_to_sequences(english_sentences)

tokenizer_fr = Tokenizer()
tokenizer_fr.fit_on_texts(french_sentences)
fr_seq = tokenizer_fr.texts_to_sequences(french_sentences)


vocab_size_eng = len(tokenizer_eng.word_index) + 1
vocab_size_fr = len(tokenizer_fr.word_index) + 1

# Padding
max_length = max(len(seq) for seq in eng_seq + fr_seq)
eng_seq_padded = pad_sequences(eng_seq, maxlen=max_length, padding='post')
fr_seq_padded = pad_sequences(fr_seq, maxlen=max_length, padding='post')

In [None]:
embedding_dim = 256
units = 512

# Encoder
encoder_inputs = Input(shape=(max_length,))
enc_emb = Embedding(input_dim=vocab_size_eng, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_length,))
dec_emb_layer = Embedding(input_dim=vocab_size_fr, output_dim=embedding_dim)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_fr, activation='softmax')
output = decoder_dense(decoder_outputs)

# Modèle
model = Model([encoder_inputs, decoder_inputs], output)


model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',metrics=['accuracy'])




In [None]:
X_train, X_val, y_train, y_val = train_test_split(eng_seq_padded, fr_seq_padded, test_size=0.2)
model.fit([X_train, X_train], y_train, validation_data=([X_val, X_val], y_val), epochs=5, batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x228d7186810>

In [None]:
model.save("my_model.h5")

  saving_api.save_model(


In [None]:
#model = tf.keras.models.load_model("seq2seq_translation_v3.h5")

In [None]:
def translate_sentence(sentence):
    seq = tokenizer_eng.texts_to_sequences([sentence])
    padded = pad_sequences(seq, maxlen=max_length, padding='post')
    translated = np.argmax(model.predict([padded, padded]), axis=-1)

    translated_sentence = []
    for i in translated[0]:
        if i in tokenizer_fr.index_word:
            translated_sentence.append(tokenizer_fr.index_word[i])
        else:
            translated_sentence.append(' ')

    return ' '.join(translated_sentence)

input_sentence = "Hi! I am tired ."
translated_sentence = translate_sentence(input_sentence)
print(f"Input: {input_sentence}")
print(f"Translated: {translated_sentence}")

Input: Hi! I am tired .
Translated: salut je suis                      


In [None]:
import random

def translate_sentence(sentence):
    seq = tokenizer_eng.texts_to_sequences([sentence])
    padded = pad_sequences(seq, maxlen=max_length, padding="post")
    translated = np.argmax(model.predict([padded, padded]), axis=-1)

    translated_sentence = []
    for i in translated[0]:
        if i in tokenizer_fr.index_word:
            translated_sentence.append(tokenizer_fr.index_word[i])
        else:
            translated_sentence.append(" ")

    return " ".join(translated_sentence)

# Generate 5 random indices
random_indices = random.sample(range(len(eng)), 5)

# Create a list to store table rows
table_data = []

# Define column headers
headers = ["Original Text", "Correct Translation", "Prediction"]

# Loop through the random indices and fill the table data
for i in random_indices:
    # Retrieve data
    original_text = eng[i]
    correct_translation = fr[i]
    prediction = translate_sentence(original_text)

    # Append data to the table
    table_data.append([original_text, correct_translation, prediction])

# Print the table
print(tabulate(table_data, headers=headers, tablefmt="grid"))

+-------------------------------+-------------------------------------+----------------------+
| Original Text                 | Correct Translation                 | Prediction           |
| Don't stay out all night.     | Ne reste pas dehors toute la nuit ! | ne ne pas pas la la  |
+-------------------------------+-------------------------------------+----------------------+
| I get off there, too.         | Je descends ici aussi.              | je vous à là         |
+-------------------------------+-------------------------------------+----------------------+
| Now is your chance.           | C'est maintenant ta chance.         | maintenant est votre |
+-------------------------------+-------------------------------------+----------------------+
| I've already read that novel. | J'ai déjà lu ce roman.              | je déjà déjà ça ça   |
+-------------------------------+-------------------------------------+----------------------+
| He seems to think so.         | Il semble penser

# Conclusion

หลังจากการทดลองใช้ Model และ สร้างโมเดล<br/>
พบว่า Model ที่มีมาให้ **ให้ค่า Accuracy ดีที่สุดอยู่ที่ 0.8346**
โดยหากเพิ่มจำนวณข้อมูลในการ train ให้มากกว่านี้ค่า Accuracy ที่ได้ก็จะมากขึ้นตามไปด้วย<br/>
ส่วน Model ที่สร้างขึ้นมาเองดีใช้ได้เหมือนกันแต่ไม่ดีเท่า Model ที่มีให้<br/>


**นั่งทำนานมาครับ สามารถ predict จาก Input ได้ด้วย ขอคะแนนพิศวาสหน่อยครับ T T**