In [1]:
#importing necessary libraries
from tensorflow.keras.models import Model
from tensorflow.keras import layers
import numpy as np

batch_size = 64 #batch size for training
epochs = 100 #number of epochs for training
latent_dim = 256 #latent dimentionality of the encoding space
num_samples = 10000 # Number of samples to train on



In [2]:
# Initialize empty lists to store input, output, and unique characters
input_words = []
output_words = []
unique_input_chars = set()
unique_output_chars = set()

# Open the file in read mode ('r')
with open('/kaggle/input/english-to-french/fra.txt', 'r', encoding='utf-8') as file:
    # Read each line in the file
    for line in file:
        # Split the line into English word and French translation
        english_word, french_translation, _ = line.strip().split('\t')

        # Append English word to the input list
        input_words.append(english_word)

        # Modify French translation to include '\t' at the beginning and "\n" at the end
        modified_french_translation = '\t' + french_translation + '\n'
        output_words.append(modified_french_translation)

        # Update unique characters in the input and output
        unique_input_chars.update(set(english_word))
        unique_output_chars.update(set(modified_french_translation))

# Convert unique character sets to lists for easier manipulation
unique_input_chars = list(unique_input_chars)
unique_output_chars = list(unique_output_chars)


In [3]:
input_words = input_words[0:100001]
output_words = output_words[0:100001]

In [4]:
input_characters = sorted(list(unique_input_chars)) #Unique characters in all the english words or input
target_characters = sorted(list(unique_output_chars)) #Unique characters in all the french words or output
num_encoder_tokens = len(input_characters) #total length of unique characters of english words
num_decoder_tokens = len(target_characters) #total length of unique characters of french words
max_encoder_seq_length = max([len(word) for word in input_words])  #checking for the sentence or word with the longest length in english words
max_decoder_seq_length = max([len(word) for word in output_words]) #checking for the sentence or word with the longest length in french words

In [5]:
print(f'Number of samples: {len(input_words)}')
print(f'Number of unique input tokens: {num_encoder_tokens}')
print(f'Number of unique output tokens: {num_decoder_tokens}')
print(f'Max sequence length for inputs; {max_encoder_seq_length}')
print(f'Max sequence length for outputs: {max_decoder_seq_length}')

Number of samples: 100001
Number of unique input tokens: 90
Number of unique output tokens: 115
Max sequence length for inputs; 27
Max sequence length for outputs: 74


**One Hot Representation**

In [12]:
# Creating dictionaries to map characters to one-hot indices and vice versa
input_char_index = dict([(char,i) for i, char in enumerate(input_characters)])
output_char_index = dict([(char,i) for i, char in enumerate(target_characters)])

In [14]:
#one_hot encoding using numpy zeros to get the dimensions
encoder_input_data = np.zeros((len(input_words), max_encoder_seq_length, len(input_characters)), dtype = 'float32')
decoder_input_data = np.zeros((len(output_words), max_decoder_seq_length, len(target_characters)), dtype = 'float32')
decoder_output_data = np.zeros((len(output_words), max_decoder_seq_length, len(target_characters)), dtype = 'float32')

In [17]:
#assigning 1 to characters in english words
for i, (input_word, output_word) in enumerate(zip(input_words, output_words)):
    for t, char in enumerate(input_word):
        encoder_input_data[i, t, input_char_index[char]] = 1.0
    encoder_input_data[i, t+1:, input_char_index[' ']] = 1.0

In [18]:
for i, (input_word, output_word) in enumerate(zip(input_words, output_words)):
    for t, char in enumerate(output_word):
        decoder_input_data[i, t, output_char_index[char]] = 1.0
        if t > 0:
            decoder_output_data[i, t - 1, output_char_index[char]] = 1.0
    decoder_input_data[i, t + 1:, output_char_index[' ']] = 1.0
    decoder_output_data[i, t:, output_char_index[' ']] = 1.0

**Model Training**

In [19]:
#Importing necessary libraries for model training
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Input

In [21]:
#Encoder Model
encoder_inputs = Input(shape = (None, len(input_characters)))
encoder_lstm = LSTM(latent_dim, return_state = True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)
encoder_states = [state_h, state_c]

In [22]:
#Decoder Model
decoder_inputs = Input(shape = (None, len(target_characters)))
decoder_lstm = LSTM(latent_dim, return_sequences = True, return_state = True)
decoder_outputs,_,_ = decoder_lstm(decoder_inputs, initial_state = encoder_states)
decoder_dense = Dense(len(unique_output_chars), activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [23]:
#model compilation
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(
    optimizer = 'rmsprop',
    loss = 'categorical_crossentropy',
    metrics = ['accuracy']
)

In [24]:
#model training
model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_output_data,
    batch_size=batch_size,
    epochs=50,
    validation_split=0.2  # You can adjust the validation split as needed.
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7abe8deed9f0>

In [26]:
#Saving my model
model.save('my_translation_model.h5')


  saving_api.save_model(


In [27]:
from tensorflow.keras.models import load_model

model = load_model('/kaggle/working/my_translation_model.h5')


In [25]:
reverse_input_char_index = dict([(i,char) for i, char in enumerate(unique_input_chars)])
print(reverse_input_char_index)
reverse_input_char_inde = dict((i,char) for i, char in input_char_index.items())
print(reverse_input_char_inde)
print(input_char_index)

{0: 'n', 1: ';', 2: 'Y', 3: 'ï', 4: 'D', 5: '0', 6: 'ú', 7: 'i', 8: 'a', 9: 'M', 10: '°', 11: '—', 12: '%', 13: '&', 14: '\xad', 15: '.', 16: 'W', 17: 'q', 18: 'z', 19: 'b', 20: 'E', 21: '?', 22: '7', 23: 'v', 24: 'w', 25: 'X', 26: 'N', 27: 'S', 28: ' ', 29: 'L', 30: 'º', 31: '1', 32: '3', 33: 'J', 34: 'A', 35: '\xa0', 36: ':', 37: 'o', 38: 'Z', 39: 'P', 40: 't', 41: 'g', 42: 'd', 43: '/', 44: '–', 45: 'H', 46: 'h', 47: 'u', 48: 'B', 49: 'V', 50: '€', 51: 'j', 52: 'O', 53: '‘', 54: '$', 55: 'x', 56: '"', 57: 'c', 58: '+', 59: 'm', 60: '9', 61: '!', 62: 'I', 63: 'é', 64: 's', 65: 'l', 66: '5', 67: 'C', 68: ',', 69: 'r', 70: 'U', 71: 'p', 72: 'f', 73: '8', 74: 'k', 75: "'", 76: '-', 77: 'y', 78: 'K', 79: 'T', 80: '’', 81: 'R', 82: '2', 83: 'G', 84: '6', 85: 'Q', 86: '₂', 87: 'F', 88: '4', 89: 'e'}
{' ': 0, '!': 1, '"': 2, '$': 3, '%': 4, '&': 5, "'": 6, '+': 7, ',': 8, '-': 9, '.': 10, '/': 11, '0': 12, '1': 13, '2': 14, '3': 15, '4': 16, '5': 17, '6': 18, '7': 19, '8': 20, '9': 21, ':':

In [28]:
#Define sampling models
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape = (latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs,initial_state = decoder_states_inputs
)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states
)

#Reverse-lookup taken index to decode sequences back to something readable
reverse_input_char_index = dict([(i,char) for i, char in enumerate(input_characters)])
reverse_target_char_index = dict([(i,char) for i, char in enumerate(target_characters)])

def decode_sequence(input_seq):
    #encode the input as state vectors
    states_value = encoder_model.predict(input_seq)
    
    #generate empty target sequence of length 1
    target_seq = np.zeros((1,1,num_decoder_tokens))
    #populate the first character of target sequence with the start character
    target_seq[0,0, output_char_index['\t']] = 1
    
    #sampling loop for a batch of sequences
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c= decoder_model.predict([target_seq] + states_value)
        
        #sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char
        
        #Exit condition: either hit max length or find stop character
        if (sampled_char == '\n' or len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True
            
        #update the target sequence (of length 1)
        target_seq = np.zeros((1,1, num_decoder_tokens))
        target_seq[0,0, sampled_token_index] = 1
        
        #update states
        states_value = [h,c]
    return decoded_sentence

for seq_index in range(100):
    #take one sequence (part of the training set) for trying out decoding
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sequence = decode_sequence(input_seq)
    print('_')
    print('Input sentence:', input_words[seq_index])
    print('Decoded sentence:', decoded_sequence)
    
        


_
Input sentence: Go.
Decoded sentence: Allez !

_
Input sentence: Go.
Decoded sentence: Allez !

_
Input sentence: Go.
Decoded sentence: Allez !

_
Input sentence: Go.
Decoded sentence: Allez !

_
Input sentence: Hi.
Decoded sentence: Salut.

_
Input sentence: Hi.
Decoded sentence: Salut.

_
Input sentence: Run!
Decoded sentence: Fuit !

_
Input sentence: Run!
Decoded sentence: Fuit !

_
Input sentence: Run!
Decoded sentence: Fuit !

_
Input sentence: Run!
Decoded sentence: Fuit !

_
Input sentence: Run!
Decoded sentence: Fuit !

_
Input sentence: Run!
Decoded sentence: Fuit !

_
Input sentence: Run!
Decoded sentence: Fuit !

_
Input sentence: Run!
Decoded sentence: Fuit !

_
Input sentence: Run.
Decoded sentence: Fuit !

_
Input sentence: Run.
Decoded sentence: Fuit !

_
Input sentence: Run.
Decoded sentence: Fuit !

_
Input sentence: Run.
Decoded sentence: Fuit !

_
Input sentence: Run.
Decoded sentence: Fuit !

_
Input sentence: Run.
Decoded sentence: Fuit !

_
Input sentence: Run.

In [76]:
b = 78908
print(input_words[b])
print(output_words[b])

Where can I try this on?
	Où puis-je essayer cela ?



"\tL'as-tu abandonné ?\n"